refactor streaming decoding

2025-09-18 21:44:18 +00:00 · 2022-06-09 20:37:16 +08:00 · 2022-06-09 20:37:16 +08:00 · 7f09720403
commit 7f09720403
parent 734d97c47b
4 changed files with 421 additions and 434 deletions
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless/emformer.py
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless/emformer.py
@ -41,8 +41,8 @@ LOG_EPSILON = math.log(1e-10)
 def unstack_states(
-    states,
+    states: Tuple[List[List[torch.Tensor]], List[torch.Tensor]]
-) -> List[List[List[torch.Tensor]]]:
+) -> List[Tuple[List[List[torch.Tensor]], List[torch.Tensor]]]:
    # TODO: modify doc
    """Unstack the emformer state corresponding to a batch of utterances
    into a list of states, were the i-th entry is the state from the i-th
@ -50,18 +50,14 @@ def unstack_states(
    Args:
      states:
-        A list-of-list of tensors. ``len(states)`` equals to number of
+        A list-of-list of tensors.
-        layers in the emformer. ``states[i]]`` contains the states for
+        ``len(states[0])`` and ``len(states[1])`` eqaul to number of layers.
        the i-th layer. ``states[i][k]`` is either a 3-D tensor of shape
        ``(T, N, C)`` or a 2-D tensor of shape ``(C, N)``
    """
-    past_lens, attn_caches, conv_caches = states
+    attn_caches, conv_caches = states
-    batch_size = past_lens.size(0)
+    batch_size = conv_caches[0].size(0)
    num_layers = len(attn_caches)
    list_past_len = past_lens.tolist()
    list_attn_caches = [None] * batch_size
    for i in range(batch_size):
        list_attn_caches[i] = [[] for _ in range(num_layers)]
@ -81,14 +77,14 @@ def unstack_states(
    ans = [None] * batch_size
    for i in range(batch_size):
-        ans[i] = [list_past_len[i], list_attn_caches[i], list_conv_caches[i]]
+        ans[i] = [list_attn_caches[i], list_conv_caches[i]]
    return ans
 def stack_states(
-    state_list,
+    state_list: List[Tuple[List[List[torch.Tensor]], List[torch.Tensor]]]
-) -> List[List[torch.Tensor]]:
+) -> Tuple[List[List[torch.Tensor]], List[torch.Tensor]]:
    # TODO: modify doc
    """Stack list of emformer states that correspond to separate utterances
    into a single emformer state so that it can be used as an input for
@ -108,18 +104,15 @@ def stack_states(
    """
    batch_size = len(state_list)
    past_lens = [states[0] for states in state_list]
    past_lens = torch.tensor([past_lens])
    attn_caches = []
-    for layer in state_list[0][1]:
+    for layer in state_list[0][0]:
        if batch_size > 1:
            # Note: We will stack attn_caches[layer][s][] later to get attn_caches[layer][s]  # noqa
            attn_caches.append([[s] for s in layer])
        else:
            attn_caches.append([s.unsqueeze(1) for s in layer])
    for b, states in enumerate(state_list[1:], 1):
-        for li, layer in enumerate(states[1]):
+        for li, layer in enumerate(states[0]):
            for si, s in enumerate(layer):
                attn_caches[li][si].append(s)
                if b == batch_size - 1:
@ -128,19 +121,19 @@ def stack_states(
                    )
    conv_caches = []
-    for layer in state_list[0][2]:
+    for layer in state_list[0][1]:
        if batch_size > 1:
            # Note: We will stack conv_caches[layer][] later to get attn_caches[layer]  # noqa
            conv_caches.append([layer])
        else:
            conv_caches.append(layer.unsqueeze(0))
    for b, states in enumerate(state_list[1:], 1):
-        for li, layer in enumerate(states[2]):
+        for li, layer in enumerate(states[1]):
            conv_caches[li].append(layer)
            if b == batch_size - 1:
                conv_caches[li] = torch.stack(conv_caches[li], dim=0)
-    return [past_lens, attn_caches, conv_caches]
+    return [attn_caches, conv_caches]
 class ConvolutionModule(nn.Module):
@ -1489,13 +1482,12 @@ class EmformerEncoder(nn.Module):
        self,
        x: torch.Tensor,
        lengths: torch.Tensor,
-        states: List[
+        num_processed_frames: torch.Tensor,
-            torch.Tensor, List[List[torch.Tensor]], List[torch.Tensor]
+        states: Tuple[Tuple[List[List[torch.Tensor]], List[torch.Tensor]]],
        ],
    ) -> Tuple[
        torch.Tensor,
        torch.Tensor,
-        List[torch.Tensor, List[List[torch.Tensor]], List[torch.Tensor]],
+        Tuple[Tuple[List[List[torch.Tensor]], List[torch.Tensor]]],
    ]:
        """Forward pass for streaming inference.
@ -1526,10 +1518,9 @@ class EmformerEncoder(nn.Module):
              right_context at the end.
            - updated states from current chunk's computation.
        """
-        past_lens = states[0]
+        assert num_processed_frames.shape == (x.size(1),)
        assert past_lens.shape == (x.size(1),), past_lens.shape
-        attn_caches = states[1]
+        attn_caches = states[0]
        assert len(attn_caches) == self.num_encoder_layers, len(attn_caches)
        for i in range(len(attn_caches)):
            assert attn_caches[i][0].shape == (
@ -1548,24 +1539,23 @@ class EmformerEncoder(nn.Module):
                self.d_model,
            ), attn_caches[i][2].shape
-        conv_caches = states[2]
+        conv_caches = states[1]
        assert len(conv_caches) == self.num_encoder_layers, len(conv_caches)
        for i in range(len(conv_caches)):
            assert conv_caches[i].shape == (
                x.size(1),
                self.d_model,
-                self.cnn_module_kernel,
+                self.cnn_module_kernel - 1,
            ), conv_caches[i].shape
-        assert x.size(0) == self.chunk_length + self.right_context_length, (
+        # assert x.size(0) == self.chunk_length + self.right_context_length, (
-            "Per configured chunk_length and right_context_length, "
+        #    "Per configured chunk_length and right_context_length, "
-            f"expected size of {self.chunk_length + self.right_context_length} "
+        #    f"expected size of {self.chunk_length + self.right_context_length} "
-            f"for dimension 1 of x, but got {x.size(1)}."
+        #    f"for dimension 1 of x, but got {x.size(0)}."
-        )
+        # )
-        right_context_start_idx = x.size(0) - self.right_context_length
+        right_context = x[-self.right_context_length :]
-        right_context = x[right_context_start_idx:]
+        utterance = x[: -self.right_context_length]
        utterance = x[:right_context_start_idx]
        output_lengths = torch.clamp(lengths - self.right_context_length, min=0)
        memory = (
            self.init_memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1)
@ -1574,29 +1564,29 @@ class EmformerEncoder(nn.Module):
        )
        # calcualte padding mask to mask out initial zero caches
-        chunk_mask = make_pad_mask(output_lengths)
+        # chunk_mask = make_pad_mask(output_lengths).to(x.device)
-        memory_mask = (
+        # memory_mask = (
-            (past_lens // self.chunk_length).view(x.size(1), 1)
+        #    (past_lens // self.chunk_length).view(x.size(1), 1)
-            <= torch.arange(self.memory_size, device=x.device).expand(
+        #    <= torch.arange(self.memory_size, device=x.device).expand(
-                x.size(1), self.memory_size
+        #        x.size(1), self.memory_size
-            )
+        #    )
-        ).flip(1)
+        # ).flip(1)
-        left_context_mask = (
+        # left_context_mask = (
-            past_lens.view(x.size(1), 1)
+        #    past_lens.view(x.size(1), 1)
-            <= torch.arange(self.left_context_length, device=x.device).expand(
+        #    <= torch.arange(self.left_context_length, device=x.device).expand(
-                x.size(1), self.left_context_length
+        #        x.size(1), self.left_context_length
-            )
+        #    )
-        ).flip(1)
+        # ).flip(1)
-        right_context_mask = torch.zeros(
+        # right_context_mask = torch.zeros(
-            x.size(1),
+        #    x.size(1),
-            self.right_context_length,
+        #    self.right_context_length,
-            dtype=torch.bool,
+        #    dtype=torch.bool,
-            device=x.device,
+        #    device=x.device,
-        )
+        # )
-        padding_mask = torch.cat(
+        # padding_mask = torch.cat(
-            [memory_mask, left_context_mask, right_context_mask, chunk_mask],
+        #    [memory_mask, left_context_mask, right_context_mask, chunk_mask],
-            dim=1,
+        #    dim=1,
-        )
+        # )
        output = utterance
        output_attn_caches: List[List[torch.Tensor]] = []
@ -1612,19 +1602,14 @@ class EmformerEncoder(nn.Module):
                output,
                right_context,
                memory,
-                padding_mask=padding_mask,
+                # padding_mask=padding_mask,
                attn_cache=attn_caches[layer_idx],
                conv_cache=conv_caches[layer_idx],
            )
            output_attn_caches.append(output_attn_cache)
            output_conv_caches.append(output_conv_cache)
-        output_past_lens = past_lens + output_lengths
+        output_states = [output_attn_caches, output_conv_caches]
        output_states = [
            output_past_lens,
            output_attn_caches,
            output_conv_caches,
        ]
        return output, output_lengths, output_states
@ -1738,6 +1723,7 @@ class Emformer(EncoderInterface):
        self,
        x: torch.Tensor,
        x_lens: torch.Tensor,
        num_processed_frames: torch.Tensor,
        states: Optional[List[List[torch.Tensor]]] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
        """Forward pass for streaming inference.
@ -1770,16 +1756,17 @@ class Emformer(EncoderInterface):
            - updated states from current chunk's computation.
        """
        x = self.encoder_embed(x)
        # drop the first and last frames
        x = x[:, 1:-1, :]
        x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
        # Caution: We assume the subsampling factor is 4!
-        with warnings.catch_warnings():
+        x_lens = (((x_lens - 1) >> 1) - 1) >> 1
-            warnings.simplefilter("ignore")
+        x_lens -= 2
            x_lens = ((x_lens - 1) // 2 - 1) // 2
        assert x.size(0) == x_lens.max().item()
        output, output_lengths, output_states = self.encoder.infer(
-            x, x_lens, states
+            x, x_lens, num_processed_frames, states
        )
        output = output.permute(1, 0, 2)  # (T, N, C) -> (N, T, C)
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless/stream.py
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless/stream.py
@ -18,7 +18,7 @@ import math
 from typing import List, Optional, Tuple
 import torch
-from beam_search import HypothesisList
+from beam_search import Hypothesis, HypothesisList
 from kaldifeat import FbankOptions, OnlineFbank, OnlineFeature
 from icefall.utils import AttributeDict
@ -48,6 +48,7 @@ class Stream(object):
        self,
        params: AttributeDict,
        device: torch.device = torch.device("cpu"),
        LOG_EPS: float = math.log(1e-10),
    ) -> None:
        """
        Args:
@ -57,11 +58,14 @@ class Stream(object):
            The device to run this stream.
        """
        self.device = device
        self.LOG_EPS = LOG_EPS
        # Containing attention caches and convolution caches
-        self.states: Tuple[List[List[torch.Tensor]], List[torch.Tensor]] = None
+        self.states: Optional[
            Tuple[List[List[torch.Tensor]], List[torch.Tensor]]
        ] = None
        # Initailize zero states.
-        self.init_states()
+        self.init_states(params)
        # It use different attributes for different decoding methods.
        self.context_size = params.context_size
@ -70,6 +74,12 @@ class Stream(object):
            self.hyp = [params.blank_id] * params.context_size
        elif params.decoding_method == "modified_beam_search":
            self.hyps = HypothesisList()
            self.hyps.add(
                Hypothesis(
                    ys=[params.blank_id] * params.context_size,
                    log_prob=torch.zeros(1, dtype=torch.float32, device=device),
                )
            )
        else:
            raise ValueError(
                f"Unsupported decoding method: {params.decoding_method}"
@ -77,7 +87,7 @@ class Stream(object):
        self.ground_truth: str = ""
-        self.feature: torch.Tensor = None
+        self.feature: Optional[torch.Tensor] = None
        # Make sure all feature frames can be used.
        # Add 2 here since we will drop the first and last after subsampling.
        self.chunk_length = params.chunk_length
@ -91,14 +101,14 @@ class Stream(object):
        self._done = False
    def set_feature(self, feature: torch.Tensor) -> None:
-        assert feature.dim == 2, feature.dim
+        assert feature.dim() == 2, feature.dim()
        self.num_frames = feature.size(0)
        # tail padding
        self.feature = torch.nn.functional.pad(
            feature,
            (0, 0, 0, self.pad_length),
            mode="constant",
-            value=math.log(1e-10),
+            value=self.LOG_EPS,
        )
    def set_ground_truth(self, ground_truth: str) -> None:
@ -140,9 +150,11 @@ class Stream(object):
        )
        ret_length = update_length + self.pad_length
-        ret_feature = self.feature[:ret_length]
+        ret_feature = self.feature[
            self.num_processed_frames : self.num_processed_frames + ret_length
        ]
        # Cut off used frames.
-        self.feature = self.feature[update_length:]
+        # self.feature = self.feature[update_length:]
        self.num_processed_frames += update_length
        if self.num_processed_frames >= self.num_frames:
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless/streaming_decode.py
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless/streaming_decode.py
@ -18,9 +18,10 @@
 import argparse
 import logging
 import math
 import warnings
 from pathlib import Path
-from typing import List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
 import k2
 from lhotse import CutSet
@ -31,15 +32,24 @@ import torch.nn as nn
 from asr_datamodule import LibriSpeechAsrDataModule
 from beam_search import Hypothesis, HypothesisList, get_hyps_shape
 from emformer import LOG_EPSILON, stack_states, unstack_states
-from streaming_feature_extractor import Stream
+from kaldifeat import Fbank, FbankOptions
 from stream import Stream
 from torch.nn.utils.rnn import pad_sequence
 from train import add_model_arguments, get_params, get_transducer_model
 from icefall.checkpoint import (
    average_checkpoints,
    average_checkpoints_with_averaged_model,
    find_checkpoints,
    load_checkpoint,
 )
-from icefall.utils import AttributeDict, setup_logger
+from icefall.utils import (
    AttributeDict,
    setup_logger,
    store_transcripts,
    str2bool,
    write_error_stats,
 )
 def get_parser():
@ -55,6 +65,16 @@ def get_parser():
        "Note: Epoch counts from 0.",
    )
    parser.add_argument(
        "--iter",
        type=int,
        default=0,
        help="""If positive, --epoch is ignored and it
        will use the checkpoint exp_dir/checkpoint-iter.pt.
        You can specify --avg to use more checkpoints for model averaging.
        """,
    )
    parser.add_argument(
        "--avg",
        type=int,
@ -65,14 +85,14 @@ def get_parser():
    )
    parser.add_argument(
-        "--avg-last-n",
+        "--use-averaged-model",
-        type=int,
+        type=str2bool,
-        default=0,
+        default=False,
-        help="""If positive, --epoch and --avg are ignored and it
+        help="Whether to load averaged model. Currently it only supports "
-        will use the last n checkpoints exp_dir/checkpoint-xxx.pt
+        "using --epoch. If True, it would decode with the averaged model "
-        where xxx is the number of processed batches while
+        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
-        saving that checkpoint.
+        "Actually only the models with epoch number of `epoch-avg` and "
-        """,
+        "`epoch` are loaded for averaging. ",
    )
    parser.add_argument(
@ -172,52 +192,39 @@ def get_parser():
 def greedy_search(
    model: nn.Module,
    streams: List[Stream],
    encoder_out: torch.Tensor,
-    sp: spm.SentencePieceProcessor,
+    streams: List[Stream],
-):
+) -> List[List[int]]:
-    """
+
    Args:
      model:
        The RNN-T model.
      streams:
        A list of stream objects.
      encoder_out:
        A 3-D tensor of shape (N, T, encoder_out_dim) containing the output of
        the encoder model.
      sp:
        The BPE model.
    """
    assert len(streams) == encoder_out.size(0)
    assert encoder_out.ndim == 3
    blank_id = model.decoder.blank_id
    context_size = model.decoder.context_size
-    device = model.device
+    device = next(model.parameters()).device
    T = encoder_out.size(1)
-    if streams[0].decoder_out is None:
+    decoder_input = torch.tensor(
-        for stream in streams:
+        [stream.hyp[-context_size:] for stream in streams],
-            stream.hyp = [blank_id] * context_size
+        device=device,
-        decoder_input = torch.tensor(
+        dtype=torch.int64,
-            [stream.hyp[-context_size:] for stream in streams],
+    )
-            device=device,
+    # decoder_out is of shape (N, decoder_out_dim)
-            dtype=torch.int64,
+    decoder_out = model.decoder(decoder_input, need_pad=False)
-        )
+    decoder_out = model.joiner.decoder_proj(decoder_out)
-        decoder_out = model.decoder(decoder_input, need_pad=False).squeeze(1)
+    # logging.info(f"decoder_out shape : {decoder_out.shape}")
        # decoder_out is of shape (N, decoder_out_dim)
    else:
        decoder_out = torch.stack(
            [stream.decoder_out for stream in streams],
            dim=0,
        )
    for t in range(T):
-        current_encoder_out = encoder_out[:, t]
+        # current_encoder_out's shape: (batch_size, 1, encoder_out_dim)
-        # current_encoder_out's shape: (batch_size, encoder_out_dim)
+        current_encoder_out = encoder_out[:, t : t + 1, :]  # noqa
-        logits = model.joiner(current_encoder_out, decoder_out)
+        logits = model.joiner(
            current_encoder_out.unsqueeze(2),
            decoder_out.unsqueeze(1),
            project_input=False,
        )
        # logits'shape (batch_size,  vocab_size)
        logits = logits.squeeze(1).squeeze(1)
        assert logits.ndim == 2, logits.shape
        y = logits.argmax(dim=1).tolist()
@ -236,227 +243,64 @@ def greedy_search(
            decoder_out = model.decoder(
                decoder_input,
                need_pad=False,
            ).squeeze(1)
            for k, stream in enumerate(streams):
                result = sp.decode(stream.decoding_result())
                logging.info(f"Partial result {k}:\n{result}")
    decoder_out_list = decoder_out.unbind(dim=0)
    for i, d in enumerate(decoder_out_list):
        streams[i].decoder_out = d
 def modified_beam_search(
    model: nn.Module,
    streams: List[Stream],
    encoder_out: torch.Tensor,
    sp: spm.SentencePieceProcessor,
    beam: int = 4,
 ):
    """
    Args:
      model:
        The RNN-T model.
      streams:
        A list of stream objects.
      encoder_out:
        A 3-D tensor of shape (N, T, encoder_out_dim) containing the output of
        the encoder model.
      sp:
        The BPE model.
      beam:
        Number of active paths during the beam search.
    """
    assert encoder_out.ndim == 3, encoder_out.shape
    assert len(streams) == encoder_out.size(0)
    blank_id = model.decoder.blank_id
    context_size = model.decoder.context_size
    device = model.device
    batch_size = len(streams)
    T = encoder_out.size(1)
    for stream in streams:
        if len(stream.hyps) == 0:
            stream.hyps.add(
                Hypothesis(
                    ys=[blank_id] * context_size,
                    log_prob=torch.zeros(1, dtype=torch.float32, device=device),
                )
            )
-    B = [stream.hyps for stream in streams]
+            decoder_out = model.joiner.decoder_proj(decoder_out)
    for t in range(T):
        current_encoder_out = encoder_out[:, t]
        # current_encoder_out's shape: (batch_size, encoder_out_dim)
        hyps_shape = get_hyps_shape(B).to(device)
        A = [list(b) for b in B]
        B = [HypothesisList() for _ in range(batch_size)]
        ys_log_probs = torch.stack(
            [hyp.log_prob.reshape(1) for hyps in A for hyp in hyps], dim=0
        )  # (num_hyps, 1)
        decoder_input = torch.tensor(
            [hyp.ys[-context_size:] for hyps in A for hyp in hyps],
            device=device,
            dtype=torch.int64,
        )  # (num_hyps, context_size)
        decoder_out = model.decoder(decoder_input, need_pad=False).squeeze(1)
        # decoder_out is of shape (num_hyps, decoder_output_dim)
        # Note: For torch 1.7.1 and below, it requires a torch.int64 tensor
        # as index, so we use `to(torch.int64)` below.
        current_encoder_out = torch.index_select(
            current_encoder_out,
            dim=0,
            index=hyps_shape.row_ids(1).to(torch.int64),
        )  # (num_hyps, encoder_out_dim)
        logits = model.joiner(current_encoder_out, decoder_out)
        # logits is of shape (num_hyps, vocab_size)
        log_probs = logits.log_softmax(dim=-1)  # (num_hyps, vocab_size)
        log_probs.add_(ys_log_probs)
        vocab_size = log_probs.size(-1)
        log_probs = log_probs.reshape(-1)
        row_splits = hyps_shape.row_splits(1) * vocab_size
        log_probs_shape = k2.ragged.create_ragged_shape2(
            row_splits=row_splits, cached_tot_size=log_probs.numel()
        )
        ragged_log_probs = k2.RaggedTensor(
            shape=log_probs_shape, value=log_probs
        )
        for i in range(batch_size):
            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
                topk_token_indexes = (topk_indexes % vocab_size).tolist()
            for k in range(len(topk_hyp_indexes)):
                hyp_idx = topk_hyp_indexes[k]
                hyp = A[i][hyp_idx]
                new_ys = hyp.ys[:]
                new_token = topk_token_indexes[k]
                if new_token != blank_id:
                    new_ys.append(new_token)
                new_log_prob = topk_log_probs[k]
                new_hyp = Hypothesis(ys=new_ys, log_prob=new_log_prob)
                B[i].add(new_hyp)
            streams[i].hyps = B[i]
            result = sp.decode(streams[i].decoding_result())
            logging.info(f"Partial result {i}:\n{result}")
-def build_batch(
+def decode_one_chunk(
    decode_steams: List[Stream],
    chunk_length: int,
    segment_length: int,
 ) -> Tuple[
    Optional[torch.Tensor],
    Optional[torch.tensor],
    Optional[List[Stream]],
 ]:
    """
    Args:
      chunk_length:
        Number of frames for each chunk. It equals to
        ``segment_length + right_context_length``.
      segment_length
        Number of frames for each segment.
    Returns:
      Return a tuple containing:
        - features, a 3-D tensor of shape ``(num_active_streams, T, C)``
        - active_streams, a list of active streams. We say a stream is
          active when it has enough feature frames to be fed into the
          encoder model.
    """
    feature_list = []
    length_list = []
    stream_list = []
    for stream in decode_steams:
        if len(stream.feature_frames) >= chunk_length:
            # this_chunk is a list of tensors, each of which
            # has a shape (1, feature_dim)
            chunk = stream.feature_frames[:chunk_length]
            stream.feature_frames = stream.feature_frames[segment_length:]
            features = torch.cat(chunk, dim=0)
            feature_list.append(features)
            length_list.append(chunk_length)
            stream_list.append(stream)
        elif stream.done and len(stream.feature_frames) > 0:
            chunk = stream.feature_frames[:chunk_length]
            stream.feature_frames = []
            features = torch.cat(chunk, dim=0)
            length_list.append(features.size(0))
            features = torch.nn.functional.pad(
                features,
                (0, 0, 0, chunk_length - features.size(0)),
                mode="constant",
                value=LOG_EPSILON,
            )
            feature_list.append(features)
            stream_list.append(stream)
    if len(feature_list) == 0:
        return None, None, None
    features = torch.stack(feature_list, dim=0)
    lengths = torch.cat(length_list)
    return features, lengths, stream_list
 def process_features(
    model: nn.Module,
    features: torch.Tensor,
    feature_lens: torch.Tensor,
    streams: List[Stream],
    params: AttributeDict,
    sp: spm.SentencePieceProcessor,
-) -> None:
+) -> List[int]:
-    """Process features for each stream in parallel.
+    device = next(model.parameters()).device
-    Args:
+    feature_list = []
-      model:
+    feature_len_list = []
-        The RNN-T model.
+    state_list = []
-      features:
+    num_processed_frames_list = []
        A 3-D tensor of shape (N, T, C).
      streams:
        A list of streams of size (N,).
      params:
        It is the return value of :func:`get_params`.
      sp:
        The BPE model.
    """
    assert features.ndim == 3
    assert features.size(0) == len(streams)
    assert feature_lens.size(0) == len(streams)
-    device = model.device
+    for stream in streams:
-    features = features.to(device)
+        feature, feature_len = stream.get_feature_chunk()
        feature_list.append(feature)
        feature_len_list.append(feature_len)
        state_list.append(stream.states)
        num_processed_frames_list.append(stream.num_processed_frames)
-    state_list = [stream.states for stream in streams]
+    features = pad_sequence(
        feature_list, batch_first=True, padding_value=LOG_EPSILON
    ).to(device)
    feature_lens = torch.tensor(feature_len_list, device=device)
    num_processed_frames = torch.tensor(
        num_processed_frames_list, device=device
    )
    # Make sure it has at least 1 frame after subsampling, first-and-last-frame cutting, and right context cutting  # noqa
    tail_length = (
        3 * params.subsampling_factor + params.right_context_length + 3
    )
    if features.size(1) < tail_length:
        pad_length = tail_length - features.size(1)
        feature_lens += pad_length
        features = torch.nn.functional.pad(
            features,
            (0, 0, 0, pad_length),
            mode="constant",
            value=LOG_EPSILON,
        )
    # print(features.shape)
    # stack states of all streams
    states = stack_states(state_list)
    encoder_out, encoder_out_lens, states = model.encoder.infer(
-        features,
+        x=features,
-        feature_lens,
+        x_lens=feature_lens,
-        states,
+        states=states,
        num_processed_frames=num_processed_frames,
    )
    encoder_out = model.joiner.encoder_proj(encoder_out)
    # update cached states of each stream
    state_list = unstack_states(states)
    for i, s in enumerate(state_list):
        streams[i].states = s
@ -466,26 +310,47 @@ def process_features(
            model=model,
            streams=streams,
            encoder_out=encoder_out,
            sp=sp,
        )
    elif params.decoding_method == "modified_beam_search":
        modified_beam_search(
            model=model,
            streams=streams,
            encoder_out=encoder_out,
            sp=sp,
            beam=params.beam_size,
        )
    # elif params.decoding_method == "modified_beam_search":
    #    modified_beam_search(
    #        model=model,
    #        streams=streams,
    #        encoder_out=encoder_out,
    #        sp=sp,
    #        beam=params.beam_size,
    #    )
    else:
        raise ValueError(
            f"Unsupported decoding method: {params.decoding_method}"
        )
    finished_streams = [i for i, stream in enumerate(streams) if stream.done]
    return finished_streams
 def create_streaming_feature_extractor() -> Fbank:
    """Create a CPU streaming feature extractor.
    At present, we assume it returns a fbank feature extractor with
    fixed options. In the future, we will support passing in the options
    from outside.
    Returns:
      Return a CPU streaming feature extractor.
    """
    opts = FbankOptions()
    opts.device = "cpu"
    opts.frame_opts.dither = 0
    opts.frame_opts.snip_edges = False
    opts.frame_opts.samp_freq = 16000
    opts.mel_opts.num_bins = 80
    return Fbank(opts)
 def decode_dataset(
    params: AttributeDict,
    cuts: CutSet,
    model: nn.Module,
    params: AttributeDict,
    sp: spm.SentencePieceProcessor,
 ):
    """Decode dataset.
@ -493,72 +358,126 @@ def decode_dataset(
    """
    device = next(model.parameters()).device
-    # number of frames before subsampling
+    opts = FbankOptions()
-    segment_length = model.encoder.segment_length
+    opts.device = device
-    right_context_length = model.encoder.right_context_length
+    opts.frame_opts.dither = 0
-    # 5 = 3 + 2
+    opts.frame_opts.snip_edges = False
-    # 1) add 3 here since the subsampling method is using
+    opts.frame_opts.samp_freq = 16000
-    #    ((len - 1) // 2 - 1) // 2)
+    opts.mel_opts.num_bins = 80
-    # 2) add 2 here we will drop first and last frame after subsampling
+
-    chunk_length = (segment_length + 5) + right_context_length
+    log_interval = 300
    decode_results = []
    streams = []
    for num, cut in enumerate(cuts):
        # Each utterance has a Stream.
        stream = Stream(params=params, device=device, LOG_EPS=LOG_EPSILON)
        audio: np.ndarray = cut.load_audio()
        # audio.shape: (1, num_samples)
        assert len(audio.shape) == 2
        assert audio.shape[0] == 1, "Should be single channel"
        assert audio.dtype == np.float32, audio.dtype
        # The trained model is using normalized samples
        assert audio.max() <= 1, "Should be normalized to [-1, 1])"
        samples = torch.from_numpy(audio).squeeze(0)
        fbank = create_streaming_feature_extractor()
        feature = fbank(samples)
        stream.set_feature(feature)
        stream.set_ground_truth(cut.supervisions[0].text)
        # Each uttetance has a Stream
        stream = Stream(
            params=params,
            audio_sample=samples,
            ground_truth=cut.supervisions[0].text,
            device=device,
        )
        streams.append(stream)
        while len(streams) >= params.num_decode_streams:
-            for stream in streams:
+            finished_streams = decode_one_chunk(
-                stream.accept_waveform()
+                model=model,
-
+                streams=streams,
-            # try to build batch
+                params=params,
-            features, active_streams = build_batch(
+                sp=sp,
                chunk_length=chunk_length,
                segment_length=segment_length,
            )
            if features is not None:
                process_features(
                    model=model,
                    features=features,
                    streams=active_streams,
                    params=params,
                    sp=sp,
                )
-            new_streams = []
+            for i in sorted(finished_streams, reverse=True):
-            for stream in streams:
+                decode_results.append(
-                if stream.done:
+                    (
-                    decode_results.append(
+                        streams[i].ground_truth.split(),
-                        (
+                        sp.decode(streams[i].decoding_result()).split(),
                            stream.ground_truth.split(),
                            sp.decode(stream.decoding_result()).split(),
                        )
                    )
-                else:
+                )
-                    new_streams.append(stream)
+                print(decode_results[-1])
-            del streams
+                del streams[i]
-            streams = new_streams
+                # print("delete", i, len(streams))
        if num % log_interval == 0:
            logging.info(f"Cuts processed until now is {num}.")
    while len(streams) > 0:
        finished_streams = decode_one_chunk(
            model=model,
            streams=streams,
            params=params,
            sp=sp,
        )
        for i in sorted(finished_streams, reverse=True):
            decode_results.append(
                (
                    streams[i].ground_truth.split(),
                    sp.decode(streams[i].decoding_result()).split(),
                )
            )
            del streams[i]
    if params.decoding_method == "greedy_search":
        return {"greedy_search": decode_results}
    else:
        return {f"beam_size_{params.beam_size}": decode_results}
 def save_results(
    params: AttributeDict,
    test_set_name: str,
    results_dict: Dict[str, List[Tuple[List[str], List[str]]]],
 ):
    test_set_wers = dict()
    for key, results in results_dict.items():
        recog_path = (
            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
        )
        store_transcripts(filename=recog_path, texts=sorted(results))
        logging.info(f"The transcripts are stored in {recog_path}")
        # The following prints out WERs, per-word error statistics and aligned
        # ref/hyp pairs.
        errs_filename = (
            params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
        )
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
                f, f"{test_set_name}-{key}", results, enable_log=True
            )
            test_set_wers[key] = wer
        logging.info("Wrote detailed error stats to {}".format(errs_filename))
    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
    errs_info = (
        params.res_dir
        / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
    )
    with open(errs_info, "w") as f:
        print("settings\tWER", file=f)
        for key, val in test_set_wers:
            print("{}\t{}".format(key, val), file=f)
    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
    note = "\tbest for {}".format(test_set_name)
    for key, val in test_set_wers:
        s += "{}\t{}{}\n".format(key, val, note)
        note = ""
    logging.info(s) @ torch.no_grad()
@torch.no_grad()
 def main():
    parser = get_parser()
    LibriSpeechAsrDataModule.add_arguments(parser)
@ -571,6 +490,32 @@ def main():
    # Note: params.decoding_method is currently not used.
    params.res_dir = params.exp_dir / "streaming" / params.decoding_method
    if params.iter > 0:
        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
    else:
        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
    # for streaming
    params.suffix += f"-streaming-chunk-length-{params.chunk_length}"
    params.suffix += f"-left-context-length-{params.left_context_length}"
    params.suffix += f"-right-context-length-{params.right_context_length}"
    params.suffix += f"-memory-size-{params.memory_size}"
    if "fast_beam_search" in params.decoding_method:
        params.suffix += f"-beam-{params.beam}"
        params.suffix += f"-max-contexts-{params.max_contexts}"
        params.suffix += f"-max-states-{params.max_states}"
    elif "beam_search" in params.decoding_method:
        params.suffix += (
            f"-{params.decoding_method}-beam-size-{params.beam_size}"
        )
    else:
        params.suffix += f"-context-{params.context_size}"
        params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
    if params.use_averaged_model:
        params.suffix += "-use-averaged-model"
    setup_logger(f"{params.res_dir}/log-streaming-decode")
    logging.info("Decoding started")
@ -595,24 +540,83 @@ def main():
    logging.info("About to create model")
    model = get_transducer_model(params)
-    if params.avg_last_n > 0:
+    if not params.use_averaged_model:
-        filenames = find_checkpoints(params.exp_dir)[: params.avg_last_n]
+        if params.iter > 0:
-        logging.info(f"averaging {filenames}")
+            filenames = find_checkpoints(
-        model.to(device)
+                params.exp_dir, iteration=-params.iter
-        model.load_state_dict(average_checkpoints(filenames, device=device))
+            )[: params.avg]
-    elif params.avg == 1:
+            if len(filenames) == 0:
-        load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
+                raise ValueError(
                    f"No checkpoints found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            elif len(filenames) < params.avg:
                raise ValueError(
                    f"Not enough checkpoints ({len(filenames)}) found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            logging.info(f"averaging {filenames}")
            model.to(device)
            model.load_state_dict(average_checkpoints(filenames, device=device))
        elif params.avg == 1:
            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
        else:
            start = params.epoch - params.avg + 1
            filenames = []
            for i in range(start, params.epoch + 1):
                if i >= 1:
                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
            logging.info(f"averaging {filenames}")
            model.to(device)
            model.load_state_dict(average_checkpoints(filenames, device=device))
    else:
-        start = params.epoch - params.avg + 1
+        if params.iter > 0:
-        filenames = []
+            filenames = find_checkpoints(
-        for i in range(start, params.epoch + 1):
+                params.exp_dir, iteration=-params.iter
-            if start >= 0:
+            )[: params.avg + 1]
-                filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
+            if len(filenames) == 0:
-        logging.info(f"averaging {filenames}")
+                raise ValueError(
-        model.to(device)
+                    f"No checkpoints found for"
-        model.load_state_dict(average_checkpoints(filenames, device=device))
+                    f" --iter {params.iter}, --avg {params.avg}"
                )
            elif len(filenames) < params.avg + 1:
                raise ValueError(
                    f"Not enough checkpoints ({len(filenames)}) found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            filename_start = filenames[-1]
            filename_end = filenames[0]
            logging.info(
                "Calculating the averaged model over iteration checkpoints"
                f" from {filename_start} (excluded) to {filename_end}"
            )
            model.to(device)
            model.load_state_dict(
                average_checkpoints_with_averaged_model(
                    filename_start=filename_start,
                    filename_end=filename_end,
                    device=device,
                )
            )
        else:
            assert params.avg > 0, params.avg
            start = params.epoch - params.avg
            assert start >= 1, start
            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
            logging.info(
                f"Calculating the averaged model over epoch range from "
                f"{start} (excluded) to {params.epoch}"
            )
            model.to(device)
            model.load_state_dict(
                average_checkpoints_with_averaged_model(
                    filename_start=filename_start,
                    filename_end=filename_end,
                    device=device,
                )
            )
    model.to(device)
    model.eval()
    model.device = device
@ -622,42 +626,26 @@ def main():
    librispeech = LibriSpeechAsrDataModule(args)
    test_clean_cuts = librispeech.test_clean_cuts()
    test_other_cuts = librispeech.test_other_cuts()
-    batch_size = 3
+    test_sets = ["test-clean", "test-other"]
    test_cuts = [test_clean_cuts, test_other_cuts]
-    ground_truth = []
+    for test_set, test_cut in zip(test_sets, test_cuts):
-    batched_samples = []
+        results_dict = decode_dataset(
-    for num, cut in enumerate(test_clean_cuts):
+            cuts=test_cut,
-        audio: np.ndarray = cut.load_audio()
+            model=model,
-        # audio.shape: (1, num_samples)
+            params=params,
-        assert len(audio.shape) == 2
+            sp=sp,
-        assert audio.shape[0] == 1, "Should be single channel"
+        )
        assert audio.dtype == np.float32, audio.dtype
-        # The trained model is using normalized samples
+        save_results(
-        assert audio.max() <= 1, "Should be normalized to [-1, 1])"
+            params=params,
            test_set_name=test_set,
            results_dict=results_dict,
        )
-        samples = torch.from_numpy(audio).squeeze(0)
+    logging.info("Done!")
        # batched_samples.append(samples)
        # ground_truth.append(cut.supervisions[0].text)
        if len(batched_samples) >= batch_size:
            decoded_results = decode_batch(
                batched_samples=batched_samples,
                model=model,
                params=params,
                sp=sp,
            )
            s = "\n"
            for i, (hyp, ref) in enumerate(zip(decoded_results, ground_truth)):
                s += f"hyp {i}:\n{hyp}\n"
                s += f"ref {i}:\n{ref}\n\n"
            logging.info(s)
            batched_samples = []
            ground_truth = []
            # break after processing the first batch for test purposes
            break
 if __name__ == "__main__":
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless/train.py
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless/train.py
@ -449,7 +449,7 @@ def get_encoder_model(params: AttributeDict) -> nn.Module:
        cnn_module_kernel=params.cnn_module_kernel,
        left_context_length=params.left_context_length,
        right_context_length=params.right_context_length,
-        max_memory_size=params.memory_size,
+        memory_size=params.memory_size,
    )
    return encoder