Add streaming feature extractor.

2025-12-11 06:55:27 +00:00 · 2022-04-10 23:07:41 +08:00 · 2022-04-10 23:07:41 +08:00 · f16b759397
commit f16b759397
parent 189ca555b1
6 changed files with 515 additions and 94 deletions
--- a/.flake8
+++ b/.flake8
@ -14,3 +14,7 @@ exclude =
  .git,
  **/data/**,
  icefall/shared/make_kn_lm.py
 ignore =
  # E203 whitespace before ':'
  E203,
--- a/egs/librispeech/ASR/transducer_emformer/emformer.py
+++ b/egs/librispeech/ASR/transducer_emformer/emformer.py
@ -63,11 +63,11 @@ class Emformer(EncoderInterface):
          num_encoder_layers:
            Number of encoder layers.
          segment_length:
-            Number of frames per segment.
+            Number of frames per segment before subsampling.
          left_context_length:
-            Number of frames in the left context.
+            Number of frames in the left context before subsampling.
          right_context_length:
-            Number of frames in the right context.
+            Number of frames in the right context before subsampling.
          max_memory_size:
            TODO.
          dropout:
@ -94,6 +94,7 @@ class Emformer(EncoderInterface):
        else:
            self.encoder_embed = Conv2dSubsampling(num_features, d_model)
        self.segment_length = segment_length
        self.right_context_length = right_context_length
        assert right_context_length % subsampling_factor == 0
--- a/egs/librispeech/ASR/transducer_emformer/export.py
+++ b/egs/librispeech/ASR/transducer_emformer/export.py
@ -0,0 +1,184 @@
 #!/usr/bin/env python3
 #
 # Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This script converts several saved checkpoints
 # to a single one using model averaging.
 """
 Usage:
 ./transducer_emformer/export.py \
  --exp-dir ./transducer_emformer/exp \
  --bpe-model data/lang_bpe_500/bpe.model \
  --epoch 20 \
  --avg 10
 It will generate a file exp_dir/pretrained.pt
 To use the generated file with `transducer_emformer/decode.py`,
 you can do:
    cd /path/to/exp_dir
    ln -s pretrained.pt epoch-9999.pt
    cd /path/to/egs/librispeech/ASR
    ./transducer_emformer/decode.py \
        --exp-dir ./transducer_emformer/exp \
        --epoch 9999 \
        --avg 1 \
        --max-duration 1000 \
        --bpe-model data/lang_bpe_500/bpe.model
 """
 import argparse
 import logging
 from pathlib import Path
 import sentencepiece as spm
 import torch
 from train import add_model_arguments, get_params, get_transducer_model
 from icefall.checkpoint import average_checkpoints, load_checkpoint
 from icefall.utils import str2bool
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--epoch",
        type=int,
        default=28,
        help="It specifies the checkpoint to use for decoding."
        "Note: Epoch counts from 0.",
    )
    parser.add_argument(
        "--avg",
        type=int,
        default=15,
        help="Number of checkpoints to average. Automatically select "
        "consecutive checkpoints before the checkpoint specified by "
        "'--epoch'. ",
    )
    parser.add_argument(
        "--exp-dir",
        type=str,
        default="pruned_transducer_stateless/exp",
        help="""It specifies the directory where all training related
        files, e.g., checkpoints, log, etc, are saved
        """,
    )
    parser.add_argument(
        "--bpe-model",
        type=str,
        default="data/lang_bpe_500/bpe.model",
        help="Path to the BPE model",
    )
    parser.add_argument(
        "--jit",
        type=str2bool,
        default=False,
        help="""True to save a model after applying torch.jit.script.
        """,
    )
    parser.add_argument(
        "--context-size",
        type=int,
        default=2,
        help="The context size in the decoder. 1 means bigram; "
        "2 means tri-gram",
    )
    add_model_arguments(parser)
    return parser
 def main():
    args = get_parser().parse_args()
    args.exp_dir = Path(args.exp_dir)
    assert args.jit is False, "Support torchscript will be added later"
    params = get_params()
    params.update(vars(args))
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    logging.info(f"device: {device}")
    sp = spm.SentencePieceProcessor()
    sp.load(params.bpe_model)
    # <blk> is defined in local/train_bpe_model.py
    params.blank_id = sp.piece_to_id("<blk>")
    params.vocab_size = sp.get_piece_size()
    logging.info(params)
    logging.info("About to create model")
    model = get_transducer_model(params)
    model.to(device)
    if params.avg == 1:
        load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
    else:
        start = params.epoch - params.avg + 1
        filenames = []
        for i in range(start, params.epoch + 1):
            if start >= 0:
                filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
        logging.info(f"averaging {filenames}")
        model.to(device)
        model.load_state_dict(average_checkpoints(filenames, device=device))
    model.eval()
    model.to("cpu")
    model.eval()
    if params.jit:
        logging.info("Using torch.jit.script")
        model = torch.jit.script(model)
        filename = params.exp_dir / "cpu_jit.pt"
        model.save(str(filename))
        logging.info(f"Saved to {filename}")
    else:
        logging.info("Not using torch.jit.script")
        # Save it using a format so that it can be loaded
        # by :func:`load_checkpoint`
        filename = params.exp_dir / "pretrained.pt"
        torch.save({"model": model.state_dict()}, str(filename))
        logging.info(f"Saved to {filename}")
 if __name__ == "__main__":
    formatter = (
        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    )
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/librispeech/ASR/transducer_emformer/streaming_decode.py
+++ b/egs/librispeech/ASR/transducer_emformer/streaming_decode.py
@ -20,14 +20,14 @@ import argparse
 import logging
 import time
 from pathlib import Path
 from typing import List, Optional
 import kaldifeat
 import numpy as np
 import sentencepiece as spm
 import torch
 import torch.nn as nn
 from asr_datamodule import LibriSpeechAsrDataModule
 from emformer import LOG_EPSILON
 from streaming_feature_extractor import Stream
 from train import add_model_arguments, get_params, get_transducer_model
 from icefall.checkpoint import (
@ -147,10 +147,10 @@ def get_parser():
    )
    parser.add_argument(
-        "--sample-rate",
+        "--sampling-rate",
-        type=int,
+        type=float,
        default=16000,
-        help="The sample rate of the input sound file",
+        help="Sample rate of the audio",
    )
    add_model_arguments(parser)
@ -158,32 +158,159 @@ def get_parser():
    return parser
-def get_feature_extractor(
+def greedy_search(
-    params: AttributeDict,
+    model: nn.Module,
-) -> kaldifeat.Fbank:
+    stream: Stream,
-    logging.info("Constructing Fbank computer")
+    encoder_out: torch.Tensor,
-    opts = kaldifeat.FbankOptions()
+    sp: spm.SentencePieceProcessor,
-    opts.device = params.device
+):
-    opts.frame_opts.dither = 0
+    """
-    opts.frame_opts.snip_edges = True
+    Args:
-    opts.frame_opts.samp_freq = params.sample_rate
+      model:
-    opts.mel_opts.num_bins = params.feature_dim
+        The RNN-T model.
      stream:
        A stream object.
      encoder_out:
        A 2-D tensor of shape (T, encoder_out_dim) containing the output of
        the encoder model.
      sp:
        The BPE model.
    """
    blank_id = model.decoder.blank_id
    context_size = model.decoder.context_size
    device = model.device
-    return kaldifeat.Fbank(opts)
+    if stream.decoder_out is None:
        decoder_input = torch.tensor(
            [stream.hyp.ys[-context_size:]],
            device=device,
            dtype=torch.int64,
        )
        stream.decoder_out = model.decoder(
            decoder_input,
            need_pad=False,
        ).unsqueeze(1)
        # stream.decoder_out is of shape (1, 1, decoder_out_dim)
    assert encoder_out.ndim == 2
    T = encoder_out.size(0)
    for t in range(T):
        current_encoder_out = encoder_out[t].reshape(
            1, 1, 1, encoder_out.size(-1)
        )
        logits = model.joiner(current_encoder_out, stream.decoder_out)
        # logits is of shape (1, 1, 1, vocab_size)
        y = logits.argmax().item()
        if y == blank_id:
            continue
        stream.hyp.ys.append(y)
        decoder_input = torch.tensor(
            [stream.hyp.ys[-context_size:]],
            device=device,
            dtype=torch.int64,
        )
        stream.decoder_out = model.decoder(
            decoder_input,
            need_pad=False,
        ).unsqueeze(1)
        logging.info(
            f"Partial result:\n{sp.decode(stream.hyp.ys[context_size:])}"
        )
 def process_feature_frames(
    model: nn.Module,
    stream: Stream,
    sp: spm.SentencePieceProcessor,
 ):
    """Process the feature frames contained in ``stream.feature_frames``.
    Args:
      model:
        The RNN-T model.
      stream:
        The stream corresponding to the input audio samples.
      sp:
        The BPE model.
    """
    # number of frames before subsampling
    segment_length = model.encoder.segment_length
    right_context_length = model.encoder.right_context_length
    chunk_length = (segment_length + 3) + right_context_length
    device = model.device
    while len(stream.feature_frames) >= chunk_length:
        # a list of tensor, each with a shape (1, feature_dim)
        this_chunk = stream.feature_frames[:chunk_length]
        stream.feature_frames = stream.feature_frames[segment_length:]
        features = torch.cat(this_chunk, dim=0).to(device)  # (T, feature_dim)
        features = features.unsqueeze(0)  # (1, T, feature_dim)
        feature_lens = torch.tensor([features.size(1)], device=device)
        (
            encoder_out,
            encoder_out_lens,
            stream.states,
        ) = model.encoder.streaming_forward(
            features,
            feature_lens,
            stream.states,
        )
        greedy_search(
            model=model,
            stream=stream,
            encoder_out=encoder_out[0],
            sp=sp,
        )
    if stream.feature_extractor.is_last_frame(stream.num_fetched_frames - 1):
        assert len(stream.feature_frames) < chunk_length
        if len(stream.feature_frames) > 0:
            this_chunk = stream.feature_frames[:chunk_length]
            stream.feature_frames = []
            features = torch.cat(this_chunk, dim=0)  # (T, feature_dim)
            features = features.to(device).unsqueeze(0)  # (1, T, feature_dim)
            features = torch.nn.functional.pad(
                features,
                (0, 0, 0, chunk_length - features.size(1)),
                value=LOG_EPSILON,
            )
            feature_lens = torch.tensor([features.size(1)], device=device)
            (
                encoder_out,
                encoder_out_lens,
                stream.states,
            ) = model.encoder.streaming_forward(
                features,
                feature_lens,
                stream.states,
            )
            greedy_search(
                model=model,
                stream=stream,
                encoder_out=encoder_out[0],
                sp=sp,
            )
 def decode_one_utterance(
    audio_samples: torch.Tensor,
    model: nn.Module,
-    fbank: kaldifeat.Fbank,
+    stream: Stream,
    params: AttributeDict,
    sp: spm.SentencePieceProcessor,
 ):
    """Decode one utterance.
    Args:
      audio_samples:
-        A 1-D float32 tensor of shape (num_samples,) containing the normalized
+        A 1-D float32 tensor of shape (num_samples,) containing the
-        audio samples. Normalized means the samples is in the range [-1, 1].
+        audio samples.
      model:
        The RNN-T model.
      feature_extractor:
@ -193,80 +320,23 @@ def decode_one_utterance(
      sp:
        The BPE model.
    """
    sample_rate = params.sample_rate
    frame_shift = sample_rate * fbank.opts.frame_opts.frame_shift_ms / 1000
    frame_shift = int(frame_shift)  # number of samples
    # Note: We add 3 here because the subsampling method ((n-1)//2-1))//2
    # is not equal to n//4. We will switch to a subsampling method that
    # satisfies n//4, where n is the number of input frames.
    segment_length = (params.segment_length + 3) * frame_shift
    right_context_length = params.right_context_length * frame_shift
    chunk_size = segment_length + right_context_length
    opts = fbank.opts.frame_opts
    chunk_size += (
        (opts.frame_length_ms - opts.frame_shift_ms) / 1000 * sample_rate
    )
    chunk_size = int(chunk_size)
    states: Optional[List[List[torch.Tensor]]] = None
    blank_id = model.decoder.blank_id
    context_size = model.decoder.context_size
    device = model.device
    hyp = [blank_id] * context_size
    decoder_input = torch.tensor(hyp, device=device, dtype=torch.int64).reshape(
        1, context_size
    )
    decoder_out = model.decoder(decoder_input, need_pad=False)
    i = 0
    num_samples = audio_samples.size(0)
    while i < num_samples:
-        # Note: The current approach of computing the features is not ideal
+        # Simulate streaming.
-        # since it re-computes the features for the right context.
+        this_chunk_num_samples = torch.randint(2000, 5000, (1,)).item()
        chunk = audio_samples[i : i + chunk_size]  # noqa
        i += segment_length
        if chunk.size(0) < chunk_size:
            chunk = torch.nn.functional.pad(
                chunk, pad=(0, chunk_size - chunk.size(0))
            )
        features = fbank(chunk)
        feature_lens = torch.tensor([features.size(0)], device=params.device)
-        features = features.unsqueeze(0)  # (1, T, C)
+        thiks_chunk_samples = audio_samples[i : (i + this_chunk_num_samples)]
        i += this_chunk_num_samples
-        encoder_out, encoder_out_lens, states = model.encoder.streaming_forward(
+        stream.accept_waveform(
-            features,
+            sampling_rate=params.sampling_rate,
-            feature_lens,
+            waveform=thiks_chunk_samples,
            states,
        )
-        for t in range(encoder_out_lens.item()):
+        process_feature_frames(model=model, stream=stream, sp=sp)
            # fmt: off
            current_encoder_out = encoder_out[0:1, t:t+1, :].unsqueeze(2)
            # fmt: on
            logits = model.joiner(current_encoder_out, decoder_out.unsqueeze(1))
            # logits is (1, 1, 1, vocab_size)
            y = logits.argmax().item()
            if y == blank_id:
                continue
-            hyp.append(y)
+    stream.input_finished()
-
+    process_feature_frames(model=model, stream=stream, sp=sp)
            decoder_input = torch.tensor(
                [hyp[-context_size:]], device=device, dtype=torch.int64
            ).reshape(1, context_size)
            decoder_out = model.decoder(decoder_input, need_pad=False)
        logging.info(f"Partial result:\n{sp.decode(hyp[context_size:])}")
@torch.no_grad()
@ -333,10 +403,12 @@ def main():
    test_clean_cuts = librispeech.test_clean_cuts()
    fbank = get_feature_extractor(params)
    for num, cut in enumerate(test_clean_cuts):
-        logging.info("Processing {num}")
+        logging.info(f"Processing {num}")
        stream = Stream(
            context_size=model.decoder.context_size,
            blank_id=model.decoder.blank_id,
        )
        audio: np.ndarray = cut.load_audio()
        # audio.shape: (1, num_samples)
@ -347,16 +419,17 @@ def main():
        decode_one_utterance(
            audio_samples=torch.from_numpy(audio).squeeze(0).to(device),
            model=model,
-            fbank=fbank,
+            stream=stream,
            params=params,
            sp=sp,
        )
        logging.info(f"The ground truth is:\n{cut.supervisions[0].text}")
-        if num >= 0:
+        if num >= 2:
            break
        time.sleep(2)  # So that you can see the decoded results
 if __name__ == "__main__":
    torch.manual_seed(20220410)
    main()
--- a/egs/librispeech/ASR/transducer_emformer/streaming_feature_extractor.py
+++ b/egs/librispeech/ASR/transducer_emformer/streaming_feature_extractor.py
@ -0,0 +1,106 @@
 # Copyright    2022  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import List, Optional
 import torch
 from beam_search import Hypothesis
 from kaldifeat import FbankOptions, OnlineFbank, OnlineFeature
 def _create_streaming_feature_extractr() -> OnlineFeature:
    """Create a CPU streaming feature extractor.
    At present, we assume it returns a fbank feature extractor with
    fixed options. In the future, we will support passing in the options
    from outside.
    Returns:
      Return a CPU streaming feature extractor.
    """
    opts = FbankOptions()
    opts.device = "cpu"
    opts.frame_opts.dither = 0
    opts.frame_opts.snip_edges = False
    opts.frame_opts.samp_freq = 16000
    opts.mel_opts.num_bins = 80
    return OnlineFbank(opts)
 class Stream(object):
    def __init__(self, context_size: int, blank_id: int = 0) -> None:
        """Context size of the RNN-T decoder model."""
        self.feature_extractor = _create_streaming_feature_extractr()
        self.hyp = Hypothesis(
            ys=([blank_id] * context_size),
            log_prob=torch.tensor([0.0]),
        )  # for greedy search, will extend it to beam search
        # It contains a list of 1-D tensors representing the feature frames.
        self.feature_frames: List[torch.Tensor] = []
        self.num_fetched_frames = 0
        # For the emformer model, it contains the states of each
        # encoder layer.
        self.states: Optional[List[List[torch.Tensor]]] = None
        # For the RNN-T decoder, it contains the decoder output
        # corresponding to the decoder input self.hyp.ys[-context_size:]
        self.decoder_out: Optional[torch.Tensor] = None
    def accept_waveform(
        self,
        sampling_rate: float,
        waveform: torch.Tensor,
    ) -> None:
        """Feed audio samples to the feature extractor and compute features
        if there are enough samples available.
        Caution:
          The range of the audio samples should match the one used in the
          training. That is, if you use the range [-1, 1] in the training, then
          the input audio samples should also be normalized to [-1, 1].
        Args
          sampling_rate:
            The sampling rate of the input audio samples. It is used for sanity
            check to ensure that the input sampling rate equals to the one
            used in the extractor. If they are not equal, then no resampling
            will be performed; instead an error will be thrown.
        waveform:
          A 1-D torch tensor of dtype torch.float32 containing audio samples.
          It should be on CPU.
        """
        self.feature_extractor.accept_waveform(
            sampling_rate=sampling_rate,
            waveform=waveform,
        )
        self._fetch_frames()
    def input_finished(self) -> None:
        """Signal that no more audio samples available and the feature
        extractor should flush the buffered samples to compute frames.
        """
        self.feature_extractor.input_finished()
        self._fetch_frames()
    def _fetch_frames(self) -> None:
        """Fetch frames from the feature extractor"""
        while self.num_fetched_frames < self.feature_extractor.num_frames_ready:
            frame = self.feature_extractor.get_frame(self.num_fetched_frames)
            self.feature_frames.append(frame)
            self.num_fetched_frames += 1
--- a/egs/librispeech/ASR/transducer_emformer/test_streaming_feature_extractor.py
+++ b/egs/librispeech/ASR/transducer_emformer/test_streaming_feature_extractor.py
@ -0,0 +1,53 @@
 #!/usr/bin/env python3
 # Copyright    2022  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 To run this file, do:
    cd icefall/egs/librispeech/ASR
    python ./transducer_emformer/test_streaming_feature_extractor.py
 """
 import torch
 from streaming_feature_extractor import Stream
 def test_streaming_feature_extractor():
    stream = Stream(context_size=2, blank_id=0)
    samples = torch.rand(16000)
    start = 0
    while True:
        n = torch.randint(50, 500, (1,)).item()
        end = start + n
        this_chunk = samples[start:end]
        start = end
        if len(this_chunk) == 0:
            break
        stream.accept_waveform(sampling_rate=16000, waveform=this_chunk)
        print(len(stream.feature_frames))
    stream.input_finished()
    print(len(stream.feature_frames))
 def main():
    test_streaming_feature_extractor()
 if __name__ == "__main__":
    main()