Start to add streaming feature extractors.

2022-04-02 12:07:59 +08:00 · 2022-04-02 12:07:59 +08:00 · 2399cc8993
commit 2399cc8993
parent 4aab351344
1 changed files with 85 additions and 0 deletions
--- a/kaldifeat/csrc/online-feature-itf.h
+++ b/kaldifeat/csrc/online-feature-itf.h
@ -0,0 +1,85 @@
+// kaldifeat/csrc/online-feature-itf.h
+//
+// Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+
+// This file is copied/modified from kaldi/src/itf/online-feature-itf.h
+
+#ifndef KALDIFEAT_CSRC_ONLINE_FEATURE_ITF_H_
+#define KALDIFEAT_CSRC_ONLINE_FEATURE_ITF_H_
+
+#include <vector>
+
+#include "torch/script.h"
+
+namespace kaldifeat {
+
+class OnlineFeatureInterface {
+ public:
+  virtual ~OnlineFeatureInterface() = default;
+
+  virtual int32_t Dim() const = 0;  /// returns the feature dimension.
+
+  /// Returns the total number of frames, since the start of the utterance, that
+  /// are now available.  In an online-decoding context, this will likely
+  /// increase with time as more data becomes available.
+  virtual int32_t NumFramesReady() const = 0;
+
+  /// Returns true if this is the last frame.  Frame indices are zero-based, so
+  /// the first frame is zero.  IsLastFrame(-1) will return false, unless the
+  /// file is empty (which is a case that I'm not sure all the code will handle,
+  /// so be careful).  This function may return false for some frame if we
+  /// haven't yet decided to terminate decoding, but later true if we decide to
+  /// terminate decoding.  This function exists mainly to correctly handle end
+  /// effects in feature extraction, and is not a mechanism to determine how
+  /// many frames are in the decodable object (as it used to be, and for
+  /// backward compatibility, still is, in the Decodable interface).
+  virtual bool IsLastFrame(int32_t frame) const = 0;
+
+  /// Gets the feature vector for this frame.  Before calling this for a given
+  /// frame, it is assumed that you called NumFramesReady() and it returned a
+  /// number greater than "frame".  Otherwise this call will likely crash with
+  /// an assert failure.  This function is not declared const, in case there is
+  /// some kind of caching going on, but most of the time it shouldn't modify
+  /// the class.
+  ///
+  /// The returned tensor has shape (1, Dim()).
+  virtual torch::Tensor GetFrame(int32_t frame) = 0;
+
+  /// This is like GetFrame() but for a collection of frames.  There is a
+  /// default implementation that just gets the frames one by one, but it
+  /// may be overridden for efficiency by child classes (since sometimes
+  /// it's more efficient to do things in a batch).
+  ///
+  /// The returned tensor has shape (frames.size(), Dim()).
+  virtual torch::Tensor GetFrames(const std::vector<int32_t> &frames) {
+    std::vector<torch::Tensor> features;
+    features.reserve(frames.size());
+
+    for (auto i : frames) {
+      torch::Tensor f = GetFrame(i);
+      features.push_back(std::move(f));
+    }
+
+    return torch::cat(features, /*dim*/ 0);
+  }
+
+  // Returns frame shift in seconds.  Helps to estimate duration from frame
+  // counts.
+  virtual float FrameShiftInSeconds() const = 0;
+
+  /// This would be called from the application, when you get more wave data.
+  /// Note: the sampling_rate is typically only provided so the code can assert
+  /// that it matches the sampling rate expected in the options.
+  virtual void AcceptWaveform(float sampling_rate,
+                              const torch::Tensor &waveform) = 0;
+
+  /// InputFinished() tells the class you won't be providing any
+  /// more waveform.  This will help flush out the last few frames
+  /// of delta or LDA features (it will typically affect the return value
+  /// of IsLastFrame.
+  virtual void InputFinished() = 0;
+};
+
+}  // namespace kaldifeat
+
+#endif  // KALDIFEAT_CSRC_ONLINE_FEATURE_ITF_H_