auc

2023-03-16 12:37:46 +08:00 · 2023-03-16 12:37:46 +08:00 · 39c0ae7749
commit 39c0ae7749
parent a49817385a
2 changed files with 394 additions and 0 deletions
--- a/egs/himia/wuw/ctc_tdnn/decode.py
+++ b/egs/himia/wuw/ctc_tdnn/decode.py
@ -0,0 +1,279 @@
 #!/usr/bin/env python3
 # Copyright    2023  Xiaomi Corp.        (Author: Weiji Zhuang,
 #                                                 Liyong Guo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import copy
 import logging
 from concurrent.futures import ProcessPoolExecutor
 from typing import Tuple
 import numpy as np
 from lhotse.features.io import NumpyHdf5Reader
 from tqdm import tqdm
 from icefall.utils import AttributeDict
 from train import get_params
 from graph import ctc_trivial_decoding_graph
 class Arc:
    def __init__(
        self, src_state: int, dst_state: int, ilabel: int, olabel: int
    ) -> None:
        self.src_state = int(src_state)
        self.dst_state = int(dst_state)
        self.ilabel = int(ilabel)
        self.olabel = int(olabel)
    def next_state(self) -> None:
        return self.dst_state
 class State:
    def __init__(self) -> None:
        self.arc_list = list()
    def add_arc(self, arc: Arc) -> None:
        self.arc_list.append(arc)
 class FiniteStateTransducer:
    """Represents a decoding graph for wake word detection."""
    def __init__(self, graph: str) -> None:
        self.state_list = list()
        for arc_str in graph.split("\n"):
            arc = arc_str.strip().split()
            if len(arc) == 0:
                continue
            # 1 and 2 for final state
            # 4 for non-final state
            assert len(arc) in [1, 2, 4], f"{len(arc)} {arc_str}"
            if len(arc) == 4:  # Non-final state
                # FST must be sorted
                if len(self.state_list) <= int(arc[0]):
                    new_state = State()
                    self.state_list.append(new_state)
                self.state_list[int(arc[0])].add_arc(
                    Arc(arc[0], arc[1], arc[2], arc[3])
                )
            else:
                self.final_state_id = int(arc[0])
    def to_str(self) -> None:
        fst_str = ""
        for state_idx in range(len(self.state_list)):
            cur_state = self.state_list[state_idx]
            for arc_idx in range(len(cur_state.arc_list)):
                cur_arc = cur_state.arc_list[arc_idx]
                ilabel = cur_arc.ilabel
                olabel = cur_arc.olabel
                src_state = cur_arc.src_state
                dst_state = cur_arc.dst_state
                fst_str += f"{src_state} {dst_state} {ilabel} {olabel}\n"
        fst_str += f"{dst_state}\n"
        return fst_str
 class Token:
    def __init__(self) -> None:
        self.is_active = False
        self.total_score = -float("inf")
        self.keyword_frames = 0
        self.average_keyword_score = -float("inf")
        self.average_max_keyword_score = 0.0
    def set_token(
        self,
        src_token,
        is_keyword_ilabel: bool,
        acoustic_score: float,
    ) -> None:
        """
        A dynamic programming process computing the highest score for a token
        from all possible paths which could reach this token.
        Args:
          src_token: The source token connected to current token with an arc.
          is_keyword_ilabel: If true, the arc consumes an input label which is
            a part of wake word. Otherwhise, the input label is
            blank or unknown, i.e. current token is still not part of wake word.
          acoustic_score: acoustic score of this arc.
        """
        if (
            not self.is_active
            or self.total_score < src_token.total_score + acoustic_score
        ):
            self.is_active = True
            self.total_score = src_token.total_score + acoustic_score
            if is_keyword_ilabel:
                self.average_keyword_score = (
                    acoustic_score
                    + src_token.average_keyword_score * src_token.keyword_frames
                ) / (src_token.keyword_frames + 1)
                self.keyword_frames = src_token.keyword_frames + 1
            else:
                self.average_keyword_score = 0.0
 class SingleDecodable:
    def __init__(
        self,
        model_output,
        keyword_ilabel_start,
        graph,
    ):
        """
        Args:
          model_output: log_softmax(logit) with shape [T, C]
          keyword_ilabel_start: index of the first token of the wake word.
            In this recipe, tokens not for wake word has smaller token index,
            i.e. blank 0; unk 1.
          graph: decoding graph of the wake word.
        """
        self.init_token_list = [Token() for i in range(len(graph.state_list))]
        self.reset_token_list()
        self.model_output = model_output
        self.T = model_output.shape[0]
        self.utt_score = 0.0
        self.current_frame_index = 0
        self.keyword_ilabel_start = keyword_ilabel_start
        self.graph = graph
        self.number_tokens = len(self.cur_token_list)
    def reset_token_list(self) -> None:
        """
        Reset all tokens to a condition without consuming any acoustic frames.
        """
        self.cur_token_list = copy.deepcopy(self.init_token_list)
        self.expand_token_list = copy.deepcopy(self.init_token_list)
        self.cur_token_list[0].is_active = True
        self.cur_token_list[0].total_score = 0
        self.cur_token_list[0].average_keyword_score = 0
    def process_oneframe(self) -> None:
        """
        Decode a frame and update all tokens.
        """
        for state_id, cur_token in enumerate(self.cur_token_list):
            if cur_token.is_active:
                for arc_id in self.graph.state_list[state_id].arc_list:
                    acoustic_score = self.model_output[self.current_frame_index][
                        arc_id.ilabel
                    ]
                    is_keyword_ilabel = arc_id.ilabel >= self.keyword_ilabel_start
                    self.expand_token_list[arc_id.next_state()].set_token(
                        cur_token,
                        is_keyword_ilabel,
                        acoustic_score,
                    )
        # use best_score to keep total_score in a good range
        self.best_state_id = 0
        best_score = self.expand_token_list[0].total_score
        for state_id in range(self.number_tokens):
            if self.expand_token_list[state_id].is_active:
                if best_score < self.expand_token_list[state_id].total_score:
                    best_score = self.expand_token_list[state_id].total_score
                    self.best_state_id = state_id
        self.cur_token_list = self.expand_token_list
        for state_id in range(self.number_tokens):
            self.cur_token_list[state_id].total_score -= best_score
        self.expand_token_list = copy.deepcopy(self.init_token_list)
        potential_score = np.exp(
            self.cur_token_list[self.graph.final_state_id].average_keyword_score
        )
        if potential_score > self.utt_score:
            self.utt_score = potential_score
        self.current_frame_index += 1
 def decode_utt(
    params: AttributeDict, utt_id: str, post_file, graph: FiniteStateTransducer
 ) -> Tuple[str, float]:
    """
    Decode a single utterance.
    Args:
      params:
        The return value of :func:`get_params`.
      utt_id: utt_id to be decoded, used to fetch posterior matrix from post_file.
      post_file: file to save posterior for all test set.
      graph: decoding graph.
    Returns:
      utt_id and its corresponding probability to be a wake word.
    """
    reader = NumpyHdf5Reader(post_file)
    model_output = reader.read(utt_id)
    keyword_ilabel_start = params.wakeup_word_tokens[0]
    decodable = SingleDecodable(
        model_output=model_output,
        keyword_ilabel_start=keyword_ilabel_start,
        graph=graph,
    )
    for t in range(decodable.T):
        decodable.process_oneframe()
    return utt_id, decodable.utt_score
 def get_parser():
    parser = argparse.ArgumentParser(
        description="A simple FST decoder for the wake word detection\n"
    )
    parser.add_argument(
        "--decoding-graph", help="decoding graph", default="himia_ctc_graph.txt"
    )
    parser.add_argument("--post-h5", help="model output in h5 format")
    parser.add_argument("--score-file", help="file to save scores of each utterance")
    return parser
 def main():
    logging.basicConfig(
        level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s"
    )
    parser = get_parser()
    args = parser.parse_args()
    params = get_params()
    params.update(vars(args))
    keys = NumpyHdf5Reader(params.post_h5).hdf.keys()
    graph = FiniteStateTransducer(ctc_trivial_decoding_graph(params.wakeup_word_tokens))
    logging.info(f"Graph used:\n{graph.to_str()}")
    logging.info("About to load data to decoder.")
    with ProcessPoolExecutor() as executor, open(
        params.score_file, "w", encoding="utf8"
    ) as fout:
        futures = [
            executor.submit(decode_utt, params, key, params.post_h5, graph)
            for key in tqdm(keys)
        ]
        logging.info("Decoding.")
        for future in tqdm(futures):
            k, v = future.result()
            fout.write(str(k) + " " + str(v) + "\n")
 if __name__ == "__main__":
    main()
--- a/egs/himia/wuw/local/auc.py
+++ b/egs/himia/wuw/local/auc.py
@ -0,0 +1,115 @@
 #!/usr/bin/env python3
 # Copyright    2023  Xiaomi Corp.        (Author: Weiji Zhuang,
 #                                                 Liyong Guo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import logging
 from typing import Dict, Tuple
 import matplotlib.pyplot as plt
 import numpy as np
 from pathlib import Path
 from sklearn.metrics import roc_curve, auc
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--positive-score-file", required=True, help="score file of positive data"
    )
    parser.add_argument(
        "--negative-score-file", required=True, help="score file of negative data"
    )
    parser.add_argument("--legend", required=True, help="utt2dur file of negative data")
    return parser.parse_args()
 def load_score(score_file: Path) -> Dict[str, float]:
    """
    Args:
      score_file: Path to score file. Each line has two columns.
        The first colume is utt-id, and the second one is score.
        This score could be viewed as probability of being wakeup word.
    Returns:
      A dict with that key is utt-id and value is corresponding score.
    """
    pos_dict = {}
    with open(score_file, "r", encoding="utf8") as fin:
        for line in fin:
            arr = line.strip().split()
            assert len(arr) == 2
            key = arr[0]
            score = float(arr[1])
            pos_dict[key] = score
    return pos_dict
 def get_roc_and_auc(
    pos_dict: Dict,
    neg_dict: Dict,
 ) -> Tuple[np.array, np.array, float]:
    """
    Args:
      pos_dict: scores of positive samples.
      neg_dict: scores of negative samples.
    Return:
      A tuple of three elements, which will be used to plot roc curve.
      Refer to sklearn.metrics.roc_curve for meaning of the first and second elements.
      The third element is area under the roc curve(AUC).
    """
    pos_scores = np.fromiter(pos_dict.values(), dtype=float)
    neg_scores = np.fromiter(neg_dict.values(), dtype=float)
    pos_y = np.ones_like(pos_scores, dtype=int)
    neg_y = np.zeros_like(neg_scores, dtype=int)
    scores = np.concatenate([pos_scores, neg_scores])
    y = np.concatenate([pos_y, neg_y])
    fpr, tpr, thresholds = roc_curve(y, scores, pos_label=1)
    roc_auc = auc(fpr, tpr)
    return fpr, tpr, roc_auc
 def main():
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    args = get_args()
    logging.basicConfig(format=formatter, level=logging.INFO)
    pos_dict = load_score(args.positive_score_file)
    neg_dict = load_score(args.negative_score_file)
    fpr, tpr, roc_auc = get_roc_and_auc(pos_dict, neg_dict)
    plt.figure(figsize=(16, 9))
    plt.plot(fpr, tpr, label=f"{args.legend}(AUC = %1.8f)" % roc_auc)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("Receiver operating characteristic(ROC)")
    plt.legend(loc="lower right")
    output_path = Path(args.positive_score_file).parent
    logging.info(f"AUC of {args.legend} {output_path}: {roc_auc}")
    plt.savefig(f"{output_path}/{args.legend}.pdf", bbox_inches="tight")
 if __name__ == "__main__":
    main()