Merge 0fb43289f477aa1a9f1d88215684f7808c1c0fd8 into abd9437e6d5419a497707748eb935e50976c3b7b

2025-08-09 10:02:22 +00:00 · 2025-06-27 11:33:00 +00:00 · 2025-06-27 11:33:00 +00:00 · b5ae175e8e
commit b5ae175e8e
parent abd9437e6d 0fb43289f4
16 changed files with 2422 additions and 0 deletions
--- a/egs/himia/wuw/README.md
+++ b/egs/himia/wuw/README.md
@ -0,0 +1,10 @@
+# Pretrained models and related logs/results.
+
+## ctc tdnn model with Number of model parameters: 1,502,169
+
+AUC results for different epochs could be found at <https://huggingface.co/GuoLiyong/himia_ctc_tdnn_baseline/tree/main>
+
+E.g. for epoch 15 and avg 1, result log file is: <https://huggingface.co/GuoLiyong/himia_ctc_tdnn_baseline/blob/main/exp_max_duration_100/post/epoch_15-avg_1/log/log-auc-himia_aishell-2023-03-16-17-42-14>
+
+Corresponding ROC curve is: <https://huggingface.co/GuoLiyong/himia_ctc_tdnn_baseline/blob/main/exp_max_duration_100/post/epoch_15-avg_1/himia_aishell.png>
+
--- a/egs/himia/wuw/RESULTS.md
+++ b/egs/himia/wuw/RESULTS.md
@ -0,0 +1,16 @@
+## Results
+
+### ctc tdnn model with Number of model parameters: 1,502,169
+
+AUC results for different epochs could be found at <https://huggingface.co/GuoLiyong/himia_ctc_tdnn_baseline/tree/main>
+
+Here is the result for epoch_15-avg_1(with the highest AUC).
+
+| test set   | HiMia-Aishell | HiMia-CW|
+|  ----  | ----  | ----|
+| AUC  | 0.9597 |0.9292|
+
+![himia_aishell](https://huggingface.co/GuoLiyong/himia_ctc_tdnn_baseline/resolve/main/exp_max_duration_100/post/epoch_15-avg_1/himia_aishell.png)
+
+![himia_cw](https://huggingface.co/GuoLiyong/himia_ctc_tdnn_baseline/resolve/main/exp_max_duration_100/post/epoch_15-avg_1/himia_cw.png)
+
--- a/egs/himia/wuw/ctc_tdnn/asr_datamodule.py
+++ b/egs/himia/wuw/ctc_tdnn/asr_datamodule.py
@ -0,0 +1,423 @@
+# Copyright      2022  Xiaomi Corporation     (Author: Liyong Guo)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import logging
+from functools import lru_cache
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+import torch
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest, load_manifest_lazy
+from lhotse.dataset import (  # noqa F401 for PrecomputedFeatures
+    CutConcatenate,
+    CutMix,
+    DynamicBucketingSampler,
+    K2SpeechRecognitionDataset,
+    PrecomputedFeatures,
+    SingleCutSampler,
+    SpecAugment,
+)
+from lhotse.dataset.input_strategies import OnTheFlyFeatures
+
+from lhotse.utils import fix_random_seed
+from torch.utils.data import DataLoader
+
+from icefall.utils import str2bool
+
+
+class _SeedWorkers:
+    def __init__(self, seed: int):
+        self.seed = seed
+
+    def __call__(self, worker_id: int):
+        fix_random_seed(self.seed + worker_id)
+
+
+class HiMiaWuwDataModule:
+    """
+    DataModule for Himia wake word experiments.
+
+    It contains common data pipeline modules e.g.:
+    - dynamic batch size,
+    - bucketing samplers,
+    - cut concatenation,
+    - augmentation,
+    - on-the-fly feature extraction
+
+    """
+
+    def __init__(self, args: argparse.Namespace):
+        self.args = args
+
+    @classmethod
+    def add_arguments(cls, parser: argparse.ArgumentParser):
+        group = parser.add_argument_group(
+            title="Data related options",
+            description="These options are used for the preparation of "
+            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
+            "effective batch sizes, sampling strategies, applied data "
+            "augmentations, etc.",
+        )
+        group.add_argument(
+            "--manifest-dir",
+            type=Path,
+            default=Path("data/fbank"),
+            help="Path to directory with train/valid/test cuts.",
+        )
+        group.add_argument(
+            "--max-duration",
+            type=int,
+            default=200.0,
+            help="Maximum pooled recordings duration (seconds) in a "
+            "single batch. You can reduce it if it causes CUDA OOM.",
+        )
+        group.add_argument(
+            "--bucketing-sampler",
+            type=str2bool,
+            default=False,
+            help="When enabled, the batches will come from buckets of "
+            "similar duration (saves padding frames).",
+        )
+        group.add_argument(
+            "--num-buckets",
+            type=int,
+            default=30,
+            help="The number of buckets for the DynamicBucketingSampler"
+            "(you might want to increase it for larger datasets).",
+        )
+        group.add_argument(
+            "--concatenate-cuts",
+            type=str2bool,
+            default=False,
+            help="When enabled, utterances (cuts) will be concatenated "
+            "to minimize the amount of padding.",
+        )
+        group.add_argument(
+            "--duration-factor",
+            type=float,
+            default=1.0,
+            help="Determines the maximum duration of a concatenated cut "
+            "relative to the duration of the longest cut in a batch.",
+        )
+        group.add_argument(
+            "--gap",
+            type=float,
+            default=1.0,
+            help="The amount of padding (in seconds) inserted between "
+            "concatenated cuts. This padding is filled with noise when "
+            "noise augmentation is used.",
+        )
+        group.add_argument(
+            "--on-the-fly-feats",
+            type=str2bool,
+            default=False,
+            help="When enabled, use on-the-fly cut mixing and feature "
+            "extraction. Will drop existing precomputed feature manifests "
+            "if available.",
+        )
+        group.add_argument(
+            "--shuffle",
+            type=str2bool,
+            default=True,
+            help="When enabled (=default), the examples will be "
+            "shuffled for each epoch.",
+        )
+        group.add_argument(
+            "--drop-last",
+            type=str2bool,
+            default=True,
+            help="Whether to drop last batch. Used by sampler.",
+        )
+        group.add_argument(
+            "--return-cuts",
+            type=str2bool,
+            default=True,
+            help="When enabled, each batch will have the "
+            "field: batch['supervisions']['cut'] with the cuts that "
+            "were used to construct it.",
+        )
+
+        group.add_argument(
+            "--num-workers",
+            type=int,
+            default=2,
+            help="The number of training dataloader workers that "
+            "collect the batches.",
+        )
+
+        group.add_argument(
+            "--enable-spec-aug",
+            type=str2bool,
+            default=True,
+            help="When enabled, use SpecAugment for training dataset.",
+        )
+
+        group.add_argument(
+            "--spec-aug-time-warp-factor",
+            type=int,
+            default=80,
+            help="Used only when --enable-spec-aug is True. "
+            "It specifies the factor for time warping in SpecAugment. "
+            "Larger values mean more warping. "
+            "A value less than 1 means to disable time warp.",
+        )
+
+        group.add_argument(
+            "--enable-musan",
+            type=str2bool,
+            default=True,
+            help="When enabled, select noise from MUSAN and mix it"
+            "with training dataset. ",
+        )
+
+        group.add_argument(
+            "--input-strategy",
+            type=str,
+            default="PrecomputedFeatures",
+            help="AudioSamples or PrecomputedFeatures",
+        )
+        group.add_argument(
+            "--train-channel",
+            type=str,
+            default="_7_01",
+            help="""channel of HI_MIA train dataset.
+            All channels are used if it is set "all".
+            Please refer to stage 6 in prepare.sh for its meaning and other
+            potential values. Currently, Only "_7_01" is verified.
+            """,
+        )
+        group.add_argument(
+            "--dev-channel",
+            type=str,
+            default="_7_01",
+            help="""channel of HI_MIA dev dataset.
+            All channels are used if it is set "all".
+            Please refer to stage 6 in prepare.sh for its meaning and other
+            potential values. Currently, Only "_7_01" is verified.
+            """,
+        )
+
+    def train_dataloaders(
+        self,
+        cuts_train: CutSet,
+        sampler_state_dict: Optional[Dict[str, Any]] = None,
+    ) -> DataLoader:
+        """
+        Args:
+          cuts_train:
+            CutSet for training.
+          sampler_state_dict:
+            The state dict for the training sampler.
+        """
+        transforms = []
+        if self.args.enable_musan:
+            logging.info("Enable MUSAN")
+            logging.info("About to get Musan cuts")
+            cuts_musan = load_manifest(self.args.manifest_dir / "musan_cuts.jsonl.gz")
+            transforms.append(
+                CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True)
+            )
+        else:
+            logging.info("Disable MUSAN")
+
+        if self.args.concatenate_cuts:
+            logging.info(
+                f"Using cut concatenation with duration factor "
+                f"{self.args.duration_factor} and gap {self.args.gap}."
+            )
+            # Cut concatenation should be the first transform in the list,
+            # so that if we e.g. mix noise in, it will fill the gaps between
+            # different utterances.
+            transforms = [
+                CutConcatenate(
+                    duration_factor=self.args.duration_factor, gap=self.args.gap
+                )
+            ] + transforms
+
+        input_transforms = []
+        if self.args.enable_spec_aug:
+            logging.info(f"Time warp factor: {self.args.spec_aug_time_warp_factor}")
+            input_transforms.append(
+                SpecAugment(
+                    time_warp_factor=self.args.spec_aug_time_warp_factor,
+                    num_frame_masks=10,
+                    features_mask_size=27,
+                    num_feature_masks=2,
+                    frames_mask_size=100,
+                )
+            )
+        else:
+            logging.info("Disable SpecAugment")
+
+        logging.info("About to create train dataset")
+        train = K2SpeechRecognitionDataset(
+            input_strategy=eval(self.args.input_strategy)(),
+            cut_transforms=transforms,
+            input_transforms=input_transforms,
+            return_cuts=self.args.return_cuts,
+        )
+
+        if self.args.on_the_fly_feats:
+            # NOTE: the PerturbSpeed transform should be added only if we
+            # remove it from data prep stage.
+            # Add on-the-fly speed perturbation; since originally it would
+            # have increased epoch size by 3, we will apply prob 2/3 and use
+            # 3x more epochs.
+            # Speed perturbation probably should come first before
+            # concatenation, but in principle the transforms order doesn't have
+            # to be strict (e.g. could be randomized)
+            # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2/3)] + transforms   # noqa
+            # Drop feats to be on the safe side.
+            train = K2SpeechRecognitionDataset(
+                cut_transforms=transforms,
+                input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))),
+                input_transforms=input_transforms,
+                return_cuts=self.args.return_cuts,
+            )
+
+        if self.args.bucketing_sampler:
+            logging.info("Using DynamicBucketingSampler.")
+            train_sampler = DynamicBucketingSampler(
+                cuts_train,
+                max_duration=self.args.max_duration,
+                shuffle=self.args.shuffle,
+                num_buckets=self.args.num_buckets,
+                drop_last=self.args.drop_last,
+            )
+        else:
+            logging.info("Using SingleCutSampler.")
+            train_sampler = SingleCutSampler(
+                cuts_train,
+                max_duration=self.args.max_duration,
+                shuffle=self.args.shuffle,
+            )
+        logging.info("About to create train dataloader")
+
+        if sampler_state_dict is not None:
+            logging.info("Loading sampler state dict")
+            train_sampler.load_state_dict(sampler_state_dict)
+
+        # 'seed' is derived from the current random state, which will have
+        # previously been set in the main process.
+        seed = torch.randint(0, 100000, ()).item()
+        worker_init_fn = _SeedWorkers(seed)
+
+        train_dl = DataLoader(
+            train,
+            sampler=train_sampler,
+            batch_size=None,
+            num_workers=self.args.num_workers,
+            persistent_workers=False,
+            worker_init_fn=worker_init_fn,
+        )
+
+        return train_dl
+
+    def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
+        transforms = []
+        if self.args.concatenate_cuts:
+            transforms = [
+                CutConcatenate(
+                    duration_factor=self.args.duration_factor, gap=self.args.gap
+                )
+            ] + transforms
+
+        logging.info("About to create dev dataset")
+        if self.args.on_the_fly_feats:
+            validate = K2SpeechRecognitionDataset(
+                cut_transforms=transforms,
+                input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))),
+                return_cuts=self.args.return_cuts,
+            )
+        else:
+            validate = K2SpeechRecognitionDataset(
+                cut_transforms=transforms,
+                return_cuts=self.args.return_cuts,
+            )
+        valid_sampler = DynamicBucketingSampler(
+            cuts_valid,
+            max_duration=self.args.max_duration,
+            shuffle=False,
+        )
+        logging.info("About to create dev dataloader")
+        valid_dl = DataLoader(
+            validate,
+            sampler=valid_sampler,
+            batch_size=None,
+            num_workers=2,
+            persistent_workers=False,
+        )
+
+        return valid_dl
+
+    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
+        logging.debug("About to create test dataset")
+        test = K2SpeechRecognitionDataset(
+            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
+            if self.args.on_the_fly_feats
+            else eval(self.args.input_strategy)(),
+            return_cuts=self.args.return_cuts,
+        )
+        sampler = DynamicBucketingSampler(
+            cuts,
+            max_duration=self.args.max_duration,
+            shuffle=False,
+            num_buckets=2,
+        )
+        logging.debug("About to create test dataloader")
+        test_dl = DataLoader(
+            test,
+            batch_size=None,
+            sampler=sampler,
+            num_workers=self.args.num_workers,
+        )
+        return test_dl
+
+    @lru_cache()
+    def train_cuts(self) -> CutSet:
+        logging.info("About to get train cuts")
+        train_cuts_file = (
+            f"cuts_train_himia{self.args.train_channel}-aishell-shuf.jsonl.gz"
+        )
+        if "all" == self.args.train_channel:
+            train_cuts_file = "cuts_train_himia-aishell-shuf.jsonl.gz"
+        return load_manifest_lazy(self.args.manifest_dir / f"{train_cuts_file}")
+
+    @lru_cache()
+    def aishell_test_cuts(self) -> CutSet:
+        logging.info("About to get aishell test cuts")
+        return load_manifest_lazy(self.args.manifest_dir / "aishell_cuts_test.jsonl.gz")
+
+    @lru_cache()
+    def cw_test_cuts(self) -> CutSet:
+        logging.info("About to get HI-MIA-CW test cuts")
+        return load_manifest_lazy(self.args.manifest_dir / "cuts_cw_test.jsonl.gz")
+
+    @lru_cache()
+    def dev_cuts(self) -> CutSet:
+        logging.info("About to get dev cuts")
+        dev_cuts_file = "cuts_dev.jsonl.gz"
+        if "all" != self.args.dev_channel:
+            dev_cuts_file = f"cuts_dev{self.args.dev_channel}.jsonl.gz"
+        return load_manifest_lazy(self.args.manifest_dir / f"{dev_cuts_file}")
+
+    @lru_cache()
+    def test_cuts(self) -> CutSet:
+        logging.info("About to get test cuts")
+        # 7_01 is short for microphone 7 and channel 1.
+        return load_manifest_lazy(self.args.manifest_dir / "cuts_test_7_01.jsonl.gz")
--- a/egs/himia/wuw/ctc_tdnn/decode.py
+++ b/egs/himia/wuw/ctc_tdnn/decode.py
@ -0,0 +1,316 @@
+#!/usr/bin/env python3
+# Copyright    2023  Xiaomi Corp.        (Author: Weiji Zhuang,
+#                                                 Liyong Guo)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import copy
+import logging
+from concurrent.futures import ProcessPoolExecutor
+from typing import Tuple
+from pathlib import Path
+
+import numpy as np
+from lhotse.features.io import NumpyHdf5Reader
+from tqdm import tqdm
+
+from icefall.utils import (
+    AttributeDict,
+    setup_logger,
+)
+
+from train import get_params
+from graph import ctc_trivial_decoding_graph
+
+
+class Arc:
+    def __init__(
+        self, src_state: int, dst_state: int, ilabel: int, olabel: int
+    ) -> None:
+        self.src_state = int(src_state)
+        self.dst_state = int(dst_state)
+        self.ilabel = int(ilabel)
+        self.olabel = int(olabel)
+
+    def next_state(self) -> None:
+        return self.dst_state
+
+
+class State:
+    def __init__(self) -> None:
+        self.arc_list = list()
+
+    def add_arc(self, arc: Arc) -> None:
+        self.arc_list.append(arc)
+
+
+class FiniteStateTransducer:
+    """Represents a decoding graph for wake word detection."""
+
+    def __init__(self, graph: str) -> None:
+        """
+        Construct a decoding graph in FST format given string format graph.
+
+        Args:
+          graph: A string format fst. Each arc is separated by "\n".
+        """
+        self.state_list = list()
+        for arc_str in graph.split("\n"):
+            arc = arc_str.strip().split()
+            if len(arc) == 0:
+                continue
+            # An arc may contain 1, 2 or 4 elements, with format:
+            # src_state [dst_state] [ilabel] [olabel]
+            # 1 and 2 for final state
+            # 4 for non-final state
+            assert len(arc) in [1, 2, 4], f"{len(arc)} {arc_str}"
+            arc = [int(element) for element in arc]
+            src_state_id = arc[0]
+            max_state_id = len(self.state_list) - 1
+            if len(arc) == 4:  # Non-final state
+                assert max_state_id <= src_state_id, (
+                    f"Fsa must be sorted by src_state, "
+                    f"while {max_state_id} <= {src_state_id}. Check your graph."
+                )
+                if max_state_id < src_state_id:
+                    new_state = State()
+                    self.state_list.append(new_state)
+
+                self.state_list[src_state_id].add_arc(
+                    Arc(src_state_id, arc[1], arc[2], arc[3])
+                )
+            else:
+                assert (
+                    max_state_id == src_state_id
+                ), "Final state seems unreachable. Check your graph."
+                self.final_state_id = src_state_id
+
+    def to_str(self) -> None:
+        fst_str = ""
+        number_states = len(self.state_list)
+        if number_states == 0:
+            return fst_str
+        for state_idx in range(number_states):
+            cur_state = self.state_list[state_idx]
+            for arc_idx in range(len(cur_state.arc_list)):
+                cur_arc = cur_state.arc_list[arc_idx]
+                ilabel = cur_arc.ilabel
+                olabel = cur_arc.olabel
+                src_state = cur_arc.src_state
+                dst_state = cur_arc.dst_state
+                fst_str += f"{src_state} {dst_state} {ilabel} {olabel}\n"
+        fst_str += f"{dst_state}\n"
+        return fst_str
+
+
+class Token:
+    def __init__(self) -> None:
+        self.is_active = False
+        self.total_score = -float("inf")
+        self.keyword_frames = 0
+        self.average_keyword_score = -float("inf")
+        self.average_max_keyword_score = 0.0
+
+    def set_token(
+        self,
+        src_token,  # Token conneted to current token.
+        is_keyword_ilabel: bool,
+        acoustic_score: float,
+    ) -> None:
+        """
+        A dynamic programming process computing the highest score for a token
+        from all possible paths which could reach this token.
+
+        Args:
+          src_token: The source token connected to current token with an arc.
+          is_keyword_ilabel: If true, the arc consumes an input label which is
+            a part of wake word. Otherwhise, the input label is
+            blank or unknown, i.e. current token is still not part of wake word.
+          acoustic_score: acoustic score of this arc.
+        """
+
+        if (
+            not self.is_active
+            or self.total_score < src_token.total_score + acoustic_score
+        ):
+            self.is_active = True
+            self.total_score = src_token.total_score + acoustic_score
+
+            if is_keyword_ilabel:
+                self.average_keyword_score = (
+                    acoustic_score
+                    + src_token.average_keyword_score * src_token.keyword_frames
+                ) / (src_token.keyword_frames + 1)
+
+                self.keyword_frames = src_token.keyword_frames + 1
+            else:
+                self.average_keyword_score = 0.0
+
+
+class SingleDecodable:
+    def __init__(
+        self,
+        model_output: np.array,
+        keyword_ilabel_start: int,
+        graph: FiniteStateTransducer,
+    ):
+        """
+        Args:
+          model_output: log_softmax(logit) with shape [T, C]
+          keyword_ilabel_start: index of the first token of the wake word.
+            In this recipe, tokens not for wake word has smaller token index,
+            i.e. blank 0; unk 1.
+          graph: decoding graph of the wake word.
+
+        """
+        self.init_token_list = [Token() for i in range(len(graph.state_list))]
+        self.reset_token_list()
+        self.model_output = model_output
+        self.T = model_output.shape[0]
+        self.utt_score = 0.0
+        self.current_frame_index = 0
+        self.keyword_ilabel_start = keyword_ilabel_start
+        self.graph = graph
+        self.number_tokens = len(self.cur_token_list)
+
+    def reset_token_list(self) -> None:
+        """
+        Reset all tokens to a condition without consuming any acoustic frames.
+        """
+        self.cur_token_list = copy.deepcopy(self.init_token_list)
+        self.expand_token_list = copy.deepcopy(self.init_token_list)
+        self.cur_token_list[0].is_active = True
+        self.cur_token_list[0].total_score = 0
+        self.cur_token_list[0].average_keyword_score = 0
+
+    def process_oneframe(self) -> None:
+        """
+        Decode a frame and update all tokens.
+        """
+        for state_id, cur_token in enumerate(self.cur_token_list):
+            if cur_token.is_active:
+                for arc_id in self.graph.state_list[state_id].arc_list:
+                    acoustic_score = self.model_output[self.current_frame_index][
+                        arc_id.ilabel
+                    ]
+                    is_keyword_ilabel = arc_id.ilabel >= self.keyword_ilabel_start
+                    self.expand_token_list[arc_id.next_state()].set_token(
+                        cur_token,
+                        is_keyword_ilabel,
+                        acoustic_score,
+                    )
+        # use best_score to keep total_score in a good range
+        self.best_state_id = 0
+        best_score = self.expand_token_list[0].total_score
+        for state_id in range(self.number_tokens):
+            if self.expand_token_list[state_id].is_active:
+                if best_score < self.expand_token_list[state_id].total_score:
+                    best_score = self.expand_token_list[state_id].total_score
+                    self.best_state_id = state_id
+
+        self.cur_token_list = self.expand_token_list
+        for state_id in range(self.number_tokens):
+            self.cur_token_list[state_id].total_score -= best_score
+        self.expand_token_list = copy.deepcopy(self.init_token_list)
+        potential_score = np.exp(
+            self.cur_token_list[self.graph.final_state_id].average_keyword_score
+        )
+        if potential_score > self.utt_score:
+            self.utt_score = potential_score
+        self.current_frame_index += 1
+
+
+def decode_utt(
+    params: AttributeDict,
+    utt_id: str,
+    post_file: str,
+    graph: FiniteStateTransducer,
+) -> Tuple[str, float]:
+    """
+    Decode a single utterance.
+
+    Args:
+      params:
+        The return value of :func:`get_params`.
+      utt_id: utt_id to be decoded, used to fetch posterior matrix from post_file.
+      post_file: file to save posterior for all test set.
+      graph: decoding graph in FiniteStateTransducer format.
+
+    Returns:
+      utt_id and its corresponding probability to be a wake word.
+    """
+    reader = NumpyHdf5Reader(post_file)
+    model_output = reader.read(utt_id)
+    keyword_ilabel_start = params.wakeup_word_tokens[0]
+    decodable = SingleDecodable(
+        model_output=model_output,
+        keyword_ilabel_start=keyword_ilabel_start,
+        graph=graph,
+    )
+    for t in range(decodable.T):
+        decodable.process_oneframe()
+    return utt_id, decodable.utt_score
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="A simple FST decoder for the wake word detection\n"
+    )
+    parser.add_argument(
+        "--post-h5",
+        type=str,
+        help="model output in h5 format",
+    )
+    parser.add_argument(
+        "--score-file",
+        type=str,
+        help="file to save scores of each utterance",
+    )
+    return parser
+
+
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    params = get_params()
+    params.update(vars(args))
+    post_dir = Path(params.post_h5).parent
+    test_set = Path(params.post_h5).stem
+    setup_logger(f"{post_dir}/log/log-decode-{test_set}")
+
+    graph = FiniteStateTransducer(ctc_trivial_decoding_graph(params.wakeup_word_tokens))
+
+    logging.info(f"Graph used:\n{graph.to_str()}")
+
+    logging.info(f"About to load {test_set}.")
+    keys = NumpyHdf5Reader(params.post_h5).hdf.keys()
+    with ProcessPoolExecutor() as executor, open(
+        params.score_file, "w", encoding="utf8"
+    ) as fout:
+        futures = [
+            executor.submit(decode_utt, params, key, params.post_h5, graph)
+            for key in tqdm(keys)
+        ]
+        logging.info(f"Decoding {test_set}.")
+        for future in tqdm(futures):
+            k, v = future.result()
+            fout.write(str(k) + " " + str(v) + "\n")
+
+        logging.info(f"Finish decoding {test_set}.")
+
+
+if __name__ == "__main__":
+    main()
--- a/egs/himia/wuw/ctc_tdnn/graph.py
+++ b/egs/himia/wuw/ctc_tdnn/graph.py
@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+# Copyright    2023  Xiaomi Corp.        (Author: Weiji Zhuang,
+#                                                 Liyong Guo)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+
+def ctc_trivial_decoding_graph(wakeup_word_tokens: List[int]) -> str:
+    """
+    A graph starts with blank/unknown and following by wakeup word.
+
+    Args:
+      wakeup_word_tokens: A sequence of token ids corresponding wakeup_word.
+      It should not contain 0 and 1.
+      We assume 0 is for blank and 1 is for unknown.
+    Returns:
+      Returns a finite-state transducer in string format,
+      used as a decoding graph.
+      Arcs are separated with "\n".
+    """
+    assert 0 not in wakeup_word_tokens
+    assert 1 not in wakeup_word_tokens
+    assert len(wakeup_word_tokens) >= 2
+    keyword_ilabel_start = wakeup_word_tokens[0]
+    fst_graph = ""
+    for non_wake_word_token in range(keyword_ilabel_start):
+        fst_graph += f"0 0 {non_wake_word_token} 0\n"
+    cur_state = 1
+    for token in wakeup_word_tokens[:-1]:
+        fst_graph += f"{cur_state - 1} {cur_state} {token} 0\n"
+        fst_graph += f"{cur_state} {cur_state} {token} 0\n"
+        cur_state += 1
+
+    token = wakeup_word_tokens[-1]
+    fst_graph += f"{cur_state - 1} {cur_state} {token} 1\n"
+    fst_graph += f"{cur_state} {cur_state} {token} 0\n"
+    fst_graph += f"{cur_state}\n"
+    return fst_graph
--- a/egs/himia/wuw/ctc_tdnn/inference.py
+++ b/egs/himia/wuw/ctc_tdnn/inference.py
@ -0,0 +1,206 @@
+#!/usr/bin/env python3
+# Copyright 2023 Xiaomi Corporation (Author: Liyong Guo)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+from pathlib import Path
+
+import torch
+from lhotse.features.io import NumpyHdf5Writer
+
+from icefall.checkpoint import average_checkpoints, load_checkpoint
+from icefall.env import get_env_info
+from icefall.utils import (
+    AttributeDict,
+    setup_logger,
+)
+
+from asr_datamodule import HiMiaWuwDataModule
+from tdnn import Tdnn
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--epoch",
+        type=int,
+        default=10,
+        help="It specifies the checkpoint to use for decoding."
+        "Note: Epoch counts from 1.",
+    )
+    parser.add_argument(
+        "--avg",
+        type=int,
+        default=1,
+        help="Number of checkpoints to average. Automatically select "
+        "consecutive checkpoints before the checkpoint specified by "
+        "'--epoch'. ",
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="ctc_tdnn/exp",
+        help="The experiment dir",
+    )
+
+    return parser
+
+
+def get_params() -> AttributeDict:
+    params = AttributeDict(
+        {
+            "env_info": get_env_info(),
+            "feature_dim": 80,
+            "num_class": 9,
+        }
+    )
+    return params
+
+
+def inference_dataset(
+    dl: torch.utils.data.DataLoader,
+    params: AttributeDict,
+    model: torch.nn.Module,
+    test_set: str,
+):
+    """Compute and save model output of each utterance.
+
+    Args:
+      dl:
+        PyTorch's dataloader containing the dataset to decode.
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The neural model.
+      test_set:
+        Name of test set.
+    """
+    num_cuts = 0
+
+    try:
+        num_batches = len(dl)
+    except TypeError:
+        num_batches = "?"
+
+    writer = NumpyHdf5Writer(f"{params.out_dir}/{test_set}")
+    for batch_idx, batch in enumerate(dl):
+        device = params.device
+        feature = batch["inputs"]
+        assert feature.ndim == 3
+        supervisions = batch["supervisions"]
+        start_frames = supervisions["start_frame"]
+        end_frames = start_frames + supervisions["num_frames"]
+
+        feature = feature.to(device)
+        # model_output is log_softmax(logit) with shape [N, T, C]
+        model_output = model(feature)
+
+        for i in range(feature.size(0)):
+            assert start_frames[i] == 0
+            cut = batch["supervisions"]["cut"][i]
+            cur_target = model_output[i][start_frames[i] : end_frames[i]]
+            writer.store_array(key=cut.id, value=cur_target.cpu().numpy())
+
+        num_cuts += len(batch["supervisions"]["text"])
+
+        if batch_idx % 100 == 0:
+            batch_str = f"{batch_idx}/{num_batches}"
+
+            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
+
+
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    HiMiaWuwDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    params = get_params()
+    params.update(vars(args))
+
+    out_dir = f"{params.exp_dir}/post/epoch_{params.epoch}-avg_{params.avg}/"
+    Path(out_dir).mkdir(parents=True, exist_ok=True)
+    params.out_dir = out_dir
+    setup_logger(f"{out_dir}/log/log-inference")
+    logging.info("Decoding started")
+    logging.info(params)
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", 0)
+
+    logging.info(f"device: {device}")
+
+    model = Tdnn(params.feature_dim, params.num_class)
+
+    if params.avg == 1:
+        load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model, strict=True)
+    else:
+        start = params.epoch - params.avg + 1
+        filenames = []
+        for i in range(start, params.epoch + 1):
+            if start >= 0:
+                filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
+        logging.info(f"averaging {filenames}")
+        model.to(device)
+        model.load_state_dict(
+            average_checkpoints(filenames, device=device), strict=True
+        )
+
+    model.to(device)
+    model.eval()
+    params.device = device
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    himia = HiMiaWuwDataModule(args)
+
+    aishell_test_cuts = himia.aishell_test_cuts()
+    test_cuts = himia.test_cuts()
+    cw_test_cuts = himia.cw_test_cuts()
+
+    aishell_test_dl = himia.test_dataloaders(aishell_test_cuts)
+    test_dl = himia.test_dataloaders(test_cuts)
+    cw_test_dl = himia.test_dataloaders(cw_test_cuts)
+
+    test_sets = ["aishell_test", "test", "cw_test"]
+    test_dls = [aishell_test_dl, test_dl, cw_test_dl]
+
+    for test_set, test_dl in zip(test_sets, test_dls):
+        logging.info(f"About to inference {test_set}")
+        inference_dataset(
+            dl=test_dl,
+            params=params,
+            model=model,
+            test_set=test_set,
+        )
+
+        logging.info(f"finish inferencing {test_set}")
+
+    logging.info("Done!")
+
+
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+if __name__ == "__main__":
+    main()
--- a/egs/himia/wuw/ctc_tdnn/tdnn.py
+++ b/egs/himia/wuw/ctc_tdnn/tdnn.py
@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+# Copyright    2023  Xiaomi Corp.        (Author: Liyong Guo)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from torch import nn, Tensor
+
+
+class Tdnn(nn.Module):
+    """
+    Args:
+        num_features (int): Number of input features
+        num_classes (int): Number of output classes
+    """
+
+    def __init__(self, num_features: int, num_classes: int) -> None:
+        super().__init__()
+        self.num_features = num_features
+        self.num_classes = num_classes
+        self.tdnn = nn.Sequential(
+            nn.Conv1d(
+                in_channels=num_features,
+                out_channels=240,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+            ),
+            nn.ReLU(inplace=True),
+            nn.BatchNorm1d(num_features=240, affine=False),
+            nn.Conv1d(
+                in_channels=240, out_channels=240, kernel_size=3, stride=1, padding=1
+            ),
+            nn.ReLU(inplace=True),
+            nn.BatchNorm1d(num_features=240, affine=False),
+            nn.Conv1d(
+                in_channels=240, out_channels=240, kernel_size=3, stride=1, padding=1
+            ),
+            nn.ReLU(inplace=True),
+            nn.BatchNorm1d(num_features=240, affine=False),
+            nn.Conv1d(
+                in_channels=240, out_channels=240, kernel_size=3, stride=1, padding=1
+            ),
+            nn.ReLU(inplace=True),
+            nn.BatchNorm1d(num_features=240, affine=False),
+            nn.Conv1d(
+                in_channels=240, out_channels=240, kernel_size=3, stride=1, padding=1
+            ),
+            nn.ReLU(inplace=True),
+            nn.BatchNorm1d(num_features=240, affine=False),
+            nn.Conv1d(
+                in_channels=240, out_channels=240, kernel_size=3, stride=1, padding=1
+            ),
+            nn.ReLU(inplace=True),
+            nn.BatchNorm1d(num_features=240, affine=False),
+            nn.Conv1d(
+                in_channels=240, out_channels=240, kernel_size=3, stride=1, padding=1
+            ),
+            nn.ReLU(inplace=True),
+            nn.BatchNorm1d(num_features=240, affine=False),
+            nn.Conv1d(
+                in_channels=240, out_channels=240, kernel_size=3, stride=1, padding=1
+            ),
+            nn.ReLU(inplace=True),
+            nn.BatchNorm1d(num_features=240, affine=False),
+            nn.Conv1d(
+                in_channels=240, out_channels=240, kernel_size=3, stride=1, padding=1
+            ),
+            nn.ReLU(inplace=True),
+            nn.BatchNorm1d(num_features=240, affine=False),
+            nn.Conv1d(
+                in_channels=240, out_channels=240, kernel_size=1, stride=1, padding=0
+            ),
+            nn.ReLU(inplace=True),
+            nn.BatchNorm1d(num_features=240, affine=False),
+            nn.Conv1d(
+                in_channels=240,
+                out_channels=num_classes,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ),
+            nn.LogSoftmax(1),
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        r"""
+        Args:
+            x (torch.Tensor): Tensor of dimension (N, T, C).
+        Returns:
+            Tensor: Predictor tensor of dimension (N, T, C).
+        """
+
+        x = x.transpose(1, 2)
+        x = self.tdnn(x)
+        x = x.transpose(1, 2)
+        return x
--- a/egs/himia/wuw/ctc_tdnn/tokenizer.py
+++ b/egs/himia/wuw/ctc_tdnn/tokenizer.py
@ -0,0 +1,101 @@
+# Copyright      2023  Xiaomi Corp.        (Author: Liyong Guo)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import torch
+
+from typing import List, Tuple
+
+
+class WakeupWordTokenizer(object):
+    def __init__(
+        self,
+        wakeup_word: str,
+        wakeup_word_tokens: List[int],
+    ) -> None:
+        """
+        Args:
+          wakeup_word: content of positive samples.
+            A sample will be treated as a negative sample unless its content
+            is exactly the same to key_words.
+          wakeup_word_tokens: A list of int representing token ids of wakeup_word.
+            For example: the pronunciation of "你好米雅" is
+            "n i h ao m i y a".
+            Suppose we are using following lexicon:
+              blk 0
+              unk 1
+              n   2
+              i   3
+              h   4
+              ao  5
+              m   6
+              y   7
+              a   8
+            Then wakeup_word_tokens for "你好米雅" is:
+             n  i  h  ao m  i  y  a
+            [2, 3, 4, 5, 6, 3, 7, 8]
+        """
+        super().__init__()
+        assert wakeup_word is not None
+        assert wakeup_word_tokens is not None
+        assert (
+            0 not in wakeup_word_tokens
+        ), f"0 is kept for blank. Please Remove 0 from {wakeup_word_tokens}"
+        assert 1 not in wakeup_word_tokens, (
+            f"1 is kept for unknown and negative samples. "
+            f" Please Remove 1 from {wakeup_word_tokens}"
+        )
+        self.wakeup_word = wakeup_word
+        self.wakeup_word_tokens = wakeup_word_tokens
+        self.positive_number_tokens = len(wakeup_word_tokens)
+        self.negative_word_tokens = [1]
+        self.negative_number_tokens = 1
+
+    def texts_to_token_ids(
+        self, texts: List[str]
+    ) -> Tuple[torch.Tensor, torch.Tensor, int]:
+        """Convert a list of texts to parameters needed by CTC loss.
+
+        Args:
+          texts:
+            It is a list of strings,
+            each element is a reference text for an audio.
+        Returns:
+          Return a tuple of 3 elements.
+          The first one is torch.Tensor(List[List[int]]),
+          each List[int] is tokens sequence for each reference text.
+
+          The second one is number of tokens for each sample,
+          mainly used by CTC loss.
+
+          The last one is number_positive_samples,
+          used to track proportion of positive samples in each batch.
+        """
+        batch_token_ids = []
+        target_lengths = []
+        number_positive_samples = 0
+        for utt_text in texts:
+            if utt_text == self.wakeup_word:
+                batch_token_ids.extend(self.wakeup_word_tokens)
+                target_lengths.append(self.positive_number_tokens)
+                number_positive_samples += 1
+            else:
+                batch_token_ids.extend(self.negative_word_tokens)
+                target_lengths.append(self.negative_number_tokens)
+
+        target = torch.tensor(batch_token_ids)
+        target_lengths = torch.tensor(target_lengths)
+        return target, target_lengths, number_positive_samples
--- a/egs/himia/wuw/ctc_tdnn/train.py
+++ b/egs/himia/wuw/ctc_tdnn/train.py
@ -0,0 +1,667 @@
+#!/usr/bin/env python3
+# Copyright    2023  Xiaomi Corp.        (Author: Liyong Guo)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Usage:
+  export CUDA_VISIBLE_DEVICES="0"
+  ./ctc_tdnn/train.py \
+     --exp-dir ./ctc_tdnn/exp \
+     --world-size 1 \
+     --max-duration 100 \
+     --num-epochs 20
+"""
+
+import argparse
+import logging
+from pathlib import Path
+from shutil import copyfile
+from typing import Optional, Tuple
+
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+from asr_datamodule import HiMiaWuwDataModule
+from tdnn import Tdnn
+
+from lhotse.utils import fix_random_seed
+from torch import Tensor
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.nn.utils import clip_grad_norm_
+from torch.utils.tensorboard import SummaryWriter
+
+from tokenizer import WakeupWordTokenizer
+from icefall.checkpoint import load_checkpoint
+from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
+from icefall.dist import cleanup_dist, setup_dist
+from icefall.env import get_env_info
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+)
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--world-size",
+        type=int,
+        default=1,
+        help="Number of GPUs for DDP training.",
+    )
+
+    parser.add_argument(
+        "--master-port",
+        type=int,
+        default=12354,
+        help="Master port to use for DDP training.",
+    )
+
+    parser.add_argument(
+        "--tensorboard",
+        type=str2bool,
+        default=True,
+        help="Should various information be logged in tensorboard.",
+    )
+
+    parser.add_argument(
+        "--num-epochs",
+        type=int,
+        default=20,
+        help="Number of epochs to train.",
+    )
+
+    parser.add_argument(
+        "--start-epoch",
+        type=int,
+        default=1,
+        help="""Resume training from from this epoch.
+        If it is positive, it will load checkpoint from
+        ctc_tdnn/exp/epoch-{start_epoch-1}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="ctc_tdnn/exp",
+        help="""The experiment dir.
+        It specifies the directory where all training related
+        files, e.g., checkpoints, log, etc, are saved
+        """,
+    )
+
+    parser.add_argument(
+        "--lr-factor",
+        type=float,
+        default=0.001,
+        help="The lr_factor for optimizer",
+    )
+
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="The seed for random generators intended for reproducibility",
+    )
+
+    return parser
+
+
+def get_params() -> AttributeDict:
+    """Return a dict containing training parameters.
+
+    All training related parameters that are not passed from the commandline
+    are saved in the variable `params`.
+
+    Commandline options are merged into `params` after they are parsed, so
+    you can also access them via `params`.
+
+    Explanation of options saved in `params`:
+
+        - best_train_loss: Best training loss so far. It is used to select
+                           the model that has the lowest training loss. It is
+                           updated during the training.
+
+        - best_valid_loss: Best validation loss so far. It is used to select
+                           the model that has the lowest validation loss. It is
+                           updated during the training.
+
+        - best_train_epoch: It is the epoch that has the best training loss.
+
+        - best_valid_epoch: It is the epoch that has the best validation loss.
+
+        - batch_idx_train: Used to writing statistics to tensorboard. It
+                           contains number of batches trained so far across
+                           epochs.
+
+        - log_interval:  Print training loss if batch_idx % log_interval` is 0
+
+        - reset_interval: Reset statistics if batch_idx % reset_interval is 0
+
+        - valid_interval:  Run validation if batch_idx % valid_interval is 0
+
+        - feature_dim: The model input dim. It has to match the one used
+                       in computing features.
+
+        - num_class: Number of classes. Each token will have a token id
+                     from [0, num_class).
+                     In this recipe, 0 is usually kept for blank,
+                     and 1 is usually kept for negative words.
+        - wakeup_word:  Text of wakeup word, i.e. positive samples.
+        - wakeup_word_tokens: A sequence of token ids corresponding wakeup_word.
+        - weight_decay:  The weight_decay for the optimizer.
+    """
+    params = AttributeDict(
+        {
+            "best_train_loss": float("inf"),
+            "best_valid_loss": float("inf"),
+            "best_train_epoch": -1,
+            "best_valid_epoch": -1,
+            "batch_idx_train": 0,
+            "log_interval": 5,
+            "reset_interval": 200,
+            "valid_interval": 3000,
+            # parameters for model
+            "feature_dim": 80,
+            "num_class": 9,
+            # parameters for tokenizer
+            "wakeup_word": "你好米雅",
+            "wakeup_word_tokens": [2, 3, 4, 5, 6, 3, 7, 8],
+            # parameters for Optimizer
+            "weight_decay": 1e-6,
+            "env_info": get_env_info(),
+        }
+    )
+
+    return params
+
+
+def load_checkpoint_if_available(
+    params: AttributeDict,
+    model: nn.Module,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None,
+) -> None:
+    """Load checkpoint from file.
+
+    If params.start_epoch is larger than 1, it will load the checkpoint from
+    `params.start_epoch - 1`. Otherwise, this function does nothing.
+
+    Apart from loading state dict for `model`, `optimizer` and `scheduler`,
+    it also updates `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
+    and `best_valid_loss` in `params`.
+
+    Args:
+      params:
+        The return value of :func:`get_params`.
+      model:
+        The training model.
+      optimizer:
+        The optimizer that we are using.
+      scheduler:
+        The learning rate scheduler we are using.
+    Returns:
+      Return None.
+    """
+    if params.start_epoch > 1:
+        filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
+    else:
+        return None
+
+    saved_params = load_checkpoint(
+        filename,
+        model=model,
+        optimizer=optimizer,
+        scheduler=scheduler,
+    )
+
+    keys = [
+        "best_train_epoch",
+        "best_valid_epoch",
+        "batch_idx_train",
+        "best_train_loss",
+        "best_valid_loss",
+    ]
+    for k in keys:
+        params[k] = saved_params[k]
+
+    return saved_params
+
+
+def save_checkpoint(
+    params: AttributeDict,
+    model: nn.Module,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None,
+    rank: int = 0,
+) -> None:
+    """Save model, optimizer, scheduler and training stats to file.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The training model.
+    """
+    if rank != 0:
+        return
+    filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
+    save_checkpoint_impl(
+        filename=filename,
+        model=model,
+        params=params,
+        optimizer=optimizer,
+        scheduler=scheduler,
+        rank=rank,
+    )
+
+    if params.best_train_epoch == params.cur_epoch:
+        best_train_filename = params.exp_dir / "best-train-loss.pt"
+        copyfile(src=filename, dst=best_train_filename)
+
+    if params.best_valid_epoch == params.cur_epoch:
+        best_valid_filename = params.exp_dir / "best-valid-loss.pt"
+        copyfile(src=filename, dst=best_valid_filename)
+
+
+def compute_loss(
+    params: AttributeDict,
+    model: nn.Module,
+    batch: dict,
+    tokenizer: WakeupWordTokenizer,
+    is_training: bool,
+) -> Tuple[Tensor, MetricsTracker]:
+    """
+    Compute CTC loss given the model and its inputs.
+
+    Args:
+      params:
+        Parameters for training. See :func:`get_params`.
+      model:
+        The model for training. It is an instance of Conformer in our case.
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      tokenizer:
+        For positive samples, map their texts to corresponding token index sequence.
+        While for negative samples, map their texts to unknown no matter what they are.
+      is_training:
+        True for training. False for validation. When it is True, this
+        function enables autograd during computation; when it is False, it
+        disables autograd.
+    """
+    device = model.device
+    feature = batch["inputs"]
+    # at entry, feature is (N, T, C)
+    assert feature.ndim == 3
+    N, T, C = feature.shape
+    feature = feature.to(device)
+
+    supervisions = batch["supervisions"]
+    texts = supervisions["text"]
+    with torch.set_grad_enabled(is_training):
+        # model_output is log_softmax(logit) with shape [N, T, C]
+        model_output = model(feature)
+
+    assert torch.all(supervisions["start_frame"] == 0)
+    num_frames = supervisions["num_frames"].to(device)
+
+    target, target_lengths, number_positive_samples = tokenizer.texts_to_token_ids(
+        texts
+    )  # noqa E501
+    target = target.to(device)
+    target_lengths = target_lengths.to(device)
+    ctc_loss = nn.CTCLoss(reduction="sum")
+    # [N, T, C] --> [T, N, C]
+    model_output = model_output.transpose(0, 1)
+    loss = ctc_loss(model_output, target, num_frames, target_lengths)
+    loss /= num_frames.sum()
+
+    assert loss.requires_grad == is_training
+
+    info = MetricsTracker()
+    info["frames"] = num_frames.sum().item()
+
+    info["loss"] = loss.detach().cpu().item() * info["frames"]
+
+    # `utt_duration` and `utt_pad_proportion` would be normalized by `utterances`  # noqa
+    info["utterances"] = feature.size(0)
+    # averaged input duration in frames over utterances
+    info["utt_duration"] = supervisions["num_frames"].sum().item()
+    # averaged padding proportion over utterances
+    info["utt_pad_proportion"] = (
+        ((feature.size(1) - supervisions["num_frames"]) / feature.size(1)).sum().item()
+    )
+
+    info["number_positive_cuts_ratio"] = (number_positive_samples / N) * info["frames"]
+
+    return loss, info
+
+
+def compute_validation_loss(
+    params: AttributeDict,
+    model: nn.Module,
+    tokenizer: WakeupWordTokenizer,
+    valid_dl: torch.utils.data.DataLoader,
+    world_size: int = 1,
+) -> MetricsTracker:
+    """Run the validation process."""
+    model.eval()
+
+    tot_loss = MetricsTracker()
+
+    for batch_idx, batch in enumerate(valid_dl):
+        loss, loss_info = compute_loss(
+            params=params,
+            model=model,
+            batch=batch,
+            tokenizer=tokenizer,
+            is_training=False,
+        )
+        assert loss.requires_grad is False
+        tot_loss = tot_loss + loss_info
+
+    if world_size > 1:
+        tot_loss.reduce(loss.device)
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    if loss_value < params.best_valid_loss:
+        params.best_valid_epoch = params.cur_epoch
+        params.best_valid_loss = loss_value
+
+    return tot_loss
+
+
+def train_one_epoch(
+    params: AttributeDict,
+    model: nn.Module,
+    optimizer: torch.optim.Optimizer,
+    tokenizer: WakeupWordTokenizer,
+    train_dl: torch.utils.data.DataLoader,
+    valid_dl: torch.utils.data.DataLoader,
+    tb_writer: Optional[SummaryWriter] = None,
+    world_size: int = 1,
+) -> None:
+    """Train the model for one epoch.
+
+    The training loss from the mean of all frames is saved in
+    `params.train_loss`. It runs the validation process every
+    `params.valid_interval` batches.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The model for training.
+      optimizer:
+        The optimizer we are using.
+      tokenizer:
+        For positive samples, map their texts to corresponding token index sequence.
+        While for negative samples, map their texts to unknown no matter what they are.
+      train_dl:
+        Dataloader for the training dataset.
+      valid_dl:
+        Dataloader for the validation dataset.
+      tb_writer:
+        Writer to write log messages to tensorboard.
+      world_size:
+        Number of nodes in DDP training. If it is 1, DDP is disabled.
+    """
+    model.train()
+
+    tot_loss = MetricsTracker()
+
+    for batch_idx, batch in enumerate(train_dl):
+        params.batch_idx_train += 1
+        batch_size = len(batch["supervisions"]["text"])
+
+        loss, loss_info = compute_loss(
+            params=params,
+            model=model,
+            batch=batch,
+            tokenizer=tokenizer,
+            is_training=True,
+        )
+        # summary stats
+        tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
+
+        # NOTE: We use reduction==sum and loss is computed over utterances
+        # in the batch and there is no normalization to it so far.
+
+        optimizer.zero_grad()
+        loss.backward()
+        clip_grad_norm_(model.parameters(), 5.0, 2.0)
+        optimizer.step()
+
+        if batch_idx % params.log_interval == 0:
+            logging.info(
+                f"Epoch {params.cur_epoch}, "
+                f"batch {batch_idx}, loss[{loss_info}], "
+                f"tot_loss[{tot_loss}], batch size: {batch_size}"
+            )
+
+        if batch_idx % params.log_interval == 0:
+
+            if tb_writer is not None:
+                loss_info.write_summary(
+                    tb_writer, "train/current_", params.batch_idx_train
+                )
+                tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train)
+
+        if batch_idx > 0 and batch_idx % params.valid_interval == 0:
+            logging.info("Computing validation loss")
+            valid_info = compute_validation_loss(
+                params=params,
+                model=model,
+                tokenizer=tokenizer,
+                valid_dl=valid_dl,
+                world_size=world_size,
+            )
+            model.train()
+            logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}")
+            if tb_writer is not None:
+                valid_info.write_summary(
+                    tb_writer, "train/valid_", params.batch_idx_train
+                )
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    params.train_loss = loss_value
+    if params.train_loss < params.best_train_loss:
+        params.best_train_epoch = params.cur_epoch
+        params.best_train_loss = params.train_loss
+
+
+def run(rank, world_size, args):
+    """
+    Args:
+      rank:
+        It is a value between 0 and `world_size-1`, which is
+        passed automatically by `mp.spawn()` in :func:`main`.
+        The node with rank 0 is responsible for saving checkpoint.
+      world_size:
+        Number of GPUs for DDP training.
+      args:
+        The return value of get_parser().parse_args()
+    """
+    params = get_params()
+    params.update(vars(args))
+
+    fix_random_seed(params.seed)
+    if world_size > 1:
+        setup_dist(rank, world_size, params.master_port)
+
+    setup_logger(f"{params.exp_dir}/log/log-train")
+    logging.info("Training started")
+    logging.info(params)
+
+    if args.tensorboard and rank == 0:
+        tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
+    else:
+        tb_writer = None
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", rank)
+
+    tokenizer = WakeupWordTokenizer(
+        wakeup_word=params.wakeup_word,
+        wakeup_word_tokens=params.wakeup_word_tokens,
+    )
+
+    logging.info("About to create model")
+
+    model = Tdnn(params.feature_dim, params.num_class)
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    checkpoints = load_checkpoint_if_available(params=params, model=model)
+
+    model.to(device)
+    if world_size > 1:
+        model = DDP(model, device_ids=[rank])
+    model.device = device
+
+    optimizer = torch.optim.Adam(
+        model.parameters(),
+        lr=params.lr_factor,
+        weight_decay=params.weight_decay,
+    )
+
+    if checkpoints:
+        optimizer.load_state_dict(checkpoints["optimizer"])
+
+    himia = HiMiaWuwDataModule(args)
+
+    train_cuts = himia.train_cuts()
+
+    train_dl = himia.train_dataloaders(train_cuts)
+
+    valid_cuts = himia.dev_cuts()
+    valid_dl = himia.valid_dataloaders(valid_cuts)
+
+    scan_pessimistic_batches_for_oom(
+        model=model,
+        train_dl=train_dl,
+        optimizer=optimizer,
+        tokenizer=tokenizer,
+        params=params,
+    )
+
+    for epoch in range(params.start_epoch, params.num_epochs + 1):
+        fix_random_seed(params.seed + epoch)
+        train_dl.sampler.set_epoch(epoch)
+
+        # TODO: Support lr scheduler
+        cur_lr = params.lr_factor
+        if tb_writer is not None:
+            tb_writer.add_scalar("train/learning_rate", cur_lr, params.batch_idx_train)
+            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
+
+        if rank == 0:
+            logging.info("epoch {}, learning rate {}".format(epoch, cur_lr))
+
+        params.cur_epoch = epoch
+
+        train_one_epoch(
+            params=params,
+            model=model,
+            optimizer=optimizer,
+            tokenizer=tokenizer,
+            train_dl=train_dl,
+            valid_dl=valid_dl,
+            tb_writer=tb_writer,
+            world_size=world_size,
+        )
+
+        save_checkpoint(
+            params=params,
+            model=model,
+            optimizer=optimizer,
+            rank=rank,
+        )
+
+    logging.info("Done!")
+
+    if world_size > 1:
+        torch.distributed.barrier()
+        cleanup_dist()
+
+
+def scan_pessimistic_batches_for_oom(
+    model: nn.Module,
+    train_dl: torch.utils.data.DataLoader,
+    optimizer: torch.optim.Optimizer,
+    tokenizer: WakeupWordTokenizer,
+    params: AttributeDict,
+):
+    from lhotse.dataset import find_pessimistic_batches
+
+    logging.info(
+        "Sanity check -- see if any of the batches in epoch 0 would cause OOM."
+    )
+    batches, crit_values = find_pessimistic_batches(train_dl.sampler)
+    for criterion, cuts in batches.items():
+        batch = train_dl.dataset[cuts]
+        try:
+            optimizer.zero_grad()
+            loss, _ = compute_loss(
+                params=params,
+                model=model,
+                batch=batch,
+                tokenizer=tokenizer,
+                is_training=True,
+            )
+            loss.backward()
+            clip_grad_norm_(model.parameters(), 5.0, 2.0)
+            optimizer.step()
+        except RuntimeError as e:
+            if "CUDA out of memory" in str(e):
+                logging.error(
+                    "Your GPU ran out of memory with the current "
+                    "max_duration setting. We recommend decreasing "
+                    "max_duration and trying again.\n"
+                    f"Failing criterion: {criterion} "
+                    f"(={crit_values[criterion]}) ..."
+                )
+            raise
+
+
+def main():
+    parser = get_parser()
+    HiMiaWuwDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    world_size = args.world_size
+    assert world_size >= 1
+    if world_size > 1:
+        mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True)
+    else:
+        run(rank=0, world_size=1, args=args)
+
+
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+if __name__ == "__main__":
+    main()
--- a/egs/himia/wuw/local/auc.py
+++ b/egs/himia/wuw/local/auc.py
@ -0,0 +1,131 @@
+#!/usr/bin/env python3
+# Copyright    2023  Xiaomi Corp.        (Author: Weiji Zhuang,
+#                                                 Liyong Guo)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+from typing import Dict, Tuple
+
+import matplotlib.pyplot as plt
+import numpy as np
+from pathlib import Path
+from sklearn.metrics import roc_curve, auc
+
+from icefall.utils import setup_logger
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--positive-score-file",
+        type=str,
+        required=True,
+        help="score file of positive data",
+    )
+    parser.add_argument(
+        "--negative-score-file",
+        type=str,
+        required=True,
+        help="score file of negative data",
+    )
+    parser.add_argument(
+        "--legend",
+        type=str,
+        required=True,
+        help="legend of ROC curve picture.",
+    )
+    return parser.parse_args()
+
+
+def load_score(score_file: Path) -> Dict[str, float]:
+    """
+    Args:
+      score_file: Path to score file. Each line has two columns.
+        The first column is utt-id, and the second one is score.
+        This score could be viewed as probability of being wakeup word.
+
+    Returns:
+      A dict with that key is utt-id and value is corresponding score.
+    """
+    pos_dict = {}
+    with open(score_file, "r", encoding="utf8") as fin:
+        for line in fin:
+            arr = line.strip().split()
+            assert len(arr) == 2
+            key = arr[0]
+            score = float(arr[1])
+            pos_dict[key] = score
+    return pos_dict
+
+
+def get_roc_and_auc(
+    pos_dict: Dict,
+    neg_dict: Dict,
+) -> Tuple[np.array, np.array, float]:
+    """
+    Args:
+      pos_dict: scores of positive samples.
+      neg_dict: scores of negative samples.
+    Return:
+      A tuple of three elements, which will be used to plot ROC curve.
+      Refer to sklearn.metrics.roc_curve for meaning of the first and second elements.
+      The third element is area under the ROC curve(AUC).
+    """
+    pos_scores = np.fromiter(pos_dict.values(), dtype=float)
+    neg_scores = np.fromiter(neg_dict.values(), dtype=float)
+
+    pos_y = np.ones_like(pos_scores, dtype=int)
+    neg_y = np.zeros_like(neg_scores, dtype=int)
+
+    scores = np.concatenate([pos_scores, neg_scores])
+    y = np.concatenate([pos_y, neg_y])
+
+    fpr, tpr, thresholds = roc_curve(y, scores, pos_label=1)
+    roc_auc = auc(fpr, tpr)
+
+    return fpr, tpr, roc_auc
+
+
+def main():
+
+    args = get_args()
+
+    score_dir = Path(args.positive_score_file).parent
+    setup_logger(f"{score_dir}/log/log-auc-{args.legend}")
+    logging.info(f"About to compute AUC of {args.legend}")
+
+    pos_dict = load_score(args.positive_score_file)
+    neg_dict = load_score(args.negative_score_file)
+    fpr, tpr, roc_auc = get_roc_and_auc(pos_dict, neg_dict)
+
+    plt.figure(figsize=(16, 9))
+    plt.plot(fpr, tpr, label=f"{args.legend}(AUC = %1.8f)" % roc_auc)
+
+    plt.xlim([0.0, 1.0])
+    plt.ylim([0.0, 1.0])
+    plt.xlabel("False Positive Rate")
+    plt.ylabel("True Positive Rate")
+    plt.title("Receiver operating characteristic(ROC)")
+    plt.legend(loc="lower right")
+
+    output_path = Path(args.positive_score_file).parent
+    logging.info(f"AUC of {args.legend} {output_path}: {roc_auc}")
+    plt.savefig(f"{output_path}/{args.legend}.png", bbox_inches="tight")
+
+
+if __name__ == "__main__":
+    main()
--- a/egs/himia/wuw/local/compute_fbank_aishell.py
+++ b/egs/himia/wuw/local/compute_fbank_aishell.py
@ -0,0 +1 @@
+../../../aishell/ASR/local/compute_fbank_aishell.py
--- a/egs/himia/wuw/local/compute_fbank_himia.py
+++ b/egs/himia/wuw/local/compute_fbank_himia.py
@ -0,0 +1,139 @@
+#!/usr/bin/env python3
+# Copyright    2023  Xiaomi Corp.        (Author: Liyong Guo)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file computes fbank features of the HI_MIA and HI_MIA_CW dataset.
+It looks for manifests in the directory data/manifests.
+
+The generated fbank features are saved in data/fbank.
+"""
+
+import argparse
+import logging
+import os
+from pathlib import Path
+
+import torch
+from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
+from lhotse.recipes.utils import read_manifests_if_cached
+
+from icefall.utils import get_executor, str2bool
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--train-set-channel",
+        type=str,
+        default="_7_01",
+        help="""channel of HI_MIA dataset.
+        All channels are used if it is set "all".
+        """,
+    )
+
+    parser.add_argument(
+        "--enable-speed-perturb",
+        type=str2bool,
+        default=False,
+        help="""channel of training set.
+        """,
+    )
+    return parser.parse_args()
+
+
+def compute_fbank_himia(
+    train_set_channel: str = None,
+    enable_speed_perturb: bool = True,
+):
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+    num_jobs = min(40, os.cpu_count())
+    num_mel_bins = 80
+
+    if "all" == train_set_channel:
+        dataset_parts = (
+            "train",
+            "dev",
+            "test",
+            "cw_test",
+        )
+    else:
+        dataset_parts = (
+            f"train{train_set_channel}",
+            f"dev{train_set_channel}",
+            f"test{train_set_channel}",
+            "cw_test",
+        )
+    manifests = read_manifests_if_cached(
+        dataset_parts=dataset_parts, prefix="himia", output_dir=src_dir
+    )
+    assert manifests is not None
+
+    extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
+
+    with get_executor() as ex:  # Initialize the executor only once.
+        for partition, m in manifests.items():
+            if (output_dir / f"cuts_{partition}.jsonl.gz").is_file():
+                logging.info(f"{partition} already exists - skipping.")
+                continue
+            logging.info(f"Processing {partition}")
+            cut_set = CutSet.from_manifests(
+                recordings=m["recordings"],
+                supervisions=m["supervisions"],
+            )
+            if "train" in partition and enable_speed_perturb:
+                cut_set = (
+                    cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
+                )
+            cut_set = cut_set.resample(16000)
+            cut_set = cut_set.compute_and_store_features(
+                extractor=extractor,
+                storage_path=f"{output_dir}/feats_{partition}",
+                # when an executor is specified, make more partitions
+                num_jobs=num_jobs if ex is None else 80,
+                executor=ex,
+                storage_type=LilcomHdf5Writer,
+            )
+            output_file_name = f"cuts_{partition}.jsonl.gz"
+            if "all" != train_set_channel:
+                output_file_name = f"cuts_{partition}{train_set_channel}.jsonl.gz"
+
+            cut_set.to_file(output_dir / f"{output_file_name}")
+
+
+def main():
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+
+    args = get_args()
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    compute_fbank_himia(
+        train_set_channel=args.train_set_channel,
+        enable_speed_perturb=args.enable_speed_perturb,
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/egs/himia/wuw/local/compute_fbank_musan.py
+++ b/egs/himia/wuw/local/compute_fbank_musan.py
@ -0,0 +1 @@
+../../../librispeech/ASR/local/compute_fbank_musan.py
--- a/egs/himia/wuw/prepare.sh
+++ b/egs/himia/wuw/prepare.sh
@ -0,0 +1,193 @@
+#!/usr/bin/env bash
+
+set -eou pipefail
+
+stage=0
+stop_stage=6
+
+# HI_MIA and aishell dataset are used in this experiment.
+# musan dataset is used for data augmentation.
+#
+# For aishell dataset downloading and preparation,
+# refer to icefall/egs/aishell/ASR/prepare.sh.
+#
+# For HI_MIA and HI_MIA_CW dataset,
+# we assume dl_dir (download dir) contains the following
+# directories and files. If not, they will be downloaded
+# by this script automatically.
+# Then these files will be extracted to $dl_dir/HiMia/
+#
+#  - $dl_dir/train.tar.gz
+#      Himia training dataset.
+#      From https://www.openslr.org/85
+#
+#  - $dl_dir/dev.tar.gz
+#      Himia Devlopment dataset.
+#      From https://www.openslr.org/85
+#
+#  - $dl_dir/test_v2.tar.gz
+#      Himia test dataset.
+#      From https://www.openslr.org/85
+#
+#  - $dl_dir/data.tgz
+#      Himia confusion words(HI_MIA_CW) test dataset.
+#      From https://www.openslr.org/120
+
+#  - $dl_dir/resource.tgz
+#      Transcripts of (HI_MIA_CW) test dataset.
+#      From https://www.openslr.org/120
+
+dl_dir=$PWD/download
+train_set_channel=_7_01
+enable_speed_perturb=False
+
+. shared/parse_options.sh || exit 1
+
+# All files generated by this script are saved in "data".
+# You can safely remove "data" and rerun this script to regenerate it.
+mkdir -p data
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+log "dl_dir: $dl_dir"
+
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
+  log "Stage 0: Download data"
+
+  # If you have pre-downloaded HI_MIA and HI_MIA_CW dataset to /path/to/himia/,
+  # you can create a symlink
+  #
+  #   ln -sfv /path/to/himia $dl_dir/
+  #
+  if [ ! -f $dl_dir/train.tar.gz ]; then
+    lhotse download himia $dl_dir/
+  fi
+
+  # If you have pre-downloaded it to /path/to/musan,
+  # you can create a symlink
+  #
+  #   ln -sfv /path/to/musan $dl_dir/
+  #
+  if [ ! -d $dl_dir/musan ]; then
+    lhotse download musan $dl_dir
+  fi
+
+  # If you have pre-downloaded it to /path/to/aishell,
+  # you can create a symlink
+  #
+  #   ln -sfv /path/to/aishell $dl_dir/aishell
+  #
+  # The directory structure is
+  # aishell/
+  # |-- data_aishell
+  # |   |-- transcript
+  # |   `-- wav
+  # `-- resource_aishell
+  #     |-- lexicon.txt
+  #     `-- speaker.info
+
+  if [ ! -d $dl_dir/aishell/data_aishell/wav/train ]; then
+    lhotse download aishell $dl_dir
+  fi
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+  log "Stage 1: Prepare HI_MIA and HI_MIA_CW manifest"
+  mkdir -p data/manifests
+  if [ ! -e data/manifests/.himia.done ]; then
+    lhotse prepare himia $dl_dir/HiMia data/manifests
+    touch data/manifests/.himia.done
+  fi
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+  log "Stage 2: Prepare musan manifest"
+  # We assume that you have downloaded the musan corpus
+  # to data/musan
+  mkdir -p data/manifests
+  if [ ! -e data/manifests/.musan.done ]; then
+    lhotse prepare musan $dl_dir/musan data/manifests
+    touch data/manifests/.musan.done
+  fi
+fi
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+  log "Stage 3: Prepare aishell manifest"
+  # We assume that you have downloaded the aishell corpus
+  # to $dl_dir/aishell
+  if [ ! -f data/manifests/.aishell_manifests.done ]; then
+    mkdir -p data/manifests
+    lhotse prepare aishell $dl_dir/aishell data/manifests
+    touch data/manifests/.aishell_manifests.done
+  fi
+fi
+
+if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
+  log "Stage 4: Compute fbank for aishell"
+  if [ ! -f data/fbank/.aishell.done ]; then
+    mkdir -p data/fbank
+    ./local/compute_fbank_aishell.py \
+      --enable-speed-perturb=${enable_speed_perturb}
+    touch data/fbank/.aishell.done
+  fi
+fi
+
+if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
+  log "Stage 5: Compute fbank for musan"
+  mkdir -p data/fbank
+  if [ ! -e data/fbank/.musan.done ]; then
+    ./local/compute_fbank_musan.py
+    touch data/fbank/.musan.done
+  fi
+fi
+
+if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
+  log "Stage 6: Compute fbank for HI_MIA and HI_MIA_CW dataset"
+  # Format of train_set_channel is "micropohone position"_"channel"
+  # Microphone 1 to 6 is an array with 16 channels.
+  # Microphone 8 only has a single channel.
+  # So valid examples of train_set_channel could be:
+  # 1_01, ..., 1_16
+  # 2_01, ..., 2_16
+  # ...
+  # 6_01, ..., 6_16
+  # 7_01
+  train_set_channel="_7_01"
+  for subset in train dev test; do
+    for file_type in recordings supervisions; do
+      src=data/manifests/himia_${file_type}_${subset}.jsonl.gz
+      dst=data/manifests/himia_${file_type}_${subset}${train_set_channel}.jsonl.gz
+      cat <(gunzip -c ${src}) | \
+        grep ${train_set_channel} | \
+        gzip -c  > ${dst}
+    done
+  done
+
+  mkdir -p data/fbank
+  if [ ! -e data/fbank/.himia.done ]; then
+    ./local/compute_fbank_himia.py \
+      --train-set-channel=${train_set_channel} \
+      --enable-speed-perturb=${enable_speed_perturb}
+    touch data/fbank/.himia.done
+  fi
+
+  train_file=data/fbank/cuts_train_himia${train_set_channel}-aishell-shuf.jsonl.gz
+  if [ ! -f ${train_file} ]; then
+    # SingleCutSampler is preferred for this experiment
+    # rather than DynamicBucketingSampler.
+    # Since negative audios(Aishell) tends to be longer than positive ones(HiMia).
+    # if DynamicBucketingSample is used, a batch may contain either all negative sample
+    # or positive sample.
+    # So `shuf` the training dataset here and use SingleCutSampler to load data.
+    cat <(gunzip -c data/fbank/aishell_cuts_train.jsonl.gz) \
+      <(gunzip -c data/fbank/cuts_train${train_set_channel}.jsonl.gz) | \
+      grep -v _sp | \
+      shuf |shuf | gzip -c > ${train_file}
+  fi
+
+fi
+
--- a/egs/himia/wuw/run_ctc_tdnn.sh
+++ b/egs/himia/wuw/run_ctc_tdnn.sh
@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+
+set -eou pipefail
+
+# You need to execute ./prepare.sh to prepare datasets.
+stage=0
+stop_stage=2
+
+epoch=20
+avg=1
+max_duration=200
+exp_dir=./ctc_tdnn/exp_max_duration_${max_duration}/
+epoch_avg=epoch_${epoch}-avg_${avg}
+post_dir=${exp_dir}/post/${epoch_avg}
+
+. shared/parse_options.sh || exit 1
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
+  log "Stage 0: Model training"
+  python ./ctc_tdnn/train.py \
+    --num-epochs $epoch \
+    --exp-dir $exp_dir \
+    --max-duration $max_duration
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+  log "Stage 1: Get posterior(log_softmax(logit)) of test sets"
+  python ctc_tdnn/inference.py \
+    --avg $avg \
+    --epoch $epoch \
+    --exp-dir $exp_dir
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+  log "Stage 2: Decode and compute area under curve(AUC)"
+  for test_set in test aishell_test cw_test; do
+    python ctc_tdnn/decode.py \
+      --post-h5 ${post_dir}/${test_set}.h5 \
+      --score-file ${post_dir}/fst_${test_set}_score.txt
+  done
+  python ./local/auc.py   \
+    --legend himia_cw \
+    --positive-score-file ${post_dir}/fst_test_score.txt \
+    --negative-score-file ${post_dir}/fst_cw_test_score.txt
+
+  python ./local/auc.py \
+    --legend himia_aishell \
+    --positive-score-file ${post_dir}/fst_test_score.txt \
+    --negative-score-file ${post_dir}/fst_aishell_test_score.txt
+fi
--- a/egs/himia/wuw/shared
+++ b/egs/himia/wuw/shared
@ -0,0 +1 @@
+../../../icefall/shared
				`@ -0,0 +1 @@`
				`../../../aishell/ASR/local/compute_fbank_aishell.py`
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/local/compute_fbank_musan.py`