decoder hubert model

2025-12-11 06:55:27 +00:00 · 2022-04-28 00:05:06 +08:00 · 2022-04-28 00:05:06 +08:00 · fb9c0c3971
commit fb9c0c3971
parent 9d48f1ce7d
3 changed files with 385 additions and 2 deletions
--- a/egs/librispeech/ASR/vq_pruned_transducer_stateless2/asr_datamodule.py
+++ b/egs/librispeech/ASR/vq_pruned_transducer_stateless2/asr_datamodule.py
@ -34,7 +34,7 @@ from lhotse.dataset import (
    SingleCutSampler,
    SpecAugment,
 )
-from lhotse.dataset.input_strategies import OnTheFlyFeatures
+from lhotse.dataset.input_strategies import AudioSamples, OnTheFlyFeatures
 from lhotse.utils import fix_random_seed
 from torch.utils.data import DataLoader
@ -192,6 +192,13 @@ class LibriSpeechAsrDataModule:
            "with training dataset. ",
        )
        group.add_argument(
            "--input-strategy",
            type=str,
            default="PrecomputedFeatures",
            help="AudioSamples or PrecomputedFeatures",
        )
    def train_dataloaders(
        self,
        cuts_train: CutSet,
@ -263,6 +270,9 @@ class LibriSpeechAsrDataModule:
        logging.info("About to create train dataset")
        train = K2SpeechRecognitionDataset(
            input_strategy=AudioSamples()
            if self.args.input_strategy == "AudioSamples"
            else PrecomputedFeatures(),
            cut_transforms=transforms,
            input_transforms=input_transforms,
            return_cuts=self.args.return_cuts,
@ -371,7 +381,7 @@ class LibriSpeechAsrDataModule:
        test = K2SpeechRecognitionDataset(
            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
            if self.args.on_the_fly_feats
-            else PrecomputedFeatures(),
+            else eval(self.args.input_strategy)(),
            return_cuts=self.args.return_cuts,
        )
        sampler = BucketingSampler(
--- a/egs/librispeech/ASR/vq_pruned_transducer_stateless2/hubert_decode.py
+++ b/egs/librispeech/ASR/vq_pruned_transducer_stateless2/hubert_decode.py
@ -0,0 +1,211 @@
 #!/usr/bin/env python3
 # Copyright 2022 Xiaomi Corporation (Author: Liyong Guo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
 from collections import defaultdict
 from pathlib import Path
 from typing import Dict, List, Tuple
 import torch
 import torch.nn as nn
 from fairseq.data.data_utils import post_process
 from asr_datamodule import LibriSpeechAsrDataModule
 from hubert_utils import (
    extract_layers_result,
    load_hubert_model,
    get_parser,
    vq_config,
 )
 from icefall.utils import (
    AttributeDict,
    setup_logger,
    store_transcripts,
    write_error_stats,
 )
 def decode_dataset(
    dl: torch.utils.data.DataLoader,
    model: nn.Module,
    processor,
    params,
 ) -> Dict[str, List[Tuple[List[str], List[str]]]]:
    """Decode dataset.
    Args:
      dl:
        PyTorch's dataloader containing the dataset to decode.
      model:
        The neural model.
    Returns:
      Return a dict, whose key may be "no-rescore" if no LM rescoring
      is used, or it may be "lm_scale_0.7" if LM rescoring is used.
      Its value is a list of tuples. Each tuple contains two elements:
      The first is the reference transcript, and the second is the
      predicted result.
    """
    results = []
    num_cuts = 0
    try:
        num_batches = len(dl)
    except TypeError:
        num_batches = "?"
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        w2v_model = model.w2v_encoder.w2v_model
        layer_results = extract_layers_result(
            w2v_model, batch=batch, device=params.device
        )
        encoder_out = w2v_model.encoder.layer_norm(
            layer_results[params.total_layers - 1][0]
        )
        encoder_out = model.w2v_encoder.proj(encoder_out.transpose(0, 1))
        toks = encoder_out.argmax(dim=-1)
        blank = 0
        toks = [tok.unique_consecutive() for tok in toks]
        hyps = [processor.string(tok[tok != blank].int().cpu()) for tok in toks]
        hyps = [post_process(hyp, "letter") for hyp in hyps]
        texts = batch["supervisions"]["text"]
        this_batch = []
        assert len(hyps) == len(texts)
        assert len(hyps) == len(texts)
        for hyp_text, ref_text in zip(hyps, texts):
            ref_words = ref_text.split()
            hyp_words = hyp_text.split()
            this_batch.append((ref_words, hyp_words))
        results["ctc_greedy_search"].extend(this_batch)
        num_cuts += len(texts)
        if batch_idx % 20 == 0:
            batch_str = f"{batch_idx}/{num_batches}"
            logging.info(
                f"batch {batch_str}, cuts processed until now is {num_cuts}"
            )
    return results
 def save_results(
    params: AttributeDict,
    test_set_name: str,
    results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
 ):
    test_set_wers = dict()
    for key, results in results_dict.items():
        recog_path = params.exp_dir / f"hubert-recogs-{test_set_name}-{key}.txt"
        store_transcripts(filename=recog_path, texts=results)
        # The following prints out WERs, per-word error statistics and aligned
        # ref/hyp pairs.
        errs_filename = (
            params.exp_dir / f"hubert-errs-{test_set_name}-{key}.txt"
        )
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
                f, f"{test_set_name}-{key}", results, enable_log=True
            )
            test_set_wers[key] = wer
            logging.info(
                "Wrote detailed error stats to {}".format(errs_filename)
            )
    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
    errs_info = params.exp_dir / f"hubert-wer-summary-{test_set_name}.txt"
    with open(errs_info, "w") as f:
        print("settings\tWER", file=f)
        for key, val in test_set_wers:
            print("{}\t{}".format(key, val), file=f)
    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
    note = "\tbest for {}".format(test_set_name)
    for key, val in test_set_wers:
        s += "{}\t{}{}\n".format(key, val, note)
        note = ""
    logging.info(s)
@torch.no_grad()
 def main():
    parser = get_parser()
    LibriSpeechAsrDataModule.add_arguments(parser)
    args = parser.parse_args()
    args.exp_dir = Path(args.exp_dir)
    params = AttributeDict()
    params.update(vars(args))
    params.update(vq_config)
    setup_logger(f"{params.exp_dir}/log-ctc_greedy_search/log-decode")
    logging.info("Decoding started")
    logging.info(params)
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    logging.info(f"device: {device}")
    params.device = device
    model, processor = load_hubert_model(params)
    librispeech = LibriSpeechAsrDataModule(params)
    test_clean_cuts = librispeech.test_clean_cuts()
    test_other_cuts = librispeech.test_other_cuts()
    test_clean_dl = librispeech.test_dataloaders(test_clean_cuts)
    test_other_dl = librispeech.test_dataloaders(test_other_cuts)
    test_sets = ["test-clean", "test-other"]
    test_dl = [test_clean_dl, test_other_dl]
    for test_set, test_dl in zip(test_sets, test_dl):
        results_dict = decode_dataset(
            dl=test_dl,
            model=model,
            processor=processor,
            params=params,
        )
        save_results(
            params=params, test_set_name=test_set, results_dict=results_dict
        )
    logging.info("Done!")
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 if __name__ == "__main__":
    main()
--- a/egs/librispeech/ASR/vq_pruned_transducer_stateless2/hubert_utils.py
+++ b/egs/librispeech/ASR/vq_pruned_transducer_stateless2/hubert_utils.py
@ -0,0 +1,162 @@
 #!/usr/bin/env python3
 # Copyright    2022  Xiaomi Corp.        (author: Liyong Guo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import logging
 from pathlib import Path
 from typing import Dict
 import torch
 from fairseq import (
    checkpoint_utils,
    tasks,
    utils,
 )
 from fairseq.models.hubert.hubert import HubertModel
 from omegaconf import OmegaConf
 vq_config = {
    # parameters about hubert model inference.
    "model_dir": "./vq_pruned_transducer_stateless2/exp/hubert_models/",
    "model_id": "hubert_xtralarge_ll60k_finetune_ls960",
    "input_strategy": "AudioSamples",
    "enable_spec_aug": False,
    "enable_musan": False,
    "total_layers": 48,
    "memory_embedding_dim": 1280,
    # parameters about quantizer.
    "num_utts": 100,
    "memory_layer": 36,
    "memory_dir": "./vq_pruned_transducer_stateless2/exp/mem/",
    "bytes_per_frame": 8,
    "refine_iter": 5,
    "enable_refine": True,
    # parameters about extracted codebook index.
    "data_dir": "./data/",
 }
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--subset",
        type=str,
    )
    parser.add_argument(
        "--job-idx",
        type=int,
    )
    parser.add_argument(
        "--num-splits",
        type=int,
    )
    parser.add_argument(
        "--quantizer-id",
        type=str,
        default=None,
        help="quantizer_id" "Manully set this incase of mistake.",
    )
    parser.add_argument(
        "--refine-iter",
        type=int,
        default=-1,
        help="number of refine iterations when extracting codebook indices",
    )
    parser.add_argument(
        "--ori-manifest-dir",
        type=str,
        default=None,
    )
    return parser
 def load_hubert_model(params):
    cfg_task = OmegaConf.create(
        {
            "_name": "hubert_pretraining",
            "single_target": True,
            "fine_tuning": True,
            "data": params.model_dir,
        }
    )
    model_path = Path(params.model_dir) / (params.model_id + ".pt")
    task = tasks.setup_task(cfg_task)
    processor = task.target_dictionary
    models, saved_cfg = checkpoint_utils.load_model_ensemble(
        utils.split_paths(str(model_path), separator="\\"),
        arg_overrides={},
        strict=True,
        suffix="",
        num_shards=1,
    )
    model = models[0]
    model.to(params.device)
    model.eval()
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")
    return model, processor
 # Modified from HubertModel.forward to extract all middle layers output
 def extract_layers_result(
    model: HubertModel,
    batch: Dict,
    device: torch.device,
 ) -> Dict[str, torch.Tensor]:
    features = batch["inputs"]
    # corresponding task.normalize in fairseq
    features = torch.nn.functional.layer_norm(features, features.shape)
    supervisions = batch["supervisions"]
    num_samples = supervisions["num_samples"]
    B, T = features.shape
    padding_mask = torch.arange(0, T).expand(B, T) > num_samples.reshape(
        [-1, 1]
    )
    padding_mask = padding_mask.to(device)
    features = features.to(device)
    features = model.forward_features(features)
    features = features.transpose(1, 2)
    features = model.layer_norm(features)
    if padding_mask is not None:
        padding_mask = model.forward_padding_mask(features, padding_mask)
    if model.post_extract_proj is not None:
        features = model.post_extract_proj(features)
    _, layer_results = model.encoder(
        features,
        padding_mask=padding_mask,
    )
    return layer_results