codebook index extraction

2025-12-11 06:55:27 +00:00 · 2022-04-28 20:34:16 +08:00 · 2022-04-28 20:34:16 +08:00 · 0cb3303a5b
commit 0cb3303a5b
parent cc1dcafc70
2 changed files with 221 additions and 1 deletions
--- a/egs/librispeech/ASR/vq_pruned_transducer_stateless2/hubert_code_indices.py
+++ b/egs/librispeech/ASR/vq_pruned_transducer_stateless2/hubert_code_indices.py
@ -0,0 +1,219 @@
 #!/usr/bin/env python3
 # Copyright    2022  Xiaomi Corp.        (author: Liyong Guo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
 import os
 from pathlib import Path
 from typing import List, Tuple
 from quantization import Quantizer
 import numpy as np
 import torch
 from asr_datamodule import LibriSpeechAsrDataModule
 from lhotse.dataset import (
    K2SpeechRecognitionDataset,
    SingleCutSampler,
 )
 from lhotse.features.io import NumpyHdf5Writer
 from lhotse.dataset.input_strategies import AudioSamples
 from torch.utils.data import DataLoader
 from lhotse import CutSet, load_manifest
 from hubert_utils import (
    extract_layers_result,
    load_hubert_model,
    get_parser,
    vq_config,
 )
 from icefall.utils import (
    AttributeDict,
    setup_logger,
 )
 def compute_codeindices(
    model: torch.nn.Module,
    processor: None,
    dl: torch.utils.data.DataLoader,
    quantizer: None,
    params: AttributeDict,
    writer: None,
 ) -> List[Tuple[str, List[int]]]:
    """Compute the framewise alignments of a dataset.
    Args:
      model:
        The neural network model.
      dl:
        Dataloader containing the dataset.
      params:
        Parameters for computing memory.
    Returns:
      Return a list of tuples. Each tuple contains two entries:
        - Utterance ID
        - memory embeddings
    """
    num_cuts = 0
    cuts = []
    total_frames = 0
    for batch_idx, batch in enumerate(dl):
        w2v_model = model.w2v_encoder.w2v_model
        layer_results = extract_layers_result(
            w2v_model, batch=batch, device=params.device
        )
        assert len(layer_results) == params.total_layers
        memory_embeddings = layer_results[params.memory_layer - 1][0]
        encoder_memory = memory_embeddings.transpose(0, 1)  # N, T, C
        refine_indexes_iters = params.refine_iter
        codebook_indices = quantizer.encode(
            encoder_memory, refine_indexes_iters=refine_indexes_iters
        )
        # [N, T, C]
        codebook_indices = codebook_indices.to("cpu").numpy().astype(np.int8)
        supervisions = batch["supervisions"]
        cut_list = supervisions["cut"]
        assert len(cut_list) == codebook_indices.shape[0]
        assert all(c.start == 0 for c in supervisions["cut"])
        for idx, cut in enumerate(cut_list):
            num_frames = supervisions["num_samples"][idx] // 320
            cut.codebook_indices = writer.store_array(
                key=cut.id,
                value=codebook_indices[idx][:num_frames],
                frame_shift=0.02,
                temporal_dim=0,
                start=0,
            )
            total_frames += num_frames
        cuts += cut_list
        num_cuts += len(cut_list)
        logging.info(
            f"processed {total_frames} frames and {num_cuts} cuts;"
            f"{batch_idx}"
            f"refine_indexes_iters: {refine_indexes_iters}"
        )
    return CutSet.from_cuts(cuts)
@torch.no_grad()
 def main():
    parser = get_parser()
    LibriSpeechAsrDataModule.add_arguments(parser)
    args = parser.parse_args()
    assert args.subset in ["clean-100", "clean-360", "other-500"], args.subset
    assert args.return_cuts is True
    assert args.concatenate_cuts is False
    params = AttributeDict()
    params.update(vars(args))
    params.update(vq_config)
    # job_idx is 0-based
    # manifest_idx is 1-based
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    params["device"] = device
    cdidx_dir = (
        Path(params.data_dir)
        / f"globalrandom-scaledquantizer-refine_iter-{params.refine_iter}-{params.num_utts}-{params.model_id}-{params.memory_layer}layer-{params.quantizer_id}-bytes_per_frame-{params.bytes_per_frame}-enable-refine-{params.enable_refine}"
        / f"splits{params.num_splits}"  # noqa: E501
    )
    cdidx_dir.mkdir(parents=True, exist_ok=True)
    setup_logger(f"{cdidx_dir}/log/codebook_index")
    logging.info(params)
    logging.info("About to create model")
    quantizer_fn = (
        Path(params.memory_dir)
        / f"globalrandom-{params.num_utts}-{params.model_id}-{params.memory_layer}layer-{params.quantizer_id}-bytes_per_frame_{params.bytes_per_frame}enable_refine_{params.enable_refine}-quantizer.pt"
    )
    assert os.path.isfile(quantizer_fn), f"{quantizer_fn}"
    model, processor = load_hubert_model(params)
    quantizer = Quantizer(
        dim=params.memory_embedding_dim,
        num_codebooks=params.bytes_per_frame,
        codebook_size=256,
    )
    quantizer.load_state_dict(torch.load(quantizer_fn))
    quantizer = quantizer.to("cuda")
    model.to(device)
    model.eval()
    cuts = load_manifest(
        Path(params.ori_manifest_dir)
        / f"cuts_train-{params.subset}.{params.manifest_idx}.json.gz"
    )
    sampler = SingleCutSampler(
        cuts,
        max_duration=params.max_duration,
        shuffle=False,
    )
    dataset = K2SpeechRecognitionDataset(
        input_strategy=AudioSamples(),
        return_cuts=True,
    )
    dl = DataLoader(
        dataset,
        sampler=sampler,
        batch_size=None,
        num_workers=params.num_workers,
        persistent_workers=False,
    )
    with NumpyHdf5Writer(
        cdidx_dir / f"{params.subset}-{params.manifest_idx}"
    ) as writer:
        cut_set = compute_codeindices(
            model=model,
            processor=processor,
            dl=dl,
            quantizer=quantizer,
            params=params,
            writer=writer,
        )
        cut_set.to_json(
            cdidx_dir
            / f"cuts_train-{params.subset}-{params.manifest_idx}.json.gz"
        )
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 if __name__ == "__main__":
    main()
--- a/egs/librispeech/ASR/vq_pruned_transducer_stateless2/hubert_utils.py
+++ b/egs/librispeech/ASR/vq_pruned_transducer_stateless2/hubert_utils.py
@ -62,8 +62,9 @@ def get_parser():
    )
    parser.add_argument(
-        "--job-idx",
+        "--manifest-idx",
        type=int,
        help="Split manifest is 1-based."
    )
    parser.add_argument(