update hf dataset loading into lhotse

2025-08-09 01:52:41 +00:00 · 2025-04-29 07:33:34 +00:00 · 2025-04-29 07:33:34 +00:00 · 448a4eeea7
commit 448a4eeea7
parent d742043e75
5 changed files with 229 additions and 124 deletions
--- a/egs/speech_llm/SPEECH2SPEECH/local/compute_whisper_fbank.py
+++ b/egs/speech_llm/SPEECH2SPEECH/local/compute_whisper_fbank.py
@ -2,6 +2,7 @@
 # Copyright    2021  Johns Hopkins University (Piotr Żelasko)
 # Copyright    2021  Xiaomi Corp.             (Fangjun Kuang)
 # Copyright    2023  Xiaomi Corp.             (Zengrui Jin)
 # Copyright    2025  Nvidia                   (Yuekai Zhang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@ -23,12 +24,7 @@ from pathlib import Path
 import torch
 from datasets import load_dataset
-from lhotse import (
+from lhotse import CutSet, LilcomChunkyWriter, WhisperFbank, WhisperFbankConfig
    CutSet,
    LilcomChunkyWriter,
    WhisperFbank,
    WhisperFbankConfig,
 )
 from icefall.utils import str2bool
@ -93,7 +89,12 @@ def get_parser():
        default="answer",
        help="The key in the Huggingface dataset containing the text data",
    )
-    
+    parser.add_argument(
        "--prefix",
        type=str,
        default="belle",
        help="""The dataset prefix to use when saving the features""",
    )
    return parser
@ -114,27 +115,28 @@ def compute_fbank(args):
            WhisperFbankConfig(num_filters=args.num_mel_bins, device=device)
        )
    else:
-        extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
+        raise NotImplementedError("Only WhisperFbank is implemented.")
    logging.info(f"device: {device}")
-    start = 0
+    dataset = load_dataset(
-    stop = 1601 
+        args.huggingface_dataset_path_or_name, streaming=True, split="train"
    )
    num_shards = dataset.num_shards
    num_digits = 5
-    for i in range(start, stop):
+    for i in range(num_shards):
        shard = dataset.shard(num_shards, i)
        shard = shard.take(10)  # for testing
        logging.info(
            f"Loading dataset shard {i} from {args.huggingface_dataset_path_or_name}"
        )
        idx = f"{i}".zfill(num_digits)
        # dataset = load_dataset(args.huggingface_dataset_path_or_name, streaming=True, split=partition)
        parquet_files = [
            f"data/train-{idx}-of-01601.parquet",
        ]
        parquet_files = [f"{args.huggingface_dataset_path_or_name}/{f}" for f in parquet_files]
        file_name = parquet_files[0]
        logging.info(f"Loading dataset from {file_name}")
        dataset = load_dataset('parquet', data_files=parquet_files, streaming=True, split='train')
-        cut_set = CutSet.from_huggingface_dataset(dataset, audio_key=args.audio_key, text_key=args.text_key)
+        cut_set = CutSet.from_huggingface_dataset(
            shard, audio_key=args.audio_key, text_key=args.text_key
        )
        logging.info("Splitting cuts into smaller chunks")
        cut_set = cut_set.trim_to_supervisions(
            keep_overlapping=False, min_duration=None
        )
@ -153,22 +155,13 @@ def compute_fbank(args):
            storage_type=LilcomChunkyWriter,
            overwrite=True,
        )
-        cuts_path = f"{in_out_dir}/cuts_belle.{idx}.jsonl.gz"
+        cuts_path = f"{in_out_dir}/{args.prefix}_cuts.{idx}.jsonl.gz"
        logging.info(f"Saving to {cuts_path}")
-        # cut_set.to_file(cuts_path)
+        # see https://github.com/lhotse-speech/lhotse/issues/1125
-        remove_recording_item(cut_set, cuts_path)
+        cut_set.drop_recordings().to_file(cuts_path)
        if i > 1:
            break
 def remove_recording_item(
    cuts,
    output_cuts,
 ):
    """
    don't store recording item
    """
    with CutSet.open_writer(output_cuts) as writer:
        for cut in cuts:
            cut.recording.sources = None
            writer.write(cut)
 def main():
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
--- a/egs/speech_llm/SPEECH2SPEECH/prepare.sh
+++ b/egs/speech_llm/SPEECH2SPEECH/prepare.sh
@ -20,10 +20,10 @@ log() {
 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
  log "stage 0: "
-  pip uninstall lhotse
+  #pip uninstall lhotse
-  cd /workspace/slam/lhotse
+  #cd /workspace/slam/lhotse
-  git config --global --add safe.directory /workspace/slam/lhotse
+  #git config --global --add safe.directory /workspace/slam/lhotse
-  pip install -e '.[dev]'
+  #pip install -e '.[dev]'
  cd -
  pip install -r slam_omni/requirements.txt
 fi
@ -31,7 +31,12 @@ fi
 if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
  log "stage 1: Download whisper-large-v2 multi-hans-zh fbank feature from huggingface"
-  python3 local/compute_whisper_fbank.py
+  python3 local/compute_whisper_fbank.py \
   --num-mel-bins 80 --whisper-fbank True --resample-to-16kHz True --speed-perturb False \
   --out-dir data/fbank_test \
   --huggingface-dataset-path-or-name /workspace/Belle_1.4M-SLAM-Omni \
   --audio-key question_audio --text-key answer \
   --prefix belle
 fi
@ -52,16 +57,18 @@ fi
 if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
  log "stage 3: "
  exp_dir=./slam_omni/exp_speech2speech_rerun
  export PYTHONPATH=$PYTHONPATH:/workspace/CosyVoice
  python3 ./slam_omni/decode.py \
    --max-duration 1 \
    --exp-dir $exp_dir \
    --speech-encoder-path-or-name models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt  \
    --llm-path-or-name models/Qwen2.5-0.5B-Instruct \
-    --epoch 997 --avg 1 \
+    --epoch 999 --avg 1 \
    --manifest-dir data/fbank \
    --use-flash-attn True \
-    --method small_test_speech2speech_rerun \
+    --method e2e-epoch10_speech2speech_rerun \
    --enable-speech-output True \
    --token2wav-path /workspace/CosyVoice-300M-SFT \
    --use-lora True # --on-the-fly-feats True
 fi
--- a/egs/speech_llm/SPEECH2SPEECH/slam_omni/data_module.py
+++ b/egs/speech_llm/SPEECH2SPEECH/slam_omni/data_module.py
@ -24,7 +24,14 @@ from pathlib import Path
 from typing import Any, Dict, Optional
 import torch
-from lhotse import CutSet, WhisperFbank, WhisperFbankConfig, load_manifest, load_manifest_lazy
+from datasets import load_dataset
 from lhotse import (
    CutSet,
    WhisperFbank,
    WhisperFbankConfig,
    load_manifest,
    load_manifest_lazy,
 )
 from lhotse.dataset import (  # noqa F401 for PrecomputedFeatures
    CutConcatenate,
    CutMix,
@ -38,11 +45,11 @@ from lhotse.dataset.input_strategies import (  # noqa F401 For AudioSamples
    OnTheFlyFeatures,
 )
 from lhotse.utils import fix_random_seed
 from speech_dataset import K2SpeechRecognitionDataset
 from torch.utils.data import DataLoader
 from datasets import load_dataset
 from icefall.utils import str2bool
-from speech_dataset import K2SpeechRecognitionDataset
+
 class _SeedWorkers:
    def __init__(self, seed: int):
@ -310,7 +317,9 @@ class AsrDataModule:
            # Drop feats to be on the safe side.
            train = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
-                input_strategy=OnTheFlyFeatures(WhisperFbank(WhisperFbankConfig(num_filters=80, device='cuda'))),
+                input_strategy=OnTheFlyFeatures(
                    WhisperFbank(WhisperFbankConfig(num_filters=80, device="cuda"))
                ),
                input_transforms=input_transforms,
                return_cuts=self.args.return_cuts,
            )
@ -365,7 +374,9 @@ class AsrDataModule:
        logging.info("About to create dev dataset")
        validate = K2SpeechRecognitionDataset(
-            input_strategy=OnTheFlyFeatures(WhisperFbank(WhisperFbankConfig(num_filters=80, device='cuda')))
+            input_strategy=OnTheFlyFeatures(
                WhisperFbank(WhisperFbankConfig(num_filters=80, device="cuda"))
            )
            if self.args.on_the_fly_feats
            else eval(self.args.input_strategy)(),
            return_cuts=self.args.return_cuts,
@ -390,7 +401,9 @@ class AsrDataModule:
    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
        logging.debug("About to create test dataset")
        test = K2SpeechRecognitionDataset(
-            input_strategy=OnTheFlyFeatures(WhisperFbank(WhisperFbankConfig(num_filters=80, device='cpu')))
+            input_strategy=OnTheFlyFeatures(
                WhisperFbank(WhisperFbankConfig(num_filters=80, device="cpu"))
            )
            if self.args.on_the_fly_feats
            else eval(self.args.input_strategy)(),
            return_cuts=self.args.return_cuts,
@ -419,16 +432,27 @@ class AsrDataModule:
            parquet_files = [
                f"data/train-{idx}-of-01601.parquet",
            ]
-            parquet_files = [f"{self.args.huggingface_dataset_path_or_name}/{f}" for f in parquet_files]
+            parquet_files = [
                f"{self.args.huggingface_dataset_path_or_name}/{f}"
                for f in parquet_files
            ]
            file_name = parquet_files[0]
            logging.info(f"Loading dataset from {file_name}")
-            dataset = load_dataset('parquet', data_files=parquet_files, streaming=True, split='train')
+            dataset = load_dataset(
-            cut_set = CutSet.from_huggingface_dataset(dataset, audio_key=self.args.audio_key, text_key=self.args.text_key)
+                "parquet", data_files=parquet_files, streaming=True, split="train"
            )
            cut_set = CutSet.from_huggingface_dataset(
                dataset, audio_key=self.args.audio_key, text_key=self.args.text_key
            )
            if self.args.resample_to_16kHz:
                cut_set = cut_set.resample(16000)
-            return {'test':cut_set}
+            return {"test": cut_set}
        else:
-            return {'test':load_manifest_lazy(self.args.manifest_dir / "cuts_belle.00000.jsonl.gz")}
+            # return {'test':load_manifest_lazy(self.args.manifest_dir / "cuts_belle.00000.jsonl.gz")}
            # return {'test':load_manifest_lazy(self.args.manifest_dir / "cuts_test_small.jsonl.gz")}
            return {
                "test": load_manifest_lazy("data/fbank_test/belle_cuts.00000.jsonl.gz")
            }
    @lru_cache()
    def dev_cuts(self) -> CutSet:
@ -436,8 +460,9 @@ class AsrDataModule:
        if self.args.on_the_fly_feats:
            pass
        else:
-            return load_manifest_lazy(self.args.manifest_dir / "cuts_belle.00000.jsonl.gz")
+            return load_manifest_lazy(
-
+                self.args.manifest_dir / "cuts_belle.00000.jsonl.gz"
            )
    @lru_cache()
    def train_cuts(self) -> CutSet:
--- a/egs/speech_llm/SPEECH2SPEECH/slam_omni/decode.py
+++ b/egs/speech_llm/SPEECH2SPEECH/slam_omni/decode.py
@ -48,32 +48,74 @@ python3 ./whisper_llm_zh/decode.py \
 import argparse
 import logging
 import sys
 from collections import defaultdict
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 import soundfile as sf
 import torch
 import torch.nn as nn
 import transformers
 import whisper
 from cosyvoice.cli.cosyvoice import CosyVoice
 from data_module import AsrDataModule
 from lhotse.cut import Cut
 from model import SPEECH_LLM, EncoderProjector
 from peft import LoraConfig, get_peft_model
-from train import DEFAULT_SPEECH_TOKEN
+from train import DEFAULT_SPEECH_TOKEN, add_model_arguments
 from transformers import AutoModelForCausalLM, AutoTokenizer, Qwen2Config
 from whisper_encoder_forward_monkey_patch import replace_whisper_encoder_forward
-from train import add_model_arguments
+
 from icefall.env import get_env_info
 from icefall.utils import (
    AttributeDict,
    setup_logger,
    store_transcripts,
    write_error_stats,
    average_checkpoints,
 )
 sys.path.append("/workspace/CosyVoice/third_party/Matcha-TTS")
 def audio_decode_cosyvoice(audio_tokens, codec_decoder):
    """
    Generate audio from tokens with optional tone and prompt embedding.
    Args:
        audio_tokens (list): List of audio tokens to be processed.
        codec_decoder: Codec decoder for generating audio.
    Returns:
        torch.Tensor: Generated audio waveform.
    """
    flow_embedding = codec_decoder.frontend.spk2info["中文女"]["embedding"]
    flow_prompt_speech_token = torch.zeros(1, 0, dtype=torch.int32)
    prompt_speech_feat = torch.zeros(1, 0, 80)
    tts_mel, _ = codec_decoder.model.flow.inference(
        token=audio_tokens.to(codec_decoder.model.device),
        token_len=torch.tensor([audio_tokens.shape[1]], dtype=torch.int32).to(
            codec_decoder.model.device
        ),
        prompt_token=flow_prompt_speech_token.to(codec_decoder.model.device),
        prompt_token_len=torch.tensor(
            [flow_prompt_speech_token.shape[1]], dtype=torch.int32
        ).to(codec_decoder.model.device),
        prompt_feat=prompt_speech_feat.to(codec_decoder.model.device),
        prompt_feat_len=torch.tensor(
            [prompt_speech_feat.shape[1]], dtype=torch.int32
        ).to(codec_decoder.model.device),
        embedding=flow_embedding.to(codec_decoder.model.device),
        flow_cache=torch.zeros(1, 80, 0, 2).to(codec_decoder.model.device),
    )
    audio_hat, _ = codec_decoder.model.hift.inference(
        speech_feat=tts_mel, cache_source=torch.zeros(1, 1, 0)
    )
    return audio_hat
 def get_model(params, device):
    """Load and prepare the speech-to-speech model."""
    if params.remove_whisper_encoder_input_length_restriction:
@ -136,7 +178,7 @@ def get_model(params, device):
        # Determine attn_implementation and torch_dtype based on use_flash_attn
        if params.use_flash_attn:
            attn_implementation = "flash_attention_2"
-            torch_dtype = torch.float16 # Or torch.bfloat16 if needed/supported
+            torch_dtype = torch.float16  # Or torch.bfloat16 if needed/supported
        else:
            attn_implementation = "eager"
            torch_dtype = torch.float16
@ -162,7 +204,7 @@ def get_model(params, device):
        codec_lm = AutoModelForCausalLM.from_config(
            config=config,
            attn_implementation=attn_implementation,
-            torch_dtype=torch_dtype
+            torch_dtype=torch_dtype,
        )
        # cosyvoice2_token_size = 6561
        codec_lm.resize_token_embeddings(codec_vocab_size)
@ -197,7 +239,7 @@ def get_model(params, device):
        llm,
        encoder_projector,
        codec_lm,
-        codec_lm_padding_side= "left" if params.use_flash_attn else "right",
+        codec_lm_padding_side="left" if params.use_flash_attn else "right",
    )
    if params.avg > 1:
@ -325,6 +367,12 @@ def get_parser():
        help="The experiment dir",
    )
    parser.add_argument(
        "--token2wav-path",
        type=str,
        default="/workspace/CosyVoice-300M-SFT",
        help="The path to the token2wav model",
    )
    # parser.add_argument(
    #     "--dataset",
    #     type=str,
@ -350,6 +398,7 @@ def decode_one_batch(
    params: AttributeDict,
    model: nn.Module,
    tokenizer: AutoTokenizer,
    token2wav_model: nn.Module,
    batch: dict,
 ) -> Dict[str, List[List[int]]]:
    """Decode one batch and return the result in a dict. The dict has the
@ -431,26 +480,32 @@ def decode_one_batch(
    #         {"role": "assistant", "content": ""},
    #     ]
    # ] * len(feature)
-    questions_with_history = [cut.custom["question"] for cut in batch["supervisions"]["cut"]]
+    questions_with_history = [
-    history_contexts = [question.rsplit('<USER>:', 1)[0].strip() for question in questions_with_history]
+        cut.custom["question"] for cut in batch["supervisions"]["cut"]
-    last_questions = [question.split('<USER>: ')[-1].strip() for question in questions_with_history]
+    ]
    history_contexts = [
        question.rsplit("<USER>:", 1)[0].strip() for question in questions_with_history
    ]
    last_questions = [
        question.split("<USER>: ")[-1].strip() for question in questions_with_history
    ]
    messages = []
    for i, total_round in enumerate(chat_rounds):
        message = []
        if total_round > 1:
-            history_question_answer = history_contexts[i].split('USER:')
+            history_question_answer = history_contexts[i].split("USER:")
            history_question_answer = [item for item in history_question_answer if item]
        for j in range(total_round - 1):
            # USER: 生成一个关于夏天的诗歌。 ASSISTANT: 夏日炎炎，万物生长，阳光明媚，享受着夏日的美好时光。 USER: 给我列举一些新闻头条。 ASSISTANT: 当今社会的新闻永远不会停。
-            question_answer = history_question_answer[j].split('ASSISTANT:')
+            question_answer = history_question_answer[j].split("ASSISTANT:")
            message += [
                {"role": "user", "content": question_answer[0].strip()},
-                {"role": "assistant", "content": question_answer[1].strip()}
+                {"role": "assistant", "content": question_answer[1].strip()},
            ]
        message += [
            {"role": "user", "content": f"{DEFAULT_SPEECH_TOKEN}"},
            # {"role": "user", "content": f"{last_questions[i]}"},
-            {"role": "assistant", "content": ""}
+            {"role": "assistant", "content": ""},
        ]
        print(f"message: {message}, batch_size {len(chat_rounds)}")
        messages.append(message)
@ -461,16 +516,21 @@ def decode_one_batch(
            feature, input_ids.to(device, dtype=torch.long), attention_mask.to(device)
        )
        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
-        for cut_id in cut_ids:
+        generated_speech_output = [
-            speech_token_file_name = (
+            generated_speech_output
-            params.log_dir / f"{cut_id}.txt"
+        ]  # WAR: only support batch = 1 for now
-            )
+        for cut_id, audio_tokens in zip(cut_ids, generated_speech_output):
-            with open(speech_token_file_name, 'w') as f:
+            speech_file_name = params.log_dir / f"{cut_id}.wav"
-            # save_path = params.exp_dir / f"speech_output/{cut_id}.wav"
+            audio_tokens = [token for token in audio_tokens if token < 4096]
-            #torchaudio.save(save_path, speech_output.cpu(), 16000)
+            audio_tokens = torch.tensor(audio_tokens, dtype=torch.int32).unsqueeze(0)
-                # print(f"speech_output: {generated_speech_output}, cut_id: {cut_id}")
+            audio_hat = audio_decode_cosyvoice(audio_tokens, token2wav_model)
-                save_str = " ".join([str(i) for i in generated_speech_output])
+            sf.write(speech_file_name, audio_hat.squeeze(0).cpu().numpy(), 22050)
-                f.write(f"{cut_id}|{save_str}\n")
+            # with open(speech_token_file_name, 'w') as f:
            # # save_path = params.exp_dir / f"speech_output/{cut_id}.wav"
            # #torchaudio.save(save_path, speech_output.cpu(), 16000)
            #     # print(f"speech_output: {generated_speech_output}, cut_id: {cut_id}")
            #     save_str = " ".join([str(i) for i in generated_speech_output])
            #     f.write(f"{cut_id}|{save_str}\n")
    else:
        generated_ids = model.decode(
@ -486,6 +546,7 @@ def decode_dataset(
    params: AttributeDict,
    model: nn.Module,
    tokenizer: AutoTokenizer,
    token2wav_model: nn.Module,
 ) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
    """Decode dataset.
@ -548,14 +609,23 @@ def decode_dataset(
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        answers = batch["supervisions"]["text"]
-        questions_with_history = [cut.custom["question"] for cut in batch["supervisions"]["cut"]]
+        questions_with_history = [
-        answer_cosyvoice_speech_token = [cut.custom["answer_cosyvoice_speech_token"] for cut in batch["supervisions"]["cut"]]
+            cut.custom["question"] for cut in batch["supervisions"]["cut"]
-        texts = [question.split('<USER>: ')[-1].strip() for question in questions_with_history]
+        ]
        answer_cosyvoice_speech_token = [
            cut.custom["answer_cosyvoice_speech_token"]
            for cut in batch["supervisions"]["cut"]
        ]
        texts = [
            question.split("<USER>: ")[-1].strip()
            for question in questions_with_history
        ]
        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
        hyps_dict = decode_one_batch(
            params=params,
            model=model,
            token2wav_model=token2wav_model,
            batch=batch,
            tokenizer=tokenizer,
        )
@ -643,9 +713,7 @@ def main():
    params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
    params.log_dir = Path(params.exp_dir) / f"log-{params.method}"
    params.log_dir.mkdir(parents=True, exist_ok=True)
-    setup_logger(
+    setup_logger(f"{params.exp_dir}/log-{params.method}/log-decode-{params.suffix}")
        f"{params.exp_dir}/log-{params.method}/log-decode-{params.suffix}"
    )
    logging.info("Decoding started")
    logging.info(params)
@ -657,6 +725,9 @@ def main():
    logging.info(f"device: {device}")
    model, tokenizer = get_model(params, device)
    token2wav_model = CosyVoice(
        params.token2wav_path, load_jit=False, load_trt=False, fp16=False
    )
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")
@ -697,6 +768,7 @@ def main():
            dl=test_dl,
            params=params,
            model=model,
            token2wav_model=token2wav_model,
            tokenizer=tokenizer,
        )
--- a/egs/speech_llm/SPEECH2SPEECH/slam_omni/train.py
+++ b/egs/speech_llm/SPEECH2SPEECH/slam_omni/train.py
@ -66,8 +66,9 @@ from lhotse.cut import Cut
 from lhotse.dataset.sampling.base import CutSampler
 from lhotse.utils import fix_random_seed
 from model import IGNORE_TOKEN_ID, SPEECH_LLM, EncoderProjector
 # from multi_dataset import MultiDataset
-from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+from peft import LoraConfig, get_peft_model
 from torch import Tensor
 from torch.utils.tensorboard import SummaryWriter
 from transformers import (
@ -146,6 +147,7 @@ def add_model_arguments(parser: argparse.ArgumentParser):
        help="Whether to enable speech codec output.",
    )
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
@ -332,9 +334,7 @@ def compute_loss(
        # remove too long text
        # texts = [ text for text in texts if len(text) < 1024 ]
        if len(texts) != len(messages):
-            logging.warning(
+            logging.warning(f"Remove too long text, {messages} ")
                f"Remove too long text, {messages} "
            )
        max_len_texts = max([len(text) for text in texts])
        if tokenizer.padding_side == "right":
            texts = [
@ -354,10 +354,10 @@ def compute_loss(
        # first get the indices of the tokens
        mask_prompt = True
        if mask_prompt:
-            default_speech_token_id = tokenizer.convert_tokens_to_ids(DEFAULT_SPEECH_TOKEN)
+            default_speech_token_id = tokenizer.convert_tokens_to_ids(
-            mask_indices = torch.where(
+                DEFAULT_SPEECH_TOKEN
                input_ids == default_speech_token_id
            )
            mask_indices = torch.where(input_ids == default_speech_token_id)
            for i in range(mask_indices[0].size(0)):
                row = mask_indices[0][i]
                col = mask_indices[1][i]
@ -382,11 +382,20 @@ def compute_loss(
    batch_idx_train = params.batch_idx_train
    answers = batch["supervisions"]["text"]
-    questions_with_history = [cut.custom["question"] for cut in batch["supervisions"]["cut"]]
+    questions_with_history = [
        cut.custom["question"] for cut in batch["supervisions"]["cut"]
    ]
    chat_rounds = [cut.custom["round"] for cut in batch["supervisions"]["cut"]]
-    answer_cosyvoice_speech_token = [cut.custom["answer_cosyvoice_speech_token"] for cut in batch["supervisions"]["cut"]]
+    answer_cosyvoice_speech_token = [
-    last_questions = [question.split('<USER>: ')[-1].strip() for question in questions_with_history]
+        cut.custom["answer_cosyvoice_speech_token"]
-    history_contexts = [question.rsplit('<USER>:', 1)[0].strip() for question in questions_with_history]
+        for cut in batch["supervisions"]["cut"]
    ]
    last_questions = [
        question.split("<USER>: ")[-1].strip() for question in questions_with_history
    ]
    history_contexts = [
        question.rsplit("<USER>:", 1)[0].strip() for question in questions_with_history
    ]
    # USER: 生成一个关于夏天的诗歌。 ASSISTANT: 夏日炎炎，万物生长，阳光明媚，享受着夏日的美好时光。 USER: 给我列举一些新闻头条。 ASSISTANT: 当今社会的新闻永远不会停。<USER>: 告诉我如何烹饪鸡肉
    #  <USER>: 对以下句子进行鉴赏：他心地善良。输出结果为"他是一个有善心的人。
@ -394,18 +403,18 @@ def compute_loss(
    for i, total_round in enumerate(chat_rounds):
        message = []
        if total_round > 1:
-            history_question_answer = history_contexts[i].split('USER:')
+            history_question_answer = history_contexts[i].split("USER:")
            history_question_answer = [item for item in history_question_answer if item]
        for j in range(total_round - 1):
            # USER: 生成一个关于夏天的诗歌。 ASSISTANT: 夏日炎炎，万物生长，阳光明媚，享受着夏日的美好时光。 USER: 给我列举一些新闻头条。 ASSISTANT: 当今社会的新闻永远不会停。
-            question_answer = history_question_answer[j].split('ASSISTANT:')
+            question_answer = history_question_answer[j].split("ASSISTANT:")
            message += [
                {"role": "user", "content": question_answer[0].strip()},
-                {"role": "assistant", "content": question_answer[1].strip()}
+                {"role": "assistant", "content": question_answer[1].strip()},
            ]
        message += [
            {"role": "user", "content": f"{DEFAULT_SPEECH_TOKEN}"},
-            {"role": "assistant", "content": answers[i]}
+            {"role": "assistant", "content": answers[i]},
        ]
        messages.append(message)
@ -423,7 +432,13 @@ def compute_loss(
                labels=target_ids.to(device),
            )
        else:
-            text_loss, acc, codec_loss, codec_acc, codec_topk_acc = model.forward_with_speech_output(
+            (
                text_loss,
                acc,
                codec_loss,
                codec_acc,
                codec_topk_acc,
            ) = model.forward_with_speech_output(
                fbank=feature,
                input_ids=input_ids.to(device),
                attention_mask=attention_mask.to(device),
@ -445,12 +460,8 @@ def compute_loss(
        acc * info["frames"]
    )  # WAR: to avoid normalization by the number of frames
    if params.enable_speech_output:
-        info["codec_acc"] = (
+        info["codec_acc"] = codec_acc * info["frames"]
-            codec_acc * info["frames"]
+        info["codec_topk_acc"] = codec_topk_acc * info["frames"]
        )
        info["codec_topk_acc"] = (
            codec_topk_acc * info["frames"]
        )
        info["codec_loss"] = codec_loss.detach().cpu().item()
        info["text_loss"] = text_loss.detach().cpu().item()
    return loss, info
@ -469,7 +480,7 @@ def compute_validation_loss(
    tot_loss = MetricsTracker()
    for batch_idx, batch in enumerate(valid_dl):
-        with torch.amp.autocast('cuda', enabled=params.use_fp16):
+        with torch.amp.autocast("cuda", enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                tokenizer=tokenizer,
@ -584,7 +595,7 @@ def train_one_epoch(
                        f"rm -rf {params.exp_dir}/epoch-{params.cur_epoch}-checkpoint-{batch_idx}"
                    )
        try:
-            with torch.amp.autocast('cuda', enabled=params.use_fp16):
+            with torch.amp.autocast("cuda", enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    tokenizer=tokenizer,
@ -722,7 +733,6 @@ def run(rank, world_size, args):
    # model.resize_token_embeddings(len(tokenizer))
    # model.vocab_size = len(tokenizer)
    llm.config.pad_token_id = tokenizer.pad_token_id
    llm.config.default_speech_token_id = tokenizer.convert_tokens_to_ids(
        DEFAULT_SPEECH_TOKEN
@ -736,12 +746,11 @@ def run(rank, world_size, args):
            param.requires_grad = False
        encoder_projector.eval()
    if params.enable_speech_output:
        # Determine attn_implementation and torch_dtype based on use_flash_attn
        if params.use_flash_attn:
            attn_implementation = "flash_attention_2"
-            torch_dtype = torch.float16 # Or torch.bfloat16 if needed/supported
+            torch_dtype = torch.float16  # Or torch.bfloat16 if needed/supported
        else:
            attn_implementation = "eager"
            torch_dtype = torch.float16
@ -768,7 +777,7 @@ def run(rank, world_size, args):
        codec_lm = AutoModelForCausalLM.from_config(
            config=config,
            attn_implementation=attn_implementation,
-            torch_dtype=torch_dtype
+            torch_dtype=torch_dtype,
        )
        # cosyvoice2_token_size = 6561
        codec_lm.resize_token_embeddings(codec_vocab_size)
@ -803,7 +812,7 @@ def run(rank, world_size, args):
        llm,
        encoder_projector,
        codec_lm,
-        codec_lm_padding_side= "left" if params.use_flash_attn else "right",
+        codec_lm_padding_side="left" if params.use_flash_attn else "right",
    )
    if params.pretrained_model_path:
@ -851,12 +860,11 @@ def run(rank, world_size, args):
        codec_len = len(c.custom["answer_cosyvoice_speech_token"])
        if codec_len > 2200:
            logging.warning(
-               f"Exclude cut with ID {c.id} from training. Duration: {c.duration}, lenth: {codec_len}"
+                f"Exclude cut with ID {c.id} from training. Duration: {c.duration}, lenth: {codec_len}"
            )
            return False
        return True
    train_cuts = data_module.train_cuts()
    train_cuts = train_cuts.filter(remove_short_and_long_utt)