add codec decode

2025-12-11 06:55:27 +00:00 · 2025-04-21 17:57:57 +08:00 · 2025-04-21 17:57:57 +08:00 · 23fdef2fd3
commit 23fdef2fd3
parent 09d81b44a7
3 changed files with 250 additions and 18 deletions
--- a/egs/speech_llm/SPEECH2SPEECH/slam_omni/decode.py
+++ b/egs/speech_llm/SPEECH2SPEECH/slam_omni/decode.py
@ -346,10 +346,19 @@ def decode_one_batch(
        messages.append(message)

    input_ids, attention_mask = preprocess(messages, tokenizer)
-
-    generated_ids = model.decode(
-        feature, input_ids.to(device, dtype=torch.long), attention_mask.to(device)
-    )
+    if params.enable_speech_output:
+        generated_ids, generated_speech_output = model.decode_with_speech_output(
+            feature, input_ids.to(device, dtype=torch.long), attention_mask.to(device)
+        )
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
+        for cut_id, speech_output in zip(cut_ids, generated_speech_output):
+            # save_path = params.exp_dir / f"speech_output/{cut_id}.wav"
+            #torchaudio.save(save_path, speech_output.cpu(), 16000)
+            print(f"speech_output: {speech_output}, cut_id: {cut_id}")
+    else:
+        generated_ids = model.decode(
+            feature, input_ids.to(device, dtype=torch.long), attention_mask.to(device)
+        )
    hyps = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

    return {"beam-search": hyps}
@ -586,10 +595,71 @@ def main():
        speech_encoder_dim, llm.config.hidden_size, params.encoder_projector_ds_rate
    )

+    if params.enable_speech_output:
+        # Determine attn_implementation and torch_dtype based on use_flash_attn
+        if params.use_flash_attn:
+            attn_implementation = "flash_attention_2"
+            torch_dtype = torch.float16 # Or torch.bfloat16 if needed/supported
+        else:
+            attn_implementation = "eager"
+            torch_dtype = torch.float16
+
+        # codec_lm = AutoModelForCausalLM.from_pretrained(
+        #     params.llm_path_or_name,
+        #     attn_implementation=attn_implementation,
+        #     torch_dtype=torch_dtype,
+        # )
+        codec_vocab_size = 8192
+        config = Qwen2Config(
+            vocab_size=codec_vocab_size,
+            hidden_size=1024,
+            num_hidden_layers=12,
+            num_attention_heads=16,
+            num_key_value_heads=16,
+            intermediate_size=2048,
+            max_position_embeddings=4096,
+        )
+        # codec_lm = Qwen2ForCausalLM(config=config)
+        # Pass attn_implementation and torch_dtype to the constructor
+        # Use AutoModelForCausalLM.from_config for more generality
+        codec_lm = AutoModelForCausalLM.from_config(
+            config=config, 
+            attn_implementation=attn_implementation, 
+            torch_dtype=torch_dtype
+        )
+        # cosyvoice2_token_size = 6561
+        codec_lm.resize_token_embeddings(codec_vocab_size)
+        codec_lm.vocab_size = codec_vocab_size
+        codec_lm.config.pad_token_id = codec_vocab_size - 1
+        codec_lm.config.eos_token_id = codec_vocab_size - 2
+        codec_lm.config.bos_token_id = codec_vocab_size - 3
+        if params.use_lora:
+            lora_config = LoraConfig(
+                r=64,
+                lora_alpha=16,
+                target_modules=[
+                    "q_proj",
+                    "k_proj",
+                    "v_proj",
+                    "o_proj",
+                    "up_proj",
+                    "gate_proj",
+                    "down_proj",
+                ],
+                lora_dropout=0.05,
+                task_type="CAUSAL_LM",
+            )
+            codec_lm = get_peft_model(codec_lm, lora_config)
+            codec_lm.print_trainable_parameters()
+    else:
+        codec_lm = None
+
    model = SPEECH_LLM(
        speech_encoder,
        llm,
        encoder_projector,
+        codec_lm,
+        codec_lm_padding_side= "left" if params.use_flash_attn else "right",
    )

    if params.avg > 1:
--- a/egs/speech_llm/SPEECH2SPEECH/slam_omni/model.py
+++ b/egs/speech_llm/SPEECH2SPEECH/slam_omni/model.py
@ -1,6 +1,7 @@
 import torch
 from torch import nn
 from transformers.trainer_pt_utils import LabelSmoother
+from typing import List, Tuple # Added for type hints

 IGNORE_TOKEN_ID = LabelSmoother.ignore_index

@ -444,6 +445,168 @@ class SPEECH_LLM(nn.Module):
        # )
        return generated_ids

+    def decode_with_speech_output(
+        self,
+        fbank: torch.Tensor = None,
+        input_ids: torch.LongTensor = None, # Prompt input_ids
+        attention_mask: torch.Tensor = None, # Prompt attention_mask
+        max_text_new_tokens: int = 1024,
+        max_speech_new_tokens: int = 1024, # Max length for speech tokens
+        llm_kwargs: dict = None, # Kwargs for text LLM generate
+        codec_lm_kwargs: dict = None # Kwargs for codec LM (e.g., temperature for sampling) - NOT IMPLEMENTED YET
+    ) -> Tuple[torch.LongTensor, List[List[int]]]:
+        """
+        Generates text and corresponding speech tokens using the revised logic.
+
+        Args:
+            fbank: Input audio features.
+            input_ids: Input token IDs for the text prompt.
+            attention_mask: Attention mask for the text prompt.
+            max_text_new_tokens: Max new tokens for text generation.
+            max_speech_new_tokens: Max new tokens for speech generation.
+            llm_kwargs: Additional arguments for self.llm.generate.
+            codec_lm_kwargs: Additional arguments for self.codec_lm.generate.
+
+        Returns:
+            Tuple[torch.LongTensor, List[List[int]]]:
+                - generated_text_ids: Tensor of generated text token IDs (including prompt).
+                - generated_speech_tokens: List of lists, where each inner list contains
+                                           the generated speech codec tokens for a batch item.
+        """
+        if not self.codec_lm or not self.speech_token_projector or not self.codec_lm_head:
+            raise ValueError("codec_lm and associated layers must be initialized to generate speech output.")
+
+        device = next(self.parameters()).device # Use model's device
+        batch_size = fbank.shape[0]
+
+        # --- 1. Prepare Prompt Embeddings ---
+        encoder_outs = self.encoder(fbank)
+        speech_features = self.encoder_projector(encoder_outs)
+        speech_features = speech_features.to(self.llm.dtype) # Ensure matching dtype
+
+        prompt_embeds = self.llm.get_input_embeddings()(input_ids)
+
+        # Merge speech features with prompt embeddings
+        (
+            merged_prompt_inputs_embeds,
+            merged_prompt_attention_mask,
+            _,
+            _,
+        ) = self._merge_input_ids_with_speech_features(
+            speech_features, prompt_embeds, input_ids, attention_mask
+        )
+
+        # --- 2. Generate Text using LLM ---
+        # Use merged embeds/mask as input to generate
+        # Ensure kwargs passed are suitable for llm.generate
+        # Note: Using default generation params from `decode` if not provided in kwargs
+        final_llm_kwargs = {
+            "bos_token_id": self.llm.config.bos_token_id,
+            "eos_token_id": self.llm.config.eos_token_id,
+            "pad_token_id": self.llm.config.pad_token_id,
+            "num_beams": 1,
+            "do_sample": True, # Typically false for S2ST/S2TT tasks unless exploration needed
+            "top_p": 0.5,
+            "top_k": 20,
+            "repetition_penalty": 1.1,
+            "temperature": 0.7,
+            **(llm_kwargs or {}) # User-provided kwargs override defaults
+        }
+
+        text_outputs = self.llm.generate(
+            inputs_embeds=merged_prompt_inputs_embeds,
+            attention_mask=merged_prompt_attention_mask,
+            max_new_tokens=max_text_new_tokens,
+            return_dict_in_generate=True,
+            **final_llm_kwargs
+        )
+        for key in text_outputs:
+            print(key, text_outputs[key].shape)
+        # Assume text_outputs is the tensor of generated IDs [B, S_full]
+        generated_text_ids = text_outputs
+        exit(0)
+
+        # --- 3. Get LLM Hidden States for the *Full* Generated Text Sequence ---
+        # Run a separate forward pass to reliably get hidden states for the complete sequence.
+        # This is simpler than parsing the complex output of generate with output_hidden_states=True.
+        full_text_embeds = self.llm.get_input_embeddings()(generated_text_ids) # [B, S_full, D_llm]
+        full_text_attention_mask = (generated_text_ids != self.llm.config.pad_token_id).long() # [B, S_full]
+
+        # --- 4. Project Hidden States ---
+        projected_text_embeds = self.speech_token_projector(full_text_embeds) # Shape [B, S_full, D_codec]
+
+        # --- 5. Generate Speech Tokens (Autoregressive Loop with Text Context) ---
+        self.codec_lm.to(device)
+        self.codec_lm_head.to(device)
+
+        # Initial input for the codec LM is the BOS token
+        current_speech_input_ids = torch.full(
+            (batch_size, 1), self.codec_lm.config.bos_token_id, dtype=torch.long, device=device
+        )
+
+        past_key_values = None
+        generated_speech_tokens_list = [[] for _ in range(batch_size)]
+        unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=device)
+        
+        text_context_len = projected_text_embeds.shape[1] # S_full
+
+        for t in range(max_speech_new_tokens):
+            # Get embedding for the *current* input token ID (initially BOS, then generated tokens)
+            current_speech_embeds = self.codec_lm.get_input_embeddings()(current_speech_input_ids) # [B, 1, D_codec]
+
+            # Add the projected text embedding corresponding to the current timestep `t`
+            if t < text_context_len:
+                # Text context from the full generated text sequence
+                current_text_context_embed = projected_text_embeds[:, t:t+1, :] # [B, 1, D_codec]
+                inputs_embeds = current_speech_embeds + current_text_context_embed
+            else:
+                # No more text context to add
+                inputs_embeds = current_speech_embeds
+                
+            # Ensure inputs_embeds has the correct dtype for the codec_lm
+            inputs_embeds = inputs_embeds.to(next(self.codec_lm.parameters()).dtype)
+
+            # Forward pass through codec LM for one step
+            # We provide inputs_embeds directly, bypassing prepare_inputs_for_generation
+            codec_outputs = self.codec_lm(
+                inputs_embeds=inputs_embeds, # Combined embedding for this step
+                past_key_values=past_key_values,
+                use_cache=True,
+                return_dict=True,
+                # No attention mask needed here when using past_key_values and single token input
+            )
+
+            # Get logits for the *last* token generated in this step
+            next_token_logits = self.codec_lm_head(codec_outputs.last_hidden_state[:, -1:, :]) # Use -1 index
+
+            # --- Process Output & Update State ---
+            # Greedy decoding (can be replaced with sampling based on codec_lm_kwargs)
+            # TODO: Implement sampling/beam search for codec LM if needed
+            next_token_ids = torch.argmax(next_token_logits, dim=-1) # Greedy [B, 1]
+
+            # Mask out finished sequences
+            next_token_ids = next_token_ids * unfinished_sequences.unsqueeze(-1) + \
+                             self.codec_lm.config.pad_token_id * (1 - unfinished_sequences.unsqueeze(-1))
+
+            # Store generated tokens for unfinished sequences
+            for i in range(batch_size):
+                if unfinished_sequences[i]:
+                    token_id = next_token_ids[i].item()
+                    if token_id == self.codec_lm.config.eos_token_id:
+                        unfinished_sequences[i] = 0 # Mark as finished
+                    elif token_id != self.codec_lm.config.pad_token_id:
+                        generated_speech_tokens_list[i].append(token_id)
+
+            # Prepare for next iteration
+            current_speech_input_ids = next_token_ids # Use the newly generated token ID as input for next step
+            past_key_values = codec_outputs.past_key_values # Update KV cache
+
+            # Stop if all sequences are finished
+            if unfinished_sequences.max() == 0:
+                break
+
+        # --- 6. Return Results ---
+        return generated_text_ids, generated_speech_tokens_list

 def compute_accuracy(pad_outputs, pad_targets, ignore_label):
    """Calculate accuracy.
--- a/egs/speech_llm/SPEECH2SPEECH/slam_omni/train.py
+++ b/egs/speech_llm/SPEECH2SPEECH/slam_omni/train.py
@ -133,20 +133,6 @@ def add_model_arguments(parser: argparse.ArgumentParser):
        help="Whether to use lora to fine-tune llm.",
    )

-    parser.add_argument(
-        "--unfreeze-llm",
-        type=str2bool,
-        default=False,
-        help="Whether to unfreeze llm during training.",
-    )
-
-    parser.add_argument(
-        "--unfreeze-speech-projector",
-        type=str2bool,
-        default=False,
-        help="Whether to unfreeze speech adaptor during training.",
-    )
-
    parser.add_argument(
        "--enable-speech-output",
        type=str2bool,
@ -224,6 +210,19 @@ def get_parser():
        help="Whether to use half precision training.",
    )

+    parser.add_argument(
+        "--unfreeze-llm",
+        type=str2bool,
+        default=False,
+        help="Whether to unfreeze llm during training.",
+    )
+
+    parser.add_argument(
+        "--unfreeze-speech-projector",
+        type=str2bool,
+        default=False,
+        help="Whether to unfreeze speech adaptor during training.",
+    )
    parser = deepspeed.add_config_arguments(parser)
    add_model_arguments(parser)