From b70fe33528cd3cb12cc4aef8618572b657d3aaa0 Mon Sep 17 00:00:00 2001
From: sablin39 <1020030829@qq.com>
Date: Sun, 23 Mar 2025 20:30:09 +0800
Subject: [PATCH 1/7] Initial commit after copy

---
 framefusion/interface.py                      |  145 +
 framefusion/main.py                           |  314 ++
 .../modeling_llava_next_video.py              |  236 ++
 .../llava_video/modeling_llava_video.py       |  339 +++
 .../models/minicpmv/modeling_minicpmv.py      |  109 +
 framefusion/models/qwen2/modeling_qwen2.py    |  333 +++
 .../models/qwen2/modeling_qwen2_baseline.py   | 2562 +++++++++++++++++
 framefusion/utils.py                          |  101 +
 8 files changed, 4139 insertions(+)
 create mode 100644 framefusion/interface.py
 create mode 100644 framefusion/main.py
 create mode 100644 framefusion/models/llava_next_video/modeling_llava_next_video.py
 create mode 100644 framefusion/models/llava_video/modeling_llava_video.py
 create mode 100644 framefusion/models/minicpmv/modeling_minicpmv.py
 create mode 100644 framefusion/models/qwen2/modeling_qwen2.py
 create mode 100644 framefusion/models/qwen2/modeling_qwen2_baseline.py
 create mode 100644 framefusion/utils.py

diff --git a/framefusion/interface.py b/framefusion/interface.py
new file mode 100644
index 0000000..a39af80
--- /dev/null
+++ b/framefusion/interface.py
@@ -0,0 +1,145 @@
+# common imports
+from types import MethodType
+from typing import Callable
+import torch
+import torch.nn as nn
+from accelerate.hooks import add_hook_to_module
+from transformers import PreTrainedModel
+
+# framefusion methods
+from framefusion.main import FrameFusion
+from framefusion.utils import TEXT_TOKEN, IGNORE_TOKEN, get_attr_by_name
+
+# model types
+from transformers import LlavaNextVideoForConditionalGeneration
+from llava.model.language_model.llava_qwen import LlavaQwenForCausalLM
+
+# replace methods
+from framefusion.models.llava_next_video.modeling_llava_next_video import _merge_input_ids_with_image_features_get_token_type
+from framefusion.models.llava_video.modeling_llava_video import prepare_inputs_labels_for_multimodal_get_patch_type
+from framefusion.models.minicpmv.modeling_minicpmv import get_vllm_embedding
+from framefusion.models.qwen2.modeling_qwen2 import Qwen2Model_merge_then_fastv_cost_given_forward, Qwen2DecoderLayer_merge_then_prune_by_cost_forward, Qwen2SdpaAttention_merge_then_prune_by_cost_forward
+
+
+def apply_framefusion(model, cost, similarity_lower_bound, ratio_lower_bound):
+    """
+    Apply FrameFusion to the model
+
+    Args:
+        model: the model to apply FrameFusion to
+        cost: the cost of the FrameFusion
+        similarity_lower_bound: the similarity lower bound of the FrameFusion
+        ratio_lower_bound: the ratio lower bound of the FrameFusion
+    """
+    # LlavaNextVideo Model
+    if isinstance(model, LlavaNextVideoForConditionalGeneration):
+        model._merge_input_ids_with_image_features = MethodType(_merge_input_ids_with_image_features_get_token_type, model)
+
+        llm_forward = Qwen2Model_merge_then_fastv_cost_given_forward
+        decoder_forward = Qwen2DecoderLayer_merge_then_prune_by_cost_forward
+        attention_forward = Qwen2SdpaAttention_merge_then_prune_by_cost_forward
+        llm_key = "model"
+        decoder_key = "layers"
+        attention_key = "self_attn"
+
+    # LlavaVideo Model
+    elif isinstance(model, LlavaQwenForCausalLM):
+        model.prepare_inputs_labels_for_multimodal = MethodType(prepare_inputs_labels_for_multimodal_get_patch_type, model)
+
+        llm_forward = Qwen2Model_merge_then_fastv_cost_given_forward
+        decoder_forward = Qwen2DecoderLayer_merge_then_prune_by_cost_forward
+        attention_forward = Qwen2SdpaAttention_merge_then_prune_by_cost_forward
+        llm_key = "model"
+        decoder_key = "layers"
+        attention_key = "self_attn"
+
+    # MiniCPM Model
+    elif model.config.architectures[0] == "MiniCPMV":
+
+        model.get_vllm_embedding = MethodType(get_vllm_embedding, model)
+        llm_forward = Qwen2Model_merge_then_fastv_cost_given_forward
+        decoder_forward = Qwen2DecoderLayer_merge_then_prune_by_cost_forward
+        attention_forward = Qwen2SdpaAttention_merge_then_prune_by_cost_forward
+        llm_key = "llm.model"
+        decoder_key = "layers"
+        attention_key = "self_attn"
+
+    else:
+        raise NotImplementedError
+
+    replace_framefusion_forward(
+        model,
+        cost=cost,
+        similarity_lower_bound=similarity_lower_bound,
+        ratio_lower_bound=ratio_lower_bound,
+        llm_forward=llm_forward,
+        decoder_forward=decoder_forward,
+        attention_forward=attention_forward,
+        llm_key=llm_key,
+        decoder_key=decoder_key,
+        attention_key=attention_key,
+    )
+
+
+def get_token_type(model):
+    # LlavaNextVideo Model
+    if isinstance(model, LlavaNextVideoForConditionalGeneration):
+        model._merge_input_ids_with_image_features = MethodType(_merge_input_ids_with_image_features_get_token_type, model)
+
+    # LlavaVideo Model
+    elif isinstance(model, LlavaQwenForCausalLM):
+        model.prepare_inputs_labels_for_multimodal = MethodType(prepare_inputs_labels_for_multimodal_get_patch_type, model)
+
+    # MiniCPM Model
+    elif model.config.architectures[0] == "MiniCPMV":
+        model.get_vllm_embedding = MethodType(get_vllm_embedding, model)
+    else:
+        raise NotImplementedError
+
+
+def replace_framefusion_forward(
+    module: torch.nn.Module,
+    cost: float,
+    similarity_lower_bound: float,
+    ratio_lower_bound: float,
+    llm_forward: Callable,
+    decoder_forward: Callable,
+    attention_forward: Callable,
+    llm_key: str = "model",
+    decoder_key: str = "layers",
+    attention_key: str = "self_attn",
+):
+    """
+    Replace the forward method of the model with the framefusion forward method.
+    Make framefusion a property of the model.
+
+    The keys are accessed in an hierarchical manner: llm_key -> decoder_key -> attention_key. Each key can have multiple hierarchies, e.g. "llm.model", which will be accessed by module.llm.model
+    """
+    framefusion = FrameFusion(cost, similarity_lower_bound, ratio_lower_bound)
+
+    module.framefusion = framefusion
+
+    llm = get_attr_by_name(module, llm_key)
+    assert isinstance(llm, PreTrainedModel), f"{llm_key} is not a PreTrainedModel"
+
+    llm.framefusion = framefusion
+    llm.forward = MethodType(llm_forward, llm)
+
+    decoder_layers = get_attr_by_name(llm, decoder_key)
+    for i, decoder_layer in enumerate(decoder_layers):
+        assert isinstance(decoder_layer, nn.Module), f"{decoder_key}[{i}] is not a nn.Module"
+
+        decoder_layer.framefusion = framefusion
+        decoder_layer.forward = MethodType(decoder_forward, decoder_layer)
+
+        # ensure accelerate hooks are not removed
+        if hasattr(decoder_layer, "_hf_hook"):
+            decoder_layer._old_forward = MethodType(decoder_forward, decoder_layer)
+            add_hook_to_module(decoder_layer, decoder_layer._hf_hook)
+
+        qwen2_attention_instance = get_attr_by_name(decoder_layer, attention_key)
+        assert isinstance(qwen2_attention_instance, nn.Module), f"{decoder_key}[{i}].self_attn is not a nn.Module"
+
+        # replace the forward method of the attention layer
+        qwen2_attention_instance.framefusion = framefusion
+        qwen2_attention_instance.forward = MethodType(attention_forward, qwen2_attention_instance)
diff --git a/framefusion/main.py b/framefusion/main.py
new file mode 100644
index 0000000..2e4b520
--- /dev/null
+++ b/framefusion/main.py
@@ -0,0 +1,314 @@
+from typing import List
+import torch
+from torch import nn
+
+TEXT_TOKEN = -1
+IGNORE_TOKEN = -2
+
+class FrameFusion(nn.Module):
+    def __init__(self, cost=0.3, similarity_lower_bound=0.6, ratio_lower_bound=0.1):
+        super(FrameFusion, self).__init__()
+        self.cost = cost
+        self.similarity_lower_bound = similarity_lower_bound
+        self.ratio_lower_bound = ratio_lower_bound
+
+    def prepare(self, patch_type, patch_num, image_token_start_index, image_token_end_index, image_token_length, original_length, finish_merging = False, finish_pruning = False, sparsity_list: List = None):
+        self.patch_type = patch_type
+        self.patch_num = patch_num
+        self.image_token_start_index = image_token_start_index
+        self.image_token_end_index = image_token_end_index
+        self.image_token_length = image_token_length
+        self.original_length = original_length
+        self.finish_merging = finish_merging
+        self.finish_pruning = finish_pruning
+        if sparsity_list is None:
+            self.sparsity_list = []
+        else:
+            self.sparsity_list = sparsity_list
+
+    def forward(self, hidden_states, position_embeddings, attention_mask, self_attn_weights = None):
+        """
+        This is the forward method of the FrameFusion class.
+
+        Args:
+            hidden_states (torch.Tensor): A tensor of shape (batch_size, sequence_length, hidden_size).
+            position_embeddings (torch.Tensor): A tensor of shape (batch_size, sequence_length, hidden_size).
+            attention_mask (torch.Tensor): A tensor of shape (batch_size, sequence_length, sequence_length).
+            self_attn_weights (torch.Tensor): A tensor of shape (batch_size, sequence_length, sequence_length).
+
+        Returns:
+            hidden_states (torch.Tensor): A tensor of shape (batch_size, sequence_length, hidden_size).
+            position_embeddings (torch.Tensor): A tensor of shape (batch_size, sequence_length, hidden_size).
+            attention_mask (torch.Tensor): A tensor of shape (batch_size, sequence_length, sequence_length).
+        """
+        bsz, q_len, hidden_size = hidden_states.size()
+        device = hidden_states.device    
+
+        # pruning
+        if q_len >1 and self.finish_merging == True and self.finish_pruning == False:
+
+            image_token_pruning_start_index = self.image_token_start_index.item()
+            image_token_pruning_length = self.image_token_length
+            # update image_token_pruning_length
+            image_token_pruning_length = (self.image_token_length - (self.original_length - q_len))
+
+            last_layer_attention = self_attn_weights
+            last_layer_attention_avg = torch.mean(last_layer_attention, dim=(1,2))[0]
+            last_layer_attention_avg_image = last_layer_attention_avg[image_token_pruning_start_index:image_token_pruning_start_index+image_token_pruning_length]
+            
+            pruning_ratio = self._compute_pruning_ratio(self.sparsity_list, self.cost)
+            top_attention_rank_index = last_layer_attention_avg_image.topk(round(image_token_pruning_length*(1-pruning_ratio))).indices + image_token_pruning_start_index
+            
+            keep_indexs = torch.cat( (torch.arange(image_token_pruning_start_index,device=device), top_attention_rank_index, torch.arange(image_token_pruning_start_index+image_token_pruning_length, q_len, device=device)))
+            keep_indexs = keep_indexs.sort().values
+            
+            hidden_states = hidden_states[:,keep_indexs,:] 
+            position_embeddings[0] = position_embeddings[0][:,keep_indexs,:]
+            position_embeddings[1] = position_embeddings[1][:,keep_indexs,:]
+            if attention_mask != None:
+                attention_mask = attention_mask[:,:,keep_indexs,:][:,:,:,keep_indexs]
+            self.finish_pruning = True
+
+        # merging
+        if q_len >1 and (not self.finish_merging):
+            # align devices
+            self.patch_type = self.patch_type.to(device)
+
+            # prefill
+            sparsity_upper_bound = self._compute_pruning_ratio(self.sparsity_list, self.cost)
+            similarity_by_patch, token_index_by_patch = self.compute_similarity_and_token_index_by_patch(hidden_states, self.patch_type, self.patch_num) # only support bsz = 1
+            
+            frame_token_num = torch.sum(self.patch_type != TEXT_TOKEN).item()
+            merge_index_by_patch = torch.where(similarity_by_patch >= self.similarity_lower_bound)[1]
+            above_k_ratio = merge_index_by_patch.shape[0] / frame_token_num
+
+            if above_k_ratio < sparsity_upper_bound:
+                self.sparsity_list.append(above_k_ratio)
+
+                if above_k_ratio < self.ratio_lower_bound:
+                    self.finish_merging = True
+            else:
+                topk_values, topk_indices = torch.topk(similarity_by_patch, int(sparsity_upper_bound*frame_token_num))
+                topk_indices, _ = torch.sort(topk_indices)
+                merge_index_by_patch = topk_indices[0]
+
+                self.finish_merging = True
+                self.finish_pruning = True
+                
+                
+            hidden_states, token_mask = self.merge_tokens_and_get_mask(hidden_states, similarity_by_patch, token_index_by_patch, merge_index_by_patch)
+            # here only bsz=1
+            # update patch type
+            self.patch_type = self.patch_type.to(device)[token_mask].reshape(bsz, -1)
+            hidden_states = hidden_states[token_mask, :].reshape(bsz, -1, hidden_size)
+            position_embeddings[0] = position_embeddings[0][:,token_mask[0],:]
+            position_embeddings[1] = position_embeddings[1][:,token_mask[0],:]
+            if attention_mask is not None:
+                attention_mask = attention_mask[:,:,token_mask[0],:][:,:,:,token_mask[0]]
+
+        return hidden_states, position_embeddings, attention_mask
+
+    @staticmethod
+    def compute_similarity_and_token_index_by_patch(hidden_states, token_patch_type, patch_num):
+        """
+        Compute the similarity between consecutive tokens of the same patch type and record the token index.
+
+        Args:
+            hidden_states (torch.Tensor): A tensor of shape (batch_size, sequence_length, hidden_size).
+            token_patch_type (torch.Tensor): A tensor indicating the patch type of each token in the sequence.
+            patch_num (int): The total number of patches of one image in the model.
+
+        Returns:
+            similarity_by_patch (torch.Tensor): A tensor of shape (batch_size, sequence_length) containing
+                                                the cosine similarity between consecutive tokens of the
+                                                same patch type. Tokens from different patches are set to -2.
+            token_index_by_patch (torch.Tensor): A tensor of shape (batch_size, sequence_length) containing
+                                                the token index corresponding to the new order after
+                                                sorting by patch type.
+
+        """
+
+        bsz, q_len, _ = hidden_states.size()
+        device = hidden_states.device
+
+        assert bsz == 1, "Only support batch size 1"
+
+        token_index_by_patch = []
+        similarity_by_patch = []
+
+
+        token_patch_type_by_patch, token_index_by_patch = torch.where(
+            token_patch_type == torch.arange(patch_num, device=device)[:, None]
+        )
+
+        # noqa: reshape to batch size = 1, with shape (batch_size, q_len),
+        token_patch_type_by_patch = token_patch_type_by_patch[None, :]
+        token_index_by_patch = token_index_by_patch[None, :]
+
+        similarity_by_patch = cosine_similarity(
+            hidden_states[
+                torch.arange(bsz, device=device), token_index_by_patch[:, :-1], :
+            ],
+            hidden_states[
+                torch.arange(bsz, device=device), token_index_by_patch[:, 1:], :
+            ],
+        )
+
+        similarity_by_patch[token_patch_type_by_patch[:, :-1] != token_patch_type_by_patch[:, 1:]] = -2
+
+        similarity_by_patch = torch.cat(
+            (
+                torch.full(
+                    size=(bsz, 1),
+                    fill_value=IGNORE_TOKEN,
+                    dtype=hidden_states.dtype,
+                    device=device,
+                ),
+                similarity_by_patch,
+            ),
+            dim=1,
+        )
+
+        assert similarity_by_patch.shape[1] == token_index_by_patch.shape[1]
+        return similarity_by_patch, token_index_by_patch
+
+
+    @staticmethod
+    def merge_tokens_and_get_mask(hidden_states: torch.Tensor, similarity_by_patch, token_index_by_patch, merge_index_by_patch):
+        """
+        Merge tokens and get a mask indicating which tokens to keep.
+
+        Args:
+            hidden_states (torch.Tensor): A tensor of shape (batch_size, sequence_length, hidden_size)
+            similarity_by_patch (torch.Tensor): A tensor of shape (batch_size, sequence_length) containing
+                                                the cosine similarity between consecutive tokens of the
+                                                same patch type.
+            token_index_by_patch (torch.Tensor): A tensor of shape (batch_size, sequence_length) containing
+                                                the token indices corresponding to the new order after
+                                                sorting by patch type.
+            merge_index_by_patch (torch.Tensor): A tensor containing the indices of tokens to be merged, in the patch_type order.
+
+        Returns:
+            hidden_states (torch.Tensor): A tensor containing the hidden states of the tokens after merging.
+            keep_mask (torch.Tensor): A boolean tensor of shape (batch_size, sequence_length) indicating
+                                    which tokens in the original sequence should be kept after merging.
+        """
+        device = hidden_states.device
+        if merge_index_by_patch.shape[0] == 0:
+            keep_mask = torch.ones(hidden_states.shape[:-1], dtype=torch.bool, device=device)
+            return hidden_states, keep_mask
+        bsz, q_len, _ = hidden_states.size()
+        bsz_index = torch.arange(bsz, device=hidden_states.device)[:, None]
+        merge_mask_by_patch: torch.LongTensor = torch.zeros(
+            bsz,
+            similarity_by_patch.shape[1],
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+        merge_mask_by_patch[bsz_index, merge_index_by_patch] = 1
+        last_merge_token_by_patch = find_contigious_latter_index(merge_mask_by_patch)
+
+        keep_mask = torch.ones(hidden_states.shape[:-1], dtype=torch.bool, device=device)
+        keep_mask[bsz_index, token_index_by_patch[bsz_index, merge_index_by_patch]] = False
+
+        # noqa: batch size = 1
+        unique_merge_nums = torch.sort(torch.unique(last_merge_token_by_patch.to(torch.long))).values
+        unique_merge_nums = (unique_merge_nums[1:] if (unique_merge_nums[0] == 0).item() else unique_merge_nums)
+
+        merge_num_indices, token_merge_index_in_patch = torch.where(
+            last_merge_token_by_patch == unique_merge_nums[:, None]
+        )
+
+        merge_nums = unique_merge_nums[merge_num_indices]
+        token_merge_start_index_in_patch = token_merge_index_in_patch - merge_nums
+        token_merge_member_start_index_in_patch = torch.repeat_interleave(token_merge_start_index_in_patch, merge_nums)
+
+        merge_member_length = torch.sum(merge_nums)
+        merge_member_contigious_sequence = torch.arange(1, merge_member_length + 1, device = device)
+
+        merge_nums_cumulative_counts = torch.cumsum(merge_nums, dim=0)
+        merge_nums_start = torch.cat((torch.tensor([0], device = device), merge_nums_cumulative_counts[:-1]))
+
+        contigious_sequence_by_merge_nums = merge_member_contigious_sequence - torch.repeat_interleave(merge_nums_start, merge_nums)
+
+        token_merge_member_index_in_patch = token_merge_member_start_index_in_patch + contigious_sequence_by_merge_nums
+
+        # noqa: this function may have numerical instability
+        hidden_states.index_add_(
+            dim = 1,
+            index = token_index_by_patch[0, token_merge_member_start_index_in_patch],
+            source = hidden_states[
+                bsz_index,
+                token_index_by_patch[bsz_index, token_merge_member_index_in_patch],
+            ]
+        )  
+
+        # divide to get average
+        hidden_states[
+            bsz_index,
+            token_index_by_patch[bsz_index, token_merge_start_index_in_patch],
+        ] /= (merge_nums[None, :, None] + 1)
+        
+
+        return hidden_states, keep_mask
+
+    @staticmethod
+    def _compute_pruning_ratio(sparsity_list, cost, num_layers = 28):
+        """
+        Args:
+            sparsity_list (list): A list containing the sparsity values of the model's first few layers.
+            cost (float): The total computation budget given by the user.
+            num_layers (int, optional): The number of layers in the model. 
+
+        Returns:
+            float: the required sparsity for the next layer to achieve the given cost
+        """
+        list_length = len(sparsity_list)
+        s = 1
+        total_calcution =0
+        for i in range(list_length):
+            s *= (1 - sparsity_list[i])
+            total_calcution += s
+        remain_calcution = num_layers * cost - total_calcution
+        if remain_calcution < 0:
+            raise ValueError("The cost is too small")
+        if remain_calcution/((num_layers-list_length)*s) > 1:
+            return 0
+        return 1 - (remain_calcution/((num_layers-list_length)*s))    
+    
+def cosine_similarity(mat1, mat2):
+    dot_product = torch.sum(mat1*mat2, dim=-1)
+    norm_vec1 = torch.norm(mat1, dim=-1)
+    norm_vec2 = torch.norm(mat2, dim=-1)
+    return dot_product / (norm_vec1 * norm_vec2)
+
+def find_contigious_latter_index(index_tensor: torch.LongTensor) -> torch.Tensor:
+    """
+    Args:
+        index_tensor (torch.LongTensor): A binary tensor containing sequences of ones and zeros.
+
+    Returns:
+        torch.Tensor: A tensor where each contiguous sequence of ones in the input tensor
+                    is replaced by zeros, except for the last element of each sequence,
+                    which is replaced by the length of that sequence.
+
+    Example:
+        Input:  torch.tensor([0, 1, 1, 1, 0, 0, 1, 1])
+        Output: torch.tensor([0, 0, 0, 3, 0, 0, 0, 2])
+    """
+    bsz, n = index_tensor.shape
+    t_prev = torch.cat([torch.zeros((bsz, 1), dtype=index_tensor.dtype, device=index_tensor.device), index_tensor[:, :-1]], dim=1)
+    t_next = torch.cat([index_tensor[:, 1:], torch.zeros((bsz, 1), dtype=index_tensor.dtype, device=index_tensor.device)], dim=1)
+
+    # Identify the starts and ends of runs of ones
+    run_starts = (index_tensor == 1) & (t_prev == 0)
+    run_ends = (index_tensor == 1) & (t_next == 0)
+
+    start_indices = torch.nonzero(run_starts, as_tuple=True)
+    end_indices = torch.nonzero(run_ends, as_tuple=True)
+    run_lengths = (end_indices[1] - start_indices[1] + 1).to(index_tensor.dtype)
+
+    output = torch.zeros_like(index_tensor, dtype=index_tensor.dtype)
+    output[end_indices[0], end_indices[1]] = run_lengths
+
+    return output
\ No newline at end of file
diff --git a/framefusion/models/llava_next_video/modeling_llava_next_video.py b/framefusion/models/llava_next_video/modeling_llava_next_video.py
new file mode 100644
index 0000000..476eb27
--- /dev/null
+++ b/framefusion/models/llava_next_video/modeling_llava_next_video.py
@@ -0,0 +1,236 @@
+import torch
+from transformers.models.llava_next_video.modeling_llava_next_video import logger
+
+
+def _merge_input_ids_with_image_features_get_token_type(
+    self,
+    image_features,
+    feature_lens,
+    inputs_embeds,
+    input_ids,
+    attention_mask,
+    position_ids=None,
+    labels=None,
+    image_token_index=None,
+    ignore_index=-100,
+):
+    """
+    Merge input_ids with with image features into final embeddings
+
+    Args:
+        image_features (`torch.Tensor` of shape `(all_feature_lens, embed_dim)`):
+            All vision vectors of all images in the batch
+        feature_lens (`torch.LongTensor` of shape `(num_images)`):
+            The length of visual embeddings of each image as stacked in `image_features`
+        inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, embed_dim)`):
+            Token embeddings before merging with visual embeddings
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Input_ids of tokens, possibly filled with image token
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Mask to avoid performing attention on padding token indices.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+        labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*)
+            :abels need to be recalculated to support training (if provided)
+        image_token_index (`int`, *optional*)
+            Token id used to indicate the special "image" token. Defaults to `config.image_token_index`
+        ignore_index (`int`, *optional*)
+            Value that is used to pad `labels` and will be ignored when calculated loss. Default: -100.
+    Returns:
+        final_embedding, final_attention_mask, position_ids, final_labels
+
+    Explanation:
+        each image has variable length embeddings, with length specified by feature_lens
+        image_features is concatenation of all visual embed vectors
+        task: fill each <image> with the correct number of visual embeddings
+        Example:
+            X (5 patches), Y (3 patches), Z (8)
+            X, Y are in the same sequence (in-context learning)
+        if right padding
+            input_ids: [
+                a b c d e f X g h i j k Y l m
+                o p q r Z s t u v _ _ _ _ _ _
+            ]
+            input_ids should be: [
+                a b c d e f X X X X X g h i j k Y Y Y l m
+                o p q r Z Z Z Z Z Z Z Z s t u v _ _ _ _ _
+            ]
+            labels should be: [
+                a b c d e f _ _ _ _ _ g h i j k _ _ _ l m
+                o p q r _ _ _ _ _ _ _ _ s t u v _ _ _ _ _
+            ]
+        elif left padding
+            input_ids: [
+                a b c d e f X g h i j k Y l m
+                _ _ _ _ _ _ o p q r Z s t u v
+            ]
+            input_ids should be: [
+                a b c d e f X X X X X g h i j k Y Y Y l m
+                _ _ _ _ _ o p q r Z Z Z Z Z Z Z Z s t u v
+            ]
+            labels should be: [
+                a b c d e f _ _ _ _ _ g h i j k _ _ _ l m
+                _ _ _ _ _ o p q r _ _ _ _ _ _ _ _ s t u v
+            ]
+        Edge cases:
+            * If tokens are same but image token sizes are different, then cannot infer left or right padding
+            ```python
+            cat_img = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+            chart_img = Image.open(requests.get("https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true", stream=True).raw)
+            prompts = [
+                "[INST] <image>\nWhat is shown in this image? [/INST]",
+                "[INST] <image>\nWhat is shown in this image? [/INST]",
+            ]
+            inputs = processor(prompts, [chart_img, cat_img], return_tensors='pt', padding=True).to("cuda")
+                chart_img has 2634 tokens, while cat_img has 2340 tokens
+            ```
+
+            input_ids: [
+                a b c d X g h
+                i j Y k l m n
+            ]
+            where X is 3 tokens while Y is 5, this mean after merge
+            if left-padding (batched generation)
+                input_ids should be: [
+                    _ _ a b c d X X X g h
+                    i j Y Y Y Y Y k l m n
+                ]
+            elif (right padding) (training)
+                input_ids should be: [
+                    a b c d X X X g h _ _
+                    i j Y Y Y Y Y k l m n
+                ]
+    """
+    image_token_index = image_token_index if image_token_index is not None else self.config.image_token_index
+    ignore_index = ignore_index if ignore_index is not None else self.config.ignore_index
+
+    if self.training and self.padding_side == "left":
+        logger.warning_once(
+            "Padding side is set to 'left' but the model is in training mode. For training " "it is recommended to set `model.padding_side='right' and `processor.tokenizer.padding_side='right'`. " "If that's intended, ignore this warning"
+        )
+    if not self.training and self.padding_side == "right":
+        logger.warning_once(
+            "Padding side is set to 'right' but the model is in inference mode. For correct " "generation results, please set `model.padding_side='left'` and `processor.tokenizer.padding_side='left'`. " "If that's intended, ignore this warning"
+        )
+
+    with torch.no_grad():
+        # ! in llava 1.6, number of patches is variable
+        num_images = feature_lens.size(0)
+        num_image_features, embed_dim = image_features.shape
+        if feature_lens.sum() != num_image_features:
+            raise ValueError(f"{feature_lens=} / {feature_lens.sum()} != {image_features.shape=}")
+        batch_size = input_ids.shape[0]
+        _left_padding = torch.any(attention_mask[:, 0] == 0)
+        _right_padding = torch.any(attention_mask[:, -1] == 0)
+
+        left_padding = self.padding_side == "left"
+        if batch_size > 1:
+            if _left_padding and _right_padding:
+                raise ValueError(f"both side of attention_mask has zero, invalid. {attention_mask}")
+            elif _right_padding and left_padding:
+                left_padding = False
+            elif _left_padding and not left_padding:
+                left_padding = True
+
+        # Whether to turn off right padding
+        # 1. Create a mask to know where special image tokens are
+        special_image_token_mask = input_ids == image_token_index
+        # special_image_token_mask: [bsz, seqlen]
+        num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
+        # num_special_image_tokens: [bsz]
+        # Reserve for padding of num_images
+        total_num_special_image_tokens = torch.sum(special_image_token_mask)
+        if total_num_special_image_tokens != num_images:
+            raise ValueError(f"Number of image tokens in input_ids ({total_num_special_image_tokens}) different from num_images ({num_images}).")
+        # Compute the maximum embed dimension
+        # max_image_feature_lens is max_feature_lens per batch
+        feature_lens = feature_lens.to(input_ids.device)
+        feature_lens_batch = feature_lens.split(num_special_image_tokens.tolist(), dim=0)
+        feature_lens_batch_sum = torch.tensor([x.sum() for x in feature_lens_batch], device=input_ids.device)
+        embed_sequence_lengths = (attention_mask == 1).long().sum(-1) - num_special_image_tokens + feature_lens_batch_sum
+        max_embed_dim = embed_sequence_lengths.max()
+
+        batch_indices, non_image_indices = torch.where((input_ids != image_token_index) & (attention_mask == 1))
+        # 2. Compute the positions where text should be written
+        # Calculate new positions for text tokens in merged image-text sequence.
+        # `special_image_token_mask` identifies image tokens. Each image token will be replaced by `nb_text_tokens_per_images` text tokens.
+        # `torch.cumsum` computes how each image token shifts subsequent text token positions.
+        # - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
+        # ! instead of special_image_token_mask * (num_image_patches - 1)
+        #   special_image_token_mask * (num_feature_len - 1)
+        special_image_token_mask = special_image_token_mask.long()
+        special_image_token_mask[special_image_token_mask == 1] = feature_lens - 1
+        new_token_positions = torch.cumsum((special_image_token_mask + 1), -1) - 1
+        if left_padding:
+            # shift right token positions so that they are ending at the same number
+            # the below here was incorrect? new_token_positions += new_token_positions[:, -1].max() - new_token_positions[:, -1:]
+            new_token_positions += max_embed_dim - 1 - new_token_positions[:, -1:]
+
+        text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
+
+    # 3. Create the full embedding, already padded to the maximum position
+    final_embedding = torch.zeros(batch_size, max_embed_dim, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device)
+    final_attention_mask = torch.zeros(batch_size, max_embed_dim, dtype=attention_mask.dtype, device=inputs_embeds.device)
+    final_input_ids = torch.full((batch_size, max_embed_dim), self.pad_token_id, dtype=input_ids.dtype, device=inputs_embeds.device)
+    # In case the Vision model or the Language model has been offloaded to CPU, we need to manually
+    # set the corresponding tensors into their correct target device.
+    target_device = inputs_embeds.device
+    batch_indices, non_image_indices, text_to_overwrite = (
+        batch_indices.to(target_device),
+        non_image_indices.to(target_device),
+        text_to_overwrite.to(target_device),
+    )
+    attention_mask = attention_mask.to(target_device)
+    input_ids = input_ids.to(target_device)
+
+    # 4. Fill the embeddings based on the mask. If we have ["hey" "<image>", "how", "are"]
+    # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
+    final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
+    final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
+    final_input_ids[batch_indices, text_to_overwrite] = input_ids[batch_indices, non_image_indices]
+    final_labels = None
+    if labels is not None:
+        labels = labels.to(target_device)
+        final_labels = torch.full_like(final_attention_mask, ignore_index).to(torch.long)
+        final_labels[batch_indices, text_to_overwrite] = labels[batch_indices, non_image_indices]
+
+    # 5. Fill the embeddings corresponding to the images. Anything that is not `text_positions` needs filling (#29835)
+    with torch.no_grad():
+        image_to_overwrite = torch.full((batch_size, max_embed_dim), True, dtype=torch.bool, device=inputs_embeds.device)
+        image_to_overwrite[batch_indices, text_to_overwrite] = False
+        embed_indices = torch.arange(max_embed_dim).unsqueeze(0).to(target_device)
+        embed_indices = embed_indices.expand(batch_size, max_embed_dim)
+        embed_seq_lens = embed_sequence_lengths[:, None].to(target_device)
+
+        if left_padding:
+            # exclude padding on the left
+            max_embed_dim = max_embed_dim.to(target_device)
+            val = (max_embed_dim - embed_indices) <= embed_seq_lens
+        else:
+            # exclude padding on the right
+            val = embed_indices < embed_seq_lens
+        image_to_overwrite &= val
+
+        if image_to_overwrite.sum() != num_image_features:
+            raise ValueError(
+                f"{image_to_overwrite.sum()=} != {num_image_features=} The input provided to the model are wrong. "
+                f"The number of image tokens is {torch.sum(special_image_token_mask)} while"
+                f" the number of image given to the model is {num_images}. "
+                f"This prevents correct indexing and breaks batch generation."
+            )
+    final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim).to(target_device)
+    final_attention_mask |= image_to_overwrite
+    position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
+
+    token_type = torch.ones_like(final_input_ids) * -10
+    token_type[batch_indices, text_to_overwrite] = -1
+    token_per_frame = self.vision_tower.vision_model.embeddings.num_patches // self.vision_resampler.pool.kernel_size**2
+    for n_batch in range(token_type.shape[0]):
+        n_frame = image_to_overwrite[n_batch].sum() // token_per_frame
+        frame_token_type = torch.arange(n_frame, dtype=token_type.dtype, device=token_type.device).reshape(-1, 1).expand(-1, token_per_frame).reshape(-1)
+        token_type[n_batch, image_to_overwrite[n_batch]] = frame_token_type
+    self.token_type = token_type
+    self.current_embedding=final_embedding
+
+    return final_embedding, final_attention_mask, position_ids, final_labels, final_input_ids
diff --git a/framefusion/models/llava_video/modeling_llava_video.py b/framefusion/models/llava_video/modeling_llava_video.py
new file mode 100644
index 0000000..aa8ab4d
--- /dev/null
+++ b/framefusion/models/llava_video/modeling_llava_video.py
@@ -0,0 +1,339 @@
+import math
+import re
+import torch
+import torch.nn as nn
+
+from llava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX
+
+from llava.mm_utils import get_anyres_image_grid_shape
+from llava.utils import rank0_print
+import random
+
+SPECIAL_TOKEN = -9
+IGNORE_TOKEN = -2
+TEXT_TOKEN = -1
+
+
+def prepare_inputs_labels_for_multimodal_get_patch_type(self, input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities=["image"], image_sizes=None):
+    vision_tower = self.get_vision_tower()
+    # rank_print(modalities)
+    if vision_tower is None or images is None or input_ids.shape[1] == 1:
+        return input_ids, position_ids, attention_mask, past_key_values, None, labels
+
+    if isinstance(modalities, str):
+        modalities = [modalities]
+
+    # import pdb; pdb.set_trace()
+    if type(images) is list or images.ndim == 5:
+        if type(images) is list:
+            images = [x.unsqueeze(0) if x.ndim == 3 else x for x in images]
+
+        video_idx_in_batch = []
+        for _ in range(len(modalities)):
+            if modalities[_] == "video":
+                video_idx_in_batch.append(_)
+
+        images_list = []
+        for image in images:
+            if image.ndim == 4:
+                images_list.append(image)
+            else:
+                images_list.append(image.unsqueeze(0))
+
+        concat_images = torch.cat([image for image in images_list], dim=0)
+        split_sizes = [image.shape[0] for image in images_list]
+        encoded_image_features = self.encode_images(concat_images)
+        # image_features,all_faster_video_features = self.encode_multimodals(concat_images, video_idx_in_batch, split_sizes)
+
+        # This is a list, each element is [num_images, patch * patch, dim]
+        # rank_print(f"Concat images : {concat_images.shape}")
+        encoded_image_features = torch.split(encoded_image_features, split_sizes)
+        image_features = []
+        for idx, image_feat in enumerate(encoded_image_features):
+            if idx in video_idx_in_batch:
+                image_features.append(self.get_2dPool(image_feat))
+            else:
+                image_features.append(image_feat)
+        # image_features = self.encode_multimodals(concat_images, video_idx_in_batch, split_sizes)
+        # rank_print(f"Encoded image feats : {[x.shape for x in image_features]}")
+        # image_features = torch.split(image_features, split_sizes, dim=0)
+        mm_patch_merge_type = getattr(self.config, "mm_patch_merge_type", "flat")
+        image_aspect_ratio = getattr(self.config, "image_aspect_ratio", "square")
+        mm_newline_position = getattr(self.config, "mm_newline_position", "one_token")
+
+        if mm_patch_merge_type == "flat":
+            image_features = [x.flatten(0, 1) for x in image_features]
+
+        elif mm_patch_merge_type.startswith("spatial"):
+            new_image_features = []
+            for image_idx, image_feature in enumerate(image_features):
+                # FIXME: now assume the image is square, and split to 2x2 patches
+                # num_patches = h * w, where h = w = sqrt(num_patches)
+                # currently image_feature is a tensor of shape (4, num_patches, hidden_size)
+                # we want to first unflatten it to (2, 2, h, w, hidden_size)
+                # rank0_print("At least we are reaching here")
+                # import pdb; pdb.set_trace()
+                if image_idx in video_idx_in_batch:  # video operations
+                    # rank0_print("Video")
+                    if mm_newline_position == "grid":
+                        # Grid-wise
+                        image_feature = self.add_token_per_grid(image_feature)
+                        if getattr(self.config, "add_faster_video", False):
+                            faster_video_feature = self.add_token_per_grid(all_faster_video_features[image_idx])
+                            # Add a token for each frame
+                            concat_slow_fater_token = []
+                            # import pdb; pdb.set_trace()
+                            for _ in range(image_feature.shape[0]):
+                                if _ % self.config.faster_token_stride == 0:
+                                    concat_slow_fater_token.append(torch.cat((image_feature[_], self.model.faster_token[None].to(image_feature.device)), dim=0))
+                                else:
+                                    concat_slow_fater_token.append(torch.cat((faster_video_feature[_], self.model.faster_token[None].to(image_feature.device)), dim=0))
+                            # import pdb; pdb.set_trace()
+                            image_feature = torch.cat(concat_slow_fater_token)
+
+                            # print("!!!!!!!!!!!!")
+
+                        new_image_features.append(image_feature)
+                    elif mm_newline_position == "frame":
+                        # Frame-wise
+                        image_feature = self.add_token_per_frame(image_feature)
+
+                        new_image_features.append(image_feature.flatten(0, 1))
+
+                    elif mm_newline_position == "one_token":
+                        # one-token
+                        image_feature = image_feature.flatten(0, 1)
+                        if "unpad" in mm_patch_merge_type:
+                            image_feature = torch.cat((image_feature, self.model.image_newline[None].to(image_feature.device)), dim=0)
+                        new_image_features.append(image_feature)
+                    elif mm_newline_position == "no_token":
+                        new_image_features.append(image_feature.flatten(0, 1))
+                    else:
+                        raise ValueError(f"Unexpected mm_newline_position: {mm_newline_position}")
+                elif image_feature.shape[0] > 1:  # multi patches and multi images operations
+                    # rank0_print("Single-images")
+                    base_image_feature = image_feature[0]
+                    image_feature = image_feature[1:]
+                    height = width = self.get_vision_tower().num_patches_per_side
+                    assert height * width == base_image_feature.shape[0]
+
+                    if "anyres_max" in image_aspect_ratio:
+                        matched_anyres_max_num_patches = re.match(r"anyres_max_(\d+)", image_aspect_ratio)
+                        if matched_anyres_max_num_patches:
+                            max_num_patches = int(matched_anyres_max_num_patches.group(1))
+
+                    if image_aspect_ratio == "anyres" or "anyres_max" in image_aspect_ratio:
+                        if hasattr(self.get_vision_tower(), "image_size"):
+                            vision_tower_image_size = self.get_vision_tower().image_size
+                        else:
+                            raise ValueError("vision_tower_image_size is not found in the vision tower.")
+                        try:
+                            num_patch_width, num_patch_height = get_anyres_image_grid_shape(image_sizes[image_idx], self.config.image_grid_pinpoints, vision_tower_image_size)
+                        except Exception as e:
+                            rank0_print(f"Error: {e}")
+                            num_patch_width, num_patch_height = 2, 2
+                        image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
+                    else:
+                        image_feature = image_feature.view(2, 2, height, width, -1)
+
+                    if "maxpool2x2" in mm_patch_merge_type:
+                        image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+                        image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+                        image_feature = nn.functional.max_pool2d(image_feature, 2)
+                        image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+                    elif "unpad" in mm_patch_merge_type and "anyres_max" in image_aspect_ratio and matched_anyres_max_num_patches:
+                        unit = image_feature.shape[2]
+                        image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+                        image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+                        image_feature = unpad_image(image_feature, image_sizes[image_idx])
+                        c, h, w = image_feature.shape
+                        times = math.sqrt(h * w / (max_num_patches * unit**2))
+                        if times > 1.1:
+                            image_feature = image_feature[None]
+                            image_feature = nn.functional.interpolate(image_feature, [int(h // times), int(w // times)], mode="bilinear")[0]
+                        image_feature = torch.cat((image_feature, self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)), dim=-1)
+                        image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+                    elif "unpad" in mm_patch_merge_type:
+                        image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+                        image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+                        image_feature = unpad_image(image_feature, image_sizes[image_idx])
+                        image_feature = torch.cat((image_feature, self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)), dim=-1)
+                        image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+                    else:
+                        image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
+                        image_feature = image_feature.flatten(0, 3)
+                    if "nobase" in mm_patch_merge_type:
+                        pass
+                    else:
+                        image_feature = torch.cat((base_image_feature, image_feature), dim=0)
+                    new_image_features.append(image_feature)
+                else:  # single image operations
+                    image_feature = image_feature[0]
+                    if "unpad" in mm_patch_merge_type:
+                        image_feature = torch.cat((image_feature, self.model.image_newline[None]), dim=0)
+
+                    new_image_features.append(image_feature)
+            image_features = new_image_features
+        else:
+            raise ValueError(f"Unexpected mm_patch_merge_type: {self.config.mm_patch_merge_type}")
+    else:
+        image_features = self.encode_images(images)
+
+    # TODO: image start / end is not implemented here to support pretraining.
+    if getattr(self.config, "tune_mm_mlp_adapter", False) and getattr(self.config, "mm_use_im_start_end", False):
+        raise NotImplementedError
+    # rank_print(f"Total images : {len(image_features)}")
+
+    # Let's just add dummy tensors if they do not exist,
+    # it is a headache to deal with None all the time.
+    # But it is not ideal, and if you have a better idea,
+    # please open an issue / submit a PR, thanks.
+    _labels = labels
+    _position_ids = position_ids
+    _attention_mask = attention_mask
+    if attention_mask is None:
+        attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+    else:
+        attention_mask = attention_mask.bool()
+    if position_ids is None:
+        position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+    if labels is None:
+        labels = torch.full_like(input_ids, IGNORE_INDEX)
+
+    # remove the padding using attention_mask -- FIXME
+    _input_ids = input_ids
+    input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
+    labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
+
+    new_input_embeds = []
+    new_labels = []
+    cur_image_idx = 0
+    # rank_print("Inserting Images embedding")
+    for batch_idx, cur_input_ids in enumerate(input_ids):
+        num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+        # rank0_print(num_images)
+        if num_images == 0:
+            cur_image_features = image_features[cur_image_idx]
+            cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
+            cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
+            new_input_embeds.append(cur_input_embeds)
+            new_labels.append(labels[batch_idx])
+            cur_image_idx += 1
+            continue
+
+        image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
+        cur_input_ids_noim = []
+        cur_labels = labels[batch_idx]
+        cur_labels_noim = []
+        for i in range(len(image_token_indices) - 1):
+            cur_input_ids_noim.append(cur_input_ids[image_token_indices[i] + 1 : image_token_indices[i + 1]])
+            cur_labels_noim.append(cur_labels[image_token_indices[i] + 1 : image_token_indices[i + 1]])
+        split_sizes = [x.shape[0] for x in cur_labels_noim]
+        cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim))
+        cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
+        cur_new_input_embeds = []
+        cur_new_labels = []
+
+        for i in range(num_images + 1):
+            cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+            cur_new_labels.append(cur_labels_noim[i])
+            if i < num_images:
+                try:
+                    cur_image_features = image_features[cur_image_idx]
+                except IndexError:
+                    cur_image_features = image_features[cur_image_idx - 1]
+                cur_image_idx += 1
+                cur_new_input_embeds.append(cur_image_features)
+                cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
+
+        cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
+
+        # import pdb; pdb.set_trace()
+        cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+        cur_new_labels = torch.cat(cur_new_labels)
+
+        new_input_embeds.append(cur_new_input_embeds)
+        new_labels.append(cur_new_labels)
+
+    # Truncate sequences to max length as image embeddings can make the sequence longer
+    tokenizer_model_max_length = getattr(self.config, "tokenizer_model_max_length", None)
+    # rank_print("Finishing Inserting")
+
+    new_input_embeds = [x[:tokenizer_model_max_length] for x, modality in zip(new_input_embeds, modalities)]
+    new_labels = [x[:tokenizer_model_max_length] for x, modality in zip(new_labels, modalities)]
+    # TODO: Hard code for control loss spike
+    # if tokenizer_model_max_length is not None:
+    #     new_input_embeds = [x[:4096] if modality != "video" else x[:tokenizer_model_max_length] for x, modality in zip(new_input_embeds, modalities)]
+    #     new_labels = [x[:4096] if modality != "video" else x[:tokenizer_model_max_length] for x, modality in zip(new_labels, modalities)]
+
+    # Combine them
+    max_len = max(x.shape[0] for x in new_input_embeds)
+    batch_size = len(new_input_embeds)
+
+    new_input_embeds_padded = []
+    new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
+    attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
+    position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
+    # rank0_print("Prepare pos id")
+
+    for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
+        cur_len = cur_new_embed.shape[0]
+        if getattr(self.config, "tokenizer_padding_side", "right") == "left":
+            new_input_embeds_padded.append(torch.cat((torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device), cur_new_embed), dim=0))
+            if cur_len > 0:
+                new_labels_padded[i, -cur_len:] = cur_new_labels
+                attention_mask[i, -cur_len:] = True
+                position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+        else:
+            new_input_embeds_padded.append(torch.cat((cur_new_embed, torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)), dim=0))
+            if cur_len > 0:
+                new_labels_padded[i, :cur_len] = cur_new_labels
+                attention_mask[i, :cur_len] = True
+                position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+
+    new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+    # rank0_print("tokenizer padding")
+
+    if _labels is None:
+        new_labels = None
+    else:
+        new_labels = new_labels_padded
+
+    if _attention_mask is None:
+        attention_mask = None
+    else:
+        attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+
+    if _position_ids is None:
+        position_ids = None
+    if getattr(self.config, "use_pos_skipping", False) and self.training:
+        position_ids = torch.arange(new_input_embeds.size(1), device=new_input_embeds.device).unsqueeze(0).to(new_input_embeds.device)
+        split_position = random.randint(0, new_input_embeds.size(1))
+        left_add = random.randint(0, self.config.pos_skipping_range)
+        right_add = random.randint(left_add, self.config.pos_skipping_range)
+        position_ids[:, :split_position] += left_add
+        position_ids[:, split_position:] += right_add
+    # import pdb; pdb.set_trace()
+    # rank0_print("Finish preparing")
+
+    ### FRAMEFUSION START ###
+    if self.config.mm_spatial_pool_mode == "bilinear":
+        patch_size = math.ceil(self.get_vision_tower().num_patches_per_side / 2)
+    else:
+        patch_size = self.get_vision_tower().num_patches_per_side // 2
+    patch_num = patch_size * (patch_size + 1)
+
+    assert batch_size == 1
+    assert num_images == 1
+    image_token_length = image_features[0].shape[0]
+    n_frames = image_token_length // patch_num
+    image_token_start_index = torch.where(input_ids[0] == IMAGE_TOKEN_INDEX)[0]
+    image_token_end_index = image_token_start_index + image_token_length - 1
+    original_length = input_ids[0].shape[0] + image_token_length - 1
+    patch_type = [TEXT_TOKEN] * image_token_start_index + list(range(patch_num)) * n_frames + [TEXT_TOKEN] * (original_length - image_token_end_index - 1)
+    patch_type = torch.tensor([patch_type], device=new_input_embeds.device)
+
+    self.framefusion.prepare(patch_type, patch_num, image_token_start_index, image_token_end_index, image_token_length, original_length)
+    ### FRAMEFUSION END ###
+
+    return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
diff --git a/framefusion/models/minicpmv/modeling_minicpmv.py b/framefusion/models/minicpmv/modeling_minicpmv.py
new file mode 100644
index 0000000..3689db2
--- /dev/null
+++ b/framefusion/models/minicpmv/modeling_minicpmv.py
@@ -0,0 +1,109 @@
+import torch
+import math
+
+TEXT_TOKEN = -1
+
+
+def get_vllm_embedding(self, data):
+    if "vision_hidden_states" not in data:
+        dtype = self.llm.model.embed_tokens.weight.dtype
+        device = self.llm.model.embed_tokens.weight.device
+        tgt_sizes = data["tgt_sizes"]
+        pixel_values_list = data["pixel_values"]
+        vision_hidden_states = []
+        all_pixel_values = []
+        img_cnt = []
+        for pixel_values in pixel_values_list:
+            img_cnt.append(len(pixel_values))
+            all_pixel_values.extend([i.flatten(end_dim=1).permute(1, 0) for i in pixel_values])
+
+        # exist image
+        if all_pixel_values:
+            tgt_sizes = [tgt_size for tgt_size in tgt_sizes if isinstance(tgt_size, torch.Tensor)]
+            tgt_sizes = torch.vstack(tgt_sizes).type(torch.int32)
+
+            max_patches = torch.max(tgt_sizes[:, 0] * tgt_sizes[:, 1])
+
+            all_pixel_values = torch.nn.utils.rnn.pad_sequence(all_pixel_values, batch_first=True, padding_value=0.0)
+            B, L, _ = all_pixel_values.shape
+            all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)
+
+            patch_attn_mask = torch.zeros((B, 1, max_patches), dtype=torch.bool, device=device)
+            for i in range(B):
+                patch_attn_mask[i, 0, : tgt_sizes[i][0] * tgt_sizes[i][1]] = True
+
+            vision_batch_size = self.config.vision_batch_size
+            all_pixel_values = all_pixel_values.type(dtype)
+            if B > vision_batch_size:
+                hs = []
+                for i in range(0, B, vision_batch_size):
+                    start_idx = i
+                    end_idx = i + vision_batch_size
+                    tmp_hs = self.vpm(all_pixel_values[start_idx:end_idx], patch_attention_mask=patch_attn_mask[start_idx:end_idx], tgt_sizes=tgt_sizes[start_idx:end_idx]).last_hidden_state
+                    hs.append(tmp_hs)
+                vision_embedding = torch.cat(hs, dim=0)
+            else:
+                vision_embedding = self.vpm(all_pixel_values, patch_attention_mask=patch_attn_mask, tgt_sizes=tgt_sizes).last_hidden_state
+            vision_embedding = self.resampler(vision_embedding, tgt_sizes)
+
+            start = 0
+            for pixel_values in pixel_values_list:
+                img_cnt = len(pixel_values)
+                if img_cnt > 0:
+                    vision_hidden_states.append(vision_embedding[start : start + img_cnt])
+                    start += img_cnt
+                else:
+                    vision_hidden_states.append([])
+        else:  # no image
+            if self.training:
+                dummy_image = torch.zeros((1, 3, 224, 224), device=device, dtype=dtype)
+                tgt_sizes = torch.Tensor([[(224 // self.config.patch_size), math.ceil(224 / self.config.patch_size)]]).type(torch.int32)
+                dummy_feature = self.resampler(self.vpm(dummy_image).last_hidden_state, tgt_sizes)
+            else:
+                dummy_feature = []
+            for _ in range(len(pixel_values_list)):
+                vision_hidden_states.append(dummy_feature)
+
+    else:
+        vision_hidden_states = data["vision_hidden_states"]
+
+    if hasattr(self.llm.config, "scale_emb"):
+        vllm_embedding = self.llm.model.embed_tokens(data["input_ids"]) * self.llm.config.scale_emb
+    else:
+        vllm_embedding = self.llm.model.embed_tokens(data["input_ids"])
+
+    vision_hidden_states = [i.type(vllm_embedding.dtype) if isinstance(i, torch.Tensor) else i for i in vision_hidden_states]
+
+    bs = len(data["input_ids"])
+    for i in range(bs):
+        cur_vs_hs = vision_hidden_states[i]
+        if len(cur_vs_hs) > 0:
+            cur_vllm_emb = vllm_embedding[i]
+            cur_image_bound = data["image_bound"][i]
+            if len(cur_image_bound) > 0:
+                image_indices = torch.stack([torch.arange(r[0], r[1], dtype=torch.long) for r in cur_image_bound]).to(vllm_embedding.device)
+
+                cur_vllm_emb.scatter_(0, image_indices.view(-1, 1).repeat(1, cur_vllm_emb.shape[-1]), cur_vs_hs.view(-1, cur_vs_hs.shape[-1]))
+            elif self.training:
+                cur_vllm_emb += cur_vs_hs[0].mean() * 0
+
+    ### FRAMEFUSION START ###
+    assert bs == 1
+    patch_type = torch.full((bs, vllm_embedding.shape[1]), TEXT_TOKEN, dtype=torch.long, device=vllm_embedding.device)
+    num_frames = self.num_frames
+
+    image_bound = data["image_bound"][0]
+    patch_per_frame = image_bound.shape[0] // num_frames
+    token_per_frame = image_bound[patch_per_frame, 0] - image_bound[0, 0]
+    patch_type[i, image_bound[0, 0] : (image_bound[-1, 1] + 2)] = torch.arange(0, image_bound[-1, 1] - image_bound[0, 0] + 2, device=patch_type.device) % token_per_frame
+
+    patch_num = token_per_frame
+    image_token_start_index = torch.argmax((patch_type >= 0).int(), dim=1)
+    image_token_end_index = patch_type.shape[1] - 1 - torch.argmax((torch.flip(patch_type, dims=[1]) >= 0).int(), dim=1)
+    original_length = patch_type.shape[1]
+    image_token_length = image_token_end_index - image_token_start_index + 1
+
+    self.framefusion.prepare(patch_type, patch_num, image_token_start_index, image_token_end_index, image_token_length, original_length)
+    ### FRAMEFUSION END ###
+
+    return vllm_embedding, vision_hidden_states
diff --git a/framefusion/models/qwen2/modeling_qwen2.py b/framefusion/models/qwen2/modeling_qwen2.py
new file mode 100644
index 0000000..6ff4a46
--- /dev/null
+++ b/framefusion/models/qwen2/modeling_qwen2.py
@@ -0,0 +1,333 @@
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from transformers.cache_utils import Cache, DynamicCache,DynamicCache
+from transformers.models.qwen2.modeling_qwen2 import repeat_kv,apply_rotary_pos_emb, logger, QWEN2_INPUTS_DOCSTRING
+from transformers.modeling_outputs import BaseModelOutputWithPast
+from transformers.utils.doc import add_start_docstrings_to_model_forward
+
+from framefusion.utils import scaled_dot_product_attention
+
+def Qwen2DecoderLayer_merge_then_prune_by_cost_forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]], torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        ### start token merging at layer 0 before attention
+        if self.self_attn.layer_idx == 0:
+            hidden_states, position_embeddings, attention_mask = self.framefusion(hidden_states, position_embeddings, attention_mask)
+        ### end token merging at layer 0 before attention
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = residual + hidden_states
+
+        ### start token merging or fastv after attention
+        hidden_states, position_embeddings, attention_mask = self.framefusion(hidden_states, position_embeddings, attention_mask, self_attn_weights)
+        ### end token merging or fastv after attention
+    
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        ### start return the updated position embeddings and attention mask
+        outputs += (position_embeddings, attention_mask)
+        return outputs
+        ### end return the updated position embeddings and attention mask
+
+def Qwen2SdpaAttention_merge_then_prune_by_cost_forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    if output_attentions:
+        # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+        logger.warning_once(
+                "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+        return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+    
+    bsz, q_len, _ = hidden_states.size()
+    
+    query_states = self.q_proj(hidden_states)
+    key_states = self.k_proj(hidden_states)
+    value_states = self.v_proj(hidden_states)
+
+    query_states = query_states.view(
+        bsz, q_len, self.num_heads, self.head_dim
+    ).transpose(1, 2)
+    key_states = key_states.view(
+        bsz, q_len, self.num_key_value_heads, self.head_dim
+    ).transpose(1, 2)
+    value_states = value_states.view(
+        bsz, q_len, self.num_key_value_heads, self.head_dim
+    ).transpose(1, 2)
+
+    if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+    else:
+        cos, sin = position_embeddings
+    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+    if past_key_value is not None:
+        cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+        key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+    causal_mask = attention_mask
+    if attention_mask is not None:  # no matter the length, we just slice it
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+
+    # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+    # Reference: https://github.com/pytorch/pytorch/issues/112577.
+    if query_states.device.type == "cuda" and attention_mask is not None:
+        query_states = query_states.contiguous()
+        key_states = key_states.contiguous()
+        value_states = value_states.contiguous()
+
+    # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+    # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+    # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+    is_causal = True if causal_mask is None and q_len > 1 else False
+    
+    ### start storing attn_weights if needed
+    attn_weights = None
+    if (q_len > 1) and (self.framefusion.finish_merging) and (not self.framefusion.finish_pruning):        
+        attn_weights = scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            num=1,
+            attn_mask=None,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+    ### end storing attn_weights if needed
+        
+        
+    attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+
+    attn_output = self.o_proj(attn_output)
+
+    return attn_output, attn_weights, past_key_value
+
+@add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
+def Qwen2Model_merge_then_fastv_cost_given_forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        ### change position_embeddings into a list for future pruning
+        position_embeddings = list(position_embeddings) 
+        ### end changing
+
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                )
+
+            ### start update the attention mask and position embeddings modified by framefusion
+            position_embeddings = layer_outputs[-2]
+            causal_mask = layer_outputs[-1]
+            ### end changing position embedding
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
diff --git a/framefusion/models/qwen2/modeling_qwen2_baseline.py b/framefusion/models/qwen2/modeling_qwen2_baseline.py
new file mode 100644
index 0000000..97b48d2
--- /dev/null
+++ b/framefusion/models/qwen2/modeling_qwen2_baseline.py
@@ -0,0 +1,2562 @@
+from types import MethodType
+from functools import partial
+import math
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from transformers.cache_utils import Cache, DynamicCache,DynamicCache, SinkCache
+from transformers.models.qwen2.modeling_qwen2 import repeat_kv,apply_rotary_pos_emb, logger, QWEN2_INPUTS_DOCSTRING
+from transformers.modeling_outputs import BaseModelOutputWithPast
+from transformers.utils.doc import add_start_docstrings_to_model_forward
+from transformers.models.qwen2.modeling_qwen2 import Qwen2SdpaAttention, Qwen2DecoderLayer, Qwen2Model
+from functools import partial
+try:
+    from minference import streaming_forward
+except ImportError:
+    # minference is not needed if streamingllm is not used
+    streaming_forward = None
+
+from framefusion.utils import TEXT_TOKEN, IGNORE_TOKEN
+from framefusion.main import find_contigious_latter_index
+
+"""
+Utils
+"""
+
+def compute_density_overhead(sparsity_list) -> tuple:
+    """
+    Compute the average cumulative product and total product of the sparsity list.
+    """
+    density_list = [1-s for s in sparsity_list]
+
+    cost = 0.0
+    remaining_density = 1.0 
+    for density in density_list:
+        remaining_density *= density
+        cost += remaining_density 
+
+    norm_cost = cost / len(density_list)
+    return norm_cost, remaining_density
+
+"""
+Meta Interface
+"""
+
+def replace_Qwen2_forward(model, mode="merge_then_fastv_cost_given", **kwargs):
+    print(f"replace_Qwen2_forward mode: {mode} and kwargs: {kwargs}")
+    
+    if mode=="prefill_merge":
+        prefill_merge_kwargs = {
+            "sparsity": kwargs.get("sparsity", [0.0] * 28),
+        }
+        
+        print(f"Config\n{prefill_merge_kwargs}")
+
+        cost, remaining_density=compute_density_overhead(prefill_merge_kwargs['sparsity']) 
+        print(f"Computational cost: {cost:.3f}, Remaining density: {remaining_density:.3f}")
+
+        replace_Qwen2_merging(
+            model,
+            **prefill_merge_kwargs
+        )
+    elif mode=="fastv":
+        fastv_kwargs = {
+            "fastv_k": kwargs.get("fastv_k", 3),
+            "fastv_r": kwargs.get("fastv_r", 0.5)
+        }
+        print(f"Config\n{fastv_kwargs}")
+
+        replace_Qwen2_fastv(
+            model,
+            **fastv_kwargs
+        )
+    elif mode=="merge_then_fastv":
+        merge_then_fastv_kwargs = {
+            "sparsity": kwargs.get("sparsity", [0.1] * 28),
+            "fastv_k": kwargs.get("fastv_k", 3),
+            "fastv_r": kwargs.get("fastv_r", 0.5)
+        }
+        print(f"Config\n{merge_then_fastv_kwargs}")
+
+        replace_Qwen2_merge_then_fastv(
+            model,
+            **merge_then_fastv_kwargs
+        )
+    elif mode=="streamingllm":
+        streamingllm_kwargs = {
+            "init_num": kwargs.get("init_num", 8),
+            "length_rate": kwargs.get("length_rate", 0.3),
+        }
+        print(f"Config\n{streamingllm_kwargs}")
+
+        replace_Qwen2_streamingllm(
+            model,
+            **streamingllm_kwargs
+        )
+    elif mode=="fastv_then_merge":
+        fastv_then_merge_kwargs = {
+            "fastv_k": kwargs.get("fastv_k", 2),
+            "fastv_r": kwargs.get("fastv_r", 0.75),
+            "merging_sparsity": kwargs.get("merging_sparsity", 0.3)
+        }
+        print(f"Config\n{fastv_then_merge_kwargs}")
+
+        replace_Qwen2_fastv_then_merge(
+            model,
+            **fastv_then_merge_kwargs
+        )
+    else:
+        raise NotImplementedError(f"Mode {mode} is not implemented yet.")
+    
+def replace_minicpmv_forward(model, mode="fastv", **kwargs):
+    print(f"replace_minicpmv_forward mode: {mode} and kwargs: {kwargs}")
+    if mode=="fastv":
+        fastv_kwargs = {
+            "fastv_k": kwargs.get("fastv_k", 3),
+            "fastv_r": kwargs.get("fastv_r", 0.5)
+        }
+        print(f"Config\n{fastv_kwargs}")
+
+        replace_minicpmv_fastv(
+            model,
+            **fastv_kwargs
+        )
+    elif mode=="streamingllm":
+        streamingllm_kwargs = {
+            "init_num": kwargs.get("init_num", 8),
+            "length_rate": kwargs.get("length_rate", 0.3),
+        }
+        print(f"Config\n{streamingllm_kwargs}")
+
+        replace_minicpmv_streamingllm(
+            model,
+            **streamingllm_kwargs
+        )
+    else:
+        raise NotImplementedError(f"Mode {mode} is not implemented yet.")
+
+
+"""
+Forward functions
+"""
+
+"""
+Fastv forward functions
+"""
+
+def replace_Qwen2_fastv(model, fastv_k = 3, fastv_r = 0.5):
+    model.fastv_k = fastv_k
+    model.fastv_r = fastv_r 
+    
+    if isinstance(model.model, Qwen2Model):
+        model.model.forward = MethodType(partial(Qwen2Model_fastv_forward, model=model), model.model)
+    for i, decoder_layer in enumerate(model.model.layers):
+        if isinstance(decoder_layer, Qwen2DecoderLayer):
+            decoder_layer.forward=MethodType(Qwen2DecoderLayer_fastv_forward, decoder_layer)
+        qwen2_attention_instance = decoder_layer.self_attn
+        if isinstance(qwen2_attention_instance, Qwen2SdpaAttention):
+            qwen2_attention_instance.forward = MethodType(partial(Qwen2SdpaAttention_fastv_forward, model=model), qwen2_attention_instance)
+        else:
+            raise TypeError("language model is not Qwen2.")
+     
+def replace_minicpmv_fastv(model, fastv_k = 3, fastv_r = 0.5):
+    model.fastv_k = fastv_k
+    model.fastv_r = fastv_r 
+    
+    if isinstance(model.llm.model, Qwen2Model):
+        model.llm.model.forward = MethodType(partial(Qwen2Model_fastv_forward, model=model), model.llm.model)
+    for i, decoder_layer in enumerate(model.llm.model.layers):
+        if isinstance(decoder_layer, Qwen2DecoderLayer):
+            decoder_layer.forward=MethodType(Qwen2DecoderLayer_fastv_forward, decoder_layer)
+        qwen2_attention_instance = decoder_layer.self_attn
+        if isinstance(qwen2_attention_instance, Qwen2SdpaAttention):
+            qwen2_attention_instance.forward = MethodType(partial(Qwen2SdpaAttention_fastv_forward, model=model), qwen2_attention_instance)
+        else:
+            raise TypeError("language model is not Qwen2.")
+
+@add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
+def Qwen2Model_fastv_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    use_cache: Optional[bool] = None,
+    model = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+) -> Union[Tuple, BaseModelOutputWithPast]:
+    
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    if (input_ids is None) ^ (inputs_embeds is not None):
+        raise ValueError(
+            "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+        )
+
+    if self.gradient_checkpointing and self.training:
+        if use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+            )
+            use_cache = False
+
+    # kept for BC (non `Cache` `past_key_values` inputs)
+    return_legacy_cache = False
+    if use_cache and not isinstance(past_key_values, Cache):
+        return_legacy_cache = True
+        if past_key_values is None:
+            past_key_values = DynamicCache()
+        else:
+            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            logger.warning_once(
+                "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+            )
+
+    if inputs_embeds is None:
+        inputs_embeds = self.embed_tokens(input_ids)
+
+    if cache_position is None:
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        cache_position = torch.arange(
+            past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+        )
+    if position_ids is None:
+        position_ids = cache_position.unsqueeze(0)
+
+    causal_mask = self._update_causal_mask(
+        attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+    )
+
+    hidden_states = inputs_embeds
+
+    # create position embeddings to be shared across the decoder layers
+    position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+    ### change position_embeddings into a list for future pruning
+    position_embeddings = list(position_embeddings) 
+
+    ### end changing
+
+    # decoder layers
+    all_hidden_states = () if output_hidden_states else None
+    all_self_attns = () if output_attentions else None
+    next_decoder_cache = None
+
+    ### implement fastv
+    FASTV_k = model.fastv_k # the layer_idx to prune
+    FASTV_r = model.fastv_r # the pruning ratio
+    FASTV_image_token_start_index = model.image_token_start_index.item()
+    FASTV_image_token_length = model.image_token_length.item()
+    device = self.device
+    #seq_length_with_past = past_seen_tokens + inputs_embeds.shape[1] (here because cache position in minicpmv is not none,so past_seen_tokens is not defined )
+    for layer_idx, decoder_layer in enumerate(self.layers):
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        # pruning hidden states, no kv cache
+        
+        if use_cache:
+            if hidden_states.shape[1] != 1:
+                if layer_idx<FASTV_k:
+                    pruned_attention_mask = causal_mask
+
+                elif layer_idx==FASTV_k:
+                    # compute pruned tokens, generate fastv sign
+                    last_layer_attention = layer_outputs[1]
+                    # compute average attention over different head
+                    last_layer_attention_avg = torch.mean(last_layer_attention, dim=1)[0]
+                    # generate new attention mask based on the average attention, sample the top ATTENTION_RANK tokens with highest attention
+                    last_layer_attention_avg_last_tok = last_layer_attention_avg[-1]
+                    # get the attention in image token
+                    last_layer_attention_avg_last_tok_image = last_layer_attention_avg_last_tok[FASTV_image_token_start_index:FASTV_image_token_start_index+FASTV_image_token_length]
+                    # get the indexs of the top ATTENTION_RANK tokens
+                    top_attention_rank_index = last_layer_attention_avg_last_tok_image.topk(round(FASTV_image_token_length*(1-FASTV_r))).indices + FASTV_image_token_start_index
+                    # keep index
+                    keep_indexs = torch.cat( (torch.arange(FASTV_image_token_start_index,device=device), top_attention_rank_index, torch.arange(FASTV_image_token_start_index+FASTV_image_token_length,hidden_states.shape[1],device=device)))
+                    # sort index
+                    keep_indexs = keep_indexs.sort().values
+                    # update seq length
+                    new_seq_length = keep_indexs.shape[0]
+                    # filter hidden states
+                    hidden_states = hidden_states[:,keep_indexs,:] 
+                    # update position ids
+                    position_ids = keep_indexs.unsqueeze(0)
+                    # update position embeddings
+                    position_embeddings[0] = position_embeddings[0][:,keep_indexs,:]
+                    position_embeddings[1] = position_embeddings[1][:,keep_indexs,:]
+
+                    cache_position = cache_position[:new_seq_length]
+            else:
+                pruned_attention_mask = causal_mask
+        
+        else:
+            raise NotImplementedError("fastv only support use_cache=True")
+   
+
+        if layer_idx == FASTV_k - 1:
+            output_attentions = True
+        else:
+            output_attentions = False
+    
+        layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=pruned_attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=False,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+        )
+    ### end fastv
+
+        hidden_states = layer_outputs[0]
+
+        if use_cache:
+            next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+    ### force output_attentions to be False(we store attn_weights by ourselves)
+        output_attentions = False
+    ### end
+    
+        if output_attentions:
+            all_self_attns += (layer_outputs[1],)
+    
+    hidden_states = self.norm(hidden_states)
+
+    # add hidden states from the last decoder layer
+    if output_hidden_states:
+        all_hidden_states += (hidden_states,)
+
+    next_cache = next_decoder_cache if use_cache else None
+    if return_legacy_cache:
+        next_cache = next_cache.to_legacy_cache()
+
+    if not return_dict:
+        return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+    return BaseModelOutputWithPast(
+        last_hidden_state=hidden_states,
+        past_key_values=next_cache,
+        hidden_states=all_hidden_states,
+        attentions=all_self_attns,
+    )
+
+def Qwen2SdpaAttention_fastv_forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        model = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos.to(query_states.device), sin.to(query_states.device))
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+        ### storing attnetion weights if needed
+        attn_weights = None
+
+        if self.layer_idx != model.fastv_k - 1:
+            attn_output = torch.nn.functional.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=causal_mask,
+                dropout_p=self.attention_dropout if self.training else 0.0,
+                is_causal=is_causal,
+            )
+        else:
+            attn_output = torch.nn.functional.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=causal_mask,
+                dropout_p=self.attention_dropout if self.training else 0.0,
+                is_causal=is_causal,
+            )
+
+            last_query = query_states[:,:,-1,:].view(bsz, self.num_heads, 1, self.head_dim)    
+            scale_factor = 1 / math.sqrt(query_states.size(-1))
+            attn_weights = last_query@ key_states.transpose(-2, -1) * scale_factor 
+            attn_weights = torch.softmax(attn_weights, dim=-1)
+            
+        ### finish storing attention weights
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, attn_weights, past_key_value
+
+def Qwen2DecoderLayer_fastv_forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        ### adding attn_weights if needed
+        if self_attn_weights != None:
+            outputs += (self_attn_weights,)
+        ### end adding
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+"""
+StreamingLLM forward functions
+"""
+def replace_Qwen2_streamingllm(model, init_num = 4, length_rate = 0.3):
+    model.init_num = init_num
+    model.length_rate = length_rate
+
+    if isinstance(model.model, Qwen2Model):
+        model.model.forward = MethodType(partial(Qwen2Model_streamingllm_forward, model=model), model.model)
+    for i, decoder_layer in enumerate(model.model.layers):
+        qwen2_attention_instance = decoder_layer.self_attn
+        if isinstance(qwen2_attention_instance, Qwen2SdpaAttention):
+            qwen2_attention_instance.forward = MethodType(partial(Qwen2SdpaAttention_streamingllm_forward, model=model), qwen2_attention_instance)
+        else:
+            raise TypeError("language model is not Qwen2.")
+
+def replace_minicpmv_streamingllm(model, init_num = 4, length_rate = 0.3):
+    model.init_num = init_num
+    model.length_rate = length_rate
+
+    if isinstance(model.llm.model, Qwen2Model):
+        model.llm.model.forward = MethodType(partial(Qwen2Model_streamingllm_forward, model=model), model.llm.model)
+    for i, decoder_layer in enumerate(model.llm.model.layers):
+        qwen2_attention_instance = decoder_layer.self_attn
+        if isinstance(qwen2_attention_instance, Qwen2SdpaAttention):
+            qwen2_attention_instance.forward = MethodType(partial(Qwen2SdpaAttention_streamingllm_forward, model=model), qwen2_attention_instance)
+        else:
+            raise TypeError("language model is not Qwen2.")
+
+def Qwen2SdpaAttention_streamingllm_forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        model = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+
+        ### start implement StreamingLLM
+        init_num = model.init_num
+        local_window_num = int(model.length_rate * hidden_states.shape[1]) - init_num
+        if q_len == 1:
+            attn_output = torch.nn.functional.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=causal_mask,
+                dropout_p=self.attention_dropout if self.training else 0.0,
+                is_causal=is_causal,
+            )
+        else:
+            attn_output = streaming_forward(query_states, key_states, value_states, init_num, local_window_num)
+        ### end implementing streamingLLM
+
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+@add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
+def Qwen2Model_streamingllm_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    use_cache: Optional[bool] = None,
+    model = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+) -> Union[Tuple, BaseModelOutputWithPast]:
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    if (input_ids is None) ^ (inputs_embeds is not None):
+        raise ValueError(
+            "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+        )
+
+    if self.gradient_checkpointing and self.training:
+        if use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+            )
+            use_cache = False
+
+
+    init_num = model.init_num
+    if inputs_embeds != None:
+        local_window_num = int(inputs_embeds.shape[1] * model.length_rate) - init_num
+
+    # kept for BC (non `Cache` `past_key_values` inputs)
+    return_legacy_cache = False
+    if use_cache and not isinstance(past_key_values, Cache):
+        return_legacy_cache = True
+        if past_key_values is None:
+            past_key_values = SinkCache(window_length = init_num + local_window_num, num_sink_tokens= init_num )
+        else:
+            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            logger.warning_once(
+                "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+            )
+
+    ### changing past_key_values into SinkCache     
+    if len(past_key_values.key_cache) == 0:
+        past_key_values = SinkCache(window_length = init_num + local_window_num, num_sink_tokens= init_num )
+    ### end changing
+    
+    if inputs_embeds is None:
+        inputs_embeds = self.embed_tokens(input_ids)
+
+    if cache_position is None:
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        cache_position = torch.arange(
+            past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+        )
+    if position_ids is None:
+        position_ids = cache_position.unsqueeze(0)
+
+    causal_mask = self._update_causal_mask(
+        attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+    )
+
+    hidden_states = inputs_embeds
+
+    # create position embeddings to be shared across the decoder layers
+    position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+    # decoder layers
+    all_hidden_states = () if output_hidden_states else None
+    all_self_attns = () if output_attentions else None
+    next_decoder_cache = None
+
+    for decoder_layer in self.layers:
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if self.gradient_checkpointing and self.training:
+            layer_outputs = self._gradient_checkpointing_func(
+                decoder_layer.__call__,
+                hidden_states,
+                causal_mask,
+                position_ids,
+                past_key_values,
+                output_attentions,
+                use_cache,
+                cache_position,
+                position_embeddings,
+            )
+        else:
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+            )
+
+        hidden_states = layer_outputs[0]
+
+        if use_cache:
+            next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+        if output_attentions:
+            all_self_attns += (layer_outputs[1],)
+
+    hidden_states = self.norm(hidden_states)
+
+    # add hidden states from the last decoder layer
+    if output_hidden_states:
+        all_hidden_states += (hidden_states,)
+
+    next_cache = next_decoder_cache if use_cache else None
+    if return_legacy_cache:
+        next_cache = next_cache.to_legacy_cache()
+
+    if not return_dict:
+        return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+    return BaseModelOutputWithPast(
+        last_hidden_state=hidden_states,
+        past_key_values=next_cache,
+        hidden_states=all_hidden_states,
+        attentions=all_self_attns,
+    )
+
+
+"""
+Merging forward misc
+"""
+
+### Merging ###
+def replace_Qwen2_merging(model, sparsity=[0.1] * 28):
+    model.sparsity = sparsity
+
+    if isinstance(model.model, Qwen2Model):
+        model.model.forward = MethodType(Qwen2Model_merging_forward, model.model)
+        
+    for i, decoder_layer in enumerate(model.model.layers):
+        if isinstance(decoder_layer, Qwen2DecoderLayer):
+            decoder_layer.forward = MethodType(Qwen2DecoderLayer_merging_forward, decoder_layer)
+            
+        qwen2_attention_instance = decoder_layer.self_attn
+        if isinstance(qwen2_attention_instance, Qwen2SdpaAttention):
+            qwen2_attention_instance.forward = MethodType(partial(Qwen2SdpaAttention_merging_forward, model=model), qwen2_attention_instance)
+        else:
+            raise TypeError("language model is not Qwen2.")
+
+def Qwen2SdpaAttention_merging_forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        model = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    if output_attentions:
+        # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+        logger.warning_once(
+                "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+        return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+    bsz, q_len, _ = hidden_states.size()
+
+    ### start token merging
+    def cosine_similarity(mat1, mat2):
+        dot_product = torch.sum(mat1*mat2, dim=-1)
+        norm_vec1 = torch.norm(mat1, dim=-1)
+        norm_vec2 = torch.norm(mat2, dim=-1)
+        return dot_product / (norm_vec1 * norm_vec2)
+
+    device = hidden_states.device
+    token_patch_type = model.patch_type.reshape(1, -1).to(device) 
+    token_mask = None  # to store the merging positions in this layer 
+
+    if q_len >1:
+        # prefill
+        sparsity = model.sparsity[self.layer_idx]
+        frame_token_num = torch.sum(token_patch_type != TEXT_TOKEN).item()
+        prune_num = math.floor(sparsity * frame_token_num)
+
+        if prune_num > 0:
+            # prefill token merging
+
+            token_similarity = torch.full(
+                (
+                    bsz,
+                    q_len,
+                ),
+                IGNORE_TOKEN,
+                dtype=hidden_states.dtype,
+                device=device,
+            )
+
+            assert bsz == 1, "Only support batch size 1"
+
+            token_index_by_patch = []
+            similarity_by_patch = []
+
+            patch_num = model.patch_num # typically 14 * 15 = 210
+
+            for i in range(patch_num):
+                this_patch_token_index: torch.LongTensor = torch.where(
+                    token_patch_type == i
+                )[
+                    1
+                ]  # shape (q_len,)
+                if this_patch_token_index.shape[-1] > 1:
+                    this_patch_similarity = torch.cat(
+                        (
+                            torch.full(
+                                size=(bsz, 1),
+                                fill_value = IGNORE_TOKEN,
+                                dtype=hidden_states.dtype,
+                                device=hidden_states.device,
+                            ),
+                            cosine_similarity(
+                                hidden_states[:, this_patch_token_index[1:], :],
+                                hidden_states[:, this_patch_token_index[:-1], :],
+                            ),
+                        ),
+                        dim=-1,
+                    )
+                    similarity_by_patch.append(this_patch_similarity)
+                    token_similarity[:, this_patch_token_index[1:]] = this_patch_similarity[
+                        :, 1:
+                    ]
+                elif this_patch_token_index.shape[-1] == 1:
+                    this_patch_similarity = torch.full(
+                        size=(bsz, 1),
+                        fill_value = IGNORE_TOKEN,
+                        dtype=hidden_states.dtype,
+                        device=hidden_states.device,
+                    )
+                    
+                    similarity_by_patch.append(this_patch_similarity)
+                    token_similarity[:, this_patch_token_index] = torch.full(
+                        size=(bsz, 1),
+                        fill_value = IGNORE_TOKEN,
+                        dtype=hidden_states.dtype,
+                        device=hidden_states.device,
+                    )
+                else:
+                    raise ValueError("No token in this patch")
+                token_index_by_patch.append(this_patch_token_index.to(device))
+            similarity_by_patch = torch.cat(similarity_by_patch, dim=-1)
+
+            token_index_by_patch = torch.cat(token_index_by_patch, dim=0).reshape(
+                1, -1
+            )  # shape (batch_size, q_len),
+
+            assert similarity_by_patch.shape[1] == token_index_by_patch.shape[1]
+
+            # profile purpose
+            if hasattr(model, "similarities"):
+                model.similarities.append(token_similarity.detach().cpu())
+            else:
+                model.similarities = [token_similarity.detach().cpu()]
+
+            topk_values, topk_indices = torch.topk(similarity_by_patch, prune_num)
+
+            bsz_index = torch.arange(bsz, device=hidden_states.device)[:, None]
+            mask_by_patch = torch.zeros(
+                bsz,
+                similarity_by_patch.shape[1],
+                dtype=hidden_states.dtype,
+                device=hidden_states.device,
+            )
+            mask_by_patch[bsz_index.to(hidden_states.device), topk_indices] = 1
+
+            token_mask = torch.ones(hidden_states.shape[:-1], dtype=torch.bool, device=device)
+            token_mask[bsz_index, token_index_by_patch[bsz_index, topk_indices]] = False
+
+            last_merge_token_by_patch = find_contigious_latter_index(mask_by_patch)
+
+            unique_merge_nums = [int(merge_num.item()) for merge_num in torch.unique(last_merge_token_by_patch.to(torch.long))]
+
+            for merge_num in unique_merge_nums:
+                if merge_num > 0:
+                    batch_merge_indices, token_merge_indices = torch.where(
+                        last_merge_token_by_patch == merge_num
+                    )
+
+                    token_merge_start_indices = token_merge_indices - merge_num  # 1D tensor
+
+                    contigious_indices = (
+                        token_merge_start_indices[:, None]
+                        + torch.arange(
+                            merge_num + 1,
+                            dtype=torch.long,
+                            device=device,
+                        )[None, :]
+                    )
+
+                    hidden_states[
+                        batch_merge_indices,
+                        token_index_by_patch[
+                            batch_merge_indices, token_merge_start_indices
+                        ],
+                    ] = hidden_states[
+                        batch_merge_indices[:, None],
+                        token_index_by_patch[
+                            batch_merge_indices[:, None], contigious_indices
+                        ],
+                    ].mean(
+                        dim=1
+                    )
+            # here only bsz=1
+
+            # update patch type
+            model.patch_type = model.patch_type.to(device)[token_mask].reshape(bsz, -1)
+            hidden_states = hidden_states[token_mask, :].reshape(bsz, -1, self.hidden_size)
+    ### end token merging
+
+    query_states = self.q_proj(hidden_states)
+    key_states = self.k_proj(hidden_states)
+    value_states = self.v_proj(hidden_states)
+
+    if q_len > 1:
+        q_len = hidden_states.shape[1]
+
+    query_states = query_states.view(
+        bsz, q_len, self.num_heads, self.head_dim
+    ).transpose(1, 2)
+    key_states = key_states.view(
+        bsz, q_len, self.num_key_value_heads, self.head_dim
+    ).transpose(1, 2)
+    value_states = value_states.view(
+        bsz, q_len, self.num_key_value_heads, self.head_dim
+    ).transpose(1, 2)
+
+    if position_embeddings is None:
+        logger.warning_once(
+            "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+            "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+            "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+            "removed and `position_embeddings` will be mandatory."
+        )
+        cos, sin = self.rotary_emb(value_states, position_ids)
+    else:
+        if q_len>1 and prune_num > 0:
+            ### also prune position_embeddings according to mask
+            position_embeddings[0] = position_embeddings[0][:,token_mask[0],:]
+            position_embeddings[1] = position_embeddings[1][:,token_mask[0],:]
+            ###end pruning position_embeddings
+        cos, sin = position_embeddings
+    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+    if past_key_value is not None:
+        cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+        key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+    causal_mask = attention_mask
+    if attention_mask is not None:  # no matter the length, we just slice it
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+
+    # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+    # Reference: https://github.com/pytorch/pytorch/issues/112577.
+    if query_states.device.type == "cuda" and attention_mask is not None:
+        query_states = query_states.contiguous()
+        key_states = key_states.contiguous()
+        value_states = value_states.contiguous()
+
+    # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+    # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+    # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+    is_causal = True if causal_mask is None and q_len > 1 else False
+
+    attn_output = torch.nn.functional.scaled_dot_product_attention(
+        query_states,
+        key_states,
+        value_states,
+        attn_mask=causal_mask,
+        dropout_p=self.attention_dropout if self.training else 0.0,
+        is_causal=is_causal,
+    )
+
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+
+    attn_output = self.o_proj(attn_output)
+
+    return attn_output, None, past_key_value, token_mask
+
+def Qwen2DecoderLayer_merging_forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states).to(residual.device)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value, mask = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+
+        ### implement mask on residual to match the shape of hidden_states
+        if mask is not None:
+            device=residual.device
+            mask=mask.to(device)
+            residual = residual[:, mask[0], :]
+        ### end masking residual
+
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        residual.to(hidden_states.device)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+@add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
+def Qwen2Model_merging_forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        ### change position_embeddings into a list for future pruning
+        position_embeddings = list(position_embeddings) 
+
+        ### end changing
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+### Merge then prune ###
+
+def replace_Qwen2_merge_then_fastv(model, sparsity = [0.1]*28, fastv_k = 3, fastv_r = 0.5):
+    model.sparsity = sparsity
+    model.fastv_k = fastv_k
+    model.fastv_r = fastv_r 
+    
+    if isinstance(model.model, Qwen2Model):
+        model.model.forward = MethodType(partial(Qwen2Model_merge_then_fastv_forward, model=model), model.model)
+    for i, decoder_layer in enumerate(model.model.layers):
+        if isinstance(decoder_layer, Qwen2DecoderLayer):
+            decoder_layer.forward=MethodType(Qwen2DecoderLayer_merge_then_fastv_forward, decoder_layer)
+        qwen2_attention_instance = decoder_layer.self_attn
+        if isinstance(qwen2_attention_instance, Qwen2SdpaAttention):
+            qwen2_attention_instance.forward = MethodType(partial(Qwen2SdpaAttention_merge_then_fastv_forward, model=model), qwen2_attention_instance)
+        else:
+            raise TypeError("language model is not Qwen2.")
+
+@add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
+def Qwen2Model_merge_then_fastv_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    use_cache: Optional[bool] = None,
+    model = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+) -> Union[Tuple, BaseModelOutputWithPast]:
+    
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    if (input_ids is None) ^ (inputs_embeds is not None):
+        raise ValueError(
+            "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+        )
+
+    if self.gradient_checkpointing and self.training:
+        if use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+            )
+            use_cache = False
+
+    # kept for BC (non `Cache` `past_key_values` inputs)
+    return_legacy_cache = False
+    if use_cache and not isinstance(past_key_values, Cache):
+        return_legacy_cache = True
+        if past_key_values is None:
+            past_key_values = DynamicCache()
+        else:
+            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            logger.warning_once(
+                "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+            )
+
+    if inputs_embeds is None:
+        inputs_embeds = self.embed_tokens(input_ids)
+
+    if cache_position is None:
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        cache_position = torch.arange(
+            past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+        )
+    if position_ids is None:
+        position_ids = cache_position.unsqueeze(0)
+
+    causal_mask = self._update_causal_mask(
+        attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+    )
+
+    hidden_states = inputs_embeds
+
+    # create position embeddings to be shared across the decoder layers
+    position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+    ### change position_embeddings into a list for future pruning
+    position_embeddings = list(position_embeddings)
+    ### end changing
+
+    # decoder layers
+    all_hidden_states = () if output_hidden_states else None
+    all_self_attns = () if output_attentions else None
+    next_decoder_cache = None
+
+
+    ### implement fastv
+    # fastv_constants
+    device = self.device
+    #seq_length_with_past = past_seen_tokens + inputs_embeds.shape[1] (here because cache position in minicpmv is not none,so past_seen_tokens is not defined )
+    FASTV_k = model.fastv_k # the layer_idx to prune
+    FASTV_r = model.fastv_r # the pruning ratio
+    FASTV_image_token_start_index = model.image_token_start_index.item()
+    FASTV_image_token_length = model.image_token_length.item()
+
+    for layer_idx, decoder_layer in enumerate(self.layers):
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        # pruning hidden states after layer k, no kv cache
+        if use_cache:
+            if hidden_states.shape[1] != 1:
+                if layer_idx<FASTV_k:
+                    pruned_attention_mask = causal_mask
+
+                elif layer_idx==FASTV_k:
+                    # update FASTV_image_token_length
+                    FASTV_image_token_length = (model.image_token_length - (model.original_length - hidden_states.shape[1])).item()
+                    # compute pruned tokens, generate fastv sign
+                    last_layer_attention = layer_outputs[1]
+                    # compute average attention over different head
+                    last_layer_attention_avg = torch.mean(last_layer_attention, dim=1)[0]
+                    # generate new attention mask based on the average attention, sample the top ATTENTION_RANK tokens with highest attention
+                    last_layer_attention_avg_last_tok = last_layer_attention_avg[-1]
+                    # get the attention in image token
+                    last_layer_attention_avg_last_tok_image = last_layer_attention_avg_last_tok[FASTV_image_token_start_index:FASTV_image_token_start_index+FASTV_image_token_length]
+                    # get the indexs of the top ATTENTION_RANK tokens
+                    top_attention_rank_index = last_layer_attention_avg_last_tok_image.topk(round(FASTV_image_token_length*(1-FASTV_r))).indices + FASTV_image_token_start_index
+                    # keep index
+                    keep_indexs = torch.cat( (torch.arange(FASTV_image_token_start_index,device=device), top_attention_rank_index, torch.arange(FASTV_image_token_start_index+FASTV_image_token_length, hidden_states.shape[1],device=device)))
+                    # sort index
+                    keep_indexs = keep_indexs.sort().values
+                    # update seq length
+                    new_seq_length = keep_indexs.shape[0]
+                    # filter hidden states
+                    hidden_states = hidden_states[:,keep_indexs,:] 
+                    # update position ids
+                    position_ids = keep_indexs.unsqueeze(0)
+                    # update position embeddings
+                    position_embeddings[0] = position_embeddings[0][:,keep_indexs,:]
+                    position_embeddings[1] = position_embeddings[1][:,keep_indexs,:]
+                    # update attention mask
+                    pruned_attention_mask =None
+
+                    cache_position = cache_position[:new_seq_length]
+            else:
+                pruned_attention_mask = causal_mask
+        else:
+            raise NotImplementedError("fastv only support use_cache=True")
+          
+    
+
+        if layer_idx == FASTV_k - 1:
+            output_attentions = True
+        else:
+            output_attentions = False
+
+        layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=pruned_attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=False,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+        )
+    ###finish inplementing fastv
+        hidden_states = layer_outputs[0]
+
+        if use_cache:
+            next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+        ### force output_attentions to be False(we store attn_weights by ourselves)
+        output_attentions = False
+        ### end
+
+        if output_attentions:
+            all_self_attns += (layer_outputs[1],)    
+
+    hidden_states = self.norm(hidden_states)
+
+    # add hidden states from the last decoder layer
+    if output_hidden_states:
+        all_hidden_states += (hidden_states,)
+
+    next_cache = next_decoder_cache if use_cache else None
+    if return_legacy_cache:
+        next_cache = next_cache.to_legacy_cache()
+
+    if not return_dict:
+        return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+    return BaseModelOutputWithPast(
+        last_hidden_state=hidden_states,
+        past_key_values=next_cache,
+        hidden_states=all_hidden_states,
+        attentions=all_self_attns,
+    )
+
+def Qwen2SdpaAttention_merge_then_fastv_forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        model = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    sparsity = model.sparsity[self.layer_idx]
+    if sparsity > 0:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                    "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                    'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            return super().forward(
+                    hidden_states=hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+
+        bsz, q_len, _ = hidden_states.size()
+    ### start token merging
+        def cosine_similarity(mat1, mat2):
+            dot_product = torch.sum(mat1*mat2, dim=-1)
+            norm_vec1 = torch.norm(mat1, dim=-1)
+            norm_vec2 = torch.norm(mat2, dim=-1)
+            return dot_product / (norm_vec1 * norm_vec2)
+
+        device = hidden_states.device
+        token_patch_type = model.patch_type.reshape(1, -1).to(device)
+        token_mask = None
+
+        if q_len >1:
+            # prefill
+            sparsity = model.sparsity[self.layer_idx]
+            frame_token_num = torch.sum(token_patch_type != TEXT_TOKEN).item()
+            prune_num = math.floor(sparsity * frame_token_num)
+
+            if prune_num > 0:
+                # prefill token merging
+
+                token_similarity = torch.full(
+                    (
+                        bsz,
+                        q_len,
+                    ),
+                    IGNORE_TOKEN,
+                    dtype=hidden_states.dtype,
+                    device=device,
+                )
+
+                token_merge_scale = model.token_merge_scale
+
+                assert bsz == 1, "Only support batch size 1"
+
+                token_index_by_patch = []
+                similarity_by_patch = []
+
+                patch_num=model.patch_num # typically 14 * 15 = 210
+
+                for i in range(patch_num):
+                    this_patch_token_index: torch.LongTensor = torch.where(
+                        token_patch_type == i
+                    )[
+                        1
+                    ]  # shape (q_len,)
+                    if this_patch_token_index.shape[-1] > 1:
+                        this_patch_similarity = torch.cat(
+                            (
+                                torch.full(
+                                    size=(bsz, 1),
+                                    fill_value = IGNORE_TOKEN,
+                                    dtype=hidden_states.dtype,
+                                    device=hidden_states.device,
+                                ),
+                                cosine_similarity(
+                                    hidden_states[:, this_patch_token_index[1:], :],
+                                    hidden_states[:, this_patch_token_index[:-1], :],
+                                ),
+                            ),
+                            dim=-1,
+                        )
+                        similarity_by_patch.append(this_patch_similarity)
+                        token_similarity[:, this_patch_token_index[1:]] = this_patch_similarity[
+                            :, 1:
+                        ]
+                    elif this_patch_token_index.shape[-1] == 1:
+                        this_patch_similarity = torch.full(
+                            size=(bsz, 1),
+                            fill_value = IGNORE_TOKEN,
+                            dtype=hidden_states.dtype,
+                            device=hidden_states.device,
+                        )
+                        
+                        similarity_by_patch.append(this_patch_similarity)
+                        token_similarity[:, this_patch_token_index] = torch.full(
+                            size=(bsz, 1),
+                            fill_value = IGNORE_TOKEN,
+                            dtype=hidden_states.dtype,
+                            device=hidden_states.device,
+                        )
+                    else:
+                        raise ValueError("No token in this patch")
+                    token_index_by_patch.append(this_patch_token_index.to(device))
+                similarity_by_patch = torch.cat(similarity_by_patch, dim=-1)
+
+                token_index_by_patch = torch.cat(token_index_by_patch, dim=0).reshape(
+                    1, -1
+                )  # shape (batch_size, q_len),
+
+                assert similarity_by_patch.shape[1] == token_index_by_patch.shape[1]
+
+                # profile purpose
+                if hasattr(model, "similarities"):
+                    model.similarities.append(token_similarity.detach().cpu())
+                else:
+                    model.similarities = [token_similarity.detach().cpu()]
+
+                topk_values, topk_indices = torch.topk(similarity_by_patch, prune_num)
+
+                bsz_index = torch.arange(bsz, device=hidden_states.device)[:, None]
+                mask_by_patch = torch.zeros(
+                    bsz,
+                    similarity_by_patch.shape[1],
+                    dtype=hidden_states.dtype,
+                    device=hidden_states.device,
+                )
+                mask_by_patch[bsz_index.to(hidden_states.device), topk_indices] = 1
+
+                token_mask = torch.ones(hidden_states.shape[:-1], dtype=torch.bool, device=device)
+                token_mask[bsz_index, token_index_by_patch[bsz_index, topk_indices]] = False
+
+                last_merge_token_by_patch = find_contigious_latter_index(mask_by_patch)
+
+                unique_merge_nums = [int(merge_num.item()) for merge_num in torch.unique(last_merge_token_by_patch.to(torch.long))]
+
+                for merge_num in unique_merge_nums:
+                    if merge_num > 0:
+                        batch_merge_indices, token_merge_indices = torch.where(
+                            last_merge_token_by_patch == merge_num
+                        )
+
+                        token_merge_start_indices = token_merge_indices - merge_num  # 1D tensor
+
+                        contigious_indices = (
+                            token_merge_start_indices[:, None]
+                            + torch.arange(
+                                merge_num + 1,
+                                dtype=torch.long,
+                                device=device,
+                            )[None, :]
+                        )
+
+                        hidden_states[
+                            batch_merge_indices,
+                            token_index_by_patch[
+                                batch_merge_indices, token_merge_start_indices
+                            ],
+                        ] = hidden_states[
+                            batch_merge_indices[:, None],
+                            token_index_by_patch[
+                                batch_merge_indices[:, None], contigious_indices
+                            ],
+                        ].mean(
+                            dim=1
+                        )
+
+                        token_merge_scale[
+                            batch_merge_indices,
+                            token_index_by_patch[
+                                batch_merge_indices, token_merge_start_indices
+                            ],
+                        ] = token_merge_scale[
+                            batch_merge_indices[:, None],
+                            token_index_by_patch[
+                                batch_merge_indices[:, None], contigious_indices
+                            ],
+                        ].sum(
+                            dim=1
+                        )
+
+                # here only bsz=1
+                # update patch type
+                model.patch_type = model.patch_type.to(device)[token_mask].reshape(bsz, -1)
+                model.token_merge_scale = token_merge_scale[token_mask].reshape(bsz, -1)
+
+                hidden_states = hidden_states[token_mask, :].reshape(bsz, -1, self.hidden_size)
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        if q_len > 1:
+            q_len = hidden_states.shape[1]
+
+        query_states = query_states.view(
+            bsz, q_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        key_states = key_states.view(
+            bsz, q_len, self.num_key_value_heads, self.head_dim
+        ).transpose(1, 2)
+        value_states = value_states.view(
+            bsz, q_len, self.num_key_value_heads, self.head_dim
+        ).transpose(1, 2)
+
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            if q_len>1 and prune_num > 0:
+                position_embeddings[0] = position_embeddings[0][:,token_mask[0],:]
+                position_embeddings[1] = position_embeddings[1][:,token_mask[0],:]
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+
+        
+        attn_weights = None
+
+        if self.layer_idx != model.fastv_k - 1:
+            attn_output = torch.nn.functional.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=causal_mask,
+                dropout_p=self.attention_dropout if self.training else 0.0,
+                is_causal=is_causal,
+            )
+        ### storing attnetion weights
+        else:
+
+            def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0,
+            is_causal=False, scale=None, enable_gqa=False) -> torch.Tensor:
+                L, S = query.size(-2), key.size(-2)
+                scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
+                attn_bias = torch.zeros(L, S, dtype=query.dtype, device=query.device)
+
+                if is_causal:
+                    assert attn_mask is None
+                    temp_mask = torch.ones(L, S, dtype=torch.bool,device=query.device).tril(diagonal=0)
+                    attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+                    attn_bias.to(query.dtype)
+
+                if attn_mask is not None:
+                    if attn_mask.dtype == torch.bool:
+                        attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
+                    else:
+                        attn_bias += attn_mask
+
+                if enable_gqa:
+                    key = key.repeat_interleave(query.size(-3)//key.size(-3), -3)
+                    value = value.repeat_interleave(query.size(-3)//value.size(-3), -3)
+
+                attn_weight = query@ key.transpose(-2, -1) * scale_factor
+                attn_weight += attn_bias
+                attn_weight = torch.softmax(attn_weight, dim=-1)
+                attn_weight=attn_weight
+                attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
+
+                return (attn_weight @ value).to(query.dtype), attn_weight
+            
+            attn_output, attn_weights = scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=causal_mask,
+                dropout_p=self.attention_dropout if self.training else 0.0,
+                is_causal=is_causal,
+            )
+
+        ### finish storing attention weights
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, attn_weights, past_key_value, token_mask
+
+    else:
+
+    ### end token merging
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos.to(query_states.device), sin.to(query_states.device))
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+        ### storing attnetion weights
+        attn_weights = None
+
+        if self.layer_idx != model.fastv_k - 1:
+            attn_output = torch.nn.functional.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=causal_mask,
+                dropout_p=self.attention_dropout if self.training else 0.0,
+                is_causal=is_causal,
+            )
+        else:
+
+            def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0,
+            is_causal=False, scale=None, enable_gqa=False) -> torch.Tensor:
+                L, S = query.size(-2), key.size(-2)
+                scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
+                attn_bias = torch.zeros(L, S, dtype=query.dtype, device=query.device)
+
+                if is_causal:
+                    assert attn_mask is None
+                    temp_mask = torch.ones(L, S, dtype=torch.bool,device=query.device).tril(diagonal=0)
+                    attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+                    attn_bias.to(query.dtype)
+
+                if attn_mask is not None:
+                    if attn_mask.dtype == torch.bool:
+                        attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
+                    else:
+                        attn_bias += attn_mask
+
+                if enable_gqa:
+                    key = key.repeat_interleave(query.size(-3)//key.size(-3), -3)
+                    value = value.repeat_interleave(query.size(-3)//value.size(-3), -3)
+
+                attn_weight = query@ key.transpose(-2, -1) * scale_factor
+                attn_weight += attn_bias
+                attn_weight = torch.softmax(attn_weight, dim=-1)
+                attn_weight=attn_weight
+                attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
+
+                return (attn_weight @ value).to(query.dtype), attn_weight
+            
+            attn_output, attn_weights = scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=causal_mask,
+                dropout_p=self.attention_dropout if self.training else 0.0,
+                is_causal=is_causal,
+            )
+
+        ### finish storing attention weights
+
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, attn_weights, past_key_value, None
+
+def Qwen2DecoderLayer_merge_then_fastv_forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states).to(residual.device)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value, mask = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+        ### implement mask on residual to match the shape of hidden_states
+        if mask is not None:
+            device=residual.device
+            mask=mask.to(device)
+            residual = residual[:, mask[0], :]
+        ### end masking residual
+        
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        residual.to(hidden_states.device)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if self_attn_weights!=None:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+### Prune then merge ###
+
+def replace_Qwen2_fastv_then_merge(model, fastv_k = 2, fastv_r = 0.75, merging_sparsity = 0.3):
+    model.fastv_k = fastv_k
+    model.fastv_r = fastv_r
+    model.merging_sparsity = merging_sparsity
+
+    if isinstance(model.model, Qwen2Model):
+        model.model.forward = MethodType(partial(Qwen2Model_fastv_then_merge_forward, model=model), model.model)
+    for i, decoder_layer in enumerate(model.model.layers):
+        if isinstance(decoder_layer, Qwen2DecoderLayer):
+            decoder_layer.forward=MethodType(Qwen2DecoderLayer_fastv_then_merge_forward, decoder_layer)
+        qwen2_attention_instance = decoder_layer.self_attn
+        if isinstance(qwen2_attention_instance, Qwen2SdpaAttention):
+            qwen2_attention_instance.forward = MethodType(partial(Qwen2SdpaAttention_fastv_then_merge_forward, model=model), qwen2_attention_instance)
+        else:
+            raise TypeError("language model is not Qwen2.")
+
+@add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
+def Qwen2Model_fastv_then_merge_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    use_cache: Optional[bool] = None,
+    model = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+) -> Union[Tuple, BaseModelOutputWithPast]:
+    
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    if (input_ids is None) ^ (inputs_embeds is not None):
+        raise ValueError(
+            "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+        )
+
+    if self.gradient_checkpointing and self.training:
+        if use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+            )
+            use_cache = False
+
+    # kept for BC (non `Cache` `past_key_values` inputs)
+    return_legacy_cache = False
+    if use_cache and not isinstance(past_key_values, Cache):
+        return_legacy_cache = True
+        if past_key_values is None:
+            past_key_values = DynamicCache()
+        else:
+            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            logger.warning_once(
+                "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+            )
+
+    if inputs_embeds is None:
+        inputs_embeds = self.embed_tokens(input_ids)
+
+    if cache_position is None:
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        cache_position = torch.arange(
+            past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+        )
+    if position_ids is None:
+        position_ids = cache_position.unsqueeze(0)
+
+    causal_mask = self._update_causal_mask(
+        attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+    )
+
+    hidden_states = inputs_embeds
+
+    # create position embeddings to be shared across the decoder layers
+    position_embeddings = self.rotary_emb(hidden_states, position_ids)
+    position_embeddings = list(position_embeddings)
+
+    # decoder layers
+    all_hidden_states = () if output_hidden_states else None
+    all_self_attns = () if output_attentions else None
+    next_decoder_cache = None
+
+
+    ### implement fastv
+ 
+    device = self.device
+    FASTV_image_token_start_index = model.image_token_start_index.item()
+    FASTV_image_token_length = model.image_token_length.item()
+    FASTV_k = model.fastv_k
+    FASTV_r = model.fastv_r
+    #seq_length_with_past = past_seen_tokens + inputs_embeds.shape[1]    (here because cache position in minicpmv is not none,so past_seen_tokens is not defined )
+
+    model.sparsity_list = []
+    
+    for layer_idx, decoder_layer in enumerate(self.layers):
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)  
+
+        if layer_idx == FASTV_k and hidden_states.shape[1] > 1: 
+            # update FASTV_image_token_length
+            last_layer_attention = layer_outputs[1]
+            # compute average attention over different head
+            last_layer_attention_avg = torch.mean(last_layer_attention, dim=1)[0]
+            # generate new attention mask based on the average attention, sample the top ATTENTION_RANK tokens with highest attention
+            last_layer_attention_avg_last_tok = last_layer_attention_avg[-1]
+            # get the attention in image token
+            last_layer_attention_avg_last_tok_image = last_layer_attention_avg_last_tok[FASTV_image_token_start_index:FASTV_image_token_start_index+FASTV_image_token_length]
+            # get the indexs of the top ATTENTION_RANK tokens
+            top_attention_rank_index = last_layer_attention_avg_last_tok_image.topk(round(FASTV_image_token_length*(1-FASTV_r))).indices + FASTV_image_token_start_index
+            # keep index
+            keep_indexs = torch.cat( (torch.arange(FASTV_image_token_start_index,device=device), top_attention_rank_index, torch.arange(FASTV_image_token_start_index+FASTV_image_token_length,hidden_states.shape[1],device=device)))
+            # sort index
+            keep_indexs = keep_indexs.sort().values
+            # update seq length
+            new_seq_length = keep_indexs.shape[0]
+            # filter hidden states
+            hidden_states = hidden_states[:,keep_indexs,:]
+            # update patch type
+            model.patch_type = model.patch_type[:,keep_indexs]
+            # update position ids
+            position_ids = keep_indexs.unsqueeze(0)
+            # update position embeddings
+            position_embeddings[0] = position_embeddings[0][:,keep_indexs,:]
+            position_embeddings[1] = position_embeddings[1][:,keep_indexs,:]
+    
+            position_embeddings = list(position_embeddings)
+            cache_position = cache_position[:new_seq_length]
+
+
+        layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=False,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+        )
+        hidden_states = layer_outputs[0]
+
+        if layer_idx == FASTV_k - 1 and hidden_states.shape[1] > 1:
+            output_attentions = True
+        else:
+            output_attentions = False
+    ###finish inplementing fastv
+
+        if use_cache:
+            next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+        
+        ### force output_attentions to be False(we store attn_weights by ourselves)
+        output_attentions = False
+        ### end
+        
+        
+        if output_attentions:
+            all_self_attns += (layer_outputs[1],)
+    
+
+    hidden_states = self.norm(hidden_states)
+
+    # add hidden states from the last decoder layer
+    if output_hidden_states:
+        all_hidden_states += (hidden_states,)
+
+    next_cache = next_decoder_cache if use_cache else None
+    if return_legacy_cache:
+        next_cache = next_cache.to_legacy_cache()
+
+    if not return_dict:
+        return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+    return BaseModelOutputWithPast(
+        last_hidden_state=hidden_states,
+        past_key_values=next_cache,
+        hidden_states=all_hidden_states,
+        attentions=all_self_attns,
+    )
+
+def Qwen2SdpaAttention_fastv_then_merge_forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        model = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    if output_attentions:
+        # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+        logger.warning_once(
+                "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+        return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+    
+    bsz, q_len, _ = hidden_states.size()
+    # start token merging
+    
+    def cosine_similarity(mat1, mat2):
+        dot_product = torch.sum(mat1*mat2, dim=-1)
+        norm_vec1 = torch.norm(mat1, dim=-1)
+        norm_vec2 = torch.norm(mat2, dim=-1)
+        return dot_product / (norm_vec1 * norm_vec2)
+
+    device = hidden_states.device
+    token_patch_type = model.patch_type.reshape(1, -1).to(device)
+    token_mask = None
+
+    if q_len >1 and self.layer_idx == model.fastv_k + 1:
+        # prefill
+        sparsity = model.merging_sparsity
+        frame_token_num = torch.sum(token_patch_type != TEXT_TOKEN).item()
+        prune_num = math.floor(sparsity * frame_token_num)
+
+        if prune_num > 0:
+            # prefill token merging
+
+            token_similarity = torch.full(
+                (
+                    bsz,
+                    q_len,
+                ),
+                IGNORE_TOKEN,
+                dtype=hidden_states.dtype,
+                device=device,
+            )
+
+
+            assert bsz == 1, "Only support batch size 1"
+
+            token_index_by_patch = []
+            similarity_by_patch = []
+
+            patch_num=model.patch_num # typically 14 * 15 = 210
+
+            for i in range(patch_num):
+                this_patch_token_index: torch.LongTensor = torch.where(
+                    token_patch_type == i
+                )[
+                    1
+                ]  # shape (q_len,)
+                if this_patch_token_index.shape[-1] > 1:
+                    this_patch_similarity = torch.cat(
+                        (
+                            torch.full(
+                                size=(bsz, 1),
+                                fill_value = IGNORE_TOKEN,
+                                dtype=hidden_states.dtype,
+                                device=hidden_states.device,
+                            ),
+                            cosine_similarity(
+                                hidden_states[:, this_patch_token_index[1:], :],
+                                hidden_states[:, this_patch_token_index[:-1], :],
+                            ),
+                        ),
+                        dim=-1,
+                    )
+                    similarity_by_patch.append(this_patch_similarity)
+                    token_similarity[:, this_patch_token_index[1:]] = this_patch_similarity[
+                        :, 1:
+                    ]
+                elif this_patch_token_index.shape[-1] == 1:
+                    this_patch_similarity = torch.full(
+                        size=(bsz, 1),
+                        fill_value = IGNORE_TOKEN,
+                        dtype=hidden_states.dtype,
+                        device=hidden_states.device,
+                    )
+                    
+                    similarity_by_patch.append(this_patch_similarity)
+                    token_similarity[:, this_patch_token_index] = torch.full(
+                        size=(bsz, 1),
+                        fill_value = IGNORE_TOKEN,
+                        dtype=hidden_states.dtype,
+                        device=hidden_states.device,
+                    )
+                else:
+                    raise ValueError("No token in this patch")
+                token_index_by_patch.append(this_patch_token_index.to(device))
+            similarity_by_patch = torch.cat(similarity_by_patch, dim=-1)
+
+            token_index_by_patch = torch.cat(token_index_by_patch, dim=0).reshape(
+                1, -1
+            )  # shape (batch_size, q_len),
+
+            assert similarity_by_patch.shape[1] == token_index_by_patch.shape[1]
+
+            # profile purpose
+            if hasattr(model, "similarities"):
+                model.similarities.append(token_similarity.detach().cpu())
+            else:
+                model.similarities = [token_similarity.detach().cpu()]
+
+            topk_values, topk_indices = torch.topk(similarity_by_patch, prune_num)
+
+            bsz_index = torch.arange(bsz, device=hidden_states.device)[:, None]
+            mask_by_patch = torch.zeros(
+                bsz,
+                similarity_by_patch.shape[1],
+                dtype=hidden_states.dtype,
+                device=hidden_states.device,
+            )
+            mask_by_patch[bsz_index.to(hidden_states.device), topk_indices] = 1
+
+            token_mask = torch.ones(hidden_states.shape[:-1], dtype=torch.bool, device=device)
+            token_mask[bsz_index, token_index_by_patch[bsz_index, topk_indices]] = False
+
+            last_merge_token_by_patch = find_contigious_latter_index(mask_by_patch)
+
+            unique_merge_nums = [int(merge_num.item()) for merge_num in torch.unique(last_merge_token_by_patch.to(torch.long))]
+
+            for merge_num in unique_merge_nums:
+                if merge_num > 0:
+                    batch_merge_indices, token_merge_indices = torch.where(
+                        last_merge_token_by_patch == merge_num
+                    )
+
+                    token_merge_start_indices = token_merge_indices - merge_num  # 1D tensor
+
+                    contigious_indices = (
+                        token_merge_start_indices[:, None]
+                        + torch.arange(
+                            merge_num + 1,
+                            dtype=torch.long,
+                            device=device,
+                        )[None, :]
+                    )
+
+                    hidden_states[
+                        batch_merge_indices,
+                        token_index_by_patch[
+                            batch_merge_indices, token_merge_start_indices
+                        ],
+                    ] = hidden_states[
+                        batch_merge_indices[:, None],
+                        token_index_by_patch[
+                            batch_merge_indices[:, None], contigious_indices
+                        ],
+                    ].mean(
+                        dim=1
+                    )
+
+            # here only bsz=1
+            # update patch type
+            model.patch_type = model.patch_type.to(device)[token_mask].reshape(bsz, -1)
+            hidden_states = hidden_states[token_mask, :].reshape(bsz, -1, self.hidden_size)
+    ### end token merging
+          
+    query_states = self.q_proj(hidden_states)
+    key_states = self.k_proj(hidden_states)
+    value_states = self.v_proj(hidden_states)
+
+    if q_len > 1:
+        q_len = hidden_states.shape[1]
+
+    query_states = query_states.view(
+        bsz, q_len, self.num_heads, self.head_dim
+    ).transpose(1, 2)
+    key_states = key_states.view(
+        bsz, q_len, self.num_key_value_heads, self.head_dim
+    ).transpose(1, 2)
+    value_states = value_states.view(
+        bsz, q_len, self.num_key_value_heads, self.head_dim
+    ).transpose(1, 2)
+
+    if position_embeddings is None:
+        logger.warning_once(
+            "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+            "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+            "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+            "removed and `position_embeddings` will be mandatory."
+        )
+        cos, sin = self.rotary_emb(value_states, position_ids)
+    else:
+        if q_len>1 and token_mask != None:
+            ### also prune position_embeddings according to mask
+            position_embeddings[0] = position_embeddings[0][:,token_mask[0],:]
+            position_embeddings[1] = position_embeddings[1][:,token_mask[0],:]
+            ###end pruning position_embeddings
+        cos, sin = position_embeddings
+    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+    if past_key_value is not None:
+        cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+        key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+    causal_mask = attention_mask
+    if attention_mask is not None:  # no matter the length, we just slice it
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+
+    # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+    # Reference: https://github.com/pytorch/pytorch/issues/112577.
+    if query_states.device.type == "cuda" and attention_mask is not None:
+        query_states = query_states.contiguous()
+        key_states = key_states.contiguous()
+        value_states = value_states.contiguous()
+
+    # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+    # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+    # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+    is_causal = True if causal_mask is None and q_len > 1 else False
+    
+    ### start storing attn_weights
+    attn_weights = None
+    if q_len > 1 and self.layer_idx == model.fastv_k - 1:
+        def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0,
+            is_causal=False, scale=None, enable_gqa=False) -> torch.Tensor:
+                L, S = query.size(-2), key.size(-2)
+                scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
+                attn_bias = torch.zeros(L, S, dtype=query.dtype, device=query.device)
+
+                if is_causal:
+                    assert attn_mask is None
+                    temp_mask = torch.ones(L, S, dtype=torch.bool,device=query.device).tril(diagonal=0)
+                    attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+                    attn_bias.to(query.dtype)
+
+                if attn_mask is not None:
+                    if attn_mask.dtype == torch.bool:
+                        attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
+                    else:
+                        attn_bias += attn_mask
+
+                if enable_gqa:
+                    key = key.repeat_interleave(query.size(-3)//key.size(-3), -3)
+                    value = value.repeat_interleave(query.size(-3)//value.size(-3), -3)
+
+
+                attn_weight = query@ key.transpose(-2, -1) * scale_factor
+                attn_weight += attn_bias
+                attn_weight = torch.softmax(attn_weight, dim=-1)
+                attn_weight=attn_weight
+                attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
+
+                return (attn_weight @ value).to(query.dtype), attn_weight
+            
+        attn_output, attn_weights = scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+        
+        
+    else:
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+    ### end storing attn_weights
+
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+
+    attn_output = self.o_proj(attn_output)
+
+    return attn_output, attn_weights, past_key_value, token_mask
+
+def Qwen2DecoderLayer_fastv_then_merge_forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states).to(residual.device)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value, mask = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+        
+        ### implement mask on residual to match the shape of hidden_states
+        if mask is not None:
+            device=residual.device
+            mask=mask.to(device)
+            residual = residual[:, mask[0], :]
+        ### end masking residual
+        
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        residual.to(hidden_states.device)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        ### adding attn_weights if needed
+        if self_attn_weights != None:
+            outputs += (self_attn_weights,)
+        ### end adding
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
diff --git a/framefusion/utils.py b/framefusion/utils.py
new file mode 100644
index 0000000..40f379a
--- /dev/null
+++ b/framefusion/utils.py
@@ -0,0 +1,101 @@
+import numpy as np
+import torch
+import math
+from typing import Any
+import os
+import torchvision.transforms as T
+import matplotlib.pyplot as plt
+
+# meta
+TEXT_TOKEN = -1
+IGNORE_TOKEN = -2
+
+def get_attr_by_name(obj: Any, name: str) -> Any:
+    """
+    Get an attribute from an object using a dot notation string.
+    e.g., get_attr_by_name(model, "layers.0.self_attn.q_proj") will return model.layers[0].self_attn.q_proj
+    """
+    levels = name.split('.')
+    current = obj
+    for level in levels:
+        if level.isdigit():
+            current = current[int(level)]
+        else:
+            current = getattr(current, level)
+    return current
+
+def scaled_dot_product_attention(query, key, value, num=1, attn_mask=None, dropout_p=0.0,
+    is_causal=False, scale=None, enable_gqa=False) -> torch.Tensor:
+        query = query[:,:,-num:,:]
+        L, S = query.size(-2), key.size(-2)
+        scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
+        attn_bias = torch.zeros(L, S, dtype=query.dtype, device=query.device)
+
+        if is_causal:
+            assert attn_mask is None
+            temp_mask = torch.ones(L, S, dtype=torch.bool, device=query.device).triu(diagonal=S - L + 1)
+            attn_bias.masked_fill_(temp_mask, float("-inf"))
+            attn_bias.to(query.dtype)
+
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
+            else:
+                attn_bias += attn_mask
+
+        if enable_gqa:
+            key = key.repeat_interleave(query.size(-3)//key.size(-3), -3)
+            value = value.repeat_interleave(query.size(-3)//value.size(-3), -3)
+
+
+        attn_weight = query@ key.transpose(-2, -1) * scale_factor
+        attn_weight += attn_bias
+        attn_weight = torch.softmax(attn_weight, dim=-1)
+        attn_weight=attn_weight
+        attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
+
+        return attn_weight
+
+def save_video_frames(video, output_path: str = "local/video_frames"):
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+    to_pil = T.ToPILImage()
+    for i, frame in enumerate(video[0]):
+        frame_float = frame.to(torch.float32)
+        frame_float = (frame_float + 1) / 2
+        frame_float = torch.clamp(frame_float, 0, 1)
+        frame_pil = to_pil(frame_float)
+        frame_pil.save(os.path.join(output_path, f"frame_{i}.png"))
+
+def save_video_frames_subfigures(video, output_path: str = "local/video_frames.jpg"):
+    """
+    Save the video frames as subfigures in a single image.
+    """
+    if not os.path.exists(os.path.dirname(output_path)):
+        os.makedirs(os.path.dirname(output_path))
+        
+    num_frames = len(video[0])
+    rows = int(np.sqrt(num_frames))
+    cols = int(np.ceil(num_frames / rows))
+    
+    fig, axes = plt.subplots(rows, cols, figsize=(4*cols, 4*rows))
+    axes = axes.flatten()
+    
+    to_pil = T.ToPILImage()
+    for i, frame in enumerate(video[0]):
+        frame_float = frame.to(torch.float32)
+        frame_float = (frame_float + 1) / 2
+        frame_float = torch.clamp(frame_float, 0, 1)
+        frame_pil = to_pil(frame_float)
+        
+        axes[i].imshow(frame_pil)
+        axes[i].axis('off')
+        axes[i].set_title(f'Frame {i}')
+    
+    # Hide empty subplots
+    for i in range(num_frames, len(axes)):
+        axes[i].axis('off')
+        
+    plt.tight_layout()
+    plt.savefig(output_path)
+    plt.close()

From 03b88c216f4dde6ada21aee0d8613a14181a2426 Mon Sep 17 00:00:00 2001
From: sablin39 <1020030829@qq.com>
Date: Sun, 23 Mar 2025 21:13:32 +0800
Subject: [PATCH 2/7] Add multi-image input features and token reduction,
 failed with videos.

---
 framefusion/example_siglip_framefusion.py | 105 +++++++++
 framefusion/siglip_adapter.py             | 251 ++++++++++++++++++++++
 framefusion/video_processor.py            | 227 +++++++++++++++++++
 infer/rwkv/model.py                       |   7 +-
 infer/rwkv/utils.py                       |   4 +-
 infer/worldmodel.py                       |  64 +++++-
 world/encoder/siglip_encoder.py           |  48 ++++-
 7 files changed, 692 insertions(+), 14 deletions(-)
 create mode 100644 framefusion/example_siglip_framefusion.py
 create mode 100644 framefusion/siglip_adapter.py
 create mode 100644 framefusion/video_processor.py

diff --git a/framefusion/example_siglip_framefusion.py b/framefusion/example_siglip_framefusion.py
new file mode 100644
index 0000000..7dcecb7
--- /dev/null
+++ b/framefusion/example_siglip_framefusion.py
@@ -0,0 +1,105 @@
+import os
+import torch
+import argparse
+from PIL import Image
+import matplotlib.pyplot as plt
+from world.encoder.siglip_encoder import SiglipEncoder
+from framefusion.siglip_adapter import apply_siglip_framefusion
+from framefusion.video_processor import load_video_frames, encode_video_frames_with_framefusion
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Test SIGLIP FrameFusion on a video")
+    parser.add_argument("--video_path", type=str, required=True, help="Path to the video file")
+    parser.add_argument("--encoder_path", type=str, default="google/siglip2-base-patch16-384", help="Path to the SIGLIP encoder model")
+    parser.add_argument("--project_dim", type=int, default=768, help="Projection dimension for the encoder")
+    parser.add_argument("--sample_steps", type=int, default=30, help="Number of frames to skip between samples")
+    parser.add_argument("--max_frames", type=int, default=None, help="Maximum number of frames to extract")
+    parser.add_argument("--cost", type=float, default=0.3, help="Computational budget for FrameFusion")
+    parser.add_argument("--similarity_threshold", type=float, default=0.6, help="Similarity threshold for merging tokens")
+    parser.add_argument("--ratio_threshold", type=float, default=0.1, help="Minimum ratio of tokens to keep")
+    parser.add_argument("--output_dir", type=str, default="./framefusion_output", help="Directory to save visualization")
+    return parser.parse_args()
+
+
+def visualize_frames(frames, output_path):
+    """Visualize a subset of frames and save to disk"""
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    
+    # Select a subset of frames if there are too many
+    if len(frames) > 16:
+        indices = torch.linspace(0, len(frames)-1, 16).long().tolist()
+        frames_subset = [frames[i] for i in indices]
+    else:
+        frames_subset = frames
+    
+    # Create a grid of images
+    fig, axes = plt.subplots(4, 4, figsize=(12, 12))
+    axes = axes.flatten()
+    
+    for i, frame in enumerate(frames_subset):
+        if i < len(axes):
+            axes[i].imshow(frame)
+            axes[i].set_title(f"Frame {i}")
+            axes[i].axis('off')
+    
+    # Hide any unused subplots
+    for i in range(len(frames_subset), len(axes)):
+        axes[i].axis('off')
+    
+    plt.tight_layout()
+    plt.savefig(output_path)
+    plt.close()
+    print(f"Saved visualization to {output_path}")
+
+
+def main():
+    args = parse_args()
+    
+    # Create output directory
+    os.makedirs(args.output_dir, exist_ok=True)
+    
+    print(f"Processing video: {args.video_path}")
+    
+    # Process video without FrameFusion
+    print("\n=== Processing without FrameFusion ===")
+    result_without_ff = encode_video_frames_with_framefusion(
+        args.video_path,
+        encoder_path=args.encoder_path,
+        project_dim=args.project_dim,
+        sample_steps=args.sample_steps,
+        max_frames=args.max_frames,
+        use_framefusion=False
+    )
+    encoded_frames_without_ff, frames_without_ff, _ = result_without_ff
+    
+    # Process video with FrameFusion
+    print("\n=== Processing with FrameFusion ===")
+    result_with_ff = encode_video_frames_with_framefusion(
+        args.video_path,
+        encoder_path=args.encoder_path,
+        project_dim=args.project_dim,
+        sample_steps=args.sample_steps,
+        max_frames=args.max_frames,
+        use_framefusion=True,
+        cost=args.cost,
+        similarity_lower_bound=args.similarity_threshold,
+        ratio_lower_bound=args.ratio_threshold
+    )
+    encoded_frames_with_ff, frames_with_ff, original_frame_count = result_with_ff
+    
+    # Print results
+    print("\n=== Results ===")
+    print(f"Original frame count: {original_frame_count}")
+    print(f"Without FrameFusion: {encoded_frames_without_ff.shape[0]} frames with {encoded_frames_without_ff.shape[1]} patches per frame")
+    print(f"With FrameFusion: {encoded_frames_with_ff.shape[0]} frames with {encoded_frames_with_ff.shape[1]} patches per frame")
+    print(f"Frame reduction: {(1 - encoded_frames_with_ff.shape[0]/original_frame_count) * 100:.2f}%")
+    
+    # Visualize frames
+    visualize_frames(frames_without_ff, os.path.join(args.output_dir, "original_frames.png"))
+    
+    print("\nFrameFusion successfully applied to SIGLIP encoder!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/framefusion/siglip_adapter.py b/framefusion/siglip_adapter.py
new file mode 100644
index 0000000..ab9acda
--- /dev/null
+++ b/framefusion/siglip_adapter.py
@@ -0,0 +1,251 @@
+import torch
+import torch.nn as nn
+import numpy as np
+from typing import List, Union, Sequence
+from PIL import Image
+
+from framefusion.main import FrameFusion
+from framefusion.utils import TEXT_TOKEN, IGNORE_TOKEN
+
+class SiglipFrameFusion(nn.Module):
+    """
+    Adapter to apply FrameFusion to SIGLIP encoder outputs.
+    This reduces the number of tokens in video frame sequences by merging similar frames
+    and pruning less important ones.
+    """
+    def __init__(self, siglip_encoder, cost=0.3, similarity_lower_bound=0.6, ratio_lower_bound=0.1):
+        super().__init__()
+        self.siglip_encoder = siglip_encoder
+        self.framefusion = FrameFusion(cost, similarity_lower_bound, ratio_lower_bound)
+        self.original_forward = siglip_encoder.forward
+        
+        # Replace the forward method
+        self.siglip_encoder.forward = self.forward_with_framefusion
+
+    def forward_with_framefusion(self, images: Union[Image.Image, Sequence[Image.Image]]):
+        """
+        Apply FrameFusion to SIGLIP encoder output to reduce the number of frame tokens.
+        
+        Args:
+            images: Single image or list of PIL images to encode
+            
+        Returns:
+            Tensor with reduced number of tokens based on FrameFusion algorithm
+        """
+        # Handle single image case (no need for FrameFusion)
+        if isinstance(images, Image.Image) or len(images) <= 1:
+            return self.original_forward(images)
+        
+        # Process images with the image processor
+        processed_images = [self.siglip_encoder.image_processor(img)['pixel_values'][0] for img in images]
+        # Stack images and move to the correct device
+        x = torch.tensor(np.stack(processed_images)).to(self.siglip_encoder.device)
+        
+        # Get features from vision model without adapter
+        with torch.no_grad():
+            # Get the raw features from the vision model
+            x = self.siglip_encoder.model(x, output_hidden_states=True).last_hidden_state
+        
+        # Now we have features of shape [num_frames, patches_per_frame, hidden_size]
+        num_frames, patches_per_frame, hidden_dim = x.shape
+        device = x.device
+        total_patches = num_frames * patches_per_frame
+        
+        # Create a batch size of 1 for FrameFusion
+        # Reshape to [1, num_frames*patches_per_frame, hidden_dim]
+        features = x.reshape(1, total_patches, hidden_dim)
+        
+        # Create patch type tensor - each frame's patches get the same patch type (0 to num_frames-1)
+        patch_type = torch.zeros(1, total_patches, dtype=torch.long, device=device)
+        for i in range(num_frames):
+            start_idx = i * patches_per_frame
+            end_idx = (i + 1) * patches_per_frame
+            patch_type[0, start_idx:end_idx] = i
+        
+        # Setup FrameFusion parameters
+        self.framefusion.prepare(
+            patch_type=patch_type,
+            patch_num=num_frames,  # Number of different patch types (frames)
+            image_token_start_index=torch.tensor([0], device=device),
+            image_token_end_index=torch.tensor([total_patches], device=device),
+            image_token_length=total_patches,
+            original_length=total_patches,
+            finish_merging=False,
+            finish_pruning=False,
+            sparsity_list=[]
+        )
+        
+        # Create dummy position embeddings and attention mask for FrameFusion
+        position_embeddings = [torch.zeros_like(features), torch.zeros_like(features)]
+        attention_mask = None
+        
+        # Apply FrameFusion
+        reduced_features, _, _ = self.framefusion(
+            hidden_states=features,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask
+        )
+        
+        # Get the number of frames after reduction
+        reduced_tokens = reduced_features.shape[1]
+        
+        # Calculate how many frames we have after reduction (rounded up)
+        reduced_frames = (reduced_tokens + patches_per_frame - 1) // patches_per_frame
+        
+        # Create a new tensor with the right shape [reduced_frames, patches_per_frame, hidden_dim]
+        # First, pad the reduced features to be a multiple of patches_per_frame
+        padding_needed = (patches_per_frame - (reduced_tokens % patches_per_frame)) % patches_per_frame
+        if padding_needed > 0:
+            padding = torch.zeros(1, padding_needed, hidden_dim, device=device)
+            reduced_features = torch.cat([reduced_features, padding], dim=1)
+            reduced_tokens += padding_needed
+        
+        # Now reshape to match the expected output format
+        final_features = reduced_features.reshape(reduced_frames, patches_per_frame, hidden_dim)
+        
+        # Apply the adapter to the final features
+        final_features = self.siglip_encoder.adapter(final_features)
+        
+        # Log reduction statistics
+        reduction_percentage = (1 - reduced_frames / num_frames) * 100
+        print(f'FrameFusion reduced frames from {num_frames} to {reduced_frames} ({reduction_percentage:.2f}% reduction)')
+        
+        return final_features
+
+
+class SiglipImageFrameFusion(nn.Module):
+    """
+    Adapter to apply FrameFusion to SIGLIP encoder outputs for single images.
+    This reduces the number of patches within a single image by merging similar patches
+    and pruning less important ones.
+    """
+    def __init__(self, siglip_encoder, cost=0.3, similarity_lower_bound=0.6, ratio_lower_bound=0.1):
+        super().__init__()
+        self.siglip_encoder = siglip_encoder
+        self.framefusion = FrameFusion(cost, similarity_lower_bound, ratio_lower_bound)
+        self.original_forward = siglip_encoder.forward
+        
+        # Replace the forward method
+        self.siglip_encoder.forward = self.forward_with_framefusion
+
+    def forward_with_framefusion(self, images: Union[Image.Image, Sequence[Image.Image]]):
+        """
+        Apply FrameFusion to SIGLIP encoder output to reduce the number of patches within an image.
+        
+        Args:
+            images: Single image or list of PIL images to encode
+            
+        Returns:
+            Tensor with reduced number of patches based on FrameFusion algorithm
+        """
+        # Convert single image to list if needed
+        if isinstance(images, Image.Image):
+            images = [images]
+            
+        # Process images with the image processor
+        processed_images = [self.siglip_encoder.image_processor(img)['pixel_values'][0] for img in images]
+        # Stack images and move to the correct device
+        x = torch.tensor(np.stack(processed_images)).to(self.siglip_encoder.device)
+        
+        # Get features from vision model without adapter
+        with torch.no_grad():
+            # Get the raw features from the vision model
+            x = self.siglip_encoder.model(x, output_hidden_states=True).last_hidden_state
+        
+        # For each image, apply FrameFusion to its patches
+        results = []
+        for i in range(len(images)):
+            # Get the features for this image
+            img_features = x[i:i+1]  # Shape: [1, patches_per_image, hidden_dim]
+            num_patches = img_features.shape[1]
+            hidden_dim = img_features.shape[2]
+            device = img_features.device
+            
+            # Create patch types - for a single image, we'll group patches by their position
+            # Group patches into a grid (e.g., 24x24 for 576 patches)
+            grid_size = int(num_patches ** 0.5)  # Assuming square grid of patches
+            
+            # Create patch type tensor - assign types based on position in the grid
+            patch_type = torch.zeros(1, num_patches, dtype=torch.long, device=device)
+            for p in range(num_patches):
+                # Assign patch types based on regions in the image (e.g., 3x3 grid of regions)
+                row = p // grid_size
+                col = p % grid_size
+                region_size = max(1, grid_size // 3)  # Split into approximately 9 regions
+                region_row = row // region_size
+                region_col = col // region_size
+                region_id = region_row * 3 + region_col  # Assign region ID (0-8 for a 3x3 grid)
+                patch_type[0, p] = region_id
+            
+            # Setup FrameFusion parameters
+            num_regions = 9  # 3x3 grid of regions
+            self.framefusion.prepare(
+                patch_type=patch_type,
+                patch_num=num_regions,  # Number of different patch types (regions)
+                image_token_start_index=torch.tensor([0], device=device),
+                image_token_end_index=torch.tensor([num_patches], device=device),
+                image_token_length=num_patches,
+                original_length=num_patches,
+                finish_merging=False,
+                finish_pruning=False,
+                sparsity_list=[]
+            )
+            
+            # Create dummy position embeddings and attention mask for FrameFusion
+            position_embeddings = [torch.zeros_like(img_features), torch.zeros_like(img_features)]
+            attention_mask = None
+            
+            # Apply FrameFusion
+            reduced_features, _, _ = self.framefusion(
+                hidden_states=img_features,
+                position_embeddings=position_embeddings,
+                attention_mask=attention_mask
+            )
+            
+            # Log reduction statistics
+            reduced_patches = reduced_features.shape[1]
+            reduction_percentage = (1 - reduced_patches / num_patches) * 100
+            print(f'FrameFusion reduced patches from {num_patches} to {reduced_patches} ({reduction_percentage:.2f}% reduction)')
+            
+            # Apply the adapter to the reduced features
+            reduced_features = self.siglip_encoder.adapter(reduced_features)
+            results.append(reduced_features)
+        
+        # If we only had one image, return the result directly
+        if len(results) == 1:
+            return results[0]
+        
+        # Otherwise, concatenate the results
+        return torch.cat(results, dim=0)
+
+def apply_siglip_framefusion(siglip_encoder, cost=0.3, similarity_lower_bound=0.6, ratio_lower_bound=0.1, for_single_images=False):
+    """
+    Apply FrameFusion to a SIGLIP encoder to reduce the number of tokens.
+    
+    Args:
+        siglip_encoder: The SIGLIP encoder to apply FrameFusion to
+        cost: The computational budget (higher values allow more token reduction)
+        similarity_lower_bound: Threshold for token similarity to be merged
+        ratio_lower_bound: Minimum ratio of tokens to keep
+        for_single_images: If True, use the adapter optimized for single images
+        
+    Returns:
+        Modified SIGLIP encoder with FrameFusion applied
+    """
+    # Create a FrameFusion adapter for the SIGLIP encoder
+    if for_single_images:
+        adapter = SiglipImageFrameFusion(
+            siglip_encoder=siglip_encoder,
+            cost=cost,
+            similarity_lower_bound=similarity_lower_bound,
+            ratio_lower_bound=ratio_lower_bound
+        )
+    else:
+        adapter = SiglipFrameFusion(
+            siglip_encoder=siglip_encoder,
+            cost=cost,
+            similarity_lower_bound=similarity_lower_bound,
+            ratio_lower_bound=ratio_lower_bound
+        )
+    
+    return adapter
diff --git a/framefusion/video_processor.py b/framefusion/video_processor.py
new file mode 100644
index 0000000..2f94ae6
--- /dev/null
+++ b/framefusion/video_processor.py
@@ -0,0 +1,227 @@
+import os
+import json
+import cv2
+import torch
+import numpy as np
+from PIL import Image
+from world.encoder.siglip_encoder import SiglipEncoder
+from framefusion.siglip_adapter import apply_siglip_framefusion
+from infer.worldmodel import Worldinfer
+
+
+def load_video_frames(video_path, sample_steps=None, max_frames=None, keyframe_timestamps=None):
+    """
+    Load frames from a video based on sample steps or timestamps
+    
+    Args:
+        video_path: Path to the video file
+        sample_steps: Number of frames to skip between samples (e.g., sample_steps=30 means take 1 frame per second in a 30fps video)
+        max_frames: Maximum number of frames to extract (None means no limit)
+        keyframe_timestamps: Optional list of timestamps (in seconds) to extract specific frames from
+        
+    Returns:
+        List of PIL Images corresponding to the sampled frames
+    """
+    # Check if video exists
+    if not os.path.exists(video_path):
+        raise FileNotFoundError(f"Video file not found: {video_path}")
+    
+    # Open the video file
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise ValueError(f"Could not open video: {video_path}")
+    
+    # Get video properties
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    duration = total_frames / fps if fps > 0 else 0
+    
+    print(f"Video properties: {fps} fps, {total_frames} frames, {duration:.2f} seconds")
+    
+    frames = []
+    
+    # Case 1: Extract frames using sample_steps
+    if sample_steps is not None:
+        # Calculate frame indices to sample
+        frame_indices = list(range(0, total_frames, sample_steps))
+        
+        # Limit to max_frames if specified
+        if max_frames is not None:
+            frame_indices = frame_indices[:max_frames]
+            
+        print(f"Sampling {len(frame_indices)} frames with step size {sample_steps}")
+        
+        for frame_idx in frame_indices:
+            # Set the frame position
+            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
+            
+            # Read the frame
+            ret, frame = cap.read()
+            if not ret:
+                print(f"Warning: Could not read frame at index {frame_idx}")
+                continue
+            
+            # Convert BGR to RGB
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            
+            # Convert to PIL Image
+            pil_image = Image.fromarray(frame_rgb)
+            frames.append(pil_image)
+    
+    # Case 2: Extract frames at specified timestamps
+    elif keyframe_timestamps is not None:
+        print(f"Extracting {len(keyframe_timestamps)} frames at specified timestamps")
+        
+        for timestamp in keyframe_timestamps:
+            # Convert timestamp to frame number
+            frame_number = int(timestamp * fps)
+            
+            # Check if frame number is valid
+            if frame_number >= total_frames:
+                print(f"Warning: Timestamp {timestamp}s exceeds video duration {duration:.2f}s")
+                continue
+            
+            # Set the frame position
+            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
+            
+            # Read the frame
+            ret, frame = cap.read()
+            if not ret:
+                print(f"Warning: Could not read frame at timestamp {timestamp}s")
+                continue
+            
+            # Convert BGR to RGB
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            
+            # Convert to PIL Image
+            pil_image = Image.fromarray(frame_rgb)
+            frames.append(pil_image)
+    
+    # Release the video capture
+    cap.release()
+    
+    return frames
+
+
+def encode_video_frames_with_framefusion(video_path, caption_json_path=None, 
+                                       encoder_path='google/siglip2-base-patch16-384', 
+                                       project_dim=768, sample_steps=None, max_frames=None,
+                                       use_framefusion=True, cost=0.3, 
+                                       similarity_lower_bound=0.6, ratio_lower_bound=0.1):
+    """
+    Process a video by extracting frames and encoding them with SiglipEncoder,
+    optionally applying FrameFusion to reduce the number of tokens
+    
+    Args:
+        video_path: Path to the video file
+        caption_json_path: Path to the JSON file containing keyframe timestamps (optional)
+        encoder_path: Path to the SiglipEncoder model
+        project_dim: Projection dimension for the encoder
+        sample_steps: Number of frames to skip between samples
+        max_frames: Maximum number of frames to extract (None means no limit)
+        use_framefusion: Whether to apply FrameFusion to reduce tokens
+        cost: The computational budget for FrameFusion
+        similarity_lower_bound: Threshold for token similarity to be merged
+        ratio_lower_bound: Minimum ratio of tokens to keep
+        
+    Returns:
+        Tuple of (encoded_frames, frames, original_frame_count)
+    """
+    # Determine how to extract frames
+    keyframe_timestamps = None
+    
+    if caption_json_path is not None:
+        # Load keyframe timestamps from JSON
+        with open(caption_json_path, 'r') as f:
+            caption_data = json.load(f)
+        
+        keyframe_timestamps = caption_data.get('keyframe', [])
+        if not keyframe_timestamps:
+            print(f"No keyframe timestamps found in {caption_json_path}, using sample_steps instead")
+            keyframe_timestamps = None
+        else:
+            print(f"Found {len(keyframe_timestamps)} keyframe timestamps")
+    
+    # Load frames from video
+    frames = load_video_frames(video_path, sample_steps=sample_steps, max_frames=max_frames, 
+                             keyframe_timestamps=keyframe_timestamps)
+    print(f"Extracted {len(frames)} frames from video")
+    original_frame_count = len(frames)
+    
+    # Initialize the SiglipEncoder
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Using device: {device}")
+    encoder = SiglipEncoder(encoder_path, project_dim, device=device)
+    
+    # Apply FrameFusion if requested
+    if use_framefusion and len(frames) > 1:
+        print(f"Applying FrameFusion with cost={cost}, similarity_threshold={similarity_lower_bound}, ratio_threshold={ratio_lower_bound}")
+        framefusion_encoder = apply_siglip_framefusion(
+            encoder, 
+            cost=cost, 
+            similarity_lower_bound=similarity_lower_bound, 
+            ratio_lower_bound=ratio_lower_bound
+        )
+        
+        # Encode the frames with FrameFusion
+        with torch.no_grad():
+            encoded_frames = framefusion_encoder.forward_with_framefusion(frames)
+        
+        print(f"Original frames: {original_frame_count}, After FrameFusion: {encoded_frames.shape}")
+        print(f"Frame reduction: {(1 - encoded_frames.shape[0]/original_frame_count) * 100:.2f}%")
+    else:
+        # Encode the frames without FrameFusion
+        with torch.no_grad():
+            encoded_frames = encoder(frames)
+        
+        print(f"Encoded frames shape: {encoded_frames.shape}")
+    
+    return encoded_frames, frames, original_frame_count
+
+
+def process_video_with_framefusion(video_path, caption_json_path=None, 
+                                 encoder_path='google/siglip2-base-patch16-384', 
+                                 project_dim=768, sample_steps=None, max_frames=None,
+                                 use_framefusion=True, cost=0.3, 
+                                 similarity_lower_bound=0.6, ratio_lower_bound=0.1):
+    """
+    Process a video by extracting frames and encoding them with SiglipEncoder,
+    optionally applying FrameFusion to reduce the number of tokens
+    
+    Args:
+        video_path: Path to the video file
+        caption_json_path: Path to the JSON file containing keyframe timestamps (optional)
+        encoder_path: Path to the SiglipEncoder model
+        project_dim: Projection dimension for the encoder
+        sample_steps: Number of frames to skip between samples
+        max_frames: Maximum number of frames to extract (None means no limit)
+        use_framefusion: Whether to apply FrameFusion to reduce tokens
+        cost: The computational budget for FrameFusion
+        similarity_lower_bound: Threshold for token similarity to be merged
+        ratio_lower_bound: Minimum ratio of tokens to keep
+        
+    Returns:
+        Dictionary containing encoded frames, original frames, and metadata
+    """
+    # Encode video frames with optional FrameFusion
+    encoded_frames, frames, original_frame_count = encode_video_frames_with_framefusion(
+        video_path, 
+        caption_json_path=caption_json_path,
+        encoder_path=encoder_path, 
+        project_dim=project_dim, 
+        sample_steps=sample_steps, 
+        max_frames=max_frames,
+        use_framefusion=use_framefusion, 
+        cost=cost, 
+        similarity_lower_bound=similarity_lower_bound, 
+        ratio_lower_bound=ratio_lower_bound
+    )
+    
+    # Return results
+    return {
+        'encoded_frames': encoded_frames,
+        'frames': frames,
+        'original_frame_count': original_frame_count,
+        'final_frame_count': encoded_frames.shape[1],
+        'reduction_percentage': (1 - encoded_frames.shape[1]/original_frame_count) * 100 if original_frame_count > 1 else 0
+    }
diff --git a/infer/rwkv/model.py b/infer/rwkv/model.py
index 2dc212d..70e478b 100644
--- a/infer/rwkv/model.py
+++ b/infer/rwkv/model.py
@@ -3,6 +3,7 @@
 ########################################################################################################
 
 from typing import Optional
+from einops import rearrange
 import types, gc, os, time, re, math
 import torch
 import torch.nn as nn
@@ -10,6 +11,7 @@
 torch.backends.cudnn.benchmark = True
 torch.backends.cudnn.allow_tf32 = True
 torch.backends.cuda.matmul.allow_tf32 = True
+
 current_path = os.path.dirname(os.path.abspath(__file__))
 
 ########################################################################################################
@@ -280,7 +282,10 @@ def forward(self, idx, state, full_output=False, sign=None):
 
         x = self.z['emb.weight'][idx]
         if isinstance(sign, torch.Tensor):
-            sign = sign.squeeze(0)
+            print(f"Image token shape: {sign.shape}, text token shape: {x.shape}")
+            # sign = sign.squeeze(0)
+            sign = rearrange(sign, 'b h w -> (b h) w')
+            print(f"Rearranged image token shape: {sign.shape}")
             # sign = F.layer_norm(sign, (self.args.n_embd,), weight=self.z['blocks.0.ln0.weight'], bias=self.z['blocks.0.ln0.bias'])
 
             x = torch.cat((sign,x.to('cuda')), dim=0)
diff --git a/infer/rwkv/utils.py b/infer/rwkv/utils.py
index 83d974d..854faed 100644
--- a/infer/rwkv/utils.py
+++ b/infer/rwkv/utils.py
@@ -7,6 +7,8 @@
 import torch
 from torch.nn import functional as F
 
+from infer.rwkv.model import RWKV
+
 class PIPELINE_ARGS():
     def __init__(self, temperature=1.0, top_p=0.85, top_k=0, alpha_frequency=0.2, alpha_presence=0.2, alpha_decay=0.996, token_ban=[], token_stop=[], chunk_len=256):
         self.temperature = temperature
@@ -20,7 +22,7 @@ def __init__(self, temperature=1.0, top_p=0.85, top_k=0, alpha_frequency=0.2, al
         self.chunk_len = chunk_len # split input into chunks to save VRAM (shorter -> slower)
 
 class PIPELINE():
-    def __init__(self, model, WORD_NAME):
+    def __init__(self, model:RWKV, WORD_NAME):
         self.model = model
         if WORD_NAME == 'cl100k_base':
             import tiktoken
diff --git a/infer/worldmodel.py b/infer/worldmodel.py
index 76e245a..5bda704 100644
--- a/infer/worldmodel.py
+++ b/infer/worldmodel.py
@@ -16,9 +16,44 @@
 
 
 from world.world_encoder import WorldEncoder
+from framefusion.siglip_adapter import apply_siglip_framefusion
+
+from typing import Union, Optional, Dict, Any
+from collections.abc import Sequence
+from PIL import Image
 
 class Worldinfer():
-    def __init__(self, model_path, encoder_type, encoder_path, strategy='cuda bf16', args=None):
+    def __init__(self, model_path, encoder_type, encoder_path, strategy='cuda bf16', args=None, 
+                 use_token_reduction=False, token_reduction_params=None):
+        """
+        Initialize a Worldinfer model with optional token reduction.
+        
+        Args:
+            model_path: Path to the RWKV model
+            encoder_type: Type of encoder to use (e.g., 'siglip')
+            encoder_path: Path to the encoder model
+            strategy: RWKV strategy string (e.g., 'cuda bf16')
+            args: PIPELINE_ARGS for RWKV
+            use_token_reduction: Whether to use token reduction (FrameFusion)
+            token_reduction_params: Parameters for token reduction, a dict with keys:
+                - cost: The computational budget (higher values allow more token reduction)
+                - similarity_threshold: Threshold for token similarity to be merged
+                - ratio_threshold: Minimum ratio of tokens to keep
+                - for_single_images: Whether to apply reduction to single images
+        """
+        # Set up token reduction parameters
+        self.use_token_reduction = use_token_reduction
+        self.token_reduction_params = {
+            'cost': 0.3,
+            'similarity_threshold': 0.6,
+            'ratio_threshold': 0.1,
+            'for_single_images': True
+        }
+        if token_reduction_params is not None:
+            self.token_reduction_params.update(token_reduction_params)
+        
+        # Store encoder type for later use
+        self.encoder_type = encoder_type
 
         ss = strategy.split(' ')
         DEVICE = ss[0]
@@ -60,9 +95,34 @@ def __init__(self, model_path, encoder_type, encoder_path, strategy='cuda bf16',
         }
         self.modality = WorldEncoder(**config).to('cuda', torch.bfloat16)        
         self.modality.load_checkpoint(modality_dict)
+        
+        # Apply token reduction if requested
+        if self.use_token_reduction:
+            self.apply_token_reduction()
+    
+    def apply_token_reduction(self):
+        """
+        Apply token reduction to the encoder based on the configured parameters.
+        Currently only supports SIGLIP encoder.
+        """
+        if self.encoder_type.lower() == 'siglip':
+            # Get the SIGLIP encoder from the model
+            siglip_encoder = self.modality.world_encoder
+            
+            # Apply FrameFusion to the encoder
+            apply_siglip_framefusion(
+                siglip_encoder, 
+                cost=self.token_reduction_params['cost'],
+                similarity_lower_bound=self.token_reduction_params['similarity_threshold'],
+                ratio_lower_bound=self.token_reduction_params['ratio_threshold'],
+                for_single_images=self.token_reduction_params['for_single_images']
+            )
+            print(f"Applied token reduction with parameters: {self.token_reduction_params}")
+        else:
+            print(f"Token reduction not supported for encoder type: {self.encoder_type}")
 
 
-    def generate(self, text, modality='none', state=None):
+    def generate(self, text, modality:Union[str, None, Sequence[Image.Image]]='none', state=None):
         if isinstance(modality, str):
             y=None
         else:
diff --git a/world/encoder/siglip_encoder.py b/world/encoder/siglip_encoder.py
index 62b7b20..cdac17a 100644
--- a/world/encoder/siglip_encoder.py
+++ b/world/encoder/siglip_encoder.py
@@ -2,6 +2,9 @@
 import torch.nn as nn
 import torch.nn.functional as F
 import numpy as np
+from typing import Union, Sequence
+from PIL import Image
+
 
 from transformers import AutoModel, SiglipImageProcessor
 
@@ -47,20 +50,45 @@ def __init__(
         encoder_path,
         project_dim,
         train_mode="adapter",
-        device="cuda",) -> None:
+        device="cuda" if torch.cuda.is_available() else "cpu",) -> None:
         super(SiglipEncoder, self).__init__()
 
         
         self.device = device
-        self.model = AutoModel.from_pretrained(encoder_path).vision_model
+        self.model = AutoModel.from_pretrained(encoder_path).vision_model.to(self.device)
         self.image_processor = SiglipImageProcessor.from_pretrained(encoder_path)
         self.encoder_dim = 768  #self.model.config.hidden_size
-
-        self.adapter = VisualAdapter(self.encoder_dim, project_dim)
-    def forward(self, x):
-
-        x= torch.from_numpy(self.image_processor(x)['pixel_values'][0]).to(self.device,dtype=torch.bfloat16)
-        x = self.model(x.unsqueeze(0), output_hidden_states=True).last_hidden_state
-        x = self.adapter(x)
         
-        return x
\ No newline at end of file
+        self.adapter = VisualAdapter(self.encoder_dim, project_dim).to(self.device)
+    def forward(self, images:Union[Image.Image, Sequence[Image.Image]]):
+        """
+        Encode single image or a list of images, preserving each image as a separate token
+        
+        Args:
+            images: List of PIL images to encode
+        
+        Returns:
+            Tensor of shape (num_images, project_dim) where each row represents an encoded image
+        """
+        # Process all images
+        try:
+            # Process images with the image processor
+            if isinstance(images, Image.Image):
+                images = [images]
+                
+            processed_images = [self.image_processor(img)['pixel_values'][0] for img in images]
+            # Stack images and move to the correct device
+            x = torch.tensor(np.stack(processed_images)).to(self.device)
+            
+            # Get features from vision model
+            with torch.no_grad():
+                x = self.model(x, output_hidden_states=True).last_hidden_state
+            
+            # Apply adapter to get projected features
+            features = self.adapter(x)
+            
+            # Return features for all images (no pooling)
+            return features
+        except Exception as e:
+            print(f"Error in SiglipEncoder forward pass: {e}")
+            raise

From 7b05a6fe35785aa2a221bf271ab8334d11bd6d52 Mon Sep 17 00:00:00 2001
From: sablin39 <1020030829@qq.com>
Date: Sun, 23 Mar 2025 21:23:10 +0800
Subject: [PATCH 3/7] Add more benchmarks and token reduction support to data
 list

---
 eval/vlmevalkit/config.json | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/eval/vlmevalkit/config.json b/eval/vlmevalkit/config.json
index ded6958..a1cfd91 100644
--- a/eval/vlmevalkit/config.json
+++ b/eval/vlmevalkit/config.json
@@ -1,10 +1,11 @@
 {
     "model": {
-        "worldrwkv7_siglip_custom": {
+        "worldrwkv7_3b_siglip": {
             "class": "WorldRWKV7_Siglip2",
             "model_path": "/home/rwkv/WorldRWKV/model/RWKV7-3B-siglip2/rwkv-0",
             "encoder_path": "google/siglip2-base-patch16-384",
             "use_custom_prompt": true,
+            "use_token_reduction":true,
             "verbose": false,
             "args": {
                 "temperature": 1.0,
@@ -19,17 +20,21 @@
         }
     },
     "data": {
-        "MMBench_DEV_EN": {
-            "class": "ImageMCQDataset",
-            "dataset": "MMBench_DEV_EN"
-        },
-        "MMMU_TEST": {
+        "MMMU_DEV_VAL":{
             "class": "MMMUDataset",
-            "dataset": "MMMU_TEST"
+            "dataset": "MMMU_DEV_VAL"
         },
-        "MMMU_EVAL":{
-            "class": "MMMUDataset",
-            "dataset": "MMMU_DEV_EVAL"
+        "TextVQA_VAL":{
+            "class":"ImageVQADataset",
+            "dataset":"TextVQA_VAL"
+        },
+        "ScienceQA_VAL":{
+            "class":"ImageMCQDataset",
+            "dataset":"ScienceQA_VAL"
+        },
+        "COCO_VAL":{
+            "class":"ImageCaptionDataset",
+            "dataset":"COCO_VAL"
         }
     }
 }

From 7e82efb088d51cb480d155af4230de6fe9273f31 Mon Sep 17 00:00:00 2001
From: sablin39 <1020030829@qq.com>
Date: Sun, 23 Mar 2025 21:50:28 +0800
Subject: [PATCH 4/7] Tested out ScienceQA

---
 framefusion/siglip_adapter.py | 25 ++++++++++++--
 infer/worldmodel.py           |  4 +++
 visual_gen.py                 | 64 ++++++++++++++++++++++++++++-------
 3 files changed, 79 insertions(+), 14 deletions(-)

diff --git a/framefusion/siglip_adapter.py b/framefusion/siglip_adapter.py
index ab9acda..41454ee 100644
--- a/framefusion/siglip_adapter.py
+++ b/framefusion/siglip_adapter.py
@@ -103,6 +103,10 @@ def forward_with_framefusion(self, images: Union[Image.Image, Sequence[Image.Ima
         # Now reshape to match the expected output format
         final_features = reduced_features.reshape(reduced_frames, patches_per_frame, hidden_dim)
         
+        # Ensure the data type matches the model's expected type
+        model_dtype = next(self.siglip_encoder.adapter.parameters()).dtype
+        final_features = final_features.to(model_dtype)
+        
         # Apply the adapter to the final features
         final_features = self.siglip_encoder.adapter(final_features)
         
@@ -207,6 +211,10 @@ def forward_with_framefusion(self, images: Union[Image.Image, Sequence[Image.Ima
             reduction_percentage = (1 - reduced_patches / num_patches) * 100
             print(f'FrameFusion reduced patches from {num_patches} to {reduced_patches} ({reduction_percentage:.2f}% reduction)')
             
+            # Ensure the data type matches the model's expected type
+            model_dtype = next(self.siglip_encoder.adapter.parameters()).dtype
+            reduced_features = reduced_features.to(model_dtype)
+            
             # Apply the adapter to the reduced features
             reduced_features = self.siglip_encoder.adapter(reduced_features)
             results.append(reduced_features)
@@ -215,8 +223,21 @@ def forward_with_framefusion(self, images: Union[Image.Image, Sequence[Image.Ima
         if len(results) == 1:
             return results[0]
         
-        # Otherwise, concatenate the results
-        return torch.cat(results, dim=0)
+        # For videos, ensure all frames have the same number of tokens
+        # Find the minimum token count across all frames
+        min_tokens = min([result.shape[0] for result in results])
+        
+        # Resize all frames to have the same token count
+        aligned_results = []
+        for result in results:
+            if result.shape[0] > min_tokens:
+                # Keep only the first min_tokens tokens
+                aligned_results.append(result[:min_tokens])
+            else:
+                aligned_results.append(result)
+        
+        # Now concatenate the results with consistent dimensions
+        return torch.cat(aligned_results, dim=0)
 
 def apply_siglip_framefusion(siglip_encoder, cost=0.3, similarity_lower_bound=0.6, ratio_lower_bound=0.1, for_single_images=False):
     """
diff --git a/infer/worldmodel.py b/infer/worldmodel.py
index 5bda704..d6d0b5d 100644
--- a/infer/worldmodel.py
+++ b/infer/worldmodel.py
@@ -109,6 +109,10 @@ def apply_token_reduction(self):
             # Get the SIGLIP encoder from the model
             siglip_encoder = self.modality.world_encoder
             
+            # Determine if we should use token reduction for videos
+            # For videos, we need to be careful about token reduction to ensure consistent token counts
+            use_for_videos = self.token_reduction_params.get('for_videos', False)
+            
             # Apply FrameFusion to the encoder
             apply_siglip_framefusion(
                 siglip_encoder, 
diff --git a/visual_gen.py b/visual_gen.py
index eb76af0..b913fc4 100644
--- a/visual_gen.py
+++ b/visual_gen.py
@@ -1,18 +1,58 @@
+import argparse
 from infer.worldmodel import Worldinfer
 from PIL import Image
+import torch
 
+def parse_args():
+    parser = argparse.ArgumentParser(description='Generate text from an image using RWKV with optional FrameFusion')
+    parser.add_argument('--image_path', type=str, default='./docs/03-Confusing-Pictures.jpg', help='Path to the input image')
+    parser.add_argument('--llm_path', type=str, default='/home/rwkv/models/RWKV7-3B-siglip2/rwkv-0', help='Path to RWKV model')
+    parser.add_argument('--encoder_path', type=str, default='google/siglip2-base-patch16-384', help='Path to encoder model')
+    parser.add_argument('--use_framefusion', action='store_true', help='Use FrameFusion to reduce tokens')
+    parser.add_argument('--cost', type=float, default=0.3, help='FrameFusion cost parameter')
+    parser.add_argument('--similarity_threshold', type=float, default=0.6, help='FrameFusion similarity threshold')
+    parser.add_argument('--ratio_threshold', type=float, default=0.1, help='FrameFusion ratio threshold')
+    parser.add_argument('--prompt', type=str, default='\x16User: What is unusual about this image?\x17Assistant:', help='Prompt for the model')
+    return parser.parse_args()
 
-llm_path='/home/rwkv/model/rwkv7-3b-siglip/rwkv-0'
-encoder_path='/home/rwkv/model/siglip2basep16s384'
-encoder_type='siglip'
+def main():
+    args = parse_args()
+    
+    # Load the image
+    img_path = args.image_path
+    image = Image.open(img_path).convert('RGB')
+    
+    # Prepare token reduction parameters if needed
+    if args.use_framefusion:
+        print(f"Using FrameFusion with cost={args.cost}, similarity_threshold={args.similarity_threshold}, ratio_threshold={args.ratio_threshold}")
+        
+        # Configure token reduction parameters
+        token_reduction_params = {
+            'cost': args.cost,
+            'similarity_threshold': args.similarity_threshold,
+            'ratio_threshold': args.ratio_threshold,
+            'for_single_images': True  # Apply to single images
+        }
+        
+        # Create model with built-in token reduction
+        model = Worldinfer(
+            model_path=args.llm_path, 
+            encoder_type='siglip', 
+            encoder_path=args.encoder_path,
+            use_token_reduction=True,
+            token_reduction_params=token_reduction_params
+        )
+    else:
+        print("Using standard SIGLIP encoder without FrameFusion")
+        model = Worldinfer(model_path=args.llm_path, encoder_type='siglip', encoder_path=args.encoder_path)
+    
+    # Generate text from the image
+    result, _ = model.generate(args.prompt, image)
+    
+    print("\nGenerated Response:\n")
+    print(result)
 
-model = Worldinfer(model_path=llm_path, encoder_type=encoder_type, encoder_path=encoder_path)
+# Function removed as we now use built-in token reduction in Worldinfer
 
-img_path = './docs/03-Confusing-Pictures.jpg'
-image = Image.open(img_path).convert('RGB')
-
-text = '\x16User: What is unusual about this image?\x17Assistant:'
-
-result,_ = model.generate(text, image)
-
-print(result)
+if __name__ == '__main__':
+    main()

From 2f387742e58d6a0e2d36dccaf8f55f29a4823532 Mon Sep 17 00:00:00 2001
From: sablin39 <1020030829@qq.com>
Date: Sun, 23 Mar 2025 23:44:29 +0800
Subject: [PATCH 5/7] Remove feature reshape

---
 framefusion/siglip_adapter.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/framefusion/siglip_adapter.py b/framefusion/siglip_adapter.py
index 41454ee..3a3999f 100644
--- a/framefusion/siglip_adapter.py
+++ b/framefusion/siglip_adapter.py
@@ -101,14 +101,15 @@ def forward_with_framefusion(self, images: Union[Image.Image, Sequence[Image.Ima
             reduced_tokens += padding_needed
         
         # Now reshape to match the expected output format
-        final_features = reduced_features.reshape(reduced_frames, patches_per_frame, hidden_dim)
+        # final_features = reduced_features.reshape(reduced_frames, patches_per_frame, hidden_dim)
         
         # Ensure the data type matches the model's expected type
         model_dtype = next(self.siglip_encoder.adapter.parameters()).dtype
-        final_features = final_features.to(model_dtype)
-        
+        # final_features = final_features.to(model_dtype)
+        reduced_features = reduced_features.to(model_dtype)
+        print(f"Shape of reduced features: {reduced_features.shape}")
         # Apply the adapter to the final features
-        final_features = self.siglip_encoder.adapter(final_features)
+        final_features = self.siglip_encoder.adapter(reduced_features)
         
         # Log reduction statistics
         reduction_percentage = (1 - reduced_frames / num_frames) * 100

From 8c79dca30b5502121028b3a67a19d969878ae96b Mon Sep 17 00:00:00 2001
From: sablin39 <1020030829@qq.com>
Date: Mon, 24 Mar 2025 14:20:54 +0800
Subject: [PATCH 6/7] Bug fix

---
 framefusion/siglip_adapter.py | 6 +++---
 infer/rwkv/model.py           | 7 +++++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/framefusion/siglip_adapter.py b/framefusion/siglip_adapter.py
index 3a3999f..ef37505 100644
--- a/framefusion/siglip_adapter.py
+++ b/framefusion/siglip_adapter.py
@@ -107,13 +107,13 @@ def forward_with_framefusion(self, images: Union[Image.Image, Sequence[Image.Ima
         model_dtype = next(self.siglip_encoder.adapter.parameters()).dtype
         # final_features = final_features.to(model_dtype)
         reduced_features = reduced_features.to(model_dtype)
-        print(f"Shape of reduced features: {reduced_features.shape}")
+        #print(f"Shape of reduced features: {reduced_features.shape}")
         # Apply the adapter to the final features
         final_features = self.siglip_encoder.adapter(reduced_features)
         
         # Log reduction statistics
         reduction_percentage = (1 - reduced_frames / num_frames) * 100
-        print(f'FrameFusion reduced frames from {num_frames} to {reduced_frames} ({reduction_percentage:.2f}% reduction)')
+        #print(f'FrameFusion reduced frames from {num_frames} to {reduced_frames} ({reduction_percentage:.2f}% reduction)')
         
         return final_features
 
@@ -210,7 +210,7 @@ def forward_with_framefusion(self, images: Union[Image.Image, Sequence[Image.Ima
             # Log reduction statistics
             reduced_patches = reduced_features.shape[1]
             reduction_percentage = (1 - reduced_patches / num_patches) * 100
-            print(f'FrameFusion reduced patches from {num_patches} to {reduced_patches} ({reduction_percentage:.2f}% reduction)')
+            #print(f'FrameFusion reduced patches from {num_patches} to {reduced_patches} ({reduction_percentage:.2f}% reduction)')
             
             # Ensure the data type matches the model's expected type
             model_dtype = next(self.siglip_encoder.adapter.parameters()).dtype
diff --git a/infer/rwkv/model.py b/infer/rwkv/model.py
index 70e478b..f2f8836 100644
--- a/infer/rwkv/model.py
+++ b/infer/rwkv/model.py
@@ -282,13 +282,16 @@ def forward(self, idx, state, full_output=False, sign=None):
 
         x = self.z['emb.weight'][idx]
         if isinstance(sign, torch.Tensor):
-            print(f"Image token shape: {sign.shape}, text token shape: {x.shape}")
+            # print(f"Image token shape: {sign.shape}, text token shape: {x.shape}")
             # sign = sign.squeeze(0)
             sign = rearrange(sign, 'b h w -> (b h) w')
-            print(f"Rearranged image token shape: {sign.shape}")
+            # print(f"Rearranged image token shape: {sign.shape}")
             # sign = F.layer_norm(sign, (self.args.n_embd,), weight=self.z['blocks.0.ln0.weight'], bias=self.z['blocks.0.ln0.bias'])
 
             x = torch.cat((sign,x.to('cuda')), dim=0)
+            if x.shape[0] > 4096:
+                print(f"Token count exceeds 4096: {x.shape[0]}")
+                
 
         x = F.layer_norm(x, (self.args.n_embd,), weight=self.z['blocks.0.ln0.weight'], bias=self.z['blocks.0.ln0.bias'])
         # if isinstance(sign, torch.Tensor):

From 8de7707752e9f0f979f352613ee6819a2ef34eaa Mon Sep 17 00:00:00 2001
From: sablin39 <1020030829@qq.com>
Date: Mon, 24 Mar 2025 14:31:04 +0800
Subject: [PATCH 7/7] Update descriptions

---
 README.md    | 10 +++++++++-
 README_zh.md | 11 ++++++++++-
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 27aca3a..7d49351 100644
--- a/README.md
+++ b/README.md
@@ -71,7 +71,15 @@ pip install -e benchmark/VLMEvalKit
 python third_party/VLMEvalKit/run.py  --work-dir ./results/ --config eval/vlmevalkit/config.json
 ```
 Currenty multi-GPU is not tested.
-<Directory to save results>
+
+## Multimodal Token Fusion
+
+We used [FrameFusion](https://github.com/thu-nics/FrameFusion) to fuse the image and video tokens so they could fit into the context length of 4096.
+
+To enable the feature, simply set `use_token_reduction=True` in the `Worldinfer` constructor. Refer to the official codebase or [paper]() for further details.
+
+> [!NOTE]
+> Though this method is capable of fusing multi-image and video tokens, current `RWKV7-*-siglip2` models have not been trained with multi-image and video QA tasks. Unexpected behaviors may occur.
 
 
 ## Training
diff --git a/README_zh.md b/README_zh.md
index 21a133d..f2208de 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -69,7 +69,16 @@ export PYTHONPATH=$PYTHONPATH:$(pwd)
 pip install -e benchmark/VLMEvalKit
 python -m benchmark/VLMEvalKit/run.py  --work-dir <Directory to save results> --config eval/vlmevalkit/config.json
 ```
-Currenty multi-GPU  is not tested.
+目前尚未测试多GPU下的运行。
+
+## 多模态Token融合
+
+我们使用了 [FrameFusion](https://github.com/thu-nics/FrameFusion) 来融合多张图片和视频帧的token，使其能够适应4096的上下文长度。
+
+为了使用这个功能，你需要在 `Worldinfer` 构造函数中设置 `use_token_reduction=True`，对于相关参数的设置请参考其官方代码仓库与[论文](https://arxiv.org/abs/2501.01986). 
+
+> [!NOTE]
+> 虽然这个Token融合方法能够融合多张图片和视频帧的token，但 `RWKV7-*-siglip2` 系列模型并未针对多张图片与视频的处理进行过训练，因此在相关任务中可能会出现意外行为。
 
 # 训练
 > [!NOTE]