[model] video data SP handling inconsistence between Qwen3VL and Qwen2.5VL · Issue #369 · ByteDance-Seed/VeOmni

Very not clear:

Qwen3vl code:

VeOmni/veomni/models/transformers/qwen3_vl/modeling_qwen3_vl.py

Lines 1370 to 1396 in b6fa6cc

    
           if pixel_values_videos is not None: 
        
               video_embeds, deepstack_video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw) 
        
               # Modification: sequence parallel patch for video embeds 
        
               if self.training and get_parallel_state().sp_enabled: 
        
                   # (seq_len // sp_size, hidden_size) to  (seq_len, hidden_size // sp_size) 
        
                   video_embeds = gather_seq_scatter_heads( 
        
                       video_embeds, seq_dim=0, head_dim=-1, group=get_parallel_state().sp_group 
        
                   ) 
        
               # _, video_mask = self.get_placeholder_mask( 
        
               #     input_ids, inputs_embeds=inputs_embeds, video_features=video_embeds 
        
               # ) 
        
               # Modification: Get the num of video tokens from the pre-computed video_mask 
        
               # And reshape the masks to match the shape of inputs_embeds 
        
               n_video_tokens = video_mask.sum().long().item() 
        
               video_mask = video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device, non_blocking=True) 
        
               # Modification: Slice tensor to drop any padded video tokens 
        
               video_embeds = video_embeds[:n_video_tokens] 
        
               deepstack_video_embeds = [embed[:n_video_tokens] for embed in deepstack_video_embeds] 
        
               n_video_features = video_embeds.shape[0] 
        
               if n_video_tokens != n_video_features: 
        
                   raise ValueError( 
        
                       f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}" 
        
                   ) 
        
               video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype) 
        
               inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)

Qwen2.5vl code:

VeOmni/veomni/models/transformers/qwen2_5vl/modeling_qwen2_5_vl.py

Lines 2022 to 2039 in b6fa6cc

    
           if pixel_values_videos is not None: 
        
               pixel_values_videos = pixel_values_videos.type(self.visual.dtype) 
        
               video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw) 
        
               n_video_tokens = (input_ids == self.config.video_token_id).sum().item() 
        
               n_video_features = video_embeds.shape[0] 
        
               if n_video_tokens != n_video_features: 
        
                   raise ValueError( 
        
                       f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}" 
        
                   ) 
        
               mask = input_ids == self.config.video_token_id 
        
               mask_unsqueezed = mask.unsqueeze(-1) 
        
               mask_expanded = mask_unsqueezed.expand_as(inputs_embeds) 
        
               video_mask = mask_expanded.to(inputs_embeds.device) 
        
               video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype) 
        
               inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds) 
        
           elif get_parallel_state().fsdp_enabled:

能让他们保持一致吗？😓

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[model] video data SP handling inconsistence between Qwen3VL and Qwen2.5VL #369

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

	if pixel_values_videos is not None:
	video_embeds, deepstack_video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw)
	# Modification: sequence parallel patch for video embeds
	if self.training and get_parallel_state().sp_enabled:
	# (seq_len // sp_size, hidden_size) to (seq_len, hidden_size // sp_size)
	video_embeds = gather_seq_scatter_heads(
	video_embeds, seq_dim=0, head_dim=-1, group=get_parallel_state().sp_group
	)
	# _, video_mask = self.get_placeholder_mask(
	# input_ids, inputs_embeds=inputs_embeds, video_features=video_embeds
	# )

	# Modification: Get the num of video tokens from the pre-computed video_mask
	# And reshape the masks to match the shape of inputs_embeds
	n_video_tokens = video_mask.sum().long().item()
	video_mask = video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device, non_blocking=True)
	# Modification: Slice tensor to drop any padded video tokens
	video_embeds = video_embeds[:n_video_tokens]
	deepstack_video_embeds = [embed[:n_video_tokens] for embed in deepstack_video_embeds]
	n_video_features = video_embeds.shape[0]
	if n_video_tokens != n_video_features:
	raise ValueError(
	f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
	)

	video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
	inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)

	if pixel_values_videos is not None:
	pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
	video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
	n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
	n_video_features = video_embeds.shape[0]
	if n_video_tokens != n_video_features:
	raise ValueError(
	f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
	)

	mask = input_ids == self.config.video_token_id
	mask_unsqueezed = mask.unsqueeze(-1)
	mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
	video_mask = mask_expanded.to(inputs_embeds.device)

	video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
	inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
	elif get_parallel_state().fsdp_enabled:

[model] video data SP handling inconsistence between Qwen3VL and Qwen2.5VL #369

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions