Skip to content

[model] video data SP handling inconsistence between Qwen3VL and Qwen2.5VL #369

@Yangr116

Description

@Yangr116

Very not clear:

Qwen3vl code:

if pixel_values_videos is not None:
video_embeds, deepstack_video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw)
# Modification: sequence parallel patch for video embeds
if self.training and get_parallel_state().sp_enabled:
# (seq_len // sp_size, hidden_size) to (seq_len, hidden_size // sp_size)
video_embeds = gather_seq_scatter_heads(
video_embeds, seq_dim=0, head_dim=-1, group=get_parallel_state().sp_group
)
# _, video_mask = self.get_placeholder_mask(
# input_ids, inputs_embeds=inputs_embeds, video_features=video_embeds
# )
# Modification: Get the num of video tokens from the pre-computed video_mask
# And reshape the masks to match the shape of inputs_embeds
n_video_tokens = video_mask.sum().long().item()
video_mask = video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device, non_blocking=True)
# Modification: Slice tensor to drop any padded video tokens
video_embeds = video_embeds[:n_video_tokens]
deepstack_video_embeds = [embed[:n_video_tokens] for embed in deepstack_video_embeds]
n_video_features = video_embeds.shape[0]
if n_video_tokens != n_video_features:
raise ValueError(
f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
)
video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)

Qwen2.5vl code:

if pixel_values_videos is not None:
pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
n_video_features = video_embeds.shape[0]
if n_video_tokens != n_video_features:
raise ValueError(
f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
)
mask = input_ids == self.config.video_token_id
mask_unsqueezed = mask.unsqueeze(-1)
mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
video_mask = mask_expanded.to(inputs_embeds.device)
video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
elif get_parallel_state().fsdp_enabled:

能让他们保持一致吗?😓

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions