-
Notifications
You must be signed in to change notification settings - Fork 128
Open
Description
Very not clear:
Qwen3vl code:
VeOmni/veomni/models/transformers/qwen3_vl/modeling_qwen3_vl.py
Lines 1370 to 1396 in b6fa6cc
| if pixel_values_videos is not None: | |
| video_embeds, deepstack_video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw) | |
| # Modification: sequence parallel patch for video embeds | |
| if self.training and get_parallel_state().sp_enabled: | |
| # (seq_len // sp_size, hidden_size) to (seq_len, hidden_size // sp_size) | |
| video_embeds = gather_seq_scatter_heads( | |
| video_embeds, seq_dim=0, head_dim=-1, group=get_parallel_state().sp_group | |
| ) | |
| # _, video_mask = self.get_placeholder_mask( | |
| # input_ids, inputs_embeds=inputs_embeds, video_features=video_embeds | |
| # ) | |
| # Modification: Get the num of video tokens from the pre-computed video_mask | |
| # And reshape the masks to match the shape of inputs_embeds | |
| n_video_tokens = video_mask.sum().long().item() | |
| video_mask = video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device, non_blocking=True) | |
| # Modification: Slice tensor to drop any padded video tokens | |
| video_embeds = video_embeds[:n_video_tokens] | |
| deepstack_video_embeds = [embed[:n_video_tokens] for embed in deepstack_video_embeds] | |
| n_video_features = video_embeds.shape[0] | |
| if n_video_tokens != n_video_features: | |
| raise ValueError( | |
| f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}" | |
| ) | |
| video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype) | |
| inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds) |
Qwen2.5vl code:
VeOmni/veomni/models/transformers/qwen2_5vl/modeling_qwen2_5_vl.py
Lines 2022 to 2039 in b6fa6cc
| if pixel_values_videos is not None: | |
| pixel_values_videos = pixel_values_videos.type(self.visual.dtype) | |
| video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw) | |
| n_video_tokens = (input_ids == self.config.video_token_id).sum().item() | |
| n_video_features = video_embeds.shape[0] | |
| if n_video_tokens != n_video_features: | |
| raise ValueError( | |
| f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}" | |
| ) | |
| mask = input_ids == self.config.video_token_id | |
| mask_unsqueezed = mask.unsqueeze(-1) | |
| mask_expanded = mask_unsqueezed.expand_as(inputs_embeds) | |
| video_mask = mask_expanded.to(inputs_embeds.device) | |
| video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype) | |
| inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds) | |
| elif get_parallel_state().fsdp_enabled: |
能让他们保持一致吗?😓
Metadata
Metadata
Assignees
Labels
No labels