Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions lightx2v/models/runners/default_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from lightx2v.utils.global_paras import CALIB
from lightx2v.utils.memory_profiler import peak_memory_decorator
from lightx2v.utils.profiler import *
from lightx2v.utils.utils import get_optimal_patched_size_with_sp, isotropic_crop_resize, save_to_video, vae_to_comfyui_image
from lightx2v.utils.utils import get_optimal_patched_size_with_sp, isotropic_crop_resize, save_to_video, wan_vae_to_comfy
from lightx2v_platform.base.global_var import AI_DEVICE

torch_device_module = getattr(torch, AI_DEVICE)
Expand Down Expand Up @@ -433,7 +433,7 @@ def post_prompt_enhancer(self):
return enhanced_prompt

def process_images_after_vae_decoder(self):
self.gen_video_final = vae_to_comfyui_image(self.gen_video_final)
self.gen_video_final = wan_vae_to_comfy(self.gen_video_final)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The change from vae_to_comfyui_image (which was not in-place) to wan_vae_to_comfy (which performs in-place normalization) represents a behavioral change for self.gen_video_final. While the assignment immediately overwrites the reference, it's important to be aware that self.gen_video_final is now modified in-place before being reassigned. This is generally acceptable for efficiency, but it's a subtle change from the previous non-in-place function.


if "video_frame_interpolation" in self.config:
assert self.vfi_model is not None and self.config["video_frame_interpolation"].get("target_fps", None) is not None
Expand Down
8 changes: 5 additions & 3 deletions lightx2v/models/runners/longcat_image/longcat_image_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,12 +399,14 @@ def run_pipeline(self, input_info):

if not input_info.return_result_tensor:
image = images[0]
image.save(f"{input_info.save_result_path}")
image.save(input_info.save_result_path)
logger.info(f"Image saved: {input_info.save_result_path}")

del latents, generator
torch_device_module.empty_cache()
gc.collect()

# Return (images, audio) - audio is None for default runner
return images, None
if input_info.return_result_tensor:
return {"images": images}
elif input_info.save_result_path is not None:
return {"images": None}
Comment on lines +409 to +412
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The return type of the run_pipeline method has changed from a tuple (images, None) to a dictionary {"images": images} or {"images": None}. This is a breaking API change that external callers might need to adapt to. Please ensure this change is documented and communicated appropriately.

17 changes: 10 additions & 7 deletions lightx2v/models/runners/qwen_image/qwen_image_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,19 +380,22 @@ def run_pipeline(self, input_info):
self.end_run()

if not input_info.return_result_tensor:
image_prefix = input_info.save_result_path.rsplit(".", 1)[0]
image_suffix = input_info.save_result_path.rsplit(".", 1)[1] if len(input_info.save_result_path.rsplit(".", 1)) > 1 else "png"
if isinstance(images[0], list) and len(images[0]) > 1:
image_prefix = f"{input_info.save_result_path}".split(".")[0]
for idx, image in enumerate(images[0]):
image.save(f"{image_prefix}_{idx}.png")
logger.info(f"Image saved: {image_prefix}_{idx}.png")
image.save(f"{image_prefix}_{idx:05d}.{image_suffix}")
logger.info(f"Image saved: {image_prefix}_{idx:05d}.{image_suffix}")
else:
image = images[0]
image.save(f"{input_info.save_result_path}")
logger.info(f"Image saved: {input_info.save_result_path}")
image.save(f"{image_prefix}.{image_suffix}")
logger.info(f"Image saved: {image_prefix}.{image_suffix}")

del latents, generator
torch_device_module.empty_cache()
gc.collect()

# Return (images, audio) - audio is None for default runner
return images, None
if input_info.return_result_tensor:
return {"images": images}
elif input_info.save_result_path is not None:
return {"images": None}
6 changes: 3 additions & 3 deletions lightx2v/models/runners/wan/wan_audio_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from lightx2v.utils.envs import *
from lightx2v.utils.profiler import *
from lightx2v.utils.registry_factory import RUNNER_REGISTER
from lightx2v.utils.utils import find_torch_model_path, fixed_shape_resize, get_optimal_patched_size_with_sp, isotropic_crop_resize, load_weights, vae_to_comfyui_image_inplace
from lightx2v.utils.utils import find_torch_model_path, fixed_shape_resize, get_optimal_patched_size_with_sp, isotropic_crop_resize, load_weights, wan_vae_to_comfy
from lightx2v_platform.base.global_var import AI_DEVICE

warnings.filterwarnings("ignore", category=UserWarning, module="torchaudio")
Expand Down Expand Up @@ -594,7 +594,7 @@ def end_run_segment(self, segment_idx, valid_duration=1e9):
video_seg = self.gen_video[:, :, :useful_length].cpu()
audio_seg = self.segment.audio_array[:, : useful_length * self._audio_processor.audio_frame_rate]
audio_seg = audio_seg.sum(dim=0) # Multiple audio tracks, mixed into one track
video_seg = vae_to_comfyui_image_inplace(video_seg)
video_seg = wan_vae_to_comfy(video_seg)

# [Warning] Need check whether video segment interpolation works...
if "video_frame_interpolation" in self.config and self.vfi_model is not None:
Expand Down Expand Up @@ -642,7 +642,7 @@ def end_run_segment_stream(self, latents, valid_duration=1e9):
origin_seg = torch.clamp(origin_seg, -1, 1).to(torch.float)
valid_T = min(valid_length - frame_idx, origin_seg.shape[2])

video_seg = vae_to_comfyui_image_inplace(origin_seg[:, :, :valid_T].cpu())
video_seg = wan_vae_to_comfy(origin_seg[:, :, :valid_T].cpu())
audio_start = frame_idx * self._audio_processor.audio_frame_rate
audio_end = (frame_idx + valid_T) * self._audio_processor.audio_frame_rate
audio_seg = self.segment.audio_array[:, audio_start:audio_end].sum(dim=0)
Expand Down
4 changes: 2 additions & 2 deletions lightx2v/models/runners/wan/wan_sf_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from lightx2v.utils.memory_profiler import peak_memory_decorator
from lightx2v.utils.profiler import *
from lightx2v.utils.registry_factory import RUNNER_REGISTER
from lightx2v.utils.utils import vae_to_comfyui_image_inplace
from lightx2v.utils.utils import wan_vae_to_comfy


@RUNNER_REGISTER("wan2.1_sf")
Expand Down Expand Up @@ -121,7 +121,7 @@ def end_run_segment(self, segment_idx=None):
self.gen_video_final = torch.cat([self.gen_video_final, self.gen_video], dim=0) if self.gen_video_final is not None else self.gen_video
if self.is_live:
if self.video_recorder:
stream_video = vae_to_comfyui_image_inplace(self.gen_video)
stream_video = wan_vae_to_comfy(self.gen_video)
self.video_recorder.pub_video(stream_video)

torch.cuda.empty_cache()
Expand Down
8 changes: 5 additions & 3 deletions lightx2v/models/runners/z_image/z_image_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,12 +344,14 @@ def run_pipeline(self, input_info):

if not input_info.return_result_tensor:
image = images[0]
image.save(f"{input_info.save_result_path}")
image.save(input_info.save_result_path)
logger.info(f"Image saved: {input_info.save_result_path}")

del latents, generator
torch_device_module.empty_cache()
gc.collect()

# Return (images, audio) - audio is None for default runner
return images, None
if input_info.return_result_tensor:
return {"images": images}
elif input_info.save_result_path is not None:
return {"images": None}
Comment on lines +354 to +357
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Similar to other image runners, the return type of the run_pipeline method has changed from a tuple (images, None) to a dictionary {"images": images} or {"images": None}. This is a breaking API change that external callers might need to adapt to. Please ensure this change is documented and communicated appropriately.

62 changes: 18 additions & 44 deletions lightx2v/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,68 +106,42 @@ def cache_video(
return None


def vae_to_comfyui_image(vae_output: torch.Tensor) -> torch.Tensor:
def wan_vae_to_comfy(vae_output: torch.Tensor) -> torch.Tensor:
"""
Convert VAE decoder output to ComfyUI Image format
Convert VAE decoder output to ComfyUI Image format (inplace operation)

Args:
vae_output: VAE decoder output tensor, typically in range [-1, 1]
Shape: [B, C, T, H, W] or [B, C, H, W]
WARNING: This tensor will be modified in-place!

Returns:
ComfyUI Image tensor in range [0, 1]
Shape: [B, H, W, C] for single frame or [B*T, H, W, C] for video
Note: The returned tensor is the same object as input (modified in-place)
Comment on lines +111 to +121
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The docstring for wan_vae_to_comfy states that it performs an "inplace operation" and that "The returned tensor is the same object as input (modified in-place)". While the normalization steps (add_, mul_, clamp_) are indeed in-place, the subsequent permute and flatten operations create new tensors (or views that are then copied to CPU). Therefore, the returned tensor is not the same object as the input vae_output. This contradiction can be misleading and potentially lead to unexpected behavior if callers rely on the returned object being the original input tensor. Please clarify the docstring to accurately reflect that the input tensor is modified in-place for normalization, but a new tensor is returned after permutation and flattening.

Suggested change
Convert VAE decoder output to ComfyUI Image format (inplace operation)
Args:
vae_output: VAE decoder output tensor, typically in range [-1, 1]
Shape: [B, C, T, H, W] or [B, C, H, W]
WARNING: This tensor will be modified in-place!
Returns:
ComfyUI Image tensor in range [0, 1]
Shape: [B, H, W, C] for single frame or [B*T, H, W, C] for video
Note: The returned tensor is the same object as input (modified in-place)
Convert VAE decoder output to ComfyUI Image format
Args:
vae_output: VAE decoder output tensor, typically in range [-1, 1]
Shape: [B, C, T, H, W] or [B, C, H, W]
WARNING: The input tensor `vae_output` will be modified in-place for normalization.
Returns:
A new ComfyUI Image tensor in range [0, 1]
Shape: [B, H, W, C] for single frame or [B*T, H, W, C] for video

"""
# Handle video tensor (5D) vs image tensor (4D)
if vae_output.dim() == 5:
# Video tensor: [B, C, T, H, W]
B, C, T, H, W = vae_output.shape
# Reshape to [B*T, C, H, W] for processing
vae_output = vae_output.permute(0, 2, 1, 3, 4).reshape(B * T, C, H, W)

# Normalize from [-1, 1] to [0, 1]
images = (vae_output + 1) / 2

# Clamp values to [0, 1]
images = torch.clamp(images, 0, 1)

# Convert from [B, C, H, W] to [B, H, W, C]
images = images.permute(0, 2, 3, 1).cpu()
vae_output.add_(1.0).mul_(0.5).clamp_(0.0, 1.0)

return images
if vae_output.ndim == 5:
# Video: [B, C, T, H, W] -> [B, T, H, W, C]
vae_output = vae_output.permute(0, 2, 3, 4, 1)
# -> [B*T, H, W, C]
return vae_output.cpu().flatten(0, 1)
else:
# Image: [B, C, H, W] -> [B, H, W, C]
return vae_output.permute(0, 2, 3, 1).cpu()


def vae_to_comfyui_image_inplace(vae_output: torch.Tensor) -> torch.Tensor:
def diffusers_vae_to_comfy(vae_output: torch.Tensor) -> torch.Tensor:
"""
Convert VAE decoder output to ComfyUI Image format (inplace operation)
Convert Diffusers VAE decoder output to ComfyUI Image format
Image processor for VAE, return tensor in range [0, 1] when do_denormalize is True.

Args:
vae_output: VAE decoder output tensor, typically in range [-1, 1]
Shape: [B, C, T, H, W] or [B, C, H, W]
WARNING: This tensor will be modified in-place!
ref: https://github.com/huggingface/diffusers/blob/main/src/diffusers/image_processor.py#L744

Returns:
ComfyUI Image tensor in range [0, 1]
Shape: [B, H, W, C] for single frame or [B*T, H, W, C] for video
Note: The returned tensor is the same object as input (modified in-place)
"""
# Handle video tensor (5D) vs image tensor (4D)
if vae_output.dim() == 5:
# Video tensor: [B, C, T, H, W]
B, C, T, H, W = vae_output.shape
# Reshape to [B*T, C, H, W] for processing (inplace view)
vae_output = vae_output.permute(0, 2, 1, 3, 4).contiguous().view(B * T, C, H, W)

# Normalize from [-1, 1] to [0, 1] (inplace)
vae_output.add_(1).div_(2)

# Clamp values to [0, 1] (inplace)
vae_output.clamp_(0, 1)

# Convert from [B, C, H, W] to [B, H, W, C] and move to CPU
vae_output = vae_output.permute(0, 2, 3, 1).cpu()

return vae_output
return vae_output.permute(0, 2, 3, 1).cpu()


def save_to_video(
Expand Down