From dbf8c5e96cc6027b15702c7c2afd3d052db2885e Mon Sep 17 00:00:00 2001 From: gaclove Date: Mon, 26 Jan 2026 10:22:47 +0000 Subject: [PATCH 1/3] Refactor VAE image conversion to use wan_vae_to_comfy function --- lightx2v/models/runners/default_runner.py | 4 +- .../models/runners/wan/wan_audio_runner.py | 6 +- lightx2v/models/runners/wan/wan_sf_runner.py | 4 +- lightx2v/utils/utils.py | 62 ++++--------------- 4 files changed, 19 insertions(+), 57 deletions(-) diff --git a/lightx2v/models/runners/default_runner.py b/lightx2v/models/runners/default_runner.py index 98e42a72..b629c4a8 100755 --- a/lightx2v/models/runners/default_runner.py +++ b/lightx2v/models/runners/default_runner.py @@ -16,7 +16,7 @@ from lightx2v.utils.global_paras import CALIB from lightx2v.utils.memory_profiler import peak_memory_decorator from lightx2v.utils.profiler import * -from lightx2v.utils.utils import get_optimal_patched_size_with_sp, isotropic_crop_resize, save_to_video, vae_to_comfyui_image +from lightx2v.utils.utils import get_optimal_patched_size_with_sp, isotropic_crop_resize, save_to_video, wan_vae_to_comfy from lightx2v_platform.base.global_var import AI_DEVICE torch_device_module = getattr(torch, AI_DEVICE) @@ -433,7 +433,7 @@ def post_prompt_enhancer(self): return enhanced_prompt def process_images_after_vae_decoder(self): - self.gen_video_final = vae_to_comfyui_image(self.gen_video_final) + self.gen_video_final = wan_vae_to_comfy(self.gen_video_final) if "video_frame_interpolation" in self.config: assert self.vfi_model is not None and self.config["video_frame_interpolation"].get("target_fps", None) is not None diff --git a/lightx2v/models/runners/wan/wan_audio_runner.py b/lightx2v/models/runners/wan/wan_audio_runner.py index 86029de7..c38fe074 100755 --- a/lightx2v/models/runners/wan/wan_audio_runner.py +++ b/lightx2v/models/runners/wan/wan_audio_runner.py @@ -26,7 +26,7 @@ from lightx2v.utils.envs import * from lightx2v.utils.profiler import * from lightx2v.utils.registry_factory import RUNNER_REGISTER -from lightx2v.utils.utils import find_torch_model_path, fixed_shape_resize, get_optimal_patched_size_with_sp, isotropic_crop_resize, load_weights, vae_to_comfyui_image_inplace +from lightx2v.utils.utils import find_torch_model_path, fixed_shape_resize, get_optimal_patched_size_with_sp, isotropic_crop_resize, load_weights, wan_vae_to_comfy from lightx2v_platform.base.global_var import AI_DEVICE warnings.filterwarnings("ignore", category=UserWarning, module="torchaudio") @@ -594,7 +594,7 @@ def end_run_segment(self, segment_idx, valid_duration=1e9): video_seg = self.gen_video[:, :, :useful_length].cpu() audio_seg = self.segment.audio_array[:, : useful_length * self._audio_processor.audio_frame_rate] audio_seg = audio_seg.sum(dim=0) # Multiple audio tracks, mixed into one track - video_seg = vae_to_comfyui_image_inplace(video_seg) + video_seg = wan_vae_to_comfy(video_seg) # [Warning] Need check whether video segment interpolation works... if "video_frame_interpolation" in self.config and self.vfi_model is not None: @@ -642,7 +642,7 @@ def end_run_segment_stream(self, latents, valid_duration=1e9): origin_seg = torch.clamp(origin_seg, -1, 1).to(torch.float) valid_T = min(valid_length - frame_idx, origin_seg.shape[2]) - video_seg = vae_to_comfyui_image_inplace(origin_seg[:, :, :valid_T].cpu()) + video_seg = wan_vae_to_comfy(origin_seg[:, :, :valid_T].cpu()) audio_start = frame_idx * self._audio_processor.audio_frame_rate audio_end = (frame_idx + valid_T) * self._audio_processor.audio_frame_rate audio_seg = self.segment.audio_array[:, audio_start:audio_end].sum(dim=0) diff --git a/lightx2v/models/runners/wan/wan_sf_runner.py b/lightx2v/models/runners/wan/wan_sf_runner.py index 4e5e91f5..62091dc7 100755 --- a/lightx2v/models/runners/wan/wan_sf_runner.py +++ b/lightx2v/models/runners/wan/wan_sf_runner.py @@ -13,7 +13,7 @@ from lightx2v.utils.memory_profiler import peak_memory_decorator from lightx2v.utils.profiler import * from lightx2v.utils.registry_factory import RUNNER_REGISTER -from lightx2v.utils.utils import vae_to_comfyui_image_inplace +from lightx2v.utils.utils import wan_vae_to_comfy @RUNNER_REGISTER("wan2.1_sf") @@ -121,7 +121,7 @@ def end_run_segment(self, segment_idx=None): self.gen_video_final = torch.cat([self.gen_video_final, self.gen_video], dim=0) if self.gen_video_final is not None else self.gen_video if self.is_live: if self.video_recorder: - stream_video = vae_to_comfyui_image_inplace(self.gen_video) + stream_video = wan_vae_to_comfy(self.gen_video) self.video_recorder.pub_video(stream_video) torch.cuda.empty_cache() diff --git a/lightx2v/utils/utils.py b/lightx2v/utils/utils.py index 2ed9d0bd..79e40acd 100755 --- a/lightx2v/utils/utils.py +++ b/lightx2v/utils/utils.py @@ -106,38 +106,7 @@ def cache_video( return None -def vae_to_comfyui_image(vae_output: torch.Tensor) -> torch.Tensor: - """ - Convert VAE decoder output to ComfyUI Image format - - Args: - vae_output: VAE decoder output tensor, typically in range [-1, 1] - Shape: [B, C, T, H, W] or [B, C, H, W] - - Returns: - ComfyUI Image tensor in range [0, 1] - Shape: [B, H, W, C] for single frame or [B*T, H, W, C] for video - """ - # Handle video tensor (5D) vs image tensor (4D) - if vae_output.dim() == 5: - # Video tensor: [B, C, T, H, W] - B, C, T, H, W = vae_output.shape - # Reshape to [B*T, C, H, W] for processing - vae_output = vae_output.permute(0, 2, 1, 3, 4).reshape(B * T, C, H, W) - - # Normalize from [-1, 1] to [0, 1] - images = (vae_output + 1) / 2 - - # Clamp values to [0, 1] - images = torch.clamp(images, 0, 1) - - # Convert from [B, C, H, W] to [B, H, W, C] - images = images.permute(0, 2, 3, 1).cpu() - - return images - - -def vae_to_comfyui_image_inplace(vae_output: torch.Tensor) -> torch.Tensor: +def wan_vae_to_comfy(vae_output: torch.Tensor) -> torch.Tensor: """ Convert VAE decoder output to ComfyUI Image format (inplace operation) @@ -151,24 +120,17 @@ def vae_to_comfyui_image_inplace(vae_output: torch.Tensor) -> torch.Tensor: Shape: [B, H, W, C] for single frame or [B*T, H, W, C] for video Note: The returned tensor is the same object as input (modified in-place) """ - # Handle video tensor (5D) vs image tensor (4D) - if vae_output.dim() == 5: - # Video tensor: [B, C, T, H, W] - B, C, T, H, W = vae_output.shape - # Reshape to [B*T, C, H, W] for processing (inplace view) - vae_output = vae_output.permute(0, 2, 1, 3, 4).contiguous().view(B * T, C, H, W) - - # Normalize from [-1, 1] to [0, 1] (inplace) - vae_output.add_(1).div_(2) - - # Clamp values to [0, 1] (inplace) - vae_output.clamp_(0, 1) - - # Convert from [B, C, H, W] to [B, H, W, C] and move to CPU - vae_output = vae_output.permute(0, 2, 3, 1).cpu() - - return vae_output - + + vae_output.add_(1.0).mul_(0.5).clamp_(0.0, 1.0) + + if vae_output.ndim == 5: + # Video: [B, C, T, H, W] -> [B, T, H, W, C] + vae_output = vae_output.permute(0, 2, 3, 4, 1) + # -> [B*T, H, W, C] + return vae_output.cpu().flatten(0, 1) + else: + # Image: [B, C, H, W] -> [B, H, W, C] + return vae_output.permute(0, 2, 3, 1).cpu() def save_to_video( images: torch.Tensor, From b2896c0f6ea04f4477b1b95508bd016c492d8132 Mon Sep 17 00:00:00 2001 From: gaclove Date: Mon, 26 Jan 2026 12:00:14 +0000 Subject: [PATCH 2/3] Refactor image saving logic in LongCat, Qwen, and Z image runners to streamline path handling and improve consistency --- .../longcat_image/longcat_image_runner.py | 2 +- .../runners/qwen_image/qwen_image_runner.py | 11 ++++++----- .../models/runners/z_image/z_image_runner.py | 2 +- lightx2v/utils/utils.py | 16 ++++++++++++++-- 4 files changed, 22 insertions(+), 9 deletions(-) diff --git a/lightx2v/models/runners/longcat_image/longcat_image_runner.py b/lightx2v/models/runners/longcat_image/longcat_image_runner.py index 55462159..f5082d31 100755 --- a/lightx2v/models/runners/longcat_image/longcat_image_runner.py +++ b/lightx2v/models/runners/longcat_image/longcat_image_runner.py @@ -399,7 +399,7 @@ def run_pipeline(self, input_info): if not input_info.return_result_tensor: image = images[0] - image.save(f"{input_info.save_result_path}") + image.save(input_info.save_result_path) logger.info(f"Image saved: {input_info.save_result_path}") del latents, generator diff --git a/lightx2v/models/runners/qwen_image/qwen_image_runner.py b/lightx2v/models/runners/qwen_image/qwen_image_runner.py index 663ab1c0..b1342565 100755 --- a/lightx2v/models/runners/qwen_image/qwen_image_runner.py +++ b/lightx2v/models/runners/qwen_image/qwen_image_runner.py @@ -380,15 +380,16 @@ def run_pipeline(self, input_info): self.end_run() if not input_info.return_result_tensor: + image_prefix = input_info.save_result_path.rsplit(".", 1)[0] + image_suffix = input_info.save_result_path.rsplit(".", 1)[1] if len(input_info.save_result_path.rsplit(".", 1)) > 1 else "png" if isinstance(images[0], list) and len(images[0]) > 1: - image_prefix = f"{input_info.save_result_path}".split(".")[0] for idx, image in enumerate(images[0]): - image.save(f"{image_prefix}_{idx}.png") - logger.info(f"Image saved: {image_prefix}_{idx}.png") + image.save(f"{image_prefix}_{idx:05d}.{image_suffix}") + logger.info(f"Image saved: {image_prefix}_{idx:05d}.{image_suffix}") else: image = images[0] - image.save(f"{input_info.save_result_path}") - logger.info(f"Image saved: {input_info.save_result_path}") + image.save(f"{image_prefix}.{image_suffix}") + logger.info(f"Image saved: {image_prefix}.{image_suffix}") del latents, generator torch_device_module.empty_cache() diff --git a/lightx2v/models/runners/z_image/z_image_runner.py b/lightx2v/models/runners/z_image/z_image_runner.py index 8959486d..ed83bcee 100755 --- a/lightx2v/models/runners/z_image/z_image_runner.py +++ b/lightx2v/models/runners/z_image/z_image_runner.py @@ -344,7 +344,7 @@ def run_pipeline(self, input_info): if not input_info.return_result_tensor: image = images[0] - image.save(f"{input_info.save_result_path}") + image.save(input_info.save_result_path) logger.info(f"Image saved: {input_info.save_result_path}") del latents, generator diff --git a/lightx2v/utils/utils.py b/lightx2v/utils/utils.py index 79e40acd..bc387410 100755 --- a/lightx2v/utils/utils.py +++ b/lightx2v/utils/utils.py @@ -120,9 +120,9 @@ def wan_vae_to_comfy(vae_output: torch.Tensor) -> torch.Tensor: Shape: [B, H, W, C] for single frame or [B*T, H, W, C] for video Note: The returned tensor is the same object as input (modified in-place) """ - + vae_output.add_(1.0).mul_(0.5).clamp_(0.0, 1.0) - + if vae_output.ndim == 5: # Video: [B, C, T, H, W] -> [B, T, H, W, C] vae_output = vae_output.permute(0, 2, 3, 4, 1) @@ -132,6 +132,18 @@ def wan_vae_to_comfy(vae_output: torch.Tensor) -> torch.Tensor: # Image: [B, C, H, W] -> [B, H, W, C] return vae_output.permute(0, 2, 3, 1).cpu() + +def diffusers_vae_to_comfy(vae_output: torch.Tensor) -> torch.Tensor: + """ + Convert Diffusers VAE decoder output to ComfyUI Image format + Image processor for VAE, return tensor in range [0, 1] when do_denormalize is True. + + ref: https://github.com/huggingface/diffusers/blob/main/src/diffusers/image_processor.py#L744 + + """ + return vae_output.permute(0, 2, 3, 1).cpu() + + def save_to_video( images: torch.Tensor, output_path: str, From 212feb6c776843f03d51a324a9718d8f2a19c451 Mon Sep 17 00:00:00 2001 From: gaclove Date: Mon, 26 Jan 2026 12:19:11 +0000 Subject: [PATCH 3/3] Update image return logic in LongCat, Qwen, and Z image runners to support conditional output based on input_info settings --- .../models/runners/longcat_image/longcat_image_runner.py | 6 ++++-- lightx2v/models/runners/qwen_image/qwen_image_runner.py | 6 ++++-- lightx2v/models/runners/z_image/z_image_runner.py | 6 ++++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/lightx2v/models/runners/longcat_image/longcat_image_runner.py b/lightx2v/models/runners/longcat_image/longcat_image_runner.py index f5082d31..bc275175 100755 --- a/lightx2v/models/runners/longcat_image/longcat_image_runner.py +++ b/lightx2v/models/runners/longcat_image/longcat_image_runner.py @@ -406,5 +406,7 @@ def run_pipeline(self, input_info): torch_device_module.empty_cache() gc.collect() - # Return (images, audio) - audio is None for default runner - return images, None + if input_info.return_result_tensor: + return {"images": images} + elif input_info.save_result_path is not None: + return {"images": None} diff --git a/lightx2v/models/runners/qwen_image/qwen_image_runner.py b/lightx2v/models/runners/qwen_image/qwen_image_runner.py index b1342565..e722fc5b 100755 --- a/lightx2v/models/runners/qwen_image/qwen_image_runner.py +++ b/lightx2v/models/runners/qwen_image/qwen_image_runner.py @@ -395,5 +395,7 @@ def run_pipeline(self, input_info): torch_device_module.empty_cache() gc.collect() - # Return (images, audio) - audio is None for default runner - return images, None + if input_info.return_result_tensor: + return {"images": images} + elif input_info.save_result_path is not None: + return {"images": None} diff --git a/lightx2v/models/runners/z_image/z_image_runner.py b/lightx2v/models/runners/z_image/z_image_runner.py index ed83bcee..b19b127d 100755 --- a/lightx2v/models/runners/z_image/z_image_runner.py +++ b/lightx2v/models/runners/z_image/z_image_runner.py @@ -351,5 +351,7 @@ def run_pipeline(self, input_info): torch_device_module.empty_cache() gc.collect() - # Return (images, audio) - audio is None for default runner - return images, None + if input_info.return_result_tensor: + return {"images": images} + elif input_info.save_result_path is not None: + return {"images": None}