From 585defff47a1812e5ee25fe36b9d3705c66a7aca Mon Sep 17 00:00:00 2001 From: PreethamNoelP Date: Fri, 12 Jun 2026 20:20:25 +0530 Subject: [PATCH 1/2] [Anima] Add img2img pipeline blocks --- docs/source/en/api/pipelines/anima.md | 4 + src/diffusers/__init__.py | 2 + src/diffusers/modular_pipelines/__init__.py | 3 +- .../modular_pipelines/anima/__init__.py | 4 +- .../modular_pipelines/anima/before_denoise.py | 87 +++++++ .../modular_pipelines/anima/encoders.py | 239 ++++++++++++++++++ .../anima/modular_blocks_anima.py | 155 +++++++++++- .../dummy_torch_and_transformers_objects.py | 15 ++ .../test_modular_pipeline_anima_img2img.py | 193 ++++++++++++++ 9 files changed, 693 insertions(+), 9 deletions(-) create mode 100644 tests/modular_pipelines/anima/test_modular_pipeline_anima_img2img.py diff --git a/docs/source/en/api/pipelines/anima.md b/docs/source/en/api/pipelines/anima.md index b66eeb2a29b7..22eff138a027 100644 --- a/docs/source/en/api/pipelines/anima.md +++ b/docs/source/en/api/pipelines/anima.md @@ -35,6 +35,10 @@ image = pipe(prompt="masterpiece, best quality, 1girl, solo, city lights").image [[autodoc]] AnimaAutoBlocks +## AnimaImg2ImgAutoBlocks + +[[autodoc]] AnimaImg2ImgAutoBlocks + ## AnimaTextConditioner [[autodoc]] AnimaTextConditioner diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 4a2c3bca5bcc..46f511477e1a 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -455,6 +455,7 @@ _import_structure["modular_pipelines"].extend( [ "AnimaAutoBlocks", + "AnimaImg2ImgAutoBlocks", "AnimaModularPipeline", "ErnieImageAutoBlocks", "ErnieImageModularPipeline", @@ -1280,6 +1281,7 @@ else: from .modular_pipelines import ( AnimaAutoBlocks, + AnimaImg2ImgAutoBlocks, AnimaModularPipeline, ErnieImageAutoBlocks, ErnieImageModularPipeline, diff --git a/src/diffusers/modular_pipelines/__init__.py b/src/diffusers/modular_pipelines/__init__.py index 4b36994aef07..335b49b451fc 100644 --- a/src/diffusers/modular_pipelines/__init__.py +++ b/src/diffusers/modular_pipelines/__init__.py @@ -95,6 +95,7 @@ ] _import_structure["anima"] = [ "AnimaAutoBlocks", + "AnimaImg2ImgAutoBlocks", "AnimaModularPipeline", ] _import_structure["ernie_image"] = [ @@ -122,7 +123,7 @@ except OptionalDependencyNotAvailable: from ..utils.dummy_pt_objects import * # noqa F403 else: - from .anima import AnimaAutoBlocks, AnimaModularPipeline + from .anima import AnimaAutoBlocks, AnimaImg2ImgAutoBlocks, AnimaModularPipeline from .components_manager import ComponentsManager from .ernie_image import ErnieImageAutoBlocks, ErnieImageModularPipeline from .flux import FluxAutoBlocks, FluxKontextAutoBlocks, FluxKontextModularPipeline, FluxModularPipeline diff --git a/src/diffusers/modular_pipelines/anima/__init__.py b/src/diffusers/modular_pipelines/anima/__init__.py index 4772d906e03b..1cbb2d741bfb 100644 --- a/src/diffusers/modular_pipelines/anima/__init__.py +++ b/src/diffusers/modular_pipelines/anima/__init__.py @@ -21,7 +21,7 @@ _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects)) else: - _import_structure["modular_blocks_anima"] = ["AnimaAutoBlocks"] + _import_structure["modular_blocks_anima"] = ["AnimaAutoBlocks", "AnimaImg2ImgAutoBlocks"] _import_structure["modular_pipeline"] = ["AnimaModularPipeline"] if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: @@ -31,7 +31,7 @@ except OptionalDependencyNotAvailable: from ...utils.dummy_torch_and_transformers_objects import * # noqa F403 else: - from .modular_blocks_anima import AnimaAutoBlocks + from .modular_blocks_anima import AnimaAutoBlocks, AnimaImg2ImgAutoBlocks from .modular_pipeline import AnimaModularPipeline else: import sys diff --git a/src/diffusers/modular_pipelines/anima/before_denoise.py b/src/diffusers/modular_pipelines/anima/before_denoise.py index 25f38cd0cb65..9147047f5f9a 100644 --- a/src/diffusers/modular_pipelines/anima/before_denoise.py +++ b/src/diffusers/modular_pipelines/anima/before_denoise.py @@ -370,6 +370,19 @@ def __call__(self, components: AnimaModularPipeline, state: PipelineState) -> Pi return components, state +# Copied from diffusers.modular_pipelines.qwenimage.before_denoise.get_timesteps +def get_timesteps(scheduler, num_inference_steps, strength): + # get the original timestep using init_timestep + init_timestep = min(num_inference_steps * strength, num_inference_steps) + + t_start = int(max(num_inference_steps - init_timestep, 0)) + timesteps = scheduler.timesteps[t_start * scheduler.order :] + if hasattr(scheduler, "set_begin_index"): + scheduler.set_begin_index(t_start * scheduler.order) + + return timesteps, num_inference_steps - t_start + + class AnimaSetTimestepsStep(ModularPipelineBlocks): model_name = "anima" @@ -414,3 +427,77 @@ def __call__(self, components: AnimaModularPipeline, state: PipelineState) -> Pi self.set_block_state(state, block_state) return components, state + + +# Copied from diffusers.modular_pipelines.anima.before_denoise.AnimaSetTimestepsStep +class AnimaImg2ImgSetTimestepsStep(ModularPipelineBlocks): + """Set the scheduler timesteps for Anima image-to-image inference. + + This step computes the full timestep schedule and stores it in state. It does **not** set + ``scheduler.set_begin_index`` — that is handled downstream by + ``AnimaImg2ImgVaeEncoderStep``, which slices the schedule based on ``strength``. + + Components: + scheduler (`FlowMatchEulerDiscreteScheduler`) + + Inputs: + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + sigmas (`list`, *optional*): + Custom sigmas for the denoising process. + + Outputs: + timesteps (`Tensor`): + Full timestep schedule for the denoising loop. + num_inference_steps (`int`): + Number of denoising steps (may be updated by ``retrieve_timesteps``). + """ + + model_name = "anima" + + @property + def expected_components(self) -> list[ComponentSpec]: + return [ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)] + + @property + def description(self) -> str: + return "Set the scheduler timesteps for Anima image-to-image inference." + + @property + def inputs(self) -> list[InputParam]: + return [ + InputParam.template("num_inference_steps"), + InputParam.template("sigmas"), + ] + + @property + def intermediate_outputs(self) -> list[OutputParam]: + return [ + OutputParam( + "timesteps", + type_hint=torch.Tensor, + description="Full timestep schedule for the denoising loop.", + ), + OutputParam("num_inference_steps", type_hint=int, description="Number of denoising steps."), + ] + + @torch.no_grad() + def __call__(self, components: AnimaModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + device = components._execution_device + + sigmas = ( + np.linspace(1.0, 1 / block_state.num_inference_steps, block_state.num_inference_steps) + if block_state.sigmas is None + else block_state.sigmas + ) + block_state.timesteps, block_state.num_inference_steps = retrieve_timesteps( + components.scheduler, + device=device, + sigmas=sigmas, + ) + # set_begin_index is omitted: get_timesteps() in AnimaImg2ImgVaeEncoderStep + # slices the schedule and sets the correct offset based on strength. + + self.set_block_state(state, block_state) + return components, state diff --git a/src/diffusers/modular_pipelines/anima/encoders.py b/src/diffusers/modular_pipelines/anima/encoders.py index bdeecd28737b..5e59c9e57cb2 100644 --- a/src/diffusers/modular_pipelines/anima/encoders.py +++ b/src/diffusers/modular_pipelines/anima/encoders.py @@ -17,8 +17,13 @@ from ...configuration_utils import FrozenDict from ...guiders import ClassifierFreeGuidance +from ...image_processor import VaeImageProcessor +from ...models import AutoencoderKLQwenImage +from ...schedulers import FlowMatchEulerDiscreteScheduler +from ...utils.torch_utils import randn_tensor from ..modular_pipeline import ModularPipelineBlocks, PipelineState from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam +from .before_denoise import get_timesteps from .modular_pipeline import AnimaModularPipeline @@ -251,3 +256,237 @@ def __call__(self, components: AnimaModularPipeline, state: PipelineState) -> Pi self.set_block_state(state, block_state) return components, state + + +# Copied from diffusers.modular_pipelines.qwenimage.encoders.retrieve_latents +def retrieve_latents( + encoder_output: torch.Tensor, generator: torch.Generator | None = None, sample_mode: str = "sample" +): + if hasattr(encoder_output, "latent_dist") and sample_mode == "sample": + return encoder_output.latent_dist.sample(generator) + elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax": + return encoder_output.latent_dist.mode() + elif hasattr(encoder_output, "latents"): + return encoder_output.latents + else: + raise AttributeError("Could not access latents of provided encoder_output") + + +# Copied from diffusers.modular_pipelines.qwenimage.encoders.encode_vae_image +def encode_vae_image( + image: torch.Tensor, + vae: AutoencoderKLQwenImage, + generator: torch.Generator, + device: torch.device, + dtype: torch.dtype, + latent_channels: int = 16, + sample_mode: str = "argmax", +): + if not isinstance(image, torch.Tensor): + raise ValueError(f"Expected image to be a tensor, got {type(image)}.") + + # preprocessed image should be a 4D tensor: batch_size, num_channels, height, width + if image.dim() == 4: + image = image.unsqueeze(2) + elif image.dim() != 5: + raise ValueError(f"Expected image dims 4 or 5, got {image.dim()}.") + + image = image.to(device=device, dtype=dtype) + + if isinstance(generator, list): + image_latents = [ + retrieve_latents(vae.encode(image[i : i + 1]), generator=generator[i], sample_mode=sample_mode) + for i in range(image.shape[0]) + ] + image_latents = torch.cat(image_latents, dim=0) + else: + image_latents = retrieve_latents(vae.encode(image), generator=generator, sample_mode=sample_mode) + latents_mean = ( + torch.tensor(vae.config.latents_mean) + .view(1, latent_channels, 1, 1, 1) + .to(image_latents.device, image_latents.dtype) + ) + latents_std = ( + torch.tensor(vae.config.latents_std) + .view(1, latent_channels, 1, 1, 1) + .to(image_latents.device, image_latents.dtype) + ) + image_latents = (image_latents - latents_mean) / latents_std + + return image_latents + + +class AnimaImg2ImgVaeEncoderStep(ModularPipelineBlocks): + """VAE Encoder step for Anima image-to-image generation. + + Preprocesses the input image, encodes it with the VAE, generates noise, slices the + timestep schedule based on ``strength``, and adds noise to the image latents using + ``scheduler.scale_noise()``. + + Components: + vae (`AutoencoderKLQwenImage`) + scheduler (`FlowMatchEulerDiscreteScheduler`) + image_processor (`VaeImageProcessor`) + + Inputs: + image (`PIL.Image.Image`): + Input image to use as starting point. + height (`int`, *optional*): + Height of the output image. Defaults to pipeline default. + width (`int`, *optional*): + Width of the output image. Defaults to pipeline default. + strength (`float`, *optional*, defaults to 0.9): + How much to transform the reference image. ``0`` means no change; ``1`` means + fully denoise from random noise. + num_images_per_prompt (`int`, *optional*, defaults to 1): + Number of images to generate per prompt. + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + latents (`Tensor`, *optional*): + Pre-computed noise tensor. Generated randomly if ``None``. + timesteps (`Tensor`): + Full timestep schedule produced by ``AnimaImg2ImgSetTimestepsStep``. + num_inference_steps (`int`): + Total number of inference steps from ``AnimaImg2ImgSetTimestepsStep``. + + Outputs: + latents (`Tensor`): + Noisy image latents to use as the starting point for denoising. + timesteps (`Tensor`): + Timestep schedule sliced by ``strength``. + num_inference_steps (`int`): + Number of denoising steps after strength-based slicing. + padding_mask (`Tensor`): + Cosmos padding mask for the image latents. + height (`int`): + Output image height (updated to pipeline default if not provided). + width (`int`): + Output image width (updated to pipeline default if not provided). + """ + + model_name = "anima" + + @property + def expected_components(self) -> list[ComponentSpec]: + return [ + ComponentSpec("vae", AutoencoderKLQwenImage), + ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler), + ComponentSpec( + "image_processor", + VaeImageProcessor, + config=FrozenDict({"vae_scale_factor": 8}), + default_creation_method="from_config", + ), + ] + + @property + def description(self) -> str: + return ( + "VAE Encoder step for Anima image-to-image generation. Encodes the input image, " + "slices the timestep schedule by strength, and adds noise via scheduler.scale_noise()." + ) + + @property + def inputs(self) -> list[InputParam]: + return [ + InputParam.template("image"), + InputParam.template("height"), + InputParam.template("width"), + InputParam.template("strength"), + InputParam.template("num_images_per_prompt"), + InputParam.template("generator"), + InputParam.template("latents"), + InputParam.template("timesteps", required=True), + InputParam( + "num_inference_steps", + required=True, + type_hint=int, + description="Total number of inference steps from AnimaImg2ImgSetTimestepsStep.", + ), + InputParam( + "batch_size", + required=True, + type_hint=int, + description="Number of prompts, provided by AnimaTextInputStep.", + ), + InputParam("dtype", type_hint=torch.dtype, description="Dtype used by the Anima denoiser."), + ] + + @property + def intermediate_outputs(self) -> list[OutputParam]: + return [ + OutputParam( + "latents", type_hint=torch.Tensor, description="Noisy image latents for the denoising process." + ), + OutputParam("timesteps", type_hint=torch.Tensor, description="Timestep schedule sliced by strength."), + OutputParam( + "num_inference_steps", type_hint=int, description="Number of denoising steps after strength slicing." + ), + OutputParam("padding_mask", type_hint=torch.Tensor, description="Cosmos padding mask for image latents."), + OutputParam("height", type_hint=int, description="Image height used for generation."), + OutputParam("width", type_hint=int, description="Image width used for generation."), + ] + + @torch.no_grad() + def __call__(self, components: AnimaModularPipeline, state: PipelineState) -> PipelineState: + block_state = self.get_block_state(state) + + device = components._execution_device + # dtype is provided by AnimaTextInputStep; fall back to vae dtype if not yet in state + dtype = block_state.dtype if block_state.dtype is not None else components.vae.dtype + + block_state.height = block_state.height or components.default_height + block_state.width = block_state.width or components.default_width + + block_state.timesteps, block_state.num_inference_steps = get_timesteps( + components.scheduler, block_state.num_inference_steps, block_state.strength + ) + + # Total batch = prompt batch × images per prompt + total_batch = block_state.batch_size * block_state.num_images_per_prompt + + # Preprocess PIL image(s) to tensor + processed_image = components.image_processor.preprocess( + image=block_state.image, height=block_state.height, width=block_state.width + ) + + # Encode to image latents; use VAE dtype for encoding + image_latents = encode_vae_image( + image=processed_image, + vae=components.vae, + generator=block_state.generator, + device=device, + dtype=components.vae.dtype, + latent_channels=components.num_channels_latents, + ) + + # Expand image_latents to total_batch (handles single image with multiple prompts) + if image_latents.shape[0] < total_batch: + repeats = total_batch // image_latents.shape[0] + image_latents = image_latents.repeat(repeats, 1, 1, 1, 1) + + # Generate initial noise (or use pre-provided latents as noise) + if block_state.latents is None: + noise = randn_tensor( + image_latents.shape, + generator=block_state.generator, + device=device, + dtype=torch.float32, + ) + else: + noise = block_state.latents.to(device=device, dtype=torch.float32) + + # Add noise to image latents at the appropriate noise level for this strength + latent_timestep = block_state.timesteps[:1].repeat(total_batch) + block_state.latents = components.scheduler.scale_noise( + image_latents.to(dtype=torch.float32), + latent_timestep, + noise, + ) + + block_state.padding_mask = block_state.latents.new_zeros( + 1, 1, block_state.height, block_state.width, dtype=dtype + ) + + self.set_block_state(state, block_state) + return components, state diff --git a/src/diffusers/modular_pipelines/anima/modular_blocks_anima.py b/src/diffusers/modular_pipelines/anima/modular_blocks_anima.py index fc71b87f62d8..69bc722a630c 100644 --- a/src/diffusers/modular_pipelines/anima/modular_blocks_anima.py +++ b/src/diffusers/modular_pipelines/anima/modular_blocks_anima.py @@ -15,6 +15,7 @@ from ..modular_pipeline import SequentialPipelineBlocks from ..modular_pipeline_utils import OutputParam from .before_denoise import ( + AnimaImg2ImgSetTimestepsStep, AnimaPrepareLatentsStep, AnimaSetTimestepsStep, AnimaTextConditioningStep, @@ -22,7 +23,7 @@ ) from .decoders import AnimaProcessImagesOutputStep, AnimaVaeDecoderStep from .denoise import AnimaDenoiseStep -from .encoders import AnimaTextEncoderStep +from .encoders import AnimaImg2ImgVaeEncoderStep, AnimaTextEncoderStep # auto_docstring @@ -35,8 +36,6 @@ class AnimaCoreDenoiseStep(SequentialPipelineBlocks): (`FlowMatchEulerDiscreteScheduler`) guider (`ClassifierFreeGuidance`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): - The number of images to generate per prompt. qwen_prompt_embeds (`Tensor`): Qwen prompt embeddings generated by the text encoder step. qwen_attention_mask (`Tensor`): @@ -53,6 +52,8 @@ class AnimaCoreDenoiseStep(SequentialPipelineBlocks): Negative T5 prompt token ids generated by the text encoder step. negative_t5_attention_mask (`Tensor`, *optional*): Negative T5 prompt attention mask generated by the text encoder step. + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): @@ -131,9 +132,10 @@ class AnimaAutoBlocks(SequentialPipelineBlocks): - `text2image`: requires `prompt` Components: - text_encoder (`Qwen3Model`) tokenizer (`Qwen2Tokenizer`) t5_tokenizer (`T5TokenizerFast`) text_conditioner - (`AnimaTextConditioner`) guider (`ClassifierFreeGuidance`) transformer (`CosmosTransformer3DModel`) scheduler - (`FlowMatchEulerDiscreteScheduler`) vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) + text_encoder (`Qwen3Model`) tokenizer (`Qwen2Tokenizer`) t5_tokenizer (`T5Tokenizer`) guider + (`ClassifierFreeGuidance`) text_conditioner (`AnimaTextConditioner`) transformer (`CosmosTransformer3DModel`) + scheduler (`FlowMatchEulerDiscreteScheduler`) vae (`AutoencoderKLQwenImage`) image_processor + (`VaeImageProcessor`) Inputs: prompt (`str`): @@ -181,3 +183,144 @@ def description(self) -> str: @property def outputs(self): return [OutputParam.template("images")] + + +# auto_docstring +class AnimaImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): + """ + Denoise block for Anima image-to-image generation. VAE encoding runs after AnimaTextInputStep so batch_size is + available in state. + + Components: + text_conditioner (`AnimaTextConditioner`) transformer (`CosmosTransformer3DModel`) vae + (`AutoencoderKLQwenImage`) scheduler (`FlowMatchEulerDiscreteScheduler`) image_processor + (`VaeImageProcessor`) guider (`ClassifierFreeGuidance`) + + Inputs: + qwen_prompt_embeds (`Tensor`): + Qwen prompt embeddings generated by the text encoder step. + qwen_attention_mask (`Tensor`): + Qwen prompt attention mask generated by the text encoder step. + t5_input_ids (`Tensor`): + T5 prompt token ids generated by the text encoder step. + t5_attention_mask (`Tensor`): + T5 prompt attention mask generated by the text encoder step. + negative_qwen_prompt_embeds (`Tensor`, *optional*): + Negative Qwen prompt embeddings generated by the text encoder step. + negative_qwen_attention_mask (`Tensor`, *optional*): + Negative Qwen prompt attention mask generated by the text encoder step. + negative_t5_input_ids (`Tensor`, *optional*): + Negative T5 prompt token ids generated by the text encoder step. + negative_t5_attention_mask (`Tensor`, *optional*): + Negative T5 prompt attention mask generated by the text encoder step. + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + image (`Image | list`): + Reference image(s) for denoising. Can be a single image or list of images. + height (`int`, *optional*): + The height in pixels of the generated image. + width (`int`, *optional*): + The width in pixels of the generated image. + strength (`float`, *optional*, defaults to 0.9): + Strength for img2img/inpainting. + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + timesteps (`Tensor`): + Timesteps for the denoising process. + num_inference_steps (`int`): + Total number of inference steps from AnimaImg2ImgSetTimestepsStep. + **denoiser_input_fields (`None`, *optional*): + The conditional model inputs for the Anima denoiser. + + Outputs: + latents (`Tensor`): + Denoised latents. + """ + + block_classes = [ + AnimaTextConditioningStep, + AnimaTextInputStep, + AnimaImg2ImgVaeEncoderStep, + AnimaDenoiseStep, + ] + block_names = ["text_conditioning", "input", "vae_encoder", "denoise"] + + @property + def description(self) -> str: + return ( + "Denoise block for Anima image-to-image generation. " + "VAE encoding runs after AnimaTextInputStep so batch_size is available in state." + ) + + @property + def outputs(self): + return [OutputParam.template("latents")] + + +# auto_docstring +class AnimaImg2ImgAutoBlocks(SequentialPipelineBlocks): + """ + Auto Modular pipeline for image-to-image generation using Anima. + + Supported workflows: + - `img2img`: requires `prompt`, `image` + + Components: + text_encoder (`Qwen3Model`) tokenizer (`Qwen2Tokenizer`) t5_tokenizer (`T5Tokenizer`) guider + (`ClassifierFreeGuidance`) scheduler (`FlowMatchEulerDiscreteScheduler`) text_conditioner + (`AnimaTextConditioner`) transformer (`CosmosTransformer3DModel`) vae (`AutoencoderKLQwenImage`) + image_processor (`VaeImageProcessor`) + + Inputs: + prompt (`str`): + The prompt or prompts to guide image generation. + negative_prompt (`str`, *optional*): + The prompt or prompts not to guide the image generation. + max_sequence_length (`int`, *optional*, defaults to 512): + Maximum sequence length for prompt encoding. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + sigmas (`list`, *optional*): + Custom sigmas for the denoising process. + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + image (`Image | list`): + Reference image(s) for denoising. Can be a single image or list of images. + height (`int`, *optional*): + The height in pixels of the generated image. + width (`int`, *optional*): + The width in pixels of the generated image. + strength (`float`, *optional*, defaults to 0.9): + Strength for img2img/inpainting. + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + **denoiser_input_fields (`None`, *optional*): + The conditional model inputs for the Anima denoiser. + output_type (`str`, *optional*, defaults to pil): + Output format: 'pil', 'np', 'pt'. + + Outputs: + images (`list`): + Generated images. + """ + + block_classes = [ + AnimaTextEncoderStep, + AnimaImg2ImgSetTimestepsStep, + AnimaImg2ImgCoreDenoiseStep, + AnimaDecodeStep, + ] + block_names = ["text_encoder", "set_timesteps", "denoise", "decode"] + _workflow_map = {"img2img": {"prompt": True, "image": True}} + + @property + def description(self) -> str: + return "Auto Modular pipeline for image-to-image generation using Anima." + + @property + def outputs(self): + return [OutputParam.template("images")] diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index fa977ee07bbe..1284f58a456c 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -17,6 +17,21 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers"]) +class AnimaImg2ImgAutoBlocks(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + class AnimaModularPipeline(metaclass=DummyObject): _backends = ["torch", "transformers"] diff --git a/tests/modular_pipelines/anima/test_modular_pipeline_anima_img2img.py b/tests/modular_pipelines/anima/test_modular_pipeline_anima_img2img.py new file mode 100644 index 000000000000..c25a79739f9f --- /dev/null +++ b/tests/modular_pipelines/anima/test_modular_pipeline_anima_img2img.py @@ -0,0 +1,193 @@ +# Copyright 2026 The HuggingFace Team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import PIL.Image +import torch +from transformers import Qwen2Tokenizer, Qwen3Config, Qwen3Model, T5TokenizerFast + +from diffusers import ( + AnimaImg2ImgAutoBlocks, + AnimaModularPipeline, + AnimaTextConditioner, + AutoencoderKLQwenImage, + CosmosTransformer3DModel, + FlowMatchEulerDiscreteScheduler, +) + +from ...testing_utils import enable_full_determinism +from ..test_modular_pipelines_common import ModularPipelineTesterMixin + + +enable_full_determinism() + + +ANIMA_IMG2IMG_WORKFLOWS = { + "img2img": [ + ("text_encoder", "AnimaTextEncoderStep"), + ("set_timesteps", "AnimaImg2ImgSetTimestepsStep"), + ("denoise.text_conditioning", "AnimaTextConditioningStep"), + ("denoise.input", "AnimaTextInputStep"), + ("denoise.vae_encoder", "AnimaImg2ImgVaeEncoderStep"), + ("denoise.denoise", "AnimaDenoiseStep"), + ("decode.decode", "AnimaVaeDecoderStep"), + ("decode.postprocess", "AnimaProcessImagesOutputStep"), + ], +} + + +def get_dummy_components(): + torch.manual_seed(0) + transformer = CosmosTransformer3DModel( + in_channels=4, + out_channels=4, + num_attention_heads=2, + attention_head_dim=16, + num_layers=2, + mlp_ratio=2, + text_embed_dim=16, + adaln_lora_dim=4, + max_size=(4, 32, 32), + patch_size=(1, 2, 2), + rope_scale=(1.0, 4.0, 4.0), + concat_padding_mask=True, + extra_pos_embed_type=None, + ) + + torch.manual_seed(0) + vae = AutoencoderKLQwenImage( + base_dim=24, + z_dim=4, + dim_mult=[1, 2, 4], + num_res_blocks=1, + temperal_downsample=[False, True], + latents_mean=[0.0] * 4, + latents_std=[1.0] * 4, + ) + + torch.manual_seed(0) + text_conditioner = AnimaTextConditioner( + source_dim=16, + target_dim=16, + model_dim=16, + num_layers=2, + num_attention_heads=4, + target_vocab_size=32128, + min_sequence_length=16, + ) + + torch.manual_seed(0) + text_encoder_config = Qwen3Config( + vocab_size=152064, + hidden_size=16, + intermediate_size=32, + num_hidden_layers=2, + num_attention_heads=4, + num_key_value_heads=2, + max_position_embeddings=128, + rms_norm_eps=1e-6, + rope_theta=1000000.0, + head_dim=4, + attention_bias=False, + ) + text_encoder = Qwen3Model(text_encoder_config).eval() + tokenizer = Qwen2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration") + t5_tokenizer = T5TokenizerFast.from_pretrained("hf-internal-testing/tiny-random-t5") + scheduler = FlowMatchEulerDiscreteScheduler(shift=3.0) + + return { + "transformer": transformer, + "vae": vae, + "scheduler": scheduler, + "text_encoder": text_encoder, + "tokenizer": tokenizer, + "t5_tokenizer": t5_tokenizer, + "text_conditioner": text_conditioner, + } + + +def get_dummy_image(height=32, width=32): + image_array = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8) + return PIL.Image.fromarray(image_array) + + +class TestAnimaImg2ImgModularPipelineFast(ModularPipelineTesterMixin): + pipeline_class = AnimaModularPipeline + pipeline_blocks_class = AnimaImg2ImgAutoBlocks + pretrained_model_name_or_path = "hf-internal-testing/tiny-anima-modular-pipe" + params = frozenset(["prompt", "image", "strength", "height", "width", "negative_prompt"]) + batch_params = frozenset(["prompt", "negative_prompt"]) + expected_workflow_blocks = ANIMA_IMG2IMG_WORKFLOWS + + def get_pipeline(self, components_manager=None, torch_dtype=torch.float32): + pipe = self.pipeline_blocks_class().init_pipeline(components_manager=components_manager) + pipe.update_components(**get_dummy_components()) + pipe.to(dtype=torch_dtype) + pipe.set_progress_bar_config(disable=None) + return pipe + + def get_dummy_inputs(self, seed=0): + generator = torch.Generator(device="cpu").manual_seed(seed) + return { + "prompt": "dance monkey", + "negative_prompt": "bad quality", + "image": get_dummy_image(32, 32), + "strength": 0.8, + "generator": generator, + "num_inference_steps": 2, + "height": 32, + "width": 32, + "max_sequence_length": 16, + "output_type": "pt", + } + + def test_inference_basic(self): + pipe = self.get_pipeline() + inputs = self.get_dummy_inputs() + output = pipe(**inputs).images + + assert output.shape == (1, 3, 32, 32) + assert not torch.isnan(output).any() + + def test_inference_strength_low(self): + pipe = self.get_pipeline() + inputs = self.get_dummy_inputs() + inputs["strength"] = 0.3 + output = pipe(**inputs).images + + assert output.shape == (1, 3, 32, 32) + assert not torch.isnan(output).any() + + def test_inference_strength_high(self): + pipe = self.get_pipeline() + inputs = self.get_dummy_inputs() + inputs["strength"] = 0.95 + output = pipe(**inputs).images + + assert output.shape == (1, 3, 32, 32) + assert not torch.isnan(output).any() + + def test_inference_empty_negative_prompt(self): + pipe = self.get_pipeline() + inputs = self.get_dummy_inputs() + inputs["negative_prompt"] = "" + output = pipe(**inputs).images + + assert output.shape == (1, 3, 32, 32) + assert not torch.isnan(output).any() + + def test_inference_batch_single_identical(self): + super().test_inference_batch_single_identical(expected_max_diff=5e-4) \ No newline at end of file From def773308fef93b7f9bff18536045411d89bdc6b Mon Sep 17 00:00:00 2001 From: PreethamNoelP Date: Fri, 19 Jun 2026 09:02:31 +0530 Subject: [PATCH 2/2] [Anima] Address review feedback: remove # Copied from, fold img2img into AnimaAutoBlocks - Remove incorrect `# Copied from` comment above AnimaImg2ImgSetTimestepsStep - Delete AnimaImg2ImgAutoBlocks; introduce AnimaAutoDenoiseStep (AutoPipelineBlocks) and AnimaImg2ImgDenoiseStep (SequentialPipelineBlocks) so img2img lives as a workflow inside AnimaAutoBlocks, following the z_image pattern - Update __init__.py, dummy_objects, and docs to remove AnimaImg2ImgAutoBlocks - Update img2img test to use AnimaAutoBlocks with updated workflow block paths --- docs/source/en/api/pipelines/anima.md | 4 - src/diffusers/__init__.py | 2 - src/diffusers/modular_pipelines/__init__.py | 3 +- .../modular_pipelines/anima/__init__.py | 4 +- .../modular_pipelines/anima/before_denoise.py | 1 - .../anima/modular_blocks_anima.py | 217 ++++++++++++------ .../dummy_torch_and_transformers_objects.py | 15 -- .../test_modular_pipeline_anima_img2img.py | 14 +- 8 files changed, 156 insertions(+), 104 deletions(-) diff --git a/docs/source/en/api/pipelines/anima.md b/docs/source/en/api/pipelines/anima.md index 22eff138a027..b66eeb2a29b7 100644 --- a/docs/source/en/api/pipelines/anima.md +++ b/docs/source/en/api/pipelines/anima.md @@ -35,10 +35,6 @@ image = pipe(prompt="masterpiece, best quality, 1girl, solo, city lights").image [[autodoc]] AnimaAutoBlocks -## AnimaImg2ImgAutoBlocks - -[[autodoc]] AnimaImg2ImgAutoBlocks - ## AnimaTextConditioner [[autodoc]] AnimaTextConditioner diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index bcb28cd2507d..da77fa67df52 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -470,7 +470,6 @@ _import_structure["modular_pipelines"].extend( [ "AnimaAutoBlocks", - "AnimaImg2ImgAutoBlocks", "AnimaModularPipeline", "ErnieImageAutoBlocks", "ErnieImageModularPipeline", @@ -1309,7 +1308,6 @@ else: from .modular_pipelines import ( AnimaAutoBlocks, - AnimaImg2ImgAutoBlocks, AnimaModularPipeline, ErnieImageAutoBlocks, ErnieImageModularPipeline, diff --git a/src/diffusers/modular_pipelines/__init__.py b/src/diffusers/modular_pipelines/__init__.py index 335b49b451fc..4b36994aef07 100644 --- a/src/diffusers/modular_pipelines/__init__.py +++ b/src/diffusers/modular_pipelines/__init__.py @@ -95,7 +95,6 @@ ] _import_structure["anima"] = [ "AnimaAutoBlocks", - "AnimaImg2ImgAutoBlocks", "AnimaModularPipeline", ] _import_structure["ernie_image"] = [ @@ -123,7 +122,7 @@ except OptionalDependencyNotAvailable: from ..utils.dummy_pt_objects import * # noqa F403 else: - from .anima import AnimaAutoBlocks, AnimaImg2ImgAutoBlocks, AnimaModularPipeline + from .anima import AnimaAutoBlocks, AnimaModularPipeline from .components_manager import ComponentsManager from .ernie_image import ErnieImageAutoBlocks, ErnieImageModularPipeline from .flux import FluxAutoBlocks, FluxKontextAutoBlocks, FluxKontextModularPipeline, FluxModularPipeline diff --git a/src/diffusers/modular_pipelines/anima/__init__.py b/src/diffusers/modular_pipelines/anima/__init__.py index 1cbb2d741bfb..4772d906e03b 100644 --- a/src/diffusers/modular_pipelines/anima/__init__.py +++ b/src/diffusers/modular_pipelines/anima/__init__.py @@ -21,7 +21,7 @@ _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects)) else: - _import_structure["modular_blocks_anima"] = ["AnimaAutoBlocks", "AnimaImg2ImgAutoBlocks"] + _import_structure["modular_blocks_anima"] = ["AnimaAutoBlocks"] _import_structure["modular_pipeline"] = ["AnimaModularPipeline"] if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: @@ -31,7 +31,7 @@ except OptionalDependencyNotAvailable: from ...utils.dummy_torch_and_transformers_objects import * # noqa F403 else: - from .modular_blocks_anima import AnimaAutoBlocks, AnimaImg2ImgAutoBlocks + from .modular_blocks_anima import AnimaAutoBlocks from .modular_pipeline import AnimaModularPipeline else: import sys diff --git a/src/diffusers/modular_pipelines/anima/before_denoise.py b/src/diffusers/modular_pipelines/anima/before_denoise.py index 9147047f5f9a..1b25688054e7 100644 --- a/src/diffusers/modular_pipelines/anima/before_denoise.py +++ b/src/diffusers/modular_pipelines/anima/before_denoise.py @@ -429,7 +429,6 @@ def __call__(self, components: AnimaModularPipeline, state: PipelineState) -> Pi return components, state -# Copied from diffusers.modular_pipelines.anima.before_denoise.AnimaSetTimestepsStep class AnimaImg2ImgSetTimestepsStep(ModularPipelineBlocks): """Set the scheduler timesteps for Anima image-to-image inference. diff --git a/src/diffusers/modular_pipelines/anima/modular_blocks_anima.py b/src/diffusers/modular_pipelines/anima/modular_blocks_anima.py index 69bc722a630c..f5aa5e6253a8 100644 --- a/src/diffusers/modular_pipelines/anima/modular_blocks_anima.py +++ b/src/diffusers/modular_pipelines/anima/modular_blocks_anima.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ..modular_pipeline import SequentialPipelineBlocks +from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks from ..modular_pipeline_utils import OutputParam from .before_denoise import ( AnimaImg2ImgSetTimestepsStep, @@ -124,72 +124,83 @@ def outputs(self): # auto_docstring -class AnimaAutoBlocks(SequentialPipelineBlocks): +class AnimaImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): """ - Auto Modular pipeline for text-to-image generation using Anima. - - Supported workflows: - - `text2image`: requires `prompt` + Denoise block for Anima image-to-image generation. VAE encoding runs after AnimaTextInputStep so batch_size is + available in state. Components: - text_encoder (`Qwen3Model`) tokenizer (`Qwen2Tokenizer`) t5_tokenizer (`T5Tokenizer`) guider - (`ClassifierFreeGuidance`) text_conditioner (`AnimaTextConditioner`) transformer (`CosmosTransformer3DModel`) - scheduler (`FlowMatchEulerDiscreteScheduler`) vae (`AutoencoderKLQwenImage`) image_processor - (`VaeImageProcessor`) + text_conditioner (`AnimaTextConditioner`) transformer (`CosmosTransformer3DModel`) vae + (`AutoencoderKLQwenImage`) scheduler (`FlowMatchEulerDiscreteScheduler`) image_processor + (`VaeImageProcessor`) guider (`ClassifierFreeGuidance`) Inputs: - prompt (`str`): - The prompt or prompts to guide image generation. - negative_prompt (`str`, *optional*): - The prompt or prompts not to guide the image generation. - max_sequence_length (`int`, *optional*, defaults to 512): - Maximum sequence length for prompt encoding. + qwen_prompt_embeds (`Tensor`): + Qwen prompt embeddings generated by the text encoder step. + qwen_attention_mask (`Tensor`): + Qwen prompt attention mask generated by the text encoder step. + t5_input_ids (`Tensor`): + T5 prompt token ids generated by the text encoder step. + t5_attention_mask (`Tensor`): + T5 prompt attention mask generated by the text encoder step. + negative_qwen_prompt_embeds (`Tensor`, *optional*): + Negative Qwen prompt embeddings generated by the text encoder step. + negative_qwen_attention_mask (`Tensor`, *optional*): + Negative Qwen prompt attention mask generated by the text encoder step. + negative_t5_input_ids (`Tensor`, *optional*): + Negative T5 prompt token ids generated by the text encoder step. + negative_t5_attention_mask (`Tensor`, *optional*): + Negative T5 prompt attention mask generated by the text encoder step. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. + image (`Image | list`): + Reference image(s) for denoising. Can be a single image or list of images. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. - latents (`Tensor`, *optional*): - Pre-generated noisy latents for image generation. + strength (`float`, *optional*, defaults to 0.9): + Strength for img2img/inpainting. generator (`Generator`, *optional*): Torch generator for deterministic generation. - num_inference_steps (`int`, *optional*, defaults to 50): - The number of denoising steps. - sigmas (`list`, *optional*): - Custom sigmas for the denoising process. + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + timesteps (`Tensor`): + Timesteps for the denoising process. + num_inference_steps (`int`): + Total number of inference steps from AnimaImg2ImgSetTimestepsStep. **denoiser_input_fields (`None`, *optional*): The conditional model inputs for the Anima denoiser. - output_type (`str`, *optional*, defaults to pil): - Output format: 'pil', 'np', 'pt'. Outputs: - images (`list`): - Generated images. + latents (`Tensor`): + Denoised latents. """ block_classes = [ - AnimaTextEncoderStep, - AnimaCoreDenoiseStep, - AnimaDecodeStep, + AnimaTextConditioningStep, + AnimaTextInputStep, + AnimaImg2ImgVaeEncoderStep, + AnimaDenoiseStep, ] - block_names = ["text_encoder", "denoise", "decode"] - _workflow_map = {"text2image": {"prompt": True}} + block_names = ["text_conditioning", "input", "vae_encoder", "denoise"] @property def description(self) -> str: - return "Auto Modular pipeline for text-to-image generation using Anima." + return ( + "Denoise block for Anima image-to-image generation. " + "VAE encoding runs after AnimaTextInputStep so batch_size is available in state." + ) @property def outputs(self): - return [OutputParam.template("images")] + return [OutputParam.template("latents")] # auto_docstring -class AnimaImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): +class AnimaImg2ImgDenoiseStep(SequentialPipelineBlocks): """ - Denoise block for Anima image-to-image generation. VAE encoding runs after AnimaTextInputStep so batch_size is - available in state. + Combined set-timesteps + denoise block for Anima image-to-image generation. Components: text_conditioner (`AnimaTextConditioner`) transformer (`CosmosTransformer3DModel`) vae @@ -213,6 +224,10 @@ class AnimaImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): Negative T5 prompt token ids generated by the text encoder step. negative_t5_attention_mask (`Tensor`, *optional*): Negative T5 prompt attention mask generated by the text encoder step. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + sigmas (`list`, *optional*): + Custom sigmas for the denoising process. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. image (`Image | list`): @@ -227,10 +242,6 @@ class AnimaImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): Torch generator for deterministic generation. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. - timesteps (`Tensor`): - Timesteps for the denoising process. - num_inference_steps (`int`): - Total number of inference steps from AnimaImg2ImgSetTimestepsStep. **denoiser_input_fields (`None`, *optional*): The conditional model inputs for the Anima denoiser. @@ -239,20 +250,12 @@ class AnimaImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): Denoised latents. """ - block_classes = [ - AnimaTextConditioningStep, - AnimaTextInputStep, - AnimaImg2ImgVaeEncoderStep, - AnimaDenoiseStep, - ] - block_names = ["text_conditioning", "input", "vae_encoder", "denoise"] + block_classes = [AnimaImg2ImgSetTimestepsStep, AnimaImg2ImgCoreDenoiseStep] + block_names = ["set_timesteps", "denoise"] @property def description(self) -> str: - return ( - "Denoise block for Anima image-to-image generation. " - "VAE encoding runs after AnimaTextInputStep so batch_size is available in state." - ) + return "Combined set-timesteps and denoise block for Anima image-to-image generation." @property def outputs(self): @@ -260,18 +263,88 @@ def outputs(self): # auto_docstring -class AnimaImg2ImgAutoBlocks(SequentialPipelineBlocks): +class AnimaAutoDenoiseStep(AutoPipelineBlocks): + """ + Denoise step that selects between text-to-image and image-to-image denoising based on whether an input image is + provided. - `AnimaCoreDenoiseStep` (text2image) is used when no image is provided. - + `AnimaImg2ImgDenoiseStep` (img2img) is used when an image is provided. + + Components: + text_conditioner (`AnimaTextConditioner`) transformer (`CosmosTransformer3DModel`) vae + (`AutoencoderKLQwenImage`) scheduler (`FlowMatchEulerDiscreteScheduler`) image_processor + (`VaeImageProcessor`) guider (`ClassifierFreeGuidance`) + + Inputs: + qwen_prompt_embeds (`Tensor`): + Qwen prompt embeddings generated by the text encoder step. + qwen_attention_mask (`Tensor`): + Qwen prompt attention mask generated by the text encoder step. + t5_input_ids (`Tensor`): + T5 prompt token ids generated by the text encoder step. + t5_attention_mask (`Tensor`): + T5 prompt attention mask generated by the text encoder step. + negative_qwen_prompt_embeds (`Tensor`, *optional*): + Negative Qwen prompt embeddings generated by the text encoder step. + negative_qwen_attention_mask (`Tensor`, *optional*): + Negative Qwen prompt attention mask generated by the text encoder step. + negative_t5_input_ids (`Tensor`, *optional*): + Negative T5 prompt token ids generated by the text encoder step. + negative_t5_attention_mask (`Tensor`, *optional*): + Negative T5 prompt attention mask generated by the text encoder step. + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + image (`Image | list`, *optional*): + Reference image(s). When provided, img2img denoising is used. + height (`int`, *optional*): + The height in pixels of the generated image. + width (`int`, *optional*): + The width in pixels of the generated image. + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + sigmas (`list`, *optional*): + Custom sigmas for the denoising process. + strength (`float`, *optional*, defaults to 0.9): + Strength for img2img transformation. + **denoiser_input_fields (`None`, *optional*): + The conditional model inputs for the Anima denoiser. + + Outputs: + latents (`Tensor`): + Denoised latents. + """ + + block_classes = [AnimaImg2ImgDenoiseStep, AnimaCoreDenoiseStep] + block_names = ["img2img", "text2image"] + block_trigger_inputs = ["image", None] + + @property + def description(self) -> str: + return ( + "Denoise step that selects between text-to-image and image-to-image denoising based on whether " + "an input image is provided." + " - `AnimaCoreDenoiseStep` (text2image) is used when no image is provided." + " - `AnimaImg2ImgDenoiseStep` (img2img) is used when an image is provided." + ) + + +# auto_docstring +class AnimaAutoBlocks(SequentialPipelineBlocks): """ - Auto Modular pipeline for image-to-image generation using Anima. + Auto Modular pipeline for text-to-image and image-to-image generation using Anima. Supported workflows: + - `text2image`: requires `prompt` - `img2img`: requires `prompt`, `image` Components: text_encoder (`Qwen3Model`) tokenizer (`Qwen2Tokenizer`) t5_tokenizer (`T5Tokenizer`) guider - (`ClassifierFreeGuidance`) scheduler (`FlowMatchEulerDiscreteScheduler`) text_conditioner - (`AnimaTextConditioner`) transformer (`CosmosTransformer3DModel`) vae (`AutoencoderKLQwenImage`) - image_processor (`VaeImageProcessor`) + (`ClassifierFreeGuidance`) text_conditioner (`AnimaTextConditioner`) transformer (`CosmosTransformer3DModel`) + scheduler (`FlowMatchEulerDiscreteScheduler`) vae (`AutoencoderKLQwenImage`) image_processor + (`VaeImageProcessor`) Inputs: prompt (`str`): @@ -280,24 +353,24 @@ class AnimaImg2ImgAutoBlocks(SequentialPipelineBlocks): The prompt or prompts not to guide the image generation. max_sequence_length (`int`, *optional*, defaults to 512): Maximum sequence length for prompt encoding. - num_inference_steps (`int`, *optional*, defaults to 50): - The number of denoising steps. - sigmas (`list`, *optional*): - Custom sigmas for the denoising process. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - image (`Image | list`): - Reference image(s) for denoising. Can be a single image or list of images. + image (`Image | list`, *optional*): + Reference image(s) for image-to-image generation. When provided, img2img workflow is used. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. - strength (`float`, *optional*, defaults to 0.9): - Strength for img2img/inpainting. - generator (`Generator`, *optional*): - Torch generator for deterministic generation. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + sigmas (`list`, *optional*): + Custom sigmas for the denoising process. + strength (`float`, *optional*, defaults to 0.9): + How much to transform the reference image (img2img only). **denoiser_input_fields (`None`, *optional*): The conditional model inputs for the Anima denoiser. output_type (`str`, *optional*, defaults to pil): @@ -310,16 +383,18 @@ class AnimaImg2ImgAutoBlocks(SequentialPipelineBlocks): block_classes = [ AnimaTextEncoderStep, - AnimaImg2ImgSetTimestepsStep, - AnimaImg2ImgCoreDenoiseStep, + AnimaAutoDenoiseStep, AnimaDecodeStep, ] - block_names = ["text_encoder", "set_timesteps", "denoise", "decode"] - _workflow_map = {"img2img": {"prompt": True, "image": True}} + block_names = ["text_encoder", "denoise", "decode"] + _workflow_map = { + "text2image": {"prompt": True}, + "img2img": {"image": True, "prompt": True}, + } @property def description(self) -> str: - return "Auto Modular pipeline for image-to-image generation using Anima." + return "Auto Modular pipeline for text-to-image and image-to-image generation using Anima." @property def outputs(self): diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index bc6a47ef9d83..0786186dff53 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -17,21 +17,6 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers"]) -class AnimaImg2ImgAutoBlocks(metaclass=DummyObject): - _backends = ["torch", "transformers"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch", "transformers"]) - - @classmethod - def from_config(cls, *args, **kwargs): - requires_backends(cls, ["torch", "transformers"]) - - @classmethod - def from_pretrained(cls, *args, **kwargs): - requires_backends(cls, ["torch", "transformers"]) - - class AnimaModularPipeline(metaclass=DummyObject): _backends = ["torch", "transformers"] diff --git a/tests/modular_pipelines/anima/test_modular_pipeline_anima_img2img.py b/tests/modular_pipelines/anima/test_modular_pipeline_anima_img2img.py index c25a79739f9f..2abe9dadae56 100644 --- a/tests/modular_pipelines/anima/test_modular_pipeline_anima_img2img.py +++ b/tests/modular_pipelines/anima/test_modular_pipeline_anima_img2img.py @@ -20,7 +20,7 @@ from transformers import Qwen2Tokenizer, Qwen3Config, Qwen3Model, T5TokenizerFast from diffusers import ( - AnimaImg2ImgAutoBlocks, + AnimaAutoBlocks, AnimaModularPipeline, AnimaTextConditioner, AutoencoderKLQwenImage, @@ -38,11 +38,11 @@ ANIMA_IMG2IMG_WORKFLOWS = { "img2img": [ ("text_encoder", "AnimaTextEncoderStep"), - ("set_timesteps", "AnimaImg2ImgSetTimestepsStep"), - ("denoise.text_conditioning", "AnimaTextConditioningStep"), - ("denoise.input", "AnimaTextInputStep"), - ("denoise.vae_encoder", "AnimaImg2ImgVaeEncoderStep"), - ("denoise.denoise", "AnimaDenoiseStep"), + ("denoise.set_timesteps", "AnimaImg2ImgSetTimestepsStep"), + ("denoise.denoise.text_conditioning", "AnimaTextConditioningStep"), + ("denoise.denoise.input", "AnimaTextInputStep"), + ("denoise.denoise.vae_encoder", "AnimaImg2ImgVaeEncoderStep"), + ("denoise.denoise.denoise", "AnimaDenoiseStep"), ("decode.decode", "AnimaVaeDecoderStep"), ("decode.postprocess", "AnimaProcessImagesOutputStep"), ], @@ -126,7 +126,7 @@ def get_dummy_image(height=32, width=32): class TestAnimaImg2ImgModularPipelineFast(ModularPipelineTesterMixin): pipeline_class = AnimaModularPipeline - pipeline_blocks_class = AnimaImg2ImgAutoBlocks + pipeline_blocks_class = AnimaAutoBlocks pretrained_model_name_or_path = "hf-internal-testing/tiny-anima-modular-pipe" params = frozenset(["prompt", "image", "strength", "height", "width", "negative_prompt"]) batch_params = frozenset(["prompt", "negative_prompt"])