From 585defff47a1812e5ee25fe36b9d3705c66a7aca Mon Sep 17 00:00:00 2001
From: PreethamNoelP <preethamnoel.05@gmail.com>
Date: Fri, 12 Jun 2026 20:20:25 +0530
Subject: [PATCH 1/2] [Anima] Add img2img pipeline blocks

---
 docs/source/en/api/pipelines/anima.md         |   4 +
 src/diffusers/__init__.py                     |   2 +
 src/diffusers/modular_pipelines/__init__.py   |   3 +-
 .../modular_pipelines/anima/__init__.py       |   4 +-
 .../modular_pipelines/anima/before_denoise.py |  87 +++++++
 .../modular_pipelines/anima/encoders.py       | 239 ++++++++++++++++++
 .../anima/modular_blocks_anima.py             | 155 +++++++++++-
 .../dummy_torch_and_transformers_objects.py   |  15 ++
 .../test_modular_pipeline_anima_img2img.py    | 193 ++++++++++++++
 9 files changed, 693 insertions(+), 9 deletions(-)
 create mode 100644 tests/modular_pipelines/anima/test_modular_pipeline_anima_img2img.py

diff --git a/docs/source/en/api/pipelines/anima.md b/docs/source/en/api/pipelines/anima.md
index b66eeb2a29b7..22eff138a027 100644
--- a/docs/source/en/api/pipelines/anima.md
+++ b/docs/source/en/api/pipelines/anima.md
@@ -35,6 +35,10 @@ image = pipe(prompt="masterpiece, best quality, 1girl, solo, city lights").image
 
 [[autodoc]] AnimaAutoBlocks
 
+## AnimaImg2ImgAutoBlocks
+
+[[autodoc]] AnimaImg2ImgAutoBlocks
+
 ## AnimaTextConditioner
 
 [[autodoc]] AnimaTextConditioner
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 4a2c3bca5bcc..46f511477e1a 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -455,6 +455,7 @@
     _import_structure["modular_pipelines"].extend(
         [
             "AnimaAutoBlocks",
+            "AnimaImg2ImgAutoBlocks",
             "AnimaModularPipeline",
             "ErnieImageAutoBlocks",
             "ErnieImageModularPipeline",
@@ -1280,6 +1281,7 @@
     else:
         from .modular_pipelines import (
             AnimaAutoBlocks,
+            AnimaImg2ImgAutoBlocks,
             AnimaModularPipeline,
             ErnieImageAutoBlocks,
             ErnieImageModularPipeline,
diff --git a/src/diffusers/modular_pipelines/__init__.py b/src/diffusers/modular_pipelines/__init__.py
index 4b36994aef07..335b49b451fc 100644
--- a/src/diffusers/modular_pipelines/__init__.py
+++ b/src/diffusers/modular_pipelines/__init__.py
@@ -95,6 +95,7 @@
     ]
     _import_structure["anima"] = [
         "AnimaAutoBlocks",
+        "AnimaImg2ImgAutoBlocks",
         "AnimaModularPipeline",
     ]
     _import_structure["ernie_image"] = [
@@ -122,7 +123,7 @@
     except OptionalDependencyNotAvailable:
         from ..utils.dummy_pt_objects import *  # noqa F403
     else:
-        from .anima import AnimaAutoBlocks, AnimaModularPipeline
+        from .anima import AnimaAutoBlocks, AnimaImg2ImgAutoBlocks, AnimaModularPipeline
         from .components_manager import ComponentsManager
         from .ernie_image import ErnieImageAutoBlocks, ErnieImageModularPipeline
         from .flux import FluxAutoBlocks, FluxKontextAutoBlocks, FluxKontextModularPipeline, FluxModularPipeline
diff --git a/src/diffusers/modular_pipelines/anima/__init__.py b/src/diffusers/modular_pipelines/anima/__init__.py
index 4772d906e03b..1cbb2d741bfb 100644
--- a/src/diffusers/modular_pipelines/anima/__init__.py
+++ b/src/diffusers/modular_pipelines/anima/__init__.py
@@ -21,7 +21,7 @@
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
-    _import_structure["modular_blocks_anima"] = ["AnimaAutoBlocks"]
+    _import_structure["modular_blocks_anima"] = ["AnimaAutoBlocks", "AnimaImg2ImgAutoBlocks"]
     _import_structure["modular_pipeline"] = ["AnimaModularPipeline"]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -31,7 +31,7 @@
     except OptionalDependencyNotAvailable:
         from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
     else:
-        from .modular_blocks_anima import AnimaAutoBlocks
+        from .modular_blocks_anima import AnimaAutoBlocks, AnimaImg2ImgAutoBlocks
         from .modular_pipeline import AnimaModularPipeline
 else:
     import sys
diff --git a/src/diffusers/modular_pipelines/anima/before_denoise.py b/src/diffusers/modular_pipelines/anima/before_denoise.py
index 25f38cd0cb65..9147047f5f9a 100644
--- a/src/diffusers/modular_pipelines/anima/before_denoise.py
+++ b/src/diffusers/modular_pipelines/anima/before_denoise.py
@@ -370,6 +370,19 @@ def __call__(self, components: AnimaModularPipeline, state: PipelineState) -> Pi
         return components, state
 
 
+# Copied from diffusers.modular_pipelines.qwenimage.before_denoise.get_timesteps
+def get_timesteps(scheduler, num_inference_steps, strength):
+    # get the original timestep using init_timestep
+    init_timestep = min(num_inference_steps * strength, num_inference_steps)
+
+    t_start = int(max(num_inference_steps - init_timestep, 0))
+    timesteps = scheduler.timesteps[t_start * scheduler.order :]
+    if hasattr(scheduler, "set_begin_index"):
+        scheduler.set_begin_index(t_start * scheduler.order)
+
+    return timesteps, num_inference_steps - t_start
+
+
 class AnimaSetTimestepsStep(ModularPipelineBlocks):
     model_name = "anima"
 
@@ -414,3 +427,77 @@ def __call__(self, components: AnimaModularPipeline, state: PipelineState) -> Pi
 
         self.set_block_state(state, block_state)
         return components, state
+
+
+# Copied from diffusers.modular_pipelines.anima.before_denoise.AnimaSetTimestepsStep
+class AnimaImg2ImgSetTimestepsStep(ModularPipelineBlocks):
+    """Set the scheduler timesteps for Anima image-to-image inference.
+
+    This step computes the full timestep schedule and stores it in state. It does **not** set
+    ``scheduler.set_begin_index`` — that is handled downstream by
+    ``AnimaImg2ImgVaeEncoderStep``, which slices the schedule based on ``strength``.
+
+    Components:
+        scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+    Inputs:
+        num_inference_steps (`int`, *optional*, defaults to 50):
+            The number of denoising steps.
+        sigmas (`list`, *optional*):
+            Custom sigmas for the denoising process.
+
+    Outputs:
+        timesteps (`Tensor`):
+            Full timestep schedule for the denoising loop.
+        num_inference_steps (`int`):
+            Number of denoising steps (may be updated by ``retrieve_timesteps``).
+    """
+
+    model_name = "anima"
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)]
+
+    @property
+    def description(self) -> str:
+        return "Set the scheduler timesteps for Anima image-to-image inference."
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam.template("num_inference_steps"),
+            InputParam.template("sigmas"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam(
+                "timesteps",
+                type_hint=torch.Tensor,
+                description="Full timestep schedule for the denoising loop.",
+            ),
+            OutputParam("num_inference_steps", type_hint=int, description="Number of denoising steps."),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: AnimaModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        device = components._execution_device
+
+        sigmas = (
+            np.linspace(1.0, 1 / block_state.num_inference_steps, block_state.num_inference_steps)
+            if block_state.sigmas is None
+            else block_state.sigmas
+        )
+        block_state.timesteps, block_state.num_inference_steps = retrieve_timesteps(
+            components.scheduler,
+            device=device,
+            sigmas=sigmas,
+        )
+        # set_begin_index is omitted: get_timesteps() in AnimaImg2ImgVaeEncoderStep
+        # slices the schedule and sets the correct offset based on strength.
+
+        self.set_block_state(state, block_state)
+        return components, state
diff --git a/src/diffusers/modular_pipelines/anima/encoders.py b/src/diffusers/modular_pipelines/anima/encoders.py
index bdeecd28737b..5e59c9e57cb2 100644
--- a/src/diffusers/modular_pipelines/anima/encoders.py
+++ b/src/diffusers/modular_pipelines/anima/encoders.py
@@ -17,8 +17,13 @@
 
 from ...configuration_utils import FrozenDict
 from ...guiders import ClassifierFreeGuidance
+from ...image_processor import VaeImageProcessor
+from ...models import AutoencoderKLQwenImage
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils.torch_utils import randn_tensor
 from ..modular_pipeline import ModularPipelineBlocks, PipelineState
 from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+from .before_denoise import get_timesteps
 from .modular_pipeline import AnimaModularPipeline
 
 
@@ -251,3 +256,237 @@ def __call__(self, components: AnimaModularPipeline, state: PipelineState) -> Pi
 
         self.set_block_state(state, block_state)
         return components, state
+
+
+# Copied from diffusers.modular_pipelines.qwenimage.encoders.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: torch.Generator | None = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.modular_pipelines.qwenimage.encoders.encode_vae_image
+def encode_vae_image(
+    image: torch.Tensor,
+    vae: AutoencoderKLQwenImage,
+    generator: torch.Generator,
+    device: torch.device,
+    dtype: torch.dtype,
+    latent_channels: int = 16,
+    sample_mode: str = "argmax",
+):
+    if not isinstance(image, torch.Tensor):
+        raise ValueError(f"Expected image to be a tensor, got {type(image)}.")
+
+    # preprocessed image should be a 4D tensor: batch_size, num_channels, height, width
+    if image.dim() == 4:
+        image = image.unsqueeze(2)
+    elif image.dim() != 5:
+        raise ValueError(f"Expected image dims 4 or 5, got {image.dim()}.")
+
+    image = image.to(device=device, dtype=dtype)
+
+    if isinstance(generator, list):
+        image_latents = [
+            retrieve_latents(vae.encode(image[i : i + 1]), generator=generator[i], sample_mode=sample_mode)
+            for i in range(image.shape[0])
+        ]
+        image_latents = torch.cat(image_latents, dim=0)
+    else:
+        image_latents = retrieve_latents(vae.encode(image), generator=generator, sample_mode=sample_mode)
+    latents_mean = (
+        torch.tensor(vae.config.latents_mean)
+        .view(1, latent_channels, 1, 1, 1)
+        .to(image_latents.device, image_latents.dtype)
+    )
+    latents_std = (
+        torch.tensor(vae.config.latents_std)
+        .view(1, latent_channels, 1, 1, 1)
+        .to(image_latents.device, image_latents.dtype)
+    )
+    image_latents = (image_latents - latents_mean) / latents_std
+
+    return image_latents
+
+
+class AnimaImg2ImgVaeEncoderStep(ModularPipelineBlocks):
+    """VAE Encoder step for Anima image-to-image generation.
+
+    Preprocesses the input image, encodes it with the VAE, generates noise, slices the
+    timestep schedule based on ``strength``, and adds noise to the image latents using
+    ``scheduler.scale_noise()``.
+
+    Components:
+        vae (`AutoencoderKLQwenImage`)
+        scheduler (`FlowMatchEulerDiscreteScheduler`)
+        image_processor (`VaeImageProcessor`)
+
+    Inputs:
+        image (`PIL.Image.Image`):
+            Input image to use as starting point.
+        height (`int`, *optional*):
+            Height of the output image. Defaults to pipeline default.
+        width (`int`, *optional*):
+            Width of the output image. Defaults to pipeline default.
+        strength (`float`, *optional*, defaults to 0.9):
+            How much to transform the reference image. ``0`` means no change; ``1`` means
+            fully denoise from random noise.
+        num_images_per_prompt (`int`, *optional*, defaults to 1):
+            Number of images to generate per prompt.
+        generator (`Generator`, *optional*):
+            Torch generator for deterministic generation.
+        latents (`Tensor`, *optional*):
+            Pre-computed noise tensor. Generated randomly if ``None``.
+        timesteps (`Tensor`):
+            Full timestep schedule produced by ``AnimaImg2ImgSetTimestepsStep``.
+        num_inference_steps (`int`):
+            Total number of inference steps from ``AnimaImg2ImgSetTimestepsStep``.
+
+    Outputs:
+        latents (`Tensor`):
+            Noisy image latents to use as the starting point for denoising.
+        timesteps (`Tensor`):
+            Timestep schedule sliced by ``strength``.
+        num_inference_steps (`int`):
+            Number of denoising steps after strength-based slicing.
+        padding_mask (`Tensor`):
+            Cosmos padding mask for the image latents.
+        height (`int`):
+            Output image height (updated to pipeline default if not provided).
+        width (`int`):
+            Output image width (updated to pipeline default if not provided).
+    """
+
+    model_name = "anima"
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("vae", AutoencoderKLQwenImage),
+            ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler),
+            ComponentSpec(
+                "image_processor",
+                VaeImageProcessor,
+                config=FrozenDict({"vae_scale_factor": 8}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def description(self) -> str:
+        return (
+            "VAE Encoder step for Anima image-to-image generation. Encodes the input image, "
+            "slices the timestep schedule by strength, and adds noise via scheduler.scale_noise()."
+        )
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam.template("image"),
+            InputParam.template("height"),
+            InputParam.template("width"),
+            InputParam.template("strength"),
+            InputParam.template("num_images_per_prompt"),
+            InputParam.template("generator"),
+            InputParam.template("latents"),
+            InputParam.template("timesteps", required=True),
+            InputParam(
+                "num_inference_steps",
+                required=True,
+                type_hint=int,
+                description="Total number of inference steps from AnimaImg2ImgSetTimestepsStep.",
+            ),
+            InputParam(
+                "batch_size",
+                required=True,
+                type_hint=int,
+                description="Number of prompts, provided by AnimaTextInputStep.",
+            ),
+            InputParam("dtype", type_hint=torch.dtype, description="Dtype used by the Anima denoiser."),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam(
+                "latents", type_hint=torch.Tensor, description="Noisy image latents for the denoising process."
+            ),
+            OutputParam("timesteps", type_hint=torch.Tensor, description="Timestep schedule sliced by strength."),
+            OutputParam(
+                "num_inference_steps", type_hint=int, description="Number of denoising steps after strength slicing."
+            ),
+            OutputParam("padding_mask", type_hint=torch.Tensor, description="Cosmos padding mask for image latents."),
+            OutputParam("height", type_hint=int, description="Image height used for generation."),
+            OutputParam("width", type_hint=int, description="Image width used for generation."),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: AnimaModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        device = components._execution_device
+        # dtype is provided by AnimaTextInputStep; fall back to vae dtype if not yet in state
+        dtype = block_state.dtype if block_state.dtype is not None else components.vae.dtype
+
+        block_state.height = block_state.height or components.default_height
+        block_state.width = block_state.width or components.default_width
+
+        block_state.timesteps, block_state.num_inference_steps = get_timesteps(
+            components.scheduler, block_state.num_inference_steps, block_state.strength
+        )
+
+        # Total batch = prompt batch × images per prompt
+        total_batch = block_state.batch_size * block_state.num_images_per_prompt
+
+        # Preprocess PIL image(s) to tensor
+        processed_image = components.image_processor.preprocess(
+            image=block_state.image, height=block_state.height, width=block_state.width
+        )
+
+        # Encode to image latents; use VAE dtype for encoding
+        image_latents = encode_vae_image(
+            image=processed_image,
+            vae=components.vae,
+            generator=block_state.generator,
+            device=device,
+            dtype=components.vae.dtype,
+            latent_channels=components.num_channels_latents,
+        )
+
+        # Expand image_latents to total_batch (handles single image with multiple prompts)
+        if image_latents.shape[0] < total_batch:
+            repeats = total_batch // image_latents.shape[0]
+            image_latents = image_latents.repeat(repeats, 1, 1, 1, 1)
+
+        # Generate initial noise (or use pre-provided latents as noise)
+        if block_state.latents is None:
+            noise = randn_tensor(
+                image_latents.shape,
+                generator=block_state.generator,
+                device=device,
+                dtype=torch.float32,
+            )
+        else:
+            noise = block_state.latents.to(device=device, dtype=torch.float32)
+
+        # Add noise to image latents at the appropriate noise level for this strength
+        latent_timestep = block_state.timesteps[:1].repeat(total_batch)
+        block_state.latents = components.scheduler.scale_noise(
+            image_latents.to(dtype=torch.float32),
+            latent_timestep,
+            noise,
+        )
+
+        block_state.padding_mask = block_state.latents.new_zeros(
+            1, 1, block_state.height, block_state.width, dtype=dtype
+        )
+
+        self.set_block_state(state, block_state)
+        return components, state
diff --git a/src/diffusers/modular_pipelines/anima/modular_blocks_anima.py b/src/diffusers/modular_pipelines/anima/modular_blocks_anima.py
index fc71b87f62d8..69bc722a630c 100644
--- a/src/diffusers/modular_pipelines/anima/modular_blocks_anima.py
+++ b/src/diffusers/modular_pipelines/anima/modular_blocks_anima.py
@@ -15,6 +15,7 @@
 from ..modular_pipeline import SequentialPipelineBlocks
 from ..modular_pipeline_utils import OutputParam
 from .before_denoise import (
+    AnimaImg2ImgSetTimestepsStep,
     AnimaPrepareLatentsStep,
     AnimaSetTimestepsStep,
     AnimaTextConditioningStep,
@@ -22,7 +23,7 @@
 )
 from .decoders import AnimaProcessImagesOutputStep, AnimaVaeDecoderStep
 from .denoise import AnimaDenoiseStep
-from .encoders import AnimaTextEncoderStep
+from .encoders import AnimaImg2ImgVaeEncoderStep, AnimaTextEncoderStep
 
 
 # auto_docstring
@@ -35,8 +36,6 @@ class AnimaCoreDenoiseStep(SequentialPipelineBlocks):
           (`FlowMatchEulerDiscreteScheduler`) guider (`ClassifierFreeGuidance`)
 
       Inputs:
-          num_images_per_prompt (`int`, *optional*, defaults to 1):
-              The number of images to generate per prompt.
           qwen_prompt_embeds (`Tensor`):
               Qwen prompt embeddings generated by the text encoder step.
           qwen_attention_mask (`Tensor`):
@@ -53,6 +52,8 @@ class AnimaCoreDenoiseStep(SequentialPipelineBlocks):
               Negative T5 prompt token ids generated by the text encoder step.
           negative_t5_attention_mask (`Tensor`, *optional*):
               Negative T5 prompt attention mask generated by the text encoder step.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
@@ -131,9 +132,10 @@ class AnimaAutoBlocks(SequentialPipelineBlocks):
         - `text2image`: requires `prompt`
 
       Components:
-          text_encoder (`Qwen3Model`) tokenizer (`Qwen2Tokenizer`) t5_tokenizer (`T5TokenizerFast`) text_conditioner
-          (`AnimaTextConditioner`) guider (`ClassifierFreeGuidance`) transformer (`CosmosTransformer3DModel`) scheduler
-          (`FlowMatchEulerDiscreteScheduler`) vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
+          text_encoder (`Qwen3Model`) tokenizer (`Qwen2Tokenizer`) t5_tokenizer (`T5Tokenizer`) guider
+          (`ClassifierFreeGuidance`) text_conditioner (`AnimaTextConditioner`) transformer (`CosmosTransformer3DModel`)
+          scheduler (`FlowMatchEulerDiscreteScheduler`) vae (`AutoencoderKLQwenImage`) image_processor
+          (`VaeImageProcessor`)
 
       Inputs:
           prompt (`str`):
@@ -181,3 +183,144 @@ def description(self) -> str:
     @property
     def outputs(self):
         return [OutputParam.template("images")]
+
+
+# auto_docstring
+class AnimaImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Denoise block for Anima image-to-image generation. VAE encoding runs after AnimaTextInputStep so batch_size is
+    available in state.
+
+      Components:
+          text_conditioner (`AnimaTextConditioner`) transformer (`CosmosTransformer3DModel`) vae
+          (`AutoencoderKLQwenImage`) scheduler (`FlowMatchEulerDiscreteScheduler`) image_processor
+          (`VaeImageProcessor`) guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          qwen_prompt_embeds (`Tensor`):
+              Qwen prompt embeddings generated by the text encoder step.
+          qwen_attention_mask (`Tensor`):
+              Qwen prompt attention mask generated by the text encoder step.
+          t5_input_ids (`Tensor`):
+              T5 prompt token ids generated by the text encoder step.
+          t5_attention_mask (`Tensor`):
+              T5 prompt attention mask generated by the text encoder step.
+          negative_qwen_prompt_embeds (`Tensor`, *optional*):
+              Negative Qwen prompt embeddings generated by the text encoder step.
+          negative_qwen_attention_mask (`Tensor`, *optional*):
+              Negative Qwen prompt attention mask generated by the text encoder step.
+          negative_t5_input_ids (`Tensor`, *optional*):
+              Negative T5 prompt token ids generated by the text encoder step.
+          negative_t5_attention_mask (`Tensor`, *optional*):
+              Negative T5 prompt attention mask generated by the text encoder step.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          image (`Image | list`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          timesteps (`Tensor`):
+              Timesteps for the denoising process.
+          num_inference_steps (`int`):
+              Total number of inference steps from AnimaImg2ImgSetTimestepsStep.
+          **denoiser_input_fields (`None`, *optional*):
+              The conditional model inputs for the Anima denoiser.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
+    block_classes = [
+        AnimaTextConditioningStep,
+        AnimaTextInputStep,
+        AnimaImg2ImgVaeEncoderStep,
+        AnimaDenoiseStep,
+    ]
+    block_names = ["text_conditioning", "input", "vae_encoder", "denoise"]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Denoise block for Anima image-to-image generation. "
+            "VAE encoding runs after AnimaTextInputStep so batch_size is available in state."
+        )
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("latents")]
+
+
+# auto_docstring
+class AnimaImg2ImgAutoBlocks(SequentialPipelineBlocks):
+    """
+    Auto Modular pipeline for image-to-image generation using Anima.
+
+      Supported workflows:
+        - `img2img`: requires `prompt`, `image`
+
+      Components:
+          text_encoder (`Qwen3Model`) tokenizer (`Qwen2Tokenizer`) t5_tokenizer (`T5Tokenizer`) guider
+          (`ClassifierFreeGuidance`) scheduler (`FlowMatchEulerDiscreteScheduler`) text_conditioner
+          (`AnimaTextConditioner`) transformer (`CosmosTransformer3DModel`) vae (`AutoencoderKLQwenImage`)
+          image_processor (`VaeImageProcessor`)
+
+      Inputs:
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          max_sequence_length (`int`, *optional*, defaults to 512):
+              Maximum sequence length for prompt encoding.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`list`, *optional*):
+              Custom sigmas for the denoising process.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          image (`Image | list`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          **denoiser_input_fields (`None`, *optional*):
+              The conditional model inputs for the Anima denoiser.
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+
+      Outputs:
+          images (`list`):
+              Generated images.
+    """
+
+    block_classes = [
+        AnimaTextEncoderStep,
+        AnimaImg2ImgSetTimestepsStep,
+        AnimaImg2ImgCoreDenoiseStep,
+        AnimaDecodeStep,
+    ]
+    block_names = ["text_encoder", "set_timesteps", "denoise", "decode"]
+    _workflow_map = {"img2img": {"prompt": True, "image": True}}
+
+    @property
+    def description(self) -> str:
+        return "Auto Modular pipeline for image-to-image generation using Anima."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("images")]
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index fa977ee07bbe..1284f58a456c 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -17,6 +17,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class AnimaImg2ImgAutoBlocks(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class AnimaModularPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
diff --git a/tests/modular_pipelines/anima/test_modular_pipeline_anima_img2img.py b/tests/modular_pipelines/anima/test_modular_pipeline_anima_img2img.py
new file mode 100644
index 000000000000..c25a79739f9f
--- /dev/null
+++ b/tests/modular_pipelines/anima/test_modular_pipeline_anima_img2img.py
@@ -0,0 +1,193 @@
+# Copyright 2026 The HuggingFace Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import Qwen2Tokenizer, Qwen3Config, Qwen3Model, T5TokenizerFast
+
+from diffusers import (
+    AnimaImg2ImgAutoBlocks,
+    AnimaModularPipeline,
+    AnimaTextConditioner,
+    AutoencoderKLQwenImage,
+    CosmosTransformer3DModel,
+    FlowMatchEulerDiscreteScheduler,
+)
+
+from ...testing_utils import enable_full_determinism
+from ..test_modular_pipelines_common import ModularPipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+ANIMA_IMG2IMG_WORKFLOWS = {
+    "img2img": [
+        ("text_encoder", "AnimaTextEncoderStep"),
+        ("set_timesteps", "AnimaImg2ImgSetTimestepsStep"),
+        ("denoise.text_conditioning", "AnimaTextConditioningStep"),
+        ("denoise.input", "AnimaTextInputStep"),
+        ("denoise.vae_encoder", "AnimaImg2ImgVaeEncoderStep"),
+        ("denoise.denoise", "AnimaDenoiseStep"),
+        ("decode.decode", "AnimaVaeDecoderStep"),
+        ("decode.postprocess", "AnimaProcessImagesOutputStep"),
+    ],
+}
+
+
+def get_dummy_components():
+    torch.manual_seed(0)
+    transformer = CosmosTransformer3DModel(
+        in_channels=4,
+        out_channels=4,
+        num_attention_heads=2,
+        attention_head_dim=16,
+        num_layers=2,
+        mlp_ratio=2,
+        text_embed_dim=16,
+        adaln_lora_dim=4,
+        max_size=(4, 32, 32),
+        patch_size=(1, 2, 2),
+        rope_scale=(1.0, 4.0, 4.0),
+        concat_padding_mask=True,
+        extra_pos_embed_type=None,
+    )
+
+    torch.manual_seed(0)
+    vae = AutoencoderKLQwenImage(
+        base_dim=24,
+        z_dim=4,
+        dim_mult=[1, 2, 4],
+        num_res_blocks=1,
+        temperal_downsample=[False, True],
+        latents_mean=[0.0] * 4,
+        latents_std=[1.0] * 4,
+    )
+
+    torch.manual_seed(0)
+    text_conditioner = AnimaTextConditioner(
+        source_dim=16,
+        target_dim=16,
+        model_dim=16,
+        num_layers=2,
+        num_attention_heads=4,
+        target_vocab_size=32128,
+        min_sequence_length=16,
+    )
+
+    torch.manual_seed(0)
+    text_encoder_config = Qwen3Config(
+        vocab_size=152064,
+        hidden_size=16,
+        intermediate_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        max_position_embeddings=128,
+        rms_norm_eps=1e-6,
+        rope_theta=1000000.0,
+        head_dim=4,
+        attention_bias=False,
+    )
+    text_encoder = Qwen3Model(text_encoder_config).eval()
+    tokenizer = Qwen2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration")
+    t5_tokenizer = T5TokenizerFast.from_pretrained("hf-internal-testing/tiny-random-t5")
+    scheduler = FlowMatchEulerDiscreteScheduler(shift=3.0)
+
+    return {
+        "transformer": transformer,
+        "vae": vae,
+        "scheduler": scheduler,
+        "text_encoder": text_encoder,
+        "tokenizer": tokenizer,
+        "t5_tokenizer": t5_tokenizer,
+        "text_conditioner": text_conditioner,
+    }
+
+
+def get_dummy_image(height=32, width=32):
+    image_array = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
+    return PIL.Image.fromarray(image_array)
+
+
+class TestAnimaImg2ImgModularPipelineFast(ModularPipelineTesterMixin):
+    pipeline_class = AnimaModularPipeline
+    pipeline_blocks_class = AnimaImg2ImgAutoBlocks
+    pretrained_model_name_or_path = "hf-internal-testing/tiny-anima-modular-pipe"
+    params = frozenset(["prompt", "image", "strength", "height", "width", "negative_prompt"])
+    batch_params = frozenset(["prompt", "negative_prompt"])
+    expected_workflow_blocks = ANIMA_IMG2IMG_WORKFLOWS
+
+    def get_pipeline(self, components_manager=None, torch_dtype=torch.float32):
+        pipe = self.pipeline_blocks_class().init_pipeline(components_manager=components_manager)
+        pipe.update_components(**get_dummy_components())
+        pipe.to(dtype=torch_dtype)
+        pipe.set_progress_bar_config(disable=None)
+        return pipe
+
+    def get_dummy_inputs(self, seed=0):
+        generator = torch.Generator(device="cpu").manual_seed(seed)
+        return {
+            "prompt": "dance monkey",
+            "negative_prompt": "bad quality",
+            "image": get_dummy_image(32, 32),
+            "strength": 0.8,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "height": 32,
+            "width": 32,
+            "max_sequence_length": 16,
+            "output_type": "pt",
+        }
+
+    def test_inference_basic(self):
+        pipe = self.get_pipeline()
+        inputs = self.get_dummy_inputs()
+        output = pipe(**inputs).images
+
+        assert output.shape == (1, 3, 32, 32)
+        assert not torch.isnan(output).any()
+
+    def test_inference_strength_low(self):
+        pipe = self.get_pipeline()
+        inputs = self.get_dummy_inputs()
+        inputs["strength"] = 0.3
+        output = pipe(**inputs).images
+
+        assert output.shape == (1, 3, 32, 32)
+        assert not torch.isnan(output).any()
+
+    def test_inference_strength_high(self):
+        pipe = self.get_pipeline()
+        inputs = self.get_dummy_inputs()
+        inputs["strength"] = 0.95
+        output = pipe(**inputs).images
+
+        assert output.shape == (1, 3, 32, 32)
+        assert not torch.isnan(output).any()
+
+    def test_inference_empty_negative_prompt(self):
+        pipe = self.get_pipeline()
+        inputs = self.get_dummy_inputs()
+        inputs["negative_prompt"] = ""
+        output = pipe(**inputs).images
+
+        assert output.shape == (1, 3, 32, 32)
+        assert not torch.isnan(output).any()
+
+    def test_inference_batch_single_identical(self):
+        super().test_inference_batch_single_identical(expected_max_diff=5e-4)
\ No newline at end of file

From def773308fef93b7f9bff18536045411d89bdc6b Mon Sep 17 00:00:00 2001
From: PreethamNoelP <preethamnoel.05@gmail.com>
Date: Fri, 19 Jun 2026 09:02:31 +0530
Subject: [PATCH 2/2] [Anima] Address review feedback: remove # Copied from,
 fold img2img into AnimaAutoBlocks

- Remove incorrect `# Copied from` comment above AnimaImg2ImgSetTimestepsStep
- Delete AnimaImg2ImgAutoBlocks; introduce AnimaAutoDenoiseStep (AutoPipelineBlocks)
  and AnimaImg2ImgDenoiseStep (SequentialPipelineBlocks) so img2img lives as a
  workflow inside AnimaAutoBlocks, following the z_image pattern
- Update __init__.py, dummy_objects, and docs to remove AnimaImg2ImgAutoBlocks
- Update img2img test to use AnimaAutoBlocks with updated workflow block paths
---
 docs/source/en/api/pipelines/anima.md         |   4 -
 src/diffusers/__init__.py                     |   2 -
 src/diffusers/modular_pipelines/__init__.py   |   3 +-
 .../modular_pipelines/anima/__init__.py       |   4 +-
 .../modular_pipelines/anima/before_denoise.py |   1 -
 .../anima/modular_blocks_anima.py             | 217 ++++++++++++------
 .../dummy_torch_and_transformers_objects.py   |  15 --
 .../test_modular_pipeline_anima_img2img.py    |  14 +-
 8 files changed, 156 insertions(+), 104 deletions(-)

diff --git a/docs/source/en/api/pipelines/anima.md b/docs/source/en/api/pipelines/anima.md
index 22eff138a027..b66eeb2a29b7 100644
--- a/docs/source/en/api/pipelines/anima.md
+++ b/docs/source/en/api/pipelines/anima.md
@@ -35,10 +35,6 @@ image = pipe(prompt="masterpiece, best quality, 1girl, solo, city lights").image
 
 [[autodoc]] AnimaAutoBlocks
 
-## AnimaImg2ImgAutoBlocks
-
-[[autodoc]] AnimaImg2ImgAutoBlocks
-
 ## AnimaTextConditioner
 
 [[autodoc]] AnimaTextConditioner
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index bcb28cd2507d..da77fa67df52 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -470,7 +470,6 @@
     _import_structure["modular_pipelines"].extend(
         [
             "AnimaAutoBlocks",
-            "AnimaImg2ImgAutoBlocks",
             "AnimaModularPipeline",
             "ErnieImageAutoBlocks",
             "ErnieImageModularPipeline",
@@ -1309,7 +1308,6 @@
     else:
         from .modular_pipelines import (
             AnimaAutoBlocks,
-            AnimaImg2ImgAutoBlocks,
             AnimaModularPipeline,
             ErnieImageAutoBlocks,
             ErnieImageModularPipeline,
diff --git a/src/diffusers/modular_pipelines/__init__.py b/src/diffusers/modular_pipelines/__init__.py
index 335b49b451fc..4b36994aef07 100644
--- a/src/diffusers/modular_pipelines/__init__.py
+++ b/src/diffusers/modular_pipelines/__init__.py
@@ -95,7 +95,6 @@
     ]
     _import_structure["anima"] = [
         "AnimaAutoBlocks",
-        "AnimaImg2ImgAutoBlocks",
         "AnimaModularPipeline",
     ]
     _import_structure["ernie_image"] = [
@@ -123,7 +122,7 @@
     except OptionalDependencyNotAvailable:
         from ..utils.dummy_pt_objects import *  # noqa F403
     else:
-        from .anima import AnimaAutoBlocks, AnimaImg2ImgAutoBlocks, AnimaModularPipeline
+        from .anima import AnimaAutoBlocks, AnimaModularPipeline
         from .components_manager import ComponentsManager
         from .ernie_image import ErnieImageAutoBlocks, ErnieImageModularPipeline
         from .flux import FluxAutoBlocks, FluxKontextAutoBlocks, FluxKontextModularPipeline, FluxModularPipeline
diff --git a/src/diffusers/modular_pipelines/anima/__init__.py b/src/diffusers/modular_pipelines/anima/__init__.py
index 1cbb2d741bfb..4772d906e03b 100644
--- a/src/diffusers/modular_pipelines/anima/__init__.py
+++ b/src/diffusers/modular_pipelines/anima/__init__.py
@@ -21,7 +21,7 @@
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
-    _import_structure["modular_blocks_anima"] = ["AnimaAutoBlocks", "AnimaImg2ImgAutoBlocks"]
+    _import_structure["modular_blocks_anima"] = ["AnimaAutoBlocks"]
     _import_structure["modular_pipeline"] = ["AnimaModularPipeline"]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -31,7 +31,7 @@
     except OptionalDependencyNotAvailable:
         from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
     else:
-        from .modular_blocks_anima import AnimaAutoBlocks, AnimaImg2ImgAutoBlocks
+        from .modular_blocks_anima import AnimaAutoBlocks
         from .modular_pipeline import AnimaModularPipeline
 else:
     import sys
diff --git a/src/diffusers/modular_pipelines/anima/before_denoise.py b/src/diffusers/modular_pipelines/anima/before_denoise.py
index 9147047f5f9a..1b25688054e7 100644
--- a/src/diffusers/modular_pipelines/anima/before_denoise.py
+++ b/src/diffusers/modular_pipelines/anima/before_denoise.py
@@ -429,7 +429,6 @@ def __call__(self, components: AnimaModularPipeline, state: PipelineState) -> Pi
         return components, state
 
 
-# Copied from diffusers.modular_pipelines.anima.before_denoise.AnimaSetTimestepsStep
 class AnimaImg2ImgSetTimestepsStep(ModularPipelineBlocks):
     """Set the scheduler timesteps for Anima image-to-image inference.
 
diff --git a/src/diffusers/modular_pipelines/anima/modular_blocks_anima.py b/src/diffusers/modular_pipelines/anima/modular_blocks_anima.py
index 69bc722a630c..f5aa5e6253a8 100644
--- a/src/diffusers/modular_pipelines/anima/modular_blocks_anima.py
+++ b/src/diffusers/modular_pipelines/anima/modular_blocks_anima.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ..modular_pipeline import SequentialPipelineBlocks
+from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
 from ..modular_pipeline_utils import OutputParam
 from .before_denoise import (
     AnimaImg2ImgSetTimestepsStep,
@@ -124,72 +124,83 @@ def outputs(self):
 
 
 # auto_docstring
-class AnimaAutoBlocks(SequentialPipelineBlocks):
+class AnimaImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    Auto Modular pipeline for text-to-image generation using Anima.
-
-      Supported workflows:
-        - `text2image`: requires `prompt`
+    Denoise block for Anima image-to-image generation. VAE encoding runs after AnimaTextInputStep so batch_size is
+    available in state.
 
       Components:
-          text_encoder (`Qwen3Model`) tokenizer (`Qwen2Tokenizer`) t5_tokenizer (`T5Tokenizer`) guider
-          (`ClassifierFreeGuidance`) text_conditioner (`AnimaTextConditioner`) transformer (`CosmosTransformer3DModel`)
-          scheduler (`FlowMatchEulerDiscreteScheduler`) vae (`AutoencoderKLQwenImage`) image_processor
-          (`VaeImageProcessor`)
+          text_conditioner (`AnimaTextConditioner`) transformer (`CosmosTransformer3DModel`) vae
+          (`AutoencoderKLQwenImage`) scheduler (`FlowMatchEulerDiscreteScheduler`) image_processor
+          (`VaeImageProcessor`) guider (`ClassifierFreeGuidance`)
 
       Inputs:
-          prompt (`str`):
-              The prompt or prompts to guide image generation.
-          negative_prompt (`str`, *optional*):
-              The prompt or prompts not to guide the image generation.
-          max_sequence_length (`int`, *optional*, defaults to 512):
-              Maximum sequence length for prompt encoding.
+          qwen_prompt_embeds (`Tensor`):
+              Qwen prompt embeddings generated by the text encoder step.
+          qwen_attention_mask (`Tensor`):
+              Qwen prompt attention mask generated by the text encoder step.
+          t5_input_ids (`Tensor`):
+              T5 prompt token ids generated by the text encoder step.
+          t5_attention_mask (`Tensor`):
+              T5 prompt attention mask generated by the text encoder step.
+          negative_qwen_prompt_embeds (`Tensor`, *optional*):
+              Negative Qwen prompt embeddings generated by the text encoder step.
+          negative_qwen_attention_mask (`Tensor`, *optional*):
+              Negative Qwen prompt attention mask generated by the text encoder step.
+          negative_t5_input_ids (`Tensor`, *optional*):
+              Negative T5 prompt token ids generated by the text encoder step.
+          negative_t5_attention_mask (`Tensor`, *optional*):
+              Negative T5 prompt attention mask generated by the text encoder step.
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
+          image (`Image | list`):
+              Reference image(s) for denoising. Can be a single image or list of images.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
-          latents (`Tensor`, *optional*):
-              Pre-generated noisy latents for image generation.
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-          num_inference_steps (`int`, *optional*, defaults to 50):
-              The number of denoising steps.
-          sigmas (`list`, *optional*):
-              Custom sigmas for the denoising process.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          timesteps (`Tensor`):
+              Timesteps for the denoising process.
+          num_inference_steps (`int`):
+              Total number of inference steps from AnimaImg2ImgSetTimestepsStep.
           **denoiser_input_fields (`None`, *optional*):
               The conditional model inputs for the Anima denoiser.
-          output_type (`str`, *optional*, defaults to pil):
-              Output format: 'pil', 'np', 'pt'.
 
       Outputs:
-          images (`list`):
-              Generated images.
+          latents (`Tensor`):
+              Denoised latents.
     """
 
     block_classes = [
-        AnimaTextEncoderStep,
-        AnimaCoreDenoiseStep,
-        AnimaDecodeStep,
+        AnimaTextConditioningStep,
+        AnimaTextInputStep,
+        AnimaImg2ImgVaeEncoderStep,
+        AnimaDenoiseStep,
     ]
-    block_names = ["text_encoder", "denoise", "decode"]
-    _workflow_map = {"text2image": {"prompt": True}}
+    block_names = ["text_conditioning", "input", "vae_encoder", "denoise"]
 
     @property
     def description(self) -> str:
-        return "Auto Modular pipeline for text-to-image generation using Anima."
+        return (
+            "Denoise block for Anima image-to-image generation. "
+            "VAE encoding runs after AnimaTextInputStep so batch_size is available in state."
+        )
 
     @property
     def outputs(self):
-        return [OutputParam.template("images")]
+        return [OutputParam.template("latents")]
 
 
 # auto_docstring
-class AnimaImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
+class AnimaImg2ImgDenoiseStep(SequentialPipelineBlocks):
     """
-    Denoise block for Anima image-to-image generation. VAE encoding runs after AnimaTextInputStep so batch_size is
-    available in state.
+    Combined set-timesteps + denoise block for Anima image-to-image generation.
 
       Components:
           text_conditioner (`AnimaTextConditioner`) transformer (`CosmosTransformer3DModel`) vae
@@ -213,6 +224,10 @@ class AnimaImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
               Negative T5 prompt token ids generated by the text encoder step.
           negative_t5_attention_mask (`Tensor`, *optional*):
               Negative T5 prompt attention mask generated by the text encoder step.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`list`, *optional*):
+              Custom sigmas for the denoising process.
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           image (`Image | list`):
@@ -227,10 +242,6 @@ class AnimaImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
               Torch generator for deterministic generation.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-          timesteps (`Tensor`):
-              Timesteps for the denoising process.
-          num_inference_steps (`int`):
-              Total number of inference steps from AnimaImg2ImgSetTimestepsStep.
           **denoiser_input_fields (`None`, *optional*):
               The conditional model inputs for the Anima denoiser.
 
@@ -239,20 +250,12 @@ class AnimaImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
               Denoised latents.
     """
 
-    block_classes = [
-        AnimaTextConditioningStep,
-        AnimaTextInputStep,
-        AnimaImg2ImgVaeEncoderStep,
-        AnimaDenoiseStep,
-    ]
-    block_names = ["text_conditioning", "input", "vae_encoder", "denoise"]
+    block_classes = [AnimaImg2ImgSetTimestepsStep, AnimaImg2ImgCoreDenoiseStep]
+    block_names = ["set_timesteps", "denoise"]
 
     @property
     def description(self) -> str:
-        return (
-            "Denoise block for Anima image-to-image generation. "
-            "VAE encoding runs after AnimaTextInputStep so batch_size is available in state."
-        )
+        return "Combined set-timesteps and denoise block for Anima image-to-image generation."
 
     @property
     def outputs(self):
@@ -260,18 +263,88 @@ def outputs(self):
 
 
 # auto_docstring
-class AnimaImg2ImgAutoBlocks(SequentialPipelineBlocks):
+class AnimaAutoDenoiseStep(AutoPipelineBlocks):
+    """
+    Denoise step that selects between text-to-image and image-to-image denoising based on whether an input image is
+    provided. - `AnimaCoreDenoiseStep` (text2image) is used when no image is provided. -
+    `AnimaImg2ImgDenoiseStep` (img2img) is used when an image is provided.
+
+      Components:
+          text_conditioner (`AnimaTextConditioner`) transformer (`CosmosTransformer3DModel`) vae
+          (`AutoencoderKLQwenImage`) scheduler (`FlowMatchEulerDiscreteScheduler`) image_processor
+          (`VaeImageProcessor`) guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          qwen_prompt_embeds (`Tensor`):
+              Qwen prompt embeddings generated by the text encoder step.
+          qwen_attention_mask (`Tensor`):
+              Qwen prompt attention mask generated by the text encoder step.
+          t5_input_ids (`Tensor`):
+              T5 prompt token ids generated by the text encoder step.
+          t5_attention_mask (`Tensor`):
+              T5 prompt attention mask generated by the text encoder step.
+          negative_qwen_prompt_embeds (`Tensor`, *optional*):
+              Negative Qwen prompt embeddings generated by the text encoder step.
+          negative_qwen_attention_mask (`Tensor`, *optional*):
+              Negative Qwen prompt attention mask generated by the text encoder step.
+          negative_t5_input_ids (`Tensor`, *optional*):
+              Negative T5 prompt token ids generated by the text encoder step.
+          negative_t5_attention_mask (`Tensor`, *optional*):
+              Negative T5 prompt attention mask generated by the text encoder step.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          image (`Image | list`, *optional*):
+              Reference image(s). When provided, img2img denoising is used.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`list`, *optional*):
+              Custom sigmas for the denoising process.
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img transformation.
+          **denoiser_input_fields (`None`, *optional*):
+              The conditional model inputs for the Anima denoiser.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
+    block_classes = [AnimaImg2ImgDenoiseStep, AnimaCoreDenoiseStep]
+    block_names = ["img2img", "text2image"]
+    block_trigger_inputs = ["image", None]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Denoise step that selects between text-to-image and image-to-image denoising based on whether "
+            "an input image is provided."
+            " - `AnimaCoreDenoiseStep` (text2image) is used when no image is provided."
+            " - `AnimaImg2ImgDenoiseStep` (img2img) is used when an image is provided."
+        )
+
+
+# auto_docstring
+class AnimaAutoBlocks(SequentialPipelineBlocks):
     """
-    Auto Modular pipeline for image-to-image generation using Anima.
+    Auto Modular pipeline for text-to-image and image-to-image generation using Anima.
 
       Supported workflows:
+        - `text2image`: requires `prompt`
         - `img2img`: requires `prompt`, `image`
 
       Components:
           text_encoder (`Qwen3Model`) tokenizer (`Qwen2Tokenizer`) t5_tokenizer (`T5Tokenizer`) guider
-          (`ClassifierFreeGuidance`) scheduler (`FlowMatchEulerDiscreteScheduler`) text_conditioner
-          (`AnimaTextConditioner`) transformer (`CosmosTransformer3DModel`) vae (`AutoencoderKLQwenImage`)
-          image_processor (`VaeImageProcessor`)
+          (`ClassifierFreeGuidance`) text_conditioner (`AnimaTextConditioner`) transformer (`CosmosTransformer3DModel`)
+          scheduler (`FlowMatchEulerDiscreteScheduler`) vae (`AutoencoderKLQwenImage`) image_processor
+          (`VaeImageProcessor`)
 
       Inputs:
           prompt (`str`):
@@ -280,24 +353,24 @@ class AnimaImg2ImgAutoBlocks(SequentialPipelineBlocks):
               The prompt or prompts not to guide the image generation.
           max_sequence_length (`int`, *optional*, defaults to 512):
               Maximum sequence length for prompt encoding.
-          num_inference_steps (`int`, *optional*, defaults to 50):
-              The number of denoising steps.
-          sigmas (`list`, *optional*):
-              Custom sigmas for the denoising process.
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-          image (`Image | list`):
-              Reference image(s) for denoising. Can be a single image or list of images.
+          image (`Image | list`, *optional*):
+              Reference image(s) for image-to-image generation. When provided, img2img workflow is used.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
-          strength (`float`, *optional*, defaults to 0.9):
-              Strength for img2img/inpainting.
-          generator (`Generator`, *optional*):
-              Torch generator for deterministic generation.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`list`, *optional*):
+              Custom sigmas for the denoising process.
+          strength (`float`, *optional*, defaults to 0.9):
+              How much to transform the reference image (img2img only).
           **denoiser_input_fields (`None`, *optional*):
               The conditional model inputs for the Anima denoiser.
           output_type (`str`, *optional*, defaults to pil):
@@ -310,16 +383,18 @@ class AnimaImg2ImgAutoBlocks(SequentialPipelineBlocks):
 
     block_classes = [
         AnimaTextEncoderStep,
-        AnimaImg2ImgSetTimestepsStep,
-        AnimaImg2ImgCoreDenoiseStep,
+        AnimaAutoDenoiseStep,
         AnimaDecodeStep,
     ]
-    block_names = ["text_encoder", "set_timesteps", "denoise", "decode"]
-    _workflow_map = {"img2img": {"prompt": True, "image": True}}
+    block_names = ["text_encoder", "denoise", "decode"]
+    _workflow_map = {
+        "text2image": {"prompt": True},
+        "img2img": {"image": True, "prompt": True},
+    }
 
     @property
     def description(self) -> str:
-        return "Auto Modular pipeline for image-to-image generation using Anima."
+        return "Auto Modular pipeline for text-to-image and image-to-image generation using Anima."
 
     @property
     def outputs(self):
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index bc6a47ef9d83..0786186dff53 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -17,21 +17,6 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
-class AnimaImg2ImgAutoBlocks(metaclass=DummyObject):
-    _backends = ["torch", "transformers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "transformers"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-
 class AnimaModularPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
diff --git a/tests/modular_pipelines/anima/test_modular_pipeline_anima_img2img.py b/tests/modular_pipelines/anima/test_modular_pipeline_anima_img2img.py
index c25a79739f9f..2abe9dadae56 100644
--- a/tests/modular_pipelines/anima/test_modular_pipeline_anima_img2img.py
+++ b/tests/modular_pipelines/anima/test_modular_pipeline_anima_img2img.py
@@ -20,7 +20,7 @@
 from transformers import Qwen2Tokenizer, Qwen3Config, Qwen3Model, T5TokenizerFast
 
 from diffusers import (
-    AnimaImg2ImgAutoBlocks,
+    AnimaAutoBlocks,
     AnimaModularPipeline,
     AnimaTextConditioner,
     AutoencoderKLQwenImage,
@@ -38,11 +38,11 @@
 ANIMA_IMG2IMG_WORKFLOWS = {
     "img2img": [
         ("text_encoder", "AnimaTextEncoderStep"),
-        ("set_timesteps", "AnimaImg2ImgSetTimestepsStep"),
-        ("denoise.text_conditioning", "AnimaTextConditioningStep"),
-        ("denoise.input", "AnimaTextInputStep"),
-        ("denoise.vae_encoder", "AnimaImg2ImgVaeEncoderStep"),
-        ("denoise.denoise", "AnimaDenoiseStep"),
+        ("denoise.set_timesteps", "AnimaImg2ImgSetTimestepsStep"),
+        ("denoise.denoise.text_conditioning", "AnimaTextConditioningStep"),
+        ("denoise.denoise.input", "AnimaTextInputStep"),
+        ("denoise.denoise.vae_encoder", "AnimaImg2ImgVaeEncoderStep"),
+        ("denoise.denoise.denoise", "AnimaDenoiseStep"),
         ("decode.decode", "AnimaVaeDecoderStep"),
         ("decode.postprocess", "AnimaProcessImagesOutputStep"),
     ],
@@ -126,7 +126,7 @@ def get_dummy_image(height=32, width=32):
 
 class TestAnimaImg2ImgModularPipelineFast(ModularPipelineTesterMixin):
     pipeline_class = AnimaModularPipeline
-    pipeline_blocks_class = AnimaImg2ImgAutoBlocks
+    pipeline_blocks_class = AnimaAutoBlocks
     pretrained_model_name_or_path = "hf-internal-testing/tiny-anima-modular-pipe"
     params = frozenset(["prompt", "image", "strength", "height", "width", "negative_prompt"])
     batch_params = frozenset(["prompt", "negative_prompt"])