fix cuda tests for models.

sayakpaul · sayakpaul · commit e54fc90fce5d · 2026-06-17T07:45:27.000Z
diff --git a/src/diffusers/models/downsampling.py b/src/diffusers/models/downsampling.py
@@ -227,15 +227,15 @@ def _downsample_2d(
             stride_value = [factor, factor]
             upfirdn_input = upfirdn2d_native(
                 hidden_states,
-                torch.tensor(kernel, device=hidden_states.device),
+                kernel.to(device=hidden_states.device, dtype=hidden_states.dtype),
                 pad=((pad_value + 1) // 2, pad_value // 2),
             )
             output = F.conv2d(upfirdn_input, weight, stride=stride_value, padding=0)
         else:
             pad_value = kernel.shape[0] - factor
             output = upfirdn2d_native(
                 hidden_states,
-                torch.tensor(kernel, device=hidden_states.device),
+                kernel.to(device=hidden_states.device, dtype=hidden_states.dtype),
                 down=factor,
                 pad=((pad_value + 1) // 2, pad_value // 2),
             )
@@ -392,7 +392,7 @@ def downsample_2d(
     pad_value = kernel.shape[0] - factor
     output = upfirdn2d_native(
         hidden_states,
-        kernel.to(device=hidden_states.device),
+        kernel.to(device=hidden_states.device, dtype=hidden_states.dtype),
         down=factor,
         pad=((pad_value + 1) // 2, pad_value // 2),
     )
diff --git a/src/diffusers/models/upsampling.py b/src/diffusers/models/upsampling.py
@@ -300,14 +300,14 @@ def _upsample_2d(
 
             output = upfirdn2d_native(
                 inverse_conv,
-                torch.tensor(kernel, device=inverse_conv.device),
+                kernel.to(device=inverse_conv.device, dtype=inverse_conv.dtype),
                 pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2 + 1),
             )
         else:
             pad_value = kernel.shape[0] - factor
             output = upfirdn2d_native(
                 hidden_states,
-                torch.tensor(kernel, device=hidden_states.device),
+                kernel.to(device=hidden_states.device, dtype=hidden_states.dtype),
                 up=factor,
                 pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2),
             )
@@ -508,7 +508,7 @@ def upsample_2d(
     pad_value = kernel.shape[0] - factor
     output = upfirdn2d_native(
         hidden_states,
-        kernel.to(device=hidden_states.device),
+        kernel.to(device=hidden_states.device, dtype=hidden_states.dtype),
         up=factor,
         pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2),
     )
diff --git a/tests/models/autoencoders/test_models_autoencoder_tiny.py b/tests/models/autoencoders/test_models_autoencoder_tiny.py
@@ -76,7 +76,12 @@ def get_dummy_inputs(self) -> dict:
 
 
 class TestAutoencoderTiny(AutoencoderTinyTesterConfig, ModelTesterMixin):
-    pass
+    @pytest.mark.skip(
+        "`forward` round-trips the latents through a uint8 byte tensor (`.byte()` / `/ 255.0`), which upcasts to "
+        "float32 regardless of the model dtype, so full fp16/bf16 forward inference is not possible."
+    )
+    def test_from_save_pretrained_dtype_inference(self):
+        pass
 
 
 class TestAutoencoderTinyTraining(AutoencoderTinyTesterConfig, TrainingTesterMixin):
diff --git a/tests/models/autoencoders/test_models_consistency_decoder_vae.py b/tests/models/autoencoders/test_models_consistency_decoder_vae.py
@@ -16,6 +16,7 @@
 import gc
 
 import numpy as np
+import pytest
 import torch
 
 from diffusers import ConsistencyDecoderVAE, StableDiffusionPipeline
@@ -86,7 +87,13 @@ def get_dummy_inputs(self) -> dict:
 
 
 class TestConsistencyDecoderVAE(ConsistencyDecoderVAETesterConfig, ModelTesterMixin):
-    pass
+    @pytest.mark.skip(
+        "`forward` decodes through an iterative, RNG-driven consistency-decoding loop whose output is not "
+        "reproducible across two model instances and amplifies fp16/bf16 nondeterminism, so a low-precision "
+        "output-equivalence check is not meaningful."
+    )
+    def test_from_save_pretrained_dtype_inference(self):
+        pass
 
 
 class TestConsistencyDecoderVAETraining(ConsistencyDecoderVAETesterConfig, TrainingTesterMixin):
diff --git a/tests/models/controlnets/test_models_controlnet_cosmos.py b/tests/models/controlnets/test_models_controlnet_cosmos.py
@@ -283,6 +283,10 @@ def test_training(self):
     def test_training_with_ema(self):
         super().test_training_with_ema()
 
+    @pytest.mark.skip("ControlNet outputs list of control blocks, not single tensor for MSE loss.")
+    def test_mixed_precision_training(self):
+        super().test_mixed_precision_training()
+
     @pytest.mark.skip("ControlNet output doesn't have .sample attribute.")
     def test_gradient_checkpointing_equivalence(self):
         super().test_gradient_checkpointing_equivalence()
diff --git a/tests/models/testing_utils/common.py b/tests/models/testing_utils/common.py
@@ -135,8 +135,9 @@ def cast_inputs_to_dtype(inputs, current_dtype, target_dtype):
         return inputs.to(target_dtype) if inputs.dtype == current_dtype else inputs
     if isinstance(inputs, dict):
         return {k: cast_inputs_to_dtype(v, current_dtype, target_dtype) for k, v in inputs.items()}
-    if isinstance(inputs, list):
-        return [cast_inputs_to_dtype(v, current_dtype, target_dtype) for v in inputs]
+    if isinstance(inputs, (list, tuple)):
+        # Preserve the container type so models that branch on it (e.g. `isinstance(..., tuple)`) still see a tuple.
+        return type(inputs)(cast_inputs_to_dtype(v, current_dtype, target_dtype) for v in inputs)
 
     return inputs
 
@@ -479,7 +480,11 @@ def test_keep_in_fp32_modules(self, tmp_path):
     )
     @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16], ids=["fp16", "bf16"])
     @torch.no_grad()
-    def test_from_save_pretrained_dtype_inference(self, tmp_path, dtype, atol=1e-4, rtol=0):
+    def test_from_save_pretrained_dtype_inference(self, tmp_path, dtype):
+        # Low-precision inference is inherently lossy, and models that keep some modules in fp32 diverge further from
+        # the fully-cast reference. Tolerances reflect the dtype's precision rather than a tight fp32-style threshold.
+        atol = 3e-2 if dtype == torch.bfloat16 else 1e-2
+        rtol = 0
         model = self.model_class(**self.get_init_dict())
         model.to(torch_device)
         fp32_modules = model._keep_in_fp32_modules or []
diff --git a/tests/models/transformers/test_models_transformer_z_image.py b/tests/models/transformers/test_models_transformer_z_image.py
@@ -250,6 +250,10 @@ def test_training(self):
     def test_training_with_ema(self):
         pass
 
+    @pytest.mark.skip("Model output `sample` is a list of tensors; mixed-precision training computes MSE loss on it.")
+    def test_mixed_precision_training(self):
+        pass
+
     @pytest.mark.skip("Test is not supported for handling main inputs that are lists.")
     def test_gradient_checkpointing_equivalence(self, loss_tolerance=1e-5, param_grad_tol=5e-5, skip=None):
         pass
diff --git a/tests/testing_utils.py b/tests/testing_utils.py
@@ -165,6 +165,17 @@ def assert_tensors_close(
     if not is_torch_available():
         raise ValueError("PyTorch needs to be installed to use this function.")
 
+    # Some models (e.g. Z-Image, Cosmos ControlNet) return a list/tuple of tensors as their output. Compare these
+    # element-wise so the same helper works regardless of whether the output is a single tensor or a sequence.
+    if isinstance(actual, (list, tuple)) or isinstance(expected, (list, tuple)):
+        if not (isinstance(actual, (list, tuple)) and isinstance(expected, (list, tuple))):
+            raise AssertionError(f"{msg} Type mismatch: actual {type(actual)} vs expected {type(expected)}")
+        if len(actual) != len(expected):
+            raise AssertionError(f"{msg} Length mismatch: actual {len(actual)} vs expected {len(expected)}")
+        for i, (a, e) in enumerate(zip(actual, expected)):
+            assert_tensors_close(a, e, atol=atol, rtol=rtol, msg=f"{msg} [element {i}]")
+        return
+
     if actual.shape != expected.shape:
         raise AssertionError(f"{msg} Shape mismatch: actual {actual.shape} vs expected {expected.shape}")