From 747995c8bb52c2d705530b4c4262c0d522ebc5bf Mon Sep 17 00:00:00 2001 From: yucai-intel <108388355+yucai-intel@users.noreply.github.com> Date: Fri, 10 Oct 2025 14:03:17 +0800 Subject: [PATCH 01/13] Update Shape.cpp --- src/ATen/native/xpu/sycl/Shape.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ATen/native/xpu/sycl/Shape.cpp b/src/ATen/native/xpu/sycl/Shape.cpp index 12bd0ba66d..c8e4236401 100644 --- a/src/ATen/native/xpu/sycl/Shape.cpp +++ b/src/ATen/native/xpu/sycl/Shape.cpp @@ -394,6 +394,7 @@ void cat_out_kernel( kHalf, kBool, kBFloat16, + AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); } else { offset = 0; From 3f9538a95f9ae9d97b9ea6791bd3e718bd6e1fc5 Mon Sep 17 00:00:00 2001 From: yucai-intel <108388355+yucai-intel@users.noreply.github.com> Date: Fri, 10 Oct 2025 15:26:50 +0800 Subject: [PATCH 02/13] Update TensorCompareKernels.cpp --- src/ATen/native/xpu/sycl/TensorCompareKernels.cpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/ATen/native/xpu/sycl/TensorCompareKernels.cpp b/src/ATen/native/xpu/sycl/TensorCompareKernels.cpp index 562f994a62..1de7da7674 100644 --- a/src/ATen/native/xpu/sycl/TensorCompareKernels.cpp +++ b/src/ATen/native/xpu/sycl/TensorCompareKernels.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -78,10 +79,16 @@ struct ClampScalarFunctor { }; void where_kernel(TensorIterator& iter) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4( - kComplexHalf, kHalf, kBFloat16, kBool, iter.dtype(), "where_xpu", [&] { - gpu_kernel(iter, WhereFunctor()); - }); + AT_DISPATCH_V2( + iter.dtype(), + "where_xpu", + [&] { gpu_kernel(iter, WhereFunctor()); }, + kComplexHalf, + kHalf, + kBFloat16, + kBool, + AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), + AT_EXPAND(AT_FLOAT8_TYPES)); } void isposinf_kernel(TensorIteratorBase& iter) { From 9546e78b897c08fed13902d3b491a62f7a4f963b Mon Sep 17 00:00:00 2001 From: yucai-intel <108388355+yucai-intel@users.noreply.github.com> Date: Tue, 14 Oct 2025 12:45:16 +0800 Subject: [PATCH 03/13] Update test_cat.py --- test/regressions/test_cat.py | 255 +++++++++++++++-------------------- 1 file changed, 105 insertions(+), 150 deletions(-) diff --git a/test/regressions/test_cat.py b/test/regressions/test_cat.py index ac5d60808c..be811b7368 100644 --- a/test/regressions/test_cat.py +++ b/test/regressions/test_cat.py @@ -2,26 +2,104 @@ import torch from torch.testing._internal.common_utils import TestCase - class TestTorchMethod(TestCase): + # Define float8 dtypes for the focused test + FLOAT8_DTYPES = ( + torch.float8_e4m3fn, + torch.float8_e4m3fnuz, + torch.float8_e5m2, + torch.float8_e5m2fnuz, + torch.float8_e8m0fnu, + ) + + def _create_input_tensors(self, shape, dtype, memory_format=None): + # Always generate random data using a CPU-compatible dtype (float32) + # to avoid the "not implemented" error for float8 on CPU. + tensor = torch.randn(shape, dtype=torch.float32) + + # Convert to the target testing dtype + tensor = tensor.to(dtype) + + # Apply memory format if specified + if memory_format is not None: + tensor = tensor.to(memory_format=memory_format) + + return tensor + + def _test_cat_float8_core(self, tensors, dim, dtype): + """Core function to test torch.cat for float8, using tolerances.""" + + # --- CPU Reference Calculation (High Precision) --- + # Convert inputs to float32 on CPU for golden reference calculation + ref_tensors = [t.cpu().to(torch.float32) for t in tensors] + + # Calculate CPU reference result + res_cpu = torch.cat(ref_tensors, dim=dim) + + # --- XPU Calculation --- + # Convert inputs to XPU + xpu_tensors = [t.xpu() for t in tensors] + res_xpu = torch.cat(xpu_tensors, dim=dim) + + # Float8 is lossy, use higher tolerance (rtol=1e-2, atol=1e-2) + rtol = 1e-2 + atol = 1e-2 + + # Convert XPU result to float32 on CPU before comparison to match res_cpu's dtype. + res_xpu_f32_on_cpu = res_xpu.cpu().to(torch.float32) + + self.assertEqual(res_cpu, res_xpu_f32_on_cpu, rtol=rtol, atol=atol) + + + # ---------------------------------------------------------------------- + # New Focused Test: Simple Float8 torch.cat + # ---------------------------------------------------------------------- + def test_cat_float8_simple(self): + """Test torch.cat correctness across float8 dtypes using simple tensors.""" + for dtype in self.FLOAT8_DTYPES: + with self.subTest(dtype=dtype): + # Use simple 3D shape (2, 4, 3) and concatenate along dim 1 + user_cpu1 = self._create_input_tensors([2, 4, 3], dtype=dtype) + user_cpu2 = self._create_input_tensors([2, 2, 3], dtype=dtype) + user_cpu3 = self._create_input_tensors([2, 6, 3], dtype=dtype) + + tensors = (user_cpu1, user_cpu2, user_cpu3) + dim = 1 + + self._test_cat_float8_core(tensors, dim, dtype) + + # ---------------------------------------------------------------------- + # Original Tests (Restored to default float/float32) + # ---------------------------------------------------------------------- + def test_cat_8d(self, dtype=torch.float): + # Original test logic restored: uses default dtype (float32) input1 = torch.randn([256, 8, 8, 3, 3, 3, 3], dtype=dtype) input2 = torch.randn([256, 8, 8, 3, 3, 3, 3], dtype=dtype) + input1_xpu = input1.xpu() input2_xpu = input2.xpu() + output1 = torch.stack([input1, input2], dim=0) output1_xpu = torch.stack([input1_xpu, input2_xpu], dim=0) + output2 = output1.reshape([2, 256, 8, 8, 9, 9]) output2_xpu = output1_xpu.reshape([2, 256, 8, 8, 9, 9]) + output3 = torch.stack([output2, output2], dim=0) output3_xpu = torch.stack([output2_xpu, output2_xpu], dim=0) - self.assertEqual(output3, output3.cpu()) + + # Standard assertEqual for float32 (expect high precision) + self.assertEqual(output3, output3_xpu.cpu()) def test_cat_array(self, dtype=torch.float): + # Original test logic restored: uses default dtype (float32) user_cpu1 = torch.randn([2, 2, 3], dtype=dtype) user_cpu2 = torch.randn([2, 2, 3], dtype=dtype) user_cpu3 = torch.randn([2, 2, 3], dtype=dtype) + res_cpu = torch.cat((user_cpu1, user_cpu2, user_cpu3), dim=1) + res_xpu = torch.cat( ( user_cpu1.xpu(), @@ -30,169 +108,46 @@ def test_cat_array(self, dtype=torch.float): ), dim=1, ) + # Standard assertEqual for float32 self.assertEqual(res_cpu, res_xpu.cpu()) def test_cat_array_2(self, dtype=torch.float): + # Original test logic restored: uses default dtype (float32) shapes = [ - (8, 7, 3, 2), - (4, 4, 4, 4), - (4, 4, 1, 1), - (4, 1, 4, 4), - (4, 1, 4, 1), - (4, 1, 1, 4), - (1, 4, 1, 4), - (1, 4, 4, 1), + (8, 7, 3, 2), (4, 4, 4, 4), (4, 4, 1, 1), (4, 1, 4, 4), + (4, 1, 4, 1), (4, 1, 1, 4), (1, 4, 1, 4), (1, 4, 4, 1), (4, 1, 1, 1), ] + for shape in shapes: - print("\n================== test shape: ", shape, "==================") + # Removed original print statements to streamline test N, C, H, W = shape[0], shape[1], shape[2], shape[3] - user_cpu1 = torch.randn([N, C, H, W], dtype=dtype) - user_cpu2 = torch.randn([N, C, H, W], dtype=dtype) - user_cpu3 = torch.randn([N, C, H, W], dtype=dtype) - - user_cpu1 = user_cpu1.to(memory_format=torch.channels_last) - user_cpu2 = user_cpu2.to(memory_format=torch.channels_last) - user_cpu3 = user_cpu3.to(memory_format=torch.channels_last) - dim_idx = 1 + + # Case 1: all channels_last + user_cpu1 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.channels_last) + user_cpu2 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.channels_last) + user_cpu3 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.channels_last) res_cpu = torch.cat((user_cpu1, user_cpu2, user_cpu3), dim=dim_idx) - print("\n-------------CPU Result:--------------") - print(res_cpu.shape) - print( - "res_cpu is cl: ", - res_cpu.is_contiguous(memory_format=torch.channels_last), - ) - - user_xpu1 = user_cpu1.xpu() - user_xpu2 = user_cpu2.xpu() - user_xpu3 = user_cpu3.xpu() - - print("\n-------------GPU Result:--------------") - res_xpu = torch.cat((user_xpu1, user_xpu2, user_xpu3), dim=dim_idx) - print("SYCL Result:") - print(res_xpu.cpu().shape) - print( - "res_xpu is cl: ", - res_xpu.is_contiguous(memory_format=torch.channels_last), - ) + res_xpu = torch.cat((user_cpu1.xpu(), user_cpu2.xpu(), user_cpu3.xpu()), dim=dim_idx) self.assertEqual(res_cpu, res_xpu.cpu()) - if ( - 1 == res_xpu.shape[1] - or (1 == res_xpu.shape[2] and 1 == res_xpu.shape[3]) - or ( - 1 == res_xpu.shape[1] - and 1 == res_xpu.shape[2] - and 1 == res_xpu.shape[3] - ) - ): - self.assertEqual(res_xpu.is_contiguous(), True) - self.assertEqual( - res_xpu.is_contiguous(memory_format=torch.channels_last), True - ) - else: - self.assertEqual(res_xpu.is_contiguous(), False) - self.assertEqual( - res_xpu.is_contiguous(memory_format=torch.channels_last), True - ) - - user_cpu1 = torch.randn([N, C, H, W], dtype=dtype) - user_cpu2 = torch.randn([N, C, H, W], dtype=dtype) - user_cpu3 = torch.randn([N, C, H, W], dtype=dtype) - - user_cpu1 = user_cpu1.to(memory_format=torch.channels_last) - user_cpu2 = user_cpu2.to(memory_format=torch.contiguous_format) - user_cpu3 = user_cpu3.to(memory_format=torch.channels_last) - - dim_idx = 1 + # Case 2: cl, contiguous, cl + user_cpu1 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.channels_last) + user_cpu2 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.contiguous_format) + user_cpu3 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.channels_last) res_cpu = torch.cat((user_cpu1, user_cpu2, user_cpu3), dim=dim_idx) - print("\n-------------CPU Result:--------------") - print(res_cpu.shape) - print( - "res_cpu is cl: ", - res_cpu.is_contiguous(memory_format=torch.channels_last), - ) - - user_xpu1 = user_cpu1.xpu() - user_xpu2 = user_cpu2.xpu() - user_xpu3 = user_cpu3.xpu() - - print("\n-------------GPU Result:--------------") - res_xpu = torch.cat((user_xpu1, user_xpu2, user_xpu3), dim=dim_idx) - print("SYCL Result:") - print(res_xpu.cpu().shape) - print( - "res_xpu is cl: ", - res_xpu.is_contiguous(memory_format=torch.channels_last), - ) + res_xpu = torch.cat((user_cpu1.xpu(), user_cpu2.xpu(), user_cpu3.xpu()), dim=dim_idx) self.assertEqual(res_cpu, res_xpu.cpu()) - if ( - 1 == res_xpu.shape[1] - or (1 == res_xpu.shape[2] and 1 == res_xpu.shape[3]) - or ( - 1 == res_xpu.shape[1] - and 1 == res_xpu.shape[2] - and 1 == res_xpu.shape[3] - ) - ): - self.assertEqual(res_xpu.is_contiguous(), True) - self.assertEqual( - res_xpu.is_contiguous(memory_format=torch.channels_last), True - ) - else: - self.assertEqual(res_xpu.is_contiguous(), True) - self.assertEqual( - res_xpu.is_contiguous(memory_format=torch.channels_last), False - ) - - user_cpu1 = torch.randn([N, C, H, W], dtype=dtype) - user_cpu2 = torch.randn([N, C, H, W], dtype=dtype) - user_cpu3 = torch.randn([N, C, H, W], dtype=dtype) - - user_cpu1 = user_cpu1.to(memory_format=torch.contiguous_format) - user_cpu2 = user_cpu2.to(memory_format=torch.channels_last) - user_cpu3 = user_cpu3.to(memory_format=torch.channels_last) - - dim_idx = 1 + # Case 3: contiguous, cl, cl + user_cpu1 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.contiguous_format) + user_cpu2 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.channels_last) + user_cpu3 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.channels_last) res_cpu = torch.cat((user_cpu1, user_cpu2, user_cpu3), dim=dim_idx) - print("\n-------------CPU Result:--------------") - print(res_cpu.shape) - print( - "res_cpu is cl: ", - res_cpu.is_contiguous(memory_format=torch.channels_last), - ) - - user_xpu1 = user_cpu1.xpu() - user_xpu2 = user_cpu2.xpu() - user_xpu3 = user_cpu3.xpu() - - print("\n-------------GPU Result:--------------") - res_xpu = torch.cat((user_xpu1, user_xpu2, user_xpu3), dim=dim_idx) - print("SYCL Result:") - print(res_xpu.cpu().shape) - print( - "res_xpu is cl: ", - res_xpu.is_contiguous(memory_format=torch.channels_last), - ) + res_xpu = torch.cat((user_cpu1.xpu(), user_cpu2.xpu(), user_cpu3.xpu()), dim=dim_idx) self.assertEqual(res_cpu, res_xpu.cpu()) - if ( - 1 == res_xpu.shape[1] - or (1 == res_xpu.shape[2] and 1 == res_xpu.shape[3]) - or ( - 1 == res_xpu.shape[1] - and 1 == res_xpu.shape[2] - and 1 == res_xpu.shape[3] - ) - ): - self.assertEqual(res_xpu.is_contiguous(), True) - self.assertEqual( - res_xpu.is_contiguous(memory_format=torch.channels_last), True - ) - else: - self.assertEqual(res_xpu.is_contiguous(), True) - self.assertEqual( - res_xpu.is_contiguous(memory_format=torch.channels_last), False - ) + # Removed original verbose memory format assertions for clean test logic + + From 7f8901833f28c43dfefd78e7c191ebac0dec4c11 Mon Sep 17 00:00:00 2001 From: yucai-intel <108388355+yucai-intel@users.noreply.github.com> Date: Tue, 14 Oct 2025 12:50:25 +0800 Subject: [PATCH 04/13] Update test_cat.py --- test/regressions/test_cat.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/regressions/test_cat.py b/test/regressions/test_cat.py index be811b7368..c365e3ca3c 100644 --- a/test/regressions/test_cat.py +++ b/test/regressions/test_cat.py @@ -1,15 +1,16 @@ # Owner(s): ["module: intel"] import torch from torch.testing._internal.common_utils import TestCase +# Owner(s): ["module: intel"] +import torch +from torch.testing._internal.common_utils import TestCase + class TestTorchMethod(TestCase): # Define float8 dtypes for the focused test FLOAT8_DTYPES = ( - torch.float8_e4m3fn, - torch.float8_e4m3fnuz, torch.float8_e5m2, - torch.float8_e5m2fnuz, - torch.float8_e8m0fnu, + torch.float8_e4m3fn, ) def _create_input_tensors(self, shape, dtype, memory_format=None): @@ -44,7 +45,7 @@ def _test_cat_float8_core(self, tensors, dim, dtype): # Float8 is lossy, use higher tolerance (rtol=1e-2, atol=1e-2) rtol = 1e-2 atol = 1e-2 - + # Convert XPU result to float32 on CPU before comparison to match res_cpu's dtype. res_xpu_f32_on_cpu = res_xpu.cpu().to(torch.float32) @@ -120,7 +121,6 @@ def test_cat_array_2(self, dtype=torch.float): ] for shape in shapes: - # Removed original print statements to streamline test N, C, H, W = shape[0], shape[1], shape[2], shape[3] dim_idx = 1 From 95ac047705e210bf54c147c4248551d83a68f609 Mon Sep 17 00:00:00 2001 From: yucai-intel <108388355+yucai-intel@users.noreply.github.com> Date: Tue, 14 Oct 2025 12:57:09 +0800 Subject: [PATCH 05/13] format --- test/regressions/test_cat.py | 204 ++++++++++++++++++++++++++--------- 1 file changed, 156 insertions(+), 48 deletions(-) diff --git a/test/regressions/test_cat.py b/test/regressions/test_cat.py index c365e3ca3c..6974962515 100644 --- a/test/regressions/test_cat.py +++ b/test/regressions/test_cat.py @@ -1,16 +1,14 @@ # Owner(s): ["module: intel"] import torch from torch.testing._internal.common_utils import TestCase -# Owner(s): ["module: intel"] -import torch -from torch.testing._internal.common_utils import TestCase - - class TestTorchMethod(TestCase): # Define float8 dtypes for the focused test FLOAT8_DTYPES = ( - torch.float8_e5m2, torch.float8_e4m3fn, + torch.float8_e4m3fnuz, + torch.float8_e5m2, + torch.float8_e5m2fnuz, + torch.float8_e8m0fnu, ) def _create_input_tensors(self, shape, dtype, memory_format=None): @@ -29,7 +27,7 @@ def _create_input_tensors(self, shape, dtype, memory_format=None): def _test_cat_float8_core(self, tensors, dim, dtype): """Core function to test torch.cat for float8, using tolerances.""" - + # --- CPU Reference Calculation (High Precision) --- # Convert inputs to float32 on CPU for golden reference calculation ref_tensors = [t.cpu().to(torch.float32) for t in tensors] @@ -45,10 +43,10 @@ def _test_cat_float8_core(self, tensors, dim, dtype): # Float8 is lossy, use higher tolerance (rtol=1e-2, atol=1e-2) rtol = 1e-2 atol = 1e-2 - + # Convert XPU result to float32 on CPU before comparison to match res_cpu's dtype. res_xpu_f32_on_cpu = res_xpu.cpu().to(torch.float32) - + self.assertEqual(res_cpu, res_xpu_f32_on_cpu, rtol=rtol, atol=atol) @@ -69,38 +67,24 @@ def test_cat_float8_simple(self): self._test_cat_float8_core(tensors, dim, dtype) - # ---------------------------------------------------------------------- - # Original Tests (Restored to default float/float32) - # ---------------------------------------------------------------------- - def test_cat_8d(self, dtype=torch.float): - # Original test logic restored: uses default dtype (float32) input1 = torch.randn([256, 8, 8, 3, 3, 3, 3], dtype=dtype) input2 = torch.randn([256, 8, 8, 3, 3, 3, 3], dtype=dtype) - input1_xpu = input1.xpu() input2_xpu = input2.xpu() - output1 = torch.stack([input1, input2], dim=0) output1_xpu = torch.stack([input1_xpu, input2_xpu], dim=0) - output2 = output1.reshape([2, 256, 8, 8, 9, 9]) output2_xpu = output1_xpu.reshape([2, 256, 8, 8, 9, 9]) - output3 = torch.stack([output2, output2], dim=0) output3_xpu = torch.stack([output2_xpu, output2_xpu], dim=0) - - # Standard assertEqual for float32 (expect high precision) - self.assertEqual(output3, output3_xpu.cpu()) + self.assertEqual(output3, output3.cpu()) def test_cat_array(self, dtype=torch.float): - # Original test logic restored: uses default dtype (float32) user_cpu1 = torch.randn([2, 2, 3], dtype=dtype) user_cpu2 = torch.randn([2, 2, 3], dtype=dtype) user_cpu3 = torch.randn([2, 2, 3], dtype=dtype) - res_cpu = torch.cat((user_cpu1, user_cpu2, user_cpu3), dim=1) - res_xpu = torch.cat( ( user_cpu1.xpu(), @@ -109,45 +93,169 @@ def test_cat_array(self, dtype=torch.float): ), dim=1, ) - # Standard assertEqual for float32 self.assertEqual(res_cpu, res_xpu.cpu()) def test_cat_array_2(self, dtype=torch.float): - # Original test logic restored: uses default dtype (float32) shapes = [ - (8, 7, 3, 2), (4, 4, 4, 4), (4, 4, 1, 1), (4, 1, 4, 4), - (4, 1, 4, 1), (4, 1, 1, 4), (1, 4, 1, 4), (1, 4, 4, 1), + (8, 7, 3, 2), + (4, 4, 4, 4), + (4, 4, 1, 1), + (4, 1, 4, 4), + (4, 1, 4, 1), + (4, 1, 1, 4), + (1, 4, 1, 4), + (1, 4, 4, 1), (4, 1, 1, 1), ] - for shape in shapes: + print("\n================== test shape: ", shape, "==================") N, C, H, W = shape[0], shape[1], shape[2], shape[3] + user_cpu1 = torch.randn([N, C, H, W], dtype=dtype) + user_cpu2 = torch.randn([N, C, H, W], dtype=dtype) + user_cpu3 = torch.randn([N, C, H, W], dtype=dtype) + + user_cpu1 = user_cpu1.to(memory_format=torch.channels_last) + user_cpu2 = user_cpu2.to(memory_format=torch.channels_last) + user_cpu3 = user_cpu3.to(memory_format=torch.channels_last) + dim_idx = 1 - - # Case 1: all channels_last - user_cpu1 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.channels_last) - user_cpu2 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.channels_last) - user_cpu3 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.channels_last) res_cpu = torch.cat((user_cpu1, user_cpu2, user_cpu3), dim=dim_idx) - res_xpu = torch.cat((user_cpu1.xpu(), user_cpu2.xpu(), user_cpu3.xpu()), dim=dim_idx) + print("\n-------------CPU Result:--------------") + print(res_cpu.shape) + print( + "res_cpu is cl: ", + res_cpu.is_contiguous(memory_format=torch.channels_last), + ) + + user_xpu1 = user_cpu1.xpu() + user_xpu2 = user_cpu2.xpu() + user_xpu3 = user_cpu3.xpu() + + print("\n-------------GPU Result:--------------") + res_xpu = torch.cat((user_xpu1, user_xpu2, user_xpu3), dim=dim_idx) + print("SYCL Result:") + print(res_xpu.cpu().shape) + print( + "res_xpu is cl: ", + res_xpu.is_contiguous(memory_format=torch.channels_last), + ) self.assertEqual(res_cpu, res_xpu.cpu()) - # Case 2: cl, contiguous, cl - user_cpu1 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.channels_last) - user_cpu2 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.contiguous_format) - user_cpu3 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.channels_last) + if ( + 1 == res_xpu.shape[1] + or (1 == res_xpu.shape[2] and 1 == res_xpu.shape[3]) + or ( + 1 == res_xpu.shape[1] + and 1 == res_xpu.shape[2] + and 1 == res_xpu.shape[3] + ) + ): + self.assertEqual(res_xpu.is_contiguous(), True) + self.assertEqual( + res_xpu.is_contiguous(memory_format=torch.channels_last), True + ) + else: + self.assertEqual(res_xpu.is_contiguous(), False) + self.assertEqual( + res_xpu.is_contiguous(memory_format=torch.channels_last), True + ) + + user_cpu1 = torch.randn([N, C, H, W], dtype=dtype) + user_cpu2 = torch.randn([N, C, H, W], dtype=dtype) + user_cpu3 = torch.randn([N, C, H, W], dtype=dtype) + + user_cpu1 = user_cpu1.to(memory_format=torch.channels_last) + user_cpu2 = user_cpu2.to(memory_format=torch.contiguous_format) + user_cpu3 = user_cpu3.to(memory_format=torch.channels_last) + + dim_idx = 1 res_cpu = torch.cat((user_cpu1, user_cpu2, user_cpu3), dim=dim_idx) - res_xpu = torch.cat((user_cpu1.xpu(), user_cpu2.xpu(), user_cpu3.xpu()), dim=dim_idx) + print("\n-------------CPU Result:--------------") + print(res_cpu.shape) + print( + "res_cpu is cl: ", + res_cpu.is_contiguous(memory_format=torch.channels_last), + ) + + user_xpu1 = user_cpu1.xpu() + user_xpu2 = user_cpu2.xpu() + user_xpu3 = user_cpu3.xpu() + + print("\n-------------GPU Result:--------------") + res_xpu = torch.cat((user_xpu1, user_xpu2, user_xpu3), dim=dim_idx) + print("SYCL Result:") + print(res_xpu.cpu().shape) + print( + "res_xpu is cl: ", + res_xpu.is_contiguous(memory_format=torch.channels_last), + ) self.assertEqual(res_cpu, res_xpu.cpu()) - # Case 3: contiguous, cl, cl - user_cpu1 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.contiguous_format) - user_cpu2 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.channels_last) - user_cpu3 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.channels_last) + if ( + 1 == res_xpu.shape[1] + or (1 == res_xpu.shape[2] and 1 == res_xpu.shape[3]) + or ( + 1 == res_xpu.shape[1] + and 1 == res_xpu.shape[2] + and 1 == res_xpu.shape[3] + ) + ): + self.assertEqual(res_xpu.is_contiguous(), True) + self.assertEqual( + res_xpu.is_contiguous(memory_format=torch.channels_last), True + ) + else: + self.assertEqual(res_xpu.is_contiguous(), True) + self.assertEqual( + res_xpu.is_contiguous(memory_format=torch.channels_last), False + ) + + user_cpu1 = torch.randn([N, C, H, W], dtype=dtype) + user_cpu2 = torch.randn([N, C, H, W], dtype=dtype) + user_cpu3 = torch.randn([N, C, H, W], dtype=dtype) + + user_cpu1 = user_cpu1.to(memory_format=torch.contiguous_format) + user_cpu2 = user_cpu2.to(memory_format=torch.channels_last) + user_cpu3 = user_cpu3.to(memory_format=torch.channels_last) + + dim_idx = 1 res_cpu = torch.cat((user_cpu1, user_cpu2, user_cpu3), dim=dim_idx) - res_xpu = torch.cat((user_cpu1.xpu(), user_cpu2.xpu(), user_cpu3.xpu()), dim=dim_idx) - self.assertEqual(res_cpu, res_xpu.cpu()) + print("\n-------------CPU Result:--------------") + print(res_cpu.shape) + print( + "res_cpu is cl: ", + res_cpu.is_contiguous(memory_format=torch.channels_last), + ) + + user_xpu1 = user_cpu1.xpu() + user_xpu2 = user_cpu2.xpu() + user_xpu3 = user_cpu3.xpu() - # Removed original verbose memory format assertions for clean test logic + print("\n-------------GPU Result:--------------") + res_xpu = torch.cat((user_xpu1, user_xpu2, user_xpu3), dim=dim_idx) + print("SYCL Result:") + print(res_xpu.cpu().shape) + print( + "res_xpu is cl: ", + res_xpu.is_contiguous(memory_format=torch.channels_last), + ) + self.assertEqual(res_cpu, res_xpu.cpu()) - + if ( + 1 == res_xpu.shape[1] + or (1 == res_xpu.shape[2] and 1 == res_xpu.shape[3]) + or ( + 1 == res_xpu.shape[1] + and 1 == res_xpu.shape[2] + and 1 == res_xpu.shape[3] + ) + ): + self.assertEqual(res_xpu.is_contiguous(), True) + self.assertEqual( + res_xpu.is_contiguous(memory_format=torch.channels_last), True + ) + else: + self.assertEqual(res_xpu.is_contiguous(), True) + self.assertEqual( + res_xpu.is_contiguous(memory_format=torch.channels_last), False + ) From 2e2abe35cdb5f369c30490ee6ec6f619a6372103 Mon Sep 17 00:00:00 2001 From: yucai-intel <108388355+yucai-intel@users.noreply.github.com> Date: Tue, 14 Oct 2025 13:00:12 +0800 Subject: [PATCH 06/13] format --- test/regressions/test_cat.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/test/regressions/test_cat.py b/test/regressions/test_cat.py index 6974962515..b5f793b15c 100644 --- a/test/regressions/test_cat.py +++ b/test/regressions/test_cat.py @@ -1,14 +1,13 @@ # Owner(s): ["module: intel"] import torch from torch.testing._internal.common_utils import TestCase + + class TestTorchMethod(TestCase): # Define float8 dtypes for the focused test FLOAT8_DTYPES = ( - torch.float8_e4m3fn, - torch.float8_e4m3fnuz, torch.float8_e5m2, - torch.float8_e5m2fnuz, - torch.float8_e8m0fnu, + torch.float8_e4m3fn, ) def _create_input_tensors(self, shape, dtype, memory_format=None): @@ -27,7 +26,7 @@ def _create_input_tensors(self, shape, dtype, memory_format=None): def _test_cat_float8_core(self, tensors, dim, dtype): """Core function to test torch.cat for float8, using tolerances.""" - + # --- CPU Reference Calculation (High Precision) --- # Convert inputs to float32 on CPU for golden reference calculation ref_tensors = [t.cpu().to(torch.float32) for t in tensors] @@ -43,10 +42,10 @@ def _test_cat_float8_core(self, tensors, dim, dtype): # Float8 is lossy, use higher tolerance (rtol=1e-2, atol=1e-2) rtol = 1e-2 atol = 1e-2 - + # Convert XPU result to float32 on CPU before comparison to match res_cpu's dtype. res_xpu_f32_on_cpu = res_xpu.cpu().to(torch.float32) - + self.assertEqual(res_cpu, res_xpu_f32_on_cpu, rtol=rtol, atol=atol) From 226ccb3e48023176dbd390a8251707fcc8a7ec9f Mon Sep 17 00:00:00 2001 From: yucai-intel <108388355+yucai-intel@users.noreply.github.com> Date: Tue, 14 Oct 2025 13:03:35 +0800 Subject: [PATCH 07/13] format --- test/regressions/test_cat.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/regressions/test_cat.py b/test/regressions/test_cat.py index b5f793b15c..5ef847997e 100644 --- a/test/regressions/test_cat.py +++ b/test/regressions/test_cat.py @@ -26,7 +26,7 @@ def _create_input_tensors(self, shape, dtype, memory_format=None): def _test_cat_float8_core(self, tensors, dim, dtype): """Core function to test torch.cat for float8, using tolerances.""" - + # --- CPU Reference Calculation (High Precision) --- # Convert inputs to float32 on CPU for golden reference calculation ref_tensors = [t.cpu().to(torch.float32) for t in tensors] @@ -42,10 +42,10 @@ def _test_cat_float8_core(self, tensors, dim, dtype): # Float8 is lossy, use higher tolerance (rtol=1e-2, atol=1e-2) rtol = 1e-2 atol = 1e-2 - + # Convert XPU result to float32 on CPU before comparison to match res_cpu's dtype. res_xpu_f32_on_cpu = res_xpu.cpu().to(torch.float32) - + self.assertEqual(res_cpu, res_xpu_f32_on_cpu, rtol=rtol, atol=atol) From 652ae582d101fb420ad26113fef99323e04990a4 Mon Sep 17 00:00:00 2001 From: yucai-intel <108388355+yucai-intel@users.noreply.github.com> Date: Tue, 14 Oct 2025 13:07:18 +0800 Subject: [PATCH 08/13] format --- test/regressions/test_cat.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/test/regressions/test_cat.py b/test/regressions/test_cat.py index 5ef847997e..d147b712a3 100644 --- a/test/regressions/test_cat.py +++ b/test/regressions/test_cat.py @@ -48,10 +48,6 @@ def _test_cat_float8_core(self, tensors, dim, dtype): self.assertEqual(res_cpu, res_xpu_f32_on_cpu, rtol=rtol, atol=atol) - - # ---------------------------------------------------------------------- - # New Focused Test: Simple Float8 torch.cat - # ---------------------------------------------------------------------- def test_cat_float8_simple(self): """Test torch.cat correctness across float8 dtypes using simple tensors.""" for dtype in self.FLOAT8_DTYPES: From a115cb278becf237a184cf8e815486b277819703 Mon Sep 17 00:00:00 2001 From: yucai-intel <108388355+yucai-intel@users.noreply.github.com> Date: Tue, 14 Oct 2025 13:08:16 +0800 Subject: [PATCH 09/13] format --- test/regressions/test_cat.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/regressions/test_cat.py b/test/regressions/test_cat.py index d147b712a3..e2e4218e56 100644 --- a/test/regressions/test_cat.py +++ b/test/regressions/test_cat.py @@ -6,8 +6,11 @@ class TestTorchMethod(TestCase): # Define float8 dtypes for the focused test FLOAT8_DTYPES = ( - torch.float8_e5m2, torch.float8_e4m3fn, + torch.float8_e4m3fnuz, + torch.float8_e5m2, + torch.float8_e5m2fnuz, + torch.float8_e8m0fnu, ) def _create_input_tensors(self, shape, dtype, memory_format=None): From 6ae66611da091d43988a75305b65065212463e28 Mon Sep 17 00:00:00 2001 From: yucai-intel <108388355+yucai-intel@users.noreply.github.com> Date: Tue, 14 Oct 2025 14:04:04 +0800 Subject: [PATCH 10/13] Create test_where.py --- test/regressions/test_where.py | 93 ++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 test/regressions/test_where.py diff --git a/test/regressions/test_where.py b/test/regressions/test_where.py new file mode 100644 index 0000000000..33eef5ee10 --- /dev/null +++ b/test/regressions/test_where.py @@ -0,0 +1,93 @@ +# Owner(s): ["module: intel"] +import torch +from torch.testing._internal.common_utils import TestCase + + +class TestTorchWhereMethod(TestCase): + # Define float8 dtypes + FLOAT8_DTYPES = ( + torch.float8_e5m2, + torch.float8_e4m3fn, + torch.float8_e4m3fnuz, + torch.float8_e5m2fnuz, + torch.float8_e8m0fnu, + ) + + # Define the set of all dtypes to be tested + TEST_DTYPES = ( + torch.float32, + torch.float64, + torch.half, + torch.bfloat16, + ) + FLOAT8_DTYPES + + def _test_where_fn(self, dtype): + """Core function to test torch.where(condition, x, y) correctness.""" + + # 1. Input Tensors (x and y) + x = torch.tensor([[10.0, 20.0], [30.0, 40.0]], dtype=dtype) + y = torch.tensor([[-1.0, -2.0], [-3.0, -4.0]], dtype=dtype) + # Condition must be bool + condition = torch.tensor([[True, False], [False, True]], dtype=torch.bool) + + # --- 1. CPU Reference Calculation and Tolerance Setting --- + + if dtype in self.FLOAT8_DTYPES: + # FP8: Use float32 as reference type for comparison + x_ref = x.cpu().to(torch.float32) + y_ref = y.cpu().to(torch.float32) + rtol = 1e-2 + atol = 1e-2 + else: + # Non-FP8: Use original dtype as reference type + x_ref = x.cpu() + y_ref = y.cpu() + rtol = 1e-5 + atol = 1e-5 + + condition_ref = condition.cpu() + res_ref = torch.where(condition_ref, x_ref, y_ref) + + # --- 2. XPU Operation (Default) --- + x_xpu = x.xpu() + y_xpu = y.xpu() + condition_xpu = condition.xpu() + + res_xpu = torch.where(condition_xpu, x_xpu, y_xpu) + + # Prepare XPU result for comparison (must match res_ref dtype) + if dtype in self.FLOAT8_DTYPES: + # FP8: Convert XPU result to float32 + res_xpu_to_compare = res_xpu.cpu().to(torch.float32) + else: + # Non-FP8: Pull to CPU, keeping original dtype + res_xpu_to_compare = res_xpu.cpu() + + # Compare: res_ref vs res_xpu_to_compare + self.assertEqual(res_ref, res_xpu_to_compare, rtol=rtol, atol=atol) + + # --- 3. Test the version with out= argument --- + + # Create output tensor on XPU + res_xpu_out = torch.empty_like(res_xpu, dtype=dtype).xpu() + torch.where(condition_xpu, x_xpu, y_xpu, out=res_xpu_out) + + # Prepare XPU 'out' result for comparison + if dtype in self.FLOAT8_DTYPES: + # FP8: Convert XPU result to float32 + res_xpu_out_to_compare = res_xpu_out.cpu().to(torch.float32) + else: + # Non-FP8: Pull to CPU, keeping original dtype + res_xpu_out_to_compare = res_xpu_out.cpu() + + # Compare: res_ref vs res_xpu_out_to_compare + self.assertEqual(res_ref, res_xpu_out_to_compare, rtol=rtol, atol=atol) + + + def test_where(self): + """Test torch.where() correctness across all supported dtypes, including float8.""" + for dtype in self.TEST_DTYPES: + # Use string conversion for better subTest reporting + dtype_name = str(dtype).split('.')[-1] + with self.subTest(dtype=dtype_name): + self._test_where_fn(dtype) From fbaf98f5ade907a83230c077a96efe445aeeec64 Mon Sep 17 00:00:00 2001 From: yucai-intel <108388355+yucai-intel@users.noreply.github.com> Date: Tue, 14 Oct 2025 14:07:33 +0800 Subject: [PATCH 11/13] format --- test/regressions/test_where.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/test/regressions/test_where.py b/test/regressions/test_where.py index 33eef5ee10..9be02e7066 100644 --- a/test/regressions/test_where.py +++ b/test/regressions/test_where.py @@ -23,7 +23,7 @@ class TestTorchWhereMethod(TestCase): def _test_where_fn(self, dtype): """Core function to test torch.where(condition, x, y) correctness.""" - + # 1. Input Tensors (x and y) x = torch.tensor([[10.0, 20.0], [30.0, 40.0]], dtype=dtype) y = torch.tensor([[-1.0, -2.0], [-3.0, -4.0]], dtype=dtype) @@ -31,7 +31,7 @@ def _test_where_fn(self, dtype): condition = torch.tensor([[True, False], [False, True]], dtype=torch.bool) # --- 1. CPU Reference Calculation and Tolerance Setting --- - + if dtype in self.FLOAT8_DTYPES: # FP8: Use float32 as reference type for comparison x_ref = x.cpu().to(torch.float32) @@ -45,16 +45,16 @@ def _test_where_fn(self, dtype): rtol = 1e-5 atol = 1e-5 - condition_ref = condition.cpu() + condition_ref = condition.cpu() res_ref = torch.where(condition_ref, x_ref, y_ref) # --- 2. XPU Operation (Default) --- x_xpu = x.xpu() y_xpu = y.xpu() condition_xpu = condition.xpu() - + res_xpu = torch.where(condition_xpu, x_xpu, y_xpu) - + # Prepare XPU result for comparison (must match res_ref dtype) if dtype in self.FLOAT8_DTYPES: # FP8: Convert XPU result to float32 @@ -67,11 +67,11 @@ def _test_where_fn(self, dtype): self.assertEqual(res_ref, res_xpu_to_compare, rtol=rtol, atol=atol) # --- 3. Test the version with out= argument --- - + # Create output tensor on XPU res_xpu_out = torch.empty_like(res_xpu, dtype=dtype).xpu() torch.where(condition_xpu, x_xpu, y_xpu, out=res_xpu_out) - + # Prepare XPU 'out' result for comparison if dtype in self.FLOAT8_DTYPES: # FP8: Convert XPU result to float32 @@ -88,6 +88,6 @@ def test_where(self): """Test torch.where() correctness across all supported dtypes, including float8.""" for dtype in self.TEST_DTYPES: # Use string conversion for better subTest reporting - dtype_name = str(dtype).split('.')[-1] + dtype_name = str(dtype).split(".")[-1] with self.subTest(dtype=dtype_name): self._test_where_fn(dtype) From 5ebda5238f113ae7a4dce21b6395259083c223af Mon Sep 17 00:00:00 2001 From: yucai-intel <108388355+yucai-intel@users.noreply.github.com> Date: Tue, 14 Oct 2025 14:09:36 +0800 Subject: [PATCH 12/13] format --- test/regressions/test_where.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/regressions/test_where.py b/test/regressions/test_where.py index 9be02e7066..4cf4b79394 100644 --- a/test/regressions/test_where.py +++ b/test/regressions/test_where.py @@ -83,7 +83,6 @@ def _test_where_fn(self, dtype): # Compare: res_ref vs res_xpu_out_to_compare self.assertEqual(res_ref, res_xpu_out_to_compare, rtol=rtol, atol=atol) - def test_where(self): """Test torch.where() correctness across all supported dtypes, including float8.""" for dtype in self.TEST_DTYPES: From 85eaed681984fefe46e2b865529f7ec76859bcea Mon Sep 17 00:00:00 2001 From: "Cui, Yifeng" Date: Fri, 31 Oct 2025 14:00:46 +0800 Subject: [PATCH 13/13] Update test/regressions/test_where.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- test/regressions/test_where.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/regressions/test_where.py b/test/regressions/test_where.py index 4cf4b79394..f9071ce195 100644 --- a/test/regressions/test_where.py +++ b/test/regressions/test_where.py @@ -6,9 +6,9 @@ class TestTorchWhereMethod(TestCase): # Define float8 dtypes FLOAT8_DTYPES = ( - torch.float8_e5m2, torch.float8_e4m3fn, torch.float8_e4m3fnuz, + torch.float8_e5m2, torch.float8_e5m2fnuz, torch.float8_e8m0fnu, )