From 747995c8bb52c2d705530b4c4262c0d522ebc5bf Mon Sep 17 00:00:00 2001
From: yucai-intel <108388355+yucai-intel@users.noreply.github.com>
Date: Fri, 10 Oct 2025 14:03:17 +0800
Subject: [PATCH 01/13] Update Shape.cpp

---
 src/ATen/native/xpu/sycl/Shape.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/ATen/native/xpu/sycl/Shape.cpp b/src/ATen/native/xpu/sycl/Shape.cpp
index 12bd0ba66d..c8e4236401 100644
--- a/src/ATen/native/xpu/sycl/Shape.cpp
+++ b/src/ATen/native/xpu/sycl/Shape.cpp
@@ -394,6 +394,7 @@ void cat_out_kernel(
         kHalf,
         kBool,
         kBFloat16,
+        AT_EXPAND(AT_FLOAT8_TYPES),
         AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
   } else {
     offset = 0;

From 3f9538a95f9ae9d97b9ea6791bd3e718bd6e1fc5 Mon Sep 17 00:00:00 2001
From: yucai-intel <108388355+yucai-intel@users.noreply.github.com>
Date: Fri, 10 Oct 2025 15:26:50 +0800
Subject: [PATCH 02/13] Update TensorCompareKernels.cpp

---
 src/ATen/native/xpu/sycl/TensorCompareKernels.cpp | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/TensorCompareKernels.cpp b/src/ATen/native/xpu/sycl/TensorCompareKernels.cpp
index 562f994a62..1de7da7674 100644
--- a/src/ATen/native/xpu/sycl/TensorCompareKernels.cpp
+++ b/src/ATen/native/xpu/sycl/TensorCompareKernels.cpp
@@ -1,4 +1,5 @@
 #include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/native/TensorCompare.h>
 #include <ATen/native/TensorIterator.h>
@@ -78,10 +79,16 @@ struct ClampScalarFunctor {
 };
 
 void where_kernel(TensorIterator& iter) {
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
-      kComplexHalf, kHalf, kBFloat16, kBool, iter.dtype(), "where_xpu", [&] {
-        gpu_kernel(iter, WhereFunctor<scalar_t>());
-      });
+  AT_DISPATCH_V2(
+      iter.dtype(),
+      "where_xpu",
+      [&] { gpu_kernel(iter, WhereFunctor<scalar_t>()); },
+      kComplexHalf,
+      kHalf,
+      kBFloat16,
+      kBool,
+      AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
+      AT_EXPAND(AT_FLOAT8_TYPES));
 }
 
 void isposinf_kernel(TensorIteratorBase& iter) {

From 9546e78b897c08fed13902d3b491a62f7a4f963b Mon Sep 17 00:00:00 2001
From: yucai-intel <108388355+yucai-intel@users.noreply.github.com>
Date: Tue, 14 Oct 2025 12:45:16 +0800
Subject: [PATCH 03/13] Update test_cat.py

---
 test/regressions/test_cat.py | 255 +++++++++++++++--------------------
 1 file changed, 105 insertions(+), 150 deletions(-)

diff --git a/test/regressions/test_cat.py b/test/regressions/test_cat.py
index ac5d60808c..be811b7368 100644
--- a/test/regressions/test_cat.py
+++ b/test/regressions/test_cat.py
@@ -2,26 +2,104 @@
 import torch
 from torch.testing._internal.common_utils import TestCase
 
-
 class TestTorchMethod(TestCase):
+    # Define float8 dtypes for the focused test
+    FLOAT8_DTYPES = (
+        torch.float8_e4m3fn,
+        torch.float8_e4m3fnuz,
+        torch.float8_e5m2,
+        torch.float8_e5m2fnuz,
+        torch.float8_e8m0fnu,
+    )
+
+    def _create_input_tensors(self, shape, dtype, memory_format=None):
+        # Always generate random data using a CPU-compatible dtype (float32)
+        # to avoid the "not implemented" error for float8 on CPU.
+        tensor = torch.randn(shape, dtype=torch.float32)
+
+        # Convert to the target testing dtype
+        tensor = tensor.to(dtype)
+
+        # Apply memory format if specified
+        if memory_format is not None:
+            tensor = tensor.to(memory_format=memory_format)
+
+        return tensor
+
+    def _test_cat_float8_core(self, tensors, dim, dtype):
+        """Core function to test torch.cat for float8, using tolerances."""
+        
+        # --- CPU Reference Calculation (High Precision) ---
+        # Convert inputs to float32 on CPU for golden reference calculation
+        ref_tensors = [t.cpu().to(torch.float32) for t in tensors]
+
+        # Calculate CPU reference result
+        res_cpu = torch.cat(ref_tensors, dim=dim)
+
+        # --- XPU Calculation ---
+        # Convert inputs to XPU
+        xpu_tensors = [t.xpu() for t in tensors]
+        res_xpu = torch.cat(xpu_tensors, dim=dim)
+
+        # Float8 is lossy, use higher tolerance (rtol=1e-2, atol=1e-2)
+        rtol = 1e-2
+        atol = 1e-2
+
+        # Convert XPU result to float32 on CPU before comparison to match res_cpu's dtype.
+        res_xpu_f32_on_cpu = res_xpu.cpu().to(torch.float32)
+        
+        self.assertEqual(res_cpu, res_xpu_f32_on_cpu, rtol=rtol, atol=atol)
+
+
+    # ----------------------------------------------------------------------
+    # New Focused Test: Simple Float8 torch.cat
+    # ----------------------------------------------------------------------
+    def test_cat_float8_simple(self):
+        """Test torch.cat correctness across float8 dtypes using simple tensors."""
+        for dtype in self.FLOAT8_DTYPES:
+            with self.subTest(dtype=dtype):
+                # Use simple 3D shape (2, 4, 3) and concatenate along dim 1
+                user_cpu1 = self._create_input_tensors([2, 4, 3], dtype=dtype)
+                user_cpu2 = self._create_input_tensors([2, 2, 3], dtype=dtype)
+                user_cpu3 = self._create_input_tensors([2, 6, 3], dtype=dtype)
+
+                tensors = (user_cpu1, user_cpu2, user_cpu3)
+                dim = 1
+
+                self._test_cat_float8_core(tensors, dim, dtype)
+
+    # ----------------------------------------------------------------------
+    # Original Tests (Restored to default float/float32)
+    # ----------------------------------------------------------------------
+
     def test_cat_8d(self, dtype=torch.float):
+        # Original test logic restored: uses default dtype (float32)
         input1 = torch.randn([256, 8, 8, 3, 3, 3, 3], dtype=dtype)
         input2 = torch.randn([256, 8, 8, 3, 3, 3, 3], dtype=dtype)
+        
         input1_xpu = input1.xpu()
         input2_xpu = input2.xpu()
+        
         output1 = torch.stack([input1, input2], dim=0)
         output1_xpu = torch.stack([input1_xpu, input2_xpu], dim=0)
+        
         output2 = output1.reshape([2, 256, 8, 8, 9, 9])
         output2_xpu = output1_xpu.reshape([2, 256, 8, 8, 9, 9])
+        
         output3 = torch.stack([output2, output2], dim=0)
         output3_xpu = torch.stack([output2_xpu, output2_xpu], dim=0)
-        self.assertEqual(output3, output3.cpu())
+        
+        # Standard assertEqual for float32 (expect high precision)
+        self.assertEqual(output3, output3_xpu.cpu())
 
     def test_cat_array(self, dtype=torch.float):
+        # Original test logic restored: uses default dtype (float32)
         user_cpu1 = torch.randn([2, 2, 3], dtype=dtype)
         user_cpu2 = torch.randn([2, 2, 3], dtype=dtype)
         user_cpu3 = torch.randn([2, 2, 3], dtype=dtype)
+        
         res_cpu = torch.cat((user_cpu1, user_cpu2, user_cpu3), dim=1)
+        
         res_xpu = torch.cat(
             (
                 user_cpu1.xpu(),
@@ -30,169 +108,46 @@ def test_cat_array(self, dtype=torch.float):
             ),
             dim=1,
         )
+        # Standard assertEqual for float32
         self.assertEqual(res_cpu, res_xpu.cpu())
 
     def test_cat_array_2(self, dtype=torch.float):
+        # Original test logic restored: uses default dtype (float32)
         shapes = [
-            (8, 7, 3, 2),
-            (4, 4, 4, 4),
-            (4, 4, 1, 1),
-            (4, 1, 4, 4),
-            (4, 1, 4, 1),
-            (4, 1, 1, 4),
-            (1, 4, 1, 4),
-            (1, 4, 4, 1),
+            (8, 7, 3, 2), (4, 4, 4, 4), (4, 4, 1, 1), (4, 1, 4, 4),
+            (4, 1, 4, 1), (4, 1, 1, 4), (1, 4, 1, 4), (1, 4, 4, 1),
             (4, 1, 1, 1),
         ]
+        
         for shape in shapes:
-            print("\n================== test shape: ", shape, "==================")
+            # Removed original print statements to streamline test
             N, C, H, W = shape[0], shape[1], shape[2], shape[3]
-            user_cpu1 = torch.randn([N, C, H, W], dtype=dtype)
-            user_cpu2 = torch.randn([N, C, H, W], dtype=dtype)
-            user_cpu3 = torch.randn([N, C, H, W], dtype=dtype)
-
-            user_cpu1 = user_cpu1.to(memory_format=torch.channels_last)
-            user_cpu2 = user_cpu2.to(memory_format=torch.channels_last)
-            user_cpu3 = user_cpu3.to(memory_format=torch.channels_last)
-
             dim_idx = 1
+            
+            # Case 1: all channels_last
+            user_cpu1 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.channels_last)
+            user_cpu2 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.channels_last)
+            user_cpu3 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.channels_last)
             res_cpu = torch.cat((user_cpu1, user_cpu2, user_cpu3), dim=dim_idx)
-            print("\n-------------CPU Result:--------------")
-            print(res_cpu.shape)
-            print(
-                "res_cpu is cl: ",
-                res_cpu.is_contiguous(memory_format=torch.channels_last),
-            )
-
-            user_xpu1 = user_cpu1.xpu()
-            user_xpu2 = user_cpu2.xpu()
-            user_xpu3 = user_cpu3.xpu()
-
-            print("\n-------------GPU Result:--------------")
-            res_xpu = torch.cat((user_xpu1, user_xpu2, user_xpu3), dim=dim_idx)
-            print("SYCL Result:")
-            print(res_xpu.cpu().shape)
-            print(
-                "res_xpu is cl: ",
-                res_xpu.is_contiguous(memory_format=torch.channels_last),
-            )
+            res_xpu = torch.cat((user_cpu1.xpu(), user_cpu2.xpu(), user_cpu3.xpu()), dim=dim_idx)
             self.assertEqual(res_cpu, res_xpu.cpu())
 
-            if (
-                1 == res_xpu.shape[1]
-                or (1 == res_xpu.shape[2] and 1 == res_xpu.shape[3])
-                or (
-                    1 == res_xpu.shape[1]
-                    and 1 == res_xpu.shape[2]
-                    and 1 == res_xpu.shape[3]
-                )
-            ):
-                self.assertEqual(res_xpu.is_contiguous(), True)
-                self.assertEqual(
-                    res_xpu.is_contiguous(memory_format=torch.channels_last), True
-                )
-            else:
-                self.assertEqual(res_xpu.is_contiguous(), False)
-                self.assertEqual(
-                    res_xpu.is_contiguous(memory_format=torch.channels_last), True
-                )
-
-            user_cpu1 = torch.randn([N, C, H, W], dtype=dtype)
-            user_cpu2 = torch.randn([N, C, H, W], dtype=dtype)
-            user_cpu3 = torch.randn([N, C, H, W], dtype=dtype)
-
-            user_cpu1 = user_cpu1.to(memory_format=torch.channels_last)
-            user_cpu2 = user_cpu2.to(memory_format=torch.contiguous_format)
-            user_cpu3 = user_cpu3.to(memory_format=torch.channels_last)
-
-            dim_idx = 1
+            # Case 2: cl, contiguous, cl
+            user_cpu1 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.channels_last)
+            user_cpu2 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.contiguous_format)
+            user_cpu3 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.channels_last)
             res_cpu = torch.cat((user_cpu1, user_cpu2, user_cpu3), dim=dim_idx)
-            print("\n-------------CPU Result:--------------")
-            print(res_cpu.shape)
-            print(
-                "res_cpu is cl: ",
-                res_cpu.is_contiguous(memory_format=torch.channels_last),
-            )
-
-            user_xpu1 = user_cpu1.xpu()
-            user_xpu2 = user_cpu2.xpu()
-            user_xpu3 = user_cpu3.xpu()
-
-            print("\n-------------GPU Result:--------------")
-            res_xpu = torch.cat((user_xpu1, user_xpu2, user_xpu3), dim=dim_idx)
-            print("SYCL Result:")
-            print(res_xpu.cpu().shape)
-            print(
-                "res_xpu is cl: ",
-                res_xpu.is_contiguous(memory_format=torch.channels_last),
-            )
+            res_xpu = torch.cat((user_cpu1.xpu(), user_cpu2.xpu(), user_cpu3.xpu()), dim=dim_idx)
             self.assertEqual(res_cpu, res_xpu.cpu())
 
-            if (
-                1 == res_xpu.shape[1]
-                or (1 == res_xpu.shape[2] and 1 == res_xpu.shape[3])
-                or (
-                    1 == res_xpu.shape[1]
-                    and 1 == res_xpu.shape[2]
-                    and 1 == res_xpu.shape[3]
-                )
-            ):
-                self.assertEqual(res_xpu.is_contiguous(), True)
-                self.assertEqual(
-                    res_xpu.is_contiguous(memory_format=torch.channels_last), True
-                )
-            else:
-                self.assertEqual(res_xpu.is_contiguous(), True)
-                self.assertEqual(
-                    res_xpu.is_contiguous(memory_format=torch.channels_last), False
-                )
-
-            user_cpu1 = torch.randn([N, C, H, W], dtype=dtype)
-            user_cpu2 = torch.randn([N, C, H, W], dtype=dtype)
-            user_cpu3 = torch.randn([N, C, H, W], dtype=dtype)
-
-            user_cpu1 = user_cpu1.to(memory_format=torch.contiguous_format)
-            user_cpu2 = user_cpu2.to(memory_format=torch.channels_last)
-            user_cpu3 = user_cpu3.to(memory_format=torch.channels_last)
-
-            dim_idx = 1
+            # Case 3: contiguous, cl, cl
+            user_cpu1 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.contiguous_format)
+            user_cpu2 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.channels_last)
+            user_cpu3 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.channels_last)
             res_cpu = torch.cat((user_cpu1, user_cpu2, user_cpu3), dim=dim_idx)
-            print("\n-------------CPU Result:--------------")
-            print(res_cpu.shape)
-            print(
-                "res_cpu is cl: ",
-                res_cpu.is_contiguous(memory_format=torch.channels_last),
-            )
-
-            user_xpu1 = user_cpu1.xpu()
-            user_xpu2 = user_cpu2.xpu()
-            user_xpu3 = user_cpu3.xpu()
-
-            print("\n-------------GPU Result:--------------")
-            res_xpu = torch.cat((user_xpu1, user_xpu2, user_xpu3), dim=dim_idx)
-            print("SYCL Result:")
-            print(res_xpu.cpu().shape)
-            print(
-                "res_xpu is cl: ",
-                res_xpu.is_contiguous(memory_format=torch.channels_last),
-            )
+            res_xpu = torch.cat((user_cpu1.xpu(), user_cpu2.xpu(), user_cpu3.xpu()), dim=dim_idx)
             self.assertEqual(res_cpu, res_xpu.cpu())
 
-            if (
-                1 == res_xpu.shape[1]
-                or (1 == res_xpu.shape[2] and 1 == res_xpu.shape[3])
-                or (
-                    1 == res_xpu.shape[1]
-                    and 1 == res_xpu.shape[2]
-                    and 1 == res_xpu.shape[3]
-                )
-            ):
-                self.assertEqual(res_xpu.is_contiguous(), True)
-                self.assertEqual(
-                    res_xpu.is_contiguous(memory_format=torch.channels_last), True
-                )
-            else:
-                self.assertEqual(res_xpu.is_contiguous(), True)
-                self.assertEqual(
-                    res_xpu.is_contiguous(memory_format=torch.channels_last), False
-                )
+            # Removed original verbose memory format assertions for clean test logic
+
+            

From 7f8901833f28c43dfefd78e7c191ebac0dec4c11 Mon Sep 17 00:00:00 2001
From: yucai-intel <108388355+yucai-intel@users.noreply.github.com>
Date: Tue, 14 Oct 2025 12:50:25 +0800
Subject: [PATCH 04/13] Update test_cat.py

---
 test/regressions/test_cat.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/regressions/test_cat.py b/test/regressions/test_cat.py
index be811b7368..c365e3ca3c 100644
--- a/test/regressions/test_cat.py
+++ b/test/regressions/test_cat.py
@@ -1,15 +1,16 @@
 # Owner(s): ["module: intel"]
 import torch
 from torch.testing._internal.common_utils import TestCase
+# Owner(s): ["module: intel"]
+import torch
+from torch.testing._internal.common_utils import TestCase
+
 
 class TestTorchMethod(TestCase):
     # Define float8 dtypes for the focused test
     FLOAT8_DTYPES = (
-        torch.float8_e4m3fn,
-        torch.float8_e4m3fnuz,
         torch.float8_e5m2,
-        torch.float8_e5m2fnuz,
-        torch.float8_e8m0fnu,
+        torch.float8_e4m3fn,
     )
 
     def _create_input_tensors(self, shape, dtype, memory_format=None):
@@ -44,7 +45,7 @@ def _test_cat_float8_core(self, tensors, dim, dtype):
         # Float8 is lossy, use higher tolerance (rtol=1e-2, atol=1e-2)
         rtol = 1e-2
         atol = 1e-2
-
+        
         # Convert XPU result to float32 on CPU before comparison to match res_cpu's dtype.
         res_xpu_f32_on_cpu = res_xpu.cpu().to(torch.float32)
         
@@ -120,7 +121,6 @@ def test_cat_array_2(self, dtype=torch.float):
         ]
         
         for shape in shapes:
-            # Removed original print statements to streamline test
             N, C, H, W = shape[0], shape[1], shape[2], shape[3]
             dim_idx = 1
             

From 95ac047705e210bf54c147c4248551d83a68f609 Mon Sep 17 00:00:00 2001
From: yucai-intel <108388355+yucai-intel@users.noreply.github.com>
Date: Tue, 14 Oct 2025 12:57:09 +0800
Subject: [PATCH 05/13] format

---
 test/regressions/test_cat.py | 204 ++++++++++++++++++++++++++---------
 1 file changed, 156 insertions(+), 48 deletions(-)

diff --git a/test/regressions/test_cat.py b/test/regressions/test_cat.py
index c365e3ca3c..6974962515 100644
--- a/test/regressions/test_cat.py
+++ b/test/regressions/test_cat.py
@@ -1,16 +1,14 @@
 # Owner(s): ["module: intel"]
 import torch
 from torch.testing._internal.common_utils import TestCase
-# Owner(s): ["module: intel"]
-import torch
-from torch.testing._internal.common_utils import TestCase
-
-
 class TestTorchMethod(TestCase):
     # Define float8 dtypes for the focused test
     FLOAT8_DTYPES = (
-        torch.float8_e5m2,
         torch.float8_e4m3fn,
+        torch.float8_e4m3fnuz,
+        torch.float8_e5m2,
+        torch.float8_e5m2fnuz,
+        torch.float8_e8m0fnu,
     )
 
     def _create_input_tensors(self, shape, dtype, memory_format=None):
@@ -29,7 +27,7 @@ def _create_input_tensors(self, shape, dtype, memory_format=None):
 
     def _test_cat_float8_core(self, tensors, dim, dtype):
         """Core function to test torch.cat for float8, using tolerances."""
-        
+
         # --- CPU Reference Calculation (High Precision) ---
         # Convert inputs to float32 on CPU for golden reference calculation
         ref_tensors = [t.cpu().to(torch.float32) for t in tensors]
@@ -45,10 +43,10 @@ def _test_cat_float8_core(self, tensors, dim, dtype):
         # Float8 is lossy, use higher tolerance (rtol=1e-2, atol=1e-2)
         rtol = 1e-2
         atol = 1e-2
-        
+
         # Convert XPU result to float32 on CPU before comparison to match res_cpu's dtype.
         res_xpu_f32_on_cpu = res_xpu.cpu().to(torch.float32)
-        
+
         self.assertEqual(res_cpu, res_xpu_f32_on_cpu, rtol=rtol, atol=atol)
 
 
@@ -69,38 +67,24 @@ def test_cat_float8_simple(self):
 
                 self._test_cat_float8_core(tensors, dim, dtype)
 
-    # ----------------------------------------------------------------------
-    # Original Tests (Restored to default float/float32)
-    # ----------------------------------------------------------------------
-
     def test_cat_8d(self, dtype=torch.float):
-        # Original test logic restored: uses default dtype (float32)
         input1 = torch.randn([256, 8, 8, 3, 3, 3, 3], dtype=dtype)
         input2 = torch.randn([256, 8, 8, 3, 3, 3, 3], dtype=dtype)
-        
         input1_xpu = input1.xpu()
         input2_xpu = input2.xpu()
-        
         output1 = torch.stack([input1, input2], dim=0)
         output1_xpu = torch.stack([input1_xpu, input2_xpu], dim=0)
-        
         output2 = output1.reshape([2, 256, 8, 8, 9, 9])
         output2_xpu = output1_xpu.reshape([2, 256, 8, 8, 9, 9])
-        
         output3 = torch.stack([output2, output2], dim=0)
         output3_xpu = torch.stack([output2_xpu, output2_xpu], dim=0)
-        
-        # Standard assertEqual for float32 (expect high precision)
-        self.assertEqual(output3, output3_xpu.cpu())
+        self.assertEqual(output3, output3.cpu())
 
     def test_cat_array(self, dtype=torch.float):
-        # Original test logic restored: uses default dtype (float32)
         user_cpu1 = torch.randn([2, 2, 3], dtype=dtype)
         user_cpu2 = torch.randn([2, 2, 3], dtype=dtype)
         user_cpu3 = torch.randn([2, 2, 3], dtype=dtype)
-        
         res_cpu = torch.cat((user_cpu1, user_cpu2, user_cpu3), dim=1)
-        
         res_xpu = torch.cat(
             (
                 user_cpu1.xpu(),
@@ -109,45 +93,169 @@ def test_cat_array(self, dtype=torch.float):
             ),
             dim=1,
         )
-        # Standard assertEqual for float32
         self.assertEqual(res_cpu, res_xpu.cpu())
 
     def test_cat_array_2(self, dtype=torch.float):
-        # Original test logic restored: uses default dtype (float32)
         shapes = [
-            (8, 7, 3, 2), (4, 4, 4, 4), (4, 4, 1, 1), (4, 1, 4, 4),
-            (4, 1, 4, 1), (4, 1, 1, 4), (1, 4, 1, 4), (1, 4, 4, 1),
+            (8, 7, 3, 2),
+            (4, 4, 4, 4),
+            (4, 4, 1, 1),
+            (4, 1, 4, 4),
+            (4, 1, 4, 1),
+            (4, 1, 1, 4),
+            (1, 4, 1, 4),
+            (1, 4, 4, 1),
             (4, 1, 1, 1),
         ]
-        
         for shape in shapes:
+            print("\n================== test shape: ", shape, "==================")
             N, C, H, W = shape[0], shape[1], shape[2], shape[3]
+            user_cpu1 = torch.randn([N, C, H, W], dtype=dtype)
+            user_cpu2 = torch.randn([N, C, H, W], dtype=dtype)
+            user_cpu3 = torch.randn([N, C, H, W], dtype=dtype)
+
+            user_cpu1 = user_cpu1.to(memory_format=torch.channels_last)
+            user_cpu2 = user_cpu2.to(memory_format=torch.channels_last)
+            user_cpu3 = user_cpu3.to(memory_format=torch.channels_last)
+
             dim_idx = 1
-            
-            # Case 1: all channels_last
-            user_cpu1 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.channels_last)
-            user_cpu2 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.channels_last)
-            user_cpu3 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.channels_last)
             res_cpu = torch.cat((user_cpu1, user_cpu2, user_cpu3), dim=dim_idx)
-            res_xpu = torch.cat((user_cpu1.xpu(), user_cpu2.xpu(), user_cpu3.xpu()), dim=dim_idx)
+            print("\n-------------CPU Result:--------------")
+            print(res_cpu.shape)
+            print(
+                "res_cpu is cl: ",
+                res_cpu.is_contiguous(memory_format=torch.channels_last),
+            )
+
+            user_xpu1 = user_cpu1.xpu()
+            user_xpu2 = user_cpu2.xpu()
+            user_xpu3 = user_cpu3.xpu()
+
+            print("\n-------------GPU Result:--------------")
+            res_xpu = torch.cat((user_xpu1, user_xpu2, user_xpu3), dim=dim_idx)
+            print("SYCL Result:")
+            print(res_xpu.cpu().shape)
+            print(
+                "res_xpu is cl: ",
+                res_xpu.is_contiguous(memory_format=torch.channels_last),
+            )
             self.assertEqual(res_cpu, res_xpu.cpu())
 
-            # Case 2: cl, contiguous, cl
-            user_cpu1 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.channels_last)
-            user_cpu2 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.contiguous_format)
-            user_cpu3 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.channels_last)
+            if (
+                1 == res_xpu.shape[1]
+                or (1 == res_xpu.shape[2] and 1 == res_xpu.shape[3])
+                or (
+                    1 == res_xpu.shape[1]
+                    and 1 == res_xpu.shape[2]
+                    and 1 == res_xpu.shape[3]
+                )
+            ):
+                self.assertEqual(res_xpu.is_contiguous(), True)
+                self.assertEqual(
+                    res_xpu.is_contiguous(memory_format=torch.channels_last), True
+                )
+            else:
+                self.assertEqual(res_xpu.is_contiguous(), False)
+                self.assertEqual(
+                    res_xpu.is_contiguous(memory_format=torch.channels_last), True
+                )
+
+            user_cpu1 = torch.randn([N, C, H, W], dtype=dtype)
+            user_cpu2 = torch.randn([N, C, H, W], dtype=dtype)
+            user_cpu3 = torch.randn([N, C, H, W], dtype=dtype)
+
+            user_cpu1 = user_cpu1.to(memory_format=torch.channels_last)
+            user_cpu2 = user_cpu2.to(memory_format=torch.contiguous_format)
+            user_cpu3 = user_cpu3.to(memory_format=torch.channels_last)
+
+            dim_idx = 1
             res_cpu = torch.cat((user_cpu1, user_cpu2, user_cpu3), dim=dim_idx)
-            res_xpu = torch.cat((user_cpu1.xpu(), user_cpu2.xpu(), user_cpu3.xpu()), dim=dim_idx)
+            print("\n-------------CPU Result:--------------")
+            print(res_cpu.shape)
+            print(
+                "res_cpu is cl: ",
+                res_cpu.is_contiguous(memory_format=torch.channels_last),
+            )
+
+            user_xpu1 = user_cpu1.xpu()
+            user_xpu2 = user_cpu2.xpu()
+            user_xpu3 = user_cpu3.xpu()
+
+            print("\n-------------GPU Result:--------------")
+            res_xpu = torch.cat((user_xpu1, user_xpu2, user_xpu3), dim=dim_idx)
+            print("SYCL Result:")
+            print(res_xpu.cpu().shape)
+            print(
+                "res_xpu is cl: ",
+                res_xpu.is_contiguous(memory_format=torch.channels_last),
+            )
             self.assertEqual(res_cpu, res_xpu.cpu())
 
-            # Case 3: contiguous, cl, cl
-            user_cpu1 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.contiguous_format)
-            user_cpu2 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.channels_last)
-            user_cpu3 = torch.randn([N, C, H, W], dtype=dtype).to(memory_format=torch.channels_last)
+            if (
+                1 == res_xpu.shape[1]
+                or (1 == res_xpu.shape[2] and 1 == res_xpu.shape[3])
+                or (
+                    1 == res_xpu.shape[1]
+                    and 1 == res_xpu.shape[2]
+                    and 1 == res_xpu.shape[3]
+                )
+            ):
+                self.assertEqual(res_xpu.is_contiguous(), True)
+                self.assertEqual(
+                    res_xpu.is_contiguous(memory_format=torch.channels_last), True
+                )
+            else:
+                self.assertEqual(res_xpu.is_contiguous(), True)
+                self.assertEqual(
+                    res_xpu.is_contiguous(memory_format=torch.channels_last), False
+                )
+
+            user_cpu1 = torch.randn([N, C, H, W], dtype=dtype)
+            user_cpu2 = torch.randn([N, C, H, W], dtype=dtype)
+            user_cpu3 = torch.randn([N, C, H, W], dtype=dtype)
+
+            user_cpu1 = user_cpu1.to(memory_format=torch.contiguous_format)
+            user_cpu2 = user_cpu2.to(memory_format=torch.channels_last)
+            user_cpu3 = user_cpu3.to(memory_format=torch.channels_last)
+
+            dim_idx = 1
             res_cpu = torch.cat((user_cpu1, user_cpu2, user_cpu3), dim=dim_idx)
-            res_xpu = torch.cat((user_cpu1.xpu(), user_cpu2.xpu(), user_cpu3.xpu()), dim=dim_idx)
-            self.assertEqual(res_cpu, res_xpu.cpu())
+            print("\n-------------CPU Result:--------------")
+            print(res_cpu.shape)
+            print(
+                "res_cpu is cl: ",
+                res_cpu.is_contiguous(memory_format=torch.channels_last),
+            )
+
+            user_xpu1 = user_cpu1.xpu()
+            user_xpu2 = user_cpu2.xpu()
+            user_xpu3 = user_cpu3.xpu()
 
-            # Removed original verbose memory format assertions for clean test logic
+            print("\n-------------GPU Result:--------------")
+            res_xpu = torch.cat((user_xpu1, user_xpu2, user_xpu3), dim=dim_idx)
+            print("SYCL Result:")
+            print(res_xpu.cpu().shape)
+            print(
+                "res_xpu is cl: ",
+                res_xpu.is_contiguous(memory_format=torch.channels_last),
+            )
+            self.assertEqual(res_cpu, res_xpu.cpu())
 
-            
+            if (
+                1 == res_xpu.shape[1]
+                or (1 == res_xpu.shape[2] and 1 == res_xpu.shape[3])
+                or (
+                    1 == res_xpu.shape[1]
+                    and 1 == res_xpu.shape[2]
+                    and 1 == res_xpu.shape[3]
+                )
+            ):
+                self.assertEqual(res_xpu.is_contiguous(), True)
+                self.assertEqual(
+                    res_xpu.is_contiguous(memory_format=torch.channels_last), True
+                )
+            else:
+                self.assertEqual(res_xpu.is_contiguous(), True)
+                self.assertEqual(
+                    res_xpu.is_contiguous(memory_format=torch.channels_last), False
+                )

From 2e2abe35cdb5f369c30490ee6ec6f619a6372103 Mon Sep 17 00:00:00 2001
From: yucai-intel <108388355+yucai-intel@users.noreply.github.com>
Date: Tue, 14 Oct 2025 13:00:12 +0800
Subject: [PATCH 06/13] format

---
 test/regressions/test_cat.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/test/regressions/test_cat.py b/test/regressions/test_cat.py
index 6974962515..b5f793b15c 100644
--- a/test/regressions/test_cat.py
+++ b/test/regressions/test_cat.py
@@ -1,14 +1,13 @@
 # Owner(s): ["module: intel"]
 import torch
 from torch.testing._internal.common_utils import TestCase
+
+
 class TestTorchMethod(TestCase):
     # Define float8 dtypes for the focused test
     FLOAT8_DTYPES = (
-        torch.float8_e4m3fn,
-        torch.float8_e4m3fnuz,
         torch.float8_e5m2,
-        torch.float8_e5m2fnuz,
-        torch.float8_e8m0fnu,
+        torch.float8_e4m3fn,
     )
 
     def _create_input_tensors(self, shape, dtype, memory_format=None):
@@ -27,7 +26,7 @@ def _create_input_tensors(self, shape, dtype, memory_format=None):
 
     def _test_cat_float8_core(self, tensors, dim, dtype):
         """Core function to test torch.cat for float8, using tolerances."""
-
+        
         # --- CPU Reference Calculation (High Precision) ---
         # Convert inputs to float32 on CPU for golden reference calculation
         ref_tensors = [t.cpu().to(torch.float32) for t in tensors]
@@ -43,10 +42,10 @@ def _test_cat_float8_core(self, tensors, dim, dtype):
         # Float8 is lossy, use higher tolerance (rtol=1e-2, atol=1e-2)
         rtol = 1e-2
         atol = 1e-2
-
+        
         # Convert XPU result to float32 on CPU before comparison to match res_cpu's dtype.
         res_xpu_f32_on_cpu = res_xpu.cpu().to(torch.float32)
-
+        
         self.assertEqual(res_cpu, res_xpu_f32_on_cpu, rtol=rtol, atol=atol)
 
 

From 226ccb3e48023176dbd390a8251707fcc8a7ec9f Mon Sep 17 00:00:00 2001
From: yucai-intel <108388355+yucai-intel@users.noreply.github.com>
Date: Tue, 14 Oct 2025 13:03:35 +0800
Subject: [PATCH 07/13] format

---
 test/regressions/test_cat.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/regressions/test_cat.py b/test/regressions/test_cat.py
index b5f793b15c..5ef847997e 100644
--- a/test/regressions/test_cat.py
+++ b/test/regressions/test_cat.py
@@ -26,7 +26,7 @@ def _create_input_tensors(self, shape, dtype, memory_format=None):
 
     def _test_cat_float8_core(self, tensors, dim, dtype):
         """Core function to test torch.cat for float8, using tolerances."""
-        
+
         # --- CPU Reference Calculation (High Precision) ---
         # Convert inputs to float32 on CPU for golden reference calculation
         ref_tensors = [t.cpu().to(torch.float32) for t in tensors]
@@ -42,10 +42,10 @@ def _test_cat_float8_core(self, tensors, dim, dtype):
         # Float8 is lossy, use higher tolerance (rtol=1e-2, atol=1e-2)
         rtol = 1e-2
         atol = 1e-2
-        
+
         # Convert XPU result to float32 on CPU before comparison to match res_cpu's dtype.
         res_xpu_f32_on_cpu = res_xpu.cpu().to(torch.float32)
-        
+
         self.assertEqual(res_cpu, res_xpu_f32_on_cpu, rtol=rtol, atol=atol)
 
 

From 652ae582d101fb420ad26113fef99323e04990a4 Mon Sep 17 00:00:00 2001
From: yucai-intel <108388355+yucai-intel@users.noreply.github.com>
Date: Tue, 14 Oct 2025 13:07:18 +0800
Subject: [PATCH 08/13] format

---
 test/regressions/test_cat.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/test/regressions/test_cat.py b/test/regressions/test_cat.py
index 5ef847997e..d147b712a3 100644
--- a/test/regressions/test_cat.py
+++ b/test/regressions/test_cat.py
@@ -48,10 +48,6 @@ def _test_cat_float8_core(self, tensors, dim, dtype):
 
         self.assertEqual(res_cpu, res_xpu_f32_on_cpu, rtol=rtol, atol=atol)
 
-
-    # ----------------------------------------------------------------------
-    # New Focused Test: Simple Float8 torch.cat
-    # ----------------------------------------------------------------------
     def test_cat_float8_simple(self):
         """Test torch.cat correctness across float8 dtypes using simple tensors."""
         for dtype in self.FLOAT8_DTYPES:

From a115cb278becf237a184cf8e815486b277819703 Mon Sep 17 00:00:00 2001
From: yucai-intel <108388355+yucai-intel@users.noreply.github.com>
Date: Tue, 14 Oct 2025 13:08:16 +0800
Subject: [PATCH 09/13] format

---
 test/regressions/test_cat.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test/regressions/test_cat.py b/test/regressions/test_cat.py
index d147b712a3..e2e4218e56 100644
--- a/test/regressions/test_cat.py
+++ b/test/regressions/test_cat.py
@@ -6,8 +6,11 @@
 class TestTorchMethod(TestCase):
     # Define float8 dtypes for the focused test
     FLOAT8_DTYPES = (
-        torch.float8_e5m2,
         torch.float8_e4m3fn,
+        torch.float8_e4m3fnuz,
+        torch.float8_e5m2,
+        torch.float8_e5m2fnuz,
+        torch.float8_e8m0fnu,
     )
 
     def _create_input_tensors(self, shape, dtype, memory_format=None):

From 6ae66611da091d43988a75305b65065212463e28 Mon Sep 17 00:00:00 2001
From: yucai-intel <108388355+yucai-intel@users.noreply.github.com>
Date: Tue, 14 Oct 2025 14:04:04 +0800
Subject: [PATCH 10/13] Create test_where.py

---
 test/regressions/test_where.py | 93 ++++++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)
 create mode 100644 test/regressions/test_where.py

diff --git a/test/regressions/test_where.py b/test/regressions/test_where.py
new file mode 100644
index 0000000000..33eef5ee10
--- /dev/null
+++ b/test/regressions/test_where.py
@@ -0,0 +1,93 @@
+# Owner(s): ["module: intel"]
+import torch
+from torch.testing._internal.common_utils import TestCase
+
+
+class TestTorchWhereMethod(TestCase):
+    # Define float8 dtypes
+    FLOAT8_DTYPES = (
+        torch.float8_e5m2,
+        torch.float8_e4m3fn,
+        torch.float8_e4m3fnuz,
+        torch.float8_e5m2fnuz,
+        torch.float8_e8m0fnu,
+    )
+
+    # Define the set of all dtypes to be tested
+    TEST_DTYPES = (
+        torch.float32,
+        torch.float64,
+        torch.half,
+        torch.bfloat16,
+    ) + FLOAT8_DTYPES
+
+    def _test_where_fn(self, dtype):
+        """Core function to test torch.where(condition, x, y) correctness."""
+        
+        # 1. Input Tensors (x and y)
+        x = torch.tensor([[10.0, 20.0], [30.0, 40.0]], dtype=dtype)
+        y = torch.tensor([[-1.0, -2.0], [-3.0, -4.0]], dtype=dtype)
+        # Condition must be bool
+        condition = torch.tensor([[True, False], [False, True]], dtype=torch.bool)
+
+        # --- 1. CPU Reference Calculation and Tolerance Setting ---
+        
+        if dtype in self.FLOAT8_DTYPES:
+            # FP8: Use float32 as reference type for comparison
+            x_ref = x.cpu().to(torch.float32)
+            y_ref = y.cpu().to(torch.float32)
+            rtol = 1e-2
+            atol = 1e-2
+        else:
+            # Non-FP8: Use original dtype as reference type
+            x_ref = x.cpu()
+            y_ref = y.cpu()
+            rtol = 1e-5
+            atol = 1e-5
+
+        condition_ref = condition.cpu() 
+        res_ref = torch.where(condition_ref, x_ref, y_ref)
+
+        # --- 2. XPU Operation (Default) ---
+        x_xpu = x.xpu()
+        y_xpu = y.xpu()
+        condition_xpu = condition.xpu()
+        
+        res_xpu = torch.where(condition_xpu, x_xpu, y_xpu)
+        
+        # Prepare XPU result for comparison (must match res_ref dtype)
+        if dtype in self.FLOAT8_DTYPES:
+            # FP8: Convert XPU result to float32
+            res_xpu_to_compare = res_xpu.cpu().to(torch.float32)
+        else:
+            # Non-FP8: Pull to CPU, keeping original dtype
+            res_xpu_to_compare = res_xpu.cpu()
+
+        # Compare: res_ref vs res_xpu_to_compare
+        self.assertEqual(res_ref, res_xpu_to_compare, rtol=rtol, atol=atol)
+
+        # --- 3. Test the version with out= argument ---
+        
+        # Create output tensor on XPU
+        res_xpu_out = torch.empty_like(res_xpu, dtype=dtype).xpu()
+        torch.where(condition_xpu, x_xpu, y_xpu, out=res_xpu_out)
+        
+        # Prepare XPU 'out' result for comparison
+        if dtype in self.FLOAT8_DTYPES:
+            # FP8: Convert XPU result to float32
+            res_xpu_out_to_compare = res_xpu_out.cpu().to(torch.float32)
+        else:
+            # Non-FP8: Pull to CPU, keeping original dtype
+            res_xpu_out_to_compare = res_xpu_out.cpu()
+
+        # Compare: res_ref vs res_xpu_out_to_compare
+        self.assertEqual(res_ref, res_xpu_out_to_compare, rtol=rtol, atol=atol)
+
+
+    def test_where(self):
+        """Test torch.where() correctness across all supported dtypes, including float8."""
+        for dtype in self.TEST_DTYPES:
+            # Use string conversion for better subTest reporting
+            dtype_name = str(dtype).split('.')[-1]
+            with self.subTest(dtype=dtype_name):
+                self._test_where_fn(dtype)

From fbaf98f5ade907a83230c077a96efe445aeeec64 Mon Sep 17 00:00:00 2001
From: yucai-intel <108388355+yucai-intel@users.noreply.github.com>
Date: Tue, 14 Oct 2025 14:07:33 +0800
Subject: [PATCH 11/13] format

---
 test/regressions/test_where.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/test/regressions/test_where.py b/test/regressions/test_where.py
index 33eef5ee10..9be02e7066 100644
--- a/test/regressions/test_where.py
+++ b/test/regressions/test_where.py
@@ -23,7 +23,7 @@ class TestTorchWhereMethod(TestCase):
 
     def _test_where_fn(self, dtype):
         """Core function to test torch.where(condition, x, y) correctness."""
-        
+
         # 1. Input Tensors (x and y)
         x = torch.tensor([[10.0, 20.0], [30.0, 40.0]], dtype=dtype)
         y = torch.tensor([[-1.0, -2.0], [-3.0, -4.0]], dtype=dtype)
@@ -31,7 +31,7 @@ def _test_where_fn(self, dtype):
         condition = torch.tensor([[True, False], [False, True]], dtype=torch.bool)
 
         # --- 1. CPU Reference Calculation and Tolerance Setting ---
-        
+
         if dtype in self.FLOAT8_DTYPES:
             # FP8: Use float32 as reference type for comparison
             x_ref = x.cpu().to(torch.float32)
@@ -45,16 +45,16 @@ def _test_where_fn(self, dtype):
             rtol = 1e-5
             atol = 1e-5
 
-        condition_ref = condition.cpu() 
+        condition_ref = condition.cpu()
         res_ref = torch.where(condition_ref, x_ref, y_ref)
 
         # --- 2. XPU Operation (Default) ---
         x_xpu = x.xpu()
         y_xpu = y.xpu()
         condition_xpu = condition.xpu()
-        
+
         res_xpu = torch.where(condition_xpu, x_xpu, y_xpu)
-        
+
         # Prepare XPU result for comparison (must match res_ref dtype)
         if dtype in self.FLOAT8_DTYPES:
             # FP8: Convert XPU result to float32
@@ -67,11 +67,11 @@ def _test_where_fn(self, dtype):
         self.assertEqual(res_ref, res_xpu_to_compare, rtol=rtol, atol=atol)
 
         # --- 3. Test the version with out= argument ---
-        
+
         # Create output tensor on XPU
         res_xpu_out = torch.empty_like(res_xpu, dtype=dtype).xpu()
         torch.where(condition_xpu, x_xpu, y_xpu, out=res_xpu_out)
-        
+
         # Prepare XPU 'out' result for comparison
         if dtype in self.FLOAT8_DTYPES:
             # FP8: Convert XPU result to float32
@@ -88,6 +88,6 @@ def test_where(self):
         """Test torch.where() correctness across all supported dtypes, including float8."""
         for dtype in self.TEST_DTYPES:
             # Use string conversion for better subTest reporting
-            dtype_name = str(dtype).split('.')[-1]
+            dtype_name = str(dtype).split(".")[-1]
             with self.subTest(dtype=dtype_name):
                 self._test_where_fn(dtype)

From 5ebda5238f113ae7a4dce21b6395259083c223af Mon Sep 17 00:00:00 2001
From: yucai-intel <108388355+yucai-intel@users.noreply.github.com>
Date: Tue, 14 Oct 2025 14:09:36 +0800
Subject: [PATCH 12/13] format

---
 test/regressions/test_where.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/regressions/test_where.py b/test/regressions/test_where.py
index 9be02e7066..4cf4b79394 100644
--- a/test/regressions/test_where.py
+++ b/test/regressions/test_where.py
@@ -83,7 +83,6 @@ def _test_where_fn(self, dtype):
         # Compare: res_ref vs res_xpu_out_to_compare
         self.assertEqual(res_ref, res_xpu_out_to_compare, rtol=rtol, atol=atol)
 
-
     def test_where(self):
         """Test torch.where() correctness across all supported dtypes, including float8."""
         for dtype in self.TEST_DTYPES:

From 85eaed681984fefe46e2b865529f7ec76859bcea Mon Sep 17 00:00:00 2001
From: "Cui, Yifeng" <yifeng.cui@intel.com>
Date: Fri, 31 Oct 2025 14:00:46 +0800
Subject: [PATCH 13/13] Update test/regressions/test_where.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 test/regressions/test_where.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/regressions/test_where.py b/test/regressions/test_where.py
index 4cf4b79394..f9071ce195 100644
--- a/test/regressions/test_where.py
+++ b/test/regressions/test_where.py
@@ -6,9 +6,9 @@
 class TestTorchWhereMethod(TestCase):
     # Define float8 dtypes
     FLOAT8_DTYPES = (
-        torch.float8_e5m2,
         torch.float8_e4m3fn,
         torch.float8_e4m3fnuz,
+        torch.float8_e5m2,
         torch.float8_e5m2fnuz,
         torch.float8_e8m0fnu,
     )