diff --git a/.github/workflows/oneapi_githubactions_build.yml b/.github/workflows/oneapi_githubactions_build.yml
new file mode 100644
index 0000000000..abbb84d839
--- /dev/null
+++ b/.github/workflows/oneapi_githubactions_build.yml
@@ -0,0 +1,82 @@
+name: oneapi_ghactions_buildrun
+
+on:
+  push:
+    branches: [ "feature/sycl" ]
+
+defaults:
+  run:
+    shell: bash
+
+env:
+  BUILD_TYPE: RELEASE
+
+jobs:
+  buildrun:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Install software
+      run: |
+        sudo apt update
+        sudo apt install -y gpg-agent wget
+        # download the key to system keyring
+        wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
+        # add signed entry to apt sources and configure the APT client to use Intel repository:
+        echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
+        sudo apt update
+        sudo apt install intel-oneapi-hpc-toolkit
+
+    - name: Setup oneAPI
+      run: |
+        source /opt/intel/oneapi/setvars.sh
+        printenv >> $GITHUB_ENV
+        which icpx
+        icpx -v
+        cat /proc/cpuinfo
+
+    - uses: actions/checkout@v4
+
+    - name: Ccache for gh actions
+      uses: hendrikmuhs/ccache-action@v1.2.16
+      with:
+        key: ${{ github.job }}
+        max-size: 2000M
+
+    - name: Configure CMake
+      run: >
+        cmake
+        -B ${{github.workspace}}/build
+        -GNinja
+        -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}}
+        -DCMAKE_C_COMPILER=icx
+        -DCMAKE_CXX_COMPILER=icpx
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+        -DQUDA_TARGET_TYPE=SYCL
+        -DQUDA_SYCL_TARGETS=spir64_x86_64
+        -DCMAKE_CXX_FLAGS="-Wno-unsupported-floating-point-opt"
+        -DCMAKE_SYCL_FLAGS="-Xs -march=avx512 -Wno-unsupported-floating-point-opt"
+        -DSYCL_LINK_FLAGS="-Xs -march=avx512 -fsycl-device-code-split=per_kernel -fsycl-max-parallel-link-jobs=4 -flink-huge-device-code"
+        -DQUDA_DIRAC_COVDEV=OFF
+        -DQUDA_DIRAC_DISTANCE_PRECONDITIONING=OFF
+        -DQUDA_MULTIGRID=ON
+        -DQUDA_INTERFACE_QDPJIT=ON
+        -DQUDA_FAST_COMPILE_REDUCE=ON
+        -DQUDA_FAST_COMPILE_DSLASH=ON
+        -DQUDA_OPENMP=OFF
+        -DQUDA_MPI=ON
+        -DQUDA_PRECISION=12
+        -DQUDA_DIRAC_DEFAULT_OFF=ON
+        -DQUDA_DIRAC_STAGGERED=ON
+        -DQUDA_DIRAC_WILSON=ON
+
+    - name: Build
+      run: cmake --build ${{github.workspace}}/build
+
+    - name: Install
+      run: cmake --install ${{github.workspace}}/build
+
+    - name: Run
+      run: |
+        cd ${{github.workspace}}/build
+        #ctest
+        ctest -E 'invert_test_asqtad_single|invert_test_splitgrid_asqtad_single|unitarize_link_single'
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c3fa2a949c..d91b1aa8bd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -211,7 +211,11 @@ if(QUDA_MAX_MULTI_RHS_TILE GREATER QUDA_MAX_MULTI_RHS)
   message(SEND_ERROR "QUDA_MAX_MULTI_RHS_TILE is greater than QUDA_MAX_MULTI_RHS")
 endif()
 
-set(QUDA_MAX_KERNEL_ARG_SIZE "4096" CACHE STRING "maximum static size of the kernel arguments in bytes passed to a kernel on the target architecture")
+if(${QUDA_TARGET_TYPE} STREQUAL "SYCL")
+  set(QUDA_MAX_KERNEL_ARG_SIZE "2048" CACHE STRING "maximum static size of the kernel arguments in bytes passed to a kernel on the target architecture")
+else()
+  set(QUDA_MAX_KERNEL_ARG_SIZE "4096" CACHE STRING "maximum static size of the kernel arguments in bytes passed to a kernel on the target architecture")
+endif()
 if(QUDA_MAX_KERNEL_ARG_SIZE GREATER 32764)
   message(SEND_ERROR "Maximum QUDA_MAX_KERNEL_ARG_SIZE is 32764")
 endif()
diff --git a/cmake/CMakeDetermineSYCLCompiler.cmake b/cmake/CMakeDetermineSYCLCompiler.cmake
new file mode 100644
index 0000000000..144b288e92
--- /dev/null
+++ b/cmake/CMakeDetermineSYCLCompiler.cmake
@@ -0,0 +1,36 @@
+if(NOT CMAKE_SYCL_COMPILER)
+  set(CMAKE_SYCL_COMPILER ${CMAKE_CXX_COMPILER})
+endif()
+mark_as_advanced(CMAKE_SYCL_COMPILER)
+message(STATUS "The SYCL compiler is " ${CMAKE_SYCL_COMPILER})
+
+if(NOT CMAKE_SYCL_COMPILER_ID_RUN)
+  set(CMAKE_SYCL_COMPILER_ID_RUN 1)
+
+  # Try to identify the compiler.
+  set(CMAKE_SYCL_COMPILER_ID)
+  set(CMAKE_SYCL_PLATFORM_ID)
+  file(READ ${CMAKE_ROOT}/Modules/CMakePlatformId.h.in CMAKE_SYCL_COMPILER_ID_PLATFORM_CONTENT)
+
+  set(CMAKE_SYCL_COMPILER_ID_TEST_FLAGS_FIRST)
+  set(CMAKE_SYCL_COMPILER_ID_TEST_FLAGS)
+
+  set(CMAKE_CXX_COMPILER_ID_CONTENT "#if defined(__INTEL_LLVM_COMPILER)\n# define COMPILER_ID \"IntelLLVM\"\n")
+  string(APPEND CMAKE_CXX_COMPILER_ID_CONTENT "#elif defined(__clang__)\n# define COMPILER_ID \"Clang\"\n")
+  string(APPEND CMAKE_CXX_COMPILER_ID_CONTENT "#endif\n")
+  include(${CMAKE_ROOT}/Modules/CMakeDetermineCompilerId.cmake)
+  CMAKE_DETERMINE_COMPILER_ID(SYCL SYCLFLAGS CMakeCXXCompilerId.cpp)
+
+  _cmake_find_compiler_sysroot(SYCL)
+endif()
+
+
+#set(CMAKE_SYCL_COMPILER_ID_TEST_FLAGS_FIRST)
+#set(CMAKESYCL_COMPILER_ID_TEST_FLAGS "-c")
+#include(${CMAKE_ROOT}/Modules/CMakeDetermineCompilerId.cmake)
+#CMAKE_DETERMINE_COMPILER_ID(SYCL SYCLFLAGS CMakeCXXCompilerId.cpp)
+
+configure_file(${CMAKE_CURRENT_LIST_DIR}/CMakeSYCLCompiler.cmake.in
+               ${CMAKE_PLATFORM_INFO_DIR}/CMakeSYCLCompiler.cmake)
+
+set(CMAKE_SYCL_COMPILER_ENV_VAR "SYCL")
diff --git a/cmake/CMakeSYCLCompiler.cmake.in b/cmake/CMakeSYCLCompiler.cmake.in
new file mode 100644
index 0000000000..2dc0b7acd2
--- /dev/null
+++ b/cmake/CMakeSYCLCompiler.cmake.in
@@ -0,0 +1,3 @@
+set(CMAKE_SYCL_COMPILER "@CMAKE_SYCL_COMPILER@")
+set(CMAKE_SYCL_COMPILER_LOADED 1)
+set(CMAKE_SYCL_COMPILER_ENV_VAR "SYCL")
diff --git a/cmake/CMakeSYCLInformation.cmake b/cmake/CMakeSYCLInformation.cmake
new file mode 100644
index 0000000000..6572616fbf
--- /dev/null
+++ b/cmake/CMakeSYCLInformation.cmake
@@ -0,0 +1,47 @@
+if(NOT CMAKE_SYCL_COMPILE_OPTIONS_PIC)
+  set(CMAKE_SYCL_COMPILE_OPTIONS_PIC ${CMAKE_CXX_COMPILE_OPTIONS_PIC})
+endif()
+
+if(NOT CMAKE_SYCL_COMPILE_OPTIONS_PIE)
+  set(CMAKE_SYCL_COMPILE_OPTIONS_PIE ${CMAKE_CXX_COMPILE_OPTIONS_PIE})
+endif()
+if(NOT CMAKE_SYCL_LINK_OPTIONS_PIE)
+  set(CMAKE_SYCL_LINK_OPTIONS_PIE ${CMAKE_CXX_LINK_OPTIONS_PIE})
+endif()
+if(NOT CMAKE_SYCL_LINK_OPTIONS_NO_PIE)
+  set(CMAKE_SYCL_LINK_OPTIONS_NO_PIE ${CMAKE_CXX_LINK_OPTIONS_NO_PIE})
+endif()
+
+if(NOT CMAKE_SYCL_OUTPUT_EXTENSION)
+  set(CMAKE_SYCL_OUTPUT_EXTENSION ${CMAKE_CXX_OUTPUT_EXTENSION})
+endif()
+
+if(NOT CMAKE_INCLUDE_FLAG_SYCL)
+  set(CMAKE_INCLUDE_FLAG_SYCL ${CMAKE_INCLUDE_FLAG_CXX})
+endif()
+
+if(NOT CMAKE_SYCL_COMPILE_OPTIONS_EXPLICIT_LANGUAGE)
+  set(CMAKE_SYCL_COMPILE_OPTIONS_EXPLICIT_LANGUAGE ${CMAKE_CXX_COMPILE_OPTIONS_EXPLICIT_LANGUAGE})
+endif()
+
+if(NOT CMAKE_SYCL_DEPENDS_USE_COMPILER)
+  set(CMAKE_SYCL_DEPENDS_USE_COMPILER ${CMAKE_CXX_DEPENDS_USE_COMPILER})
+endif()
+
+if(NOT CMAKE_DEPFILE_FLAGS_SYCL)
+  set(CMAKE_DEPFILE_FLAGS_SYCL ${CMAKE_DEPFILE_FLAGS_CXX})
+endif()
+
+if(NOT CMAKE_SYCL_DEPFILE_FORMAT)
+  set(CMAKE_SYCL_DEPFILE_FORMAT ${CMAKE_CXX_DEPFILE_FORMAT})
+endif()
+
+if(NOT CMAKE_SYCL_COMPILE_OBJECT)
+  set(CMAKE_SYCL_COMPILE_OBJECT "<CMAKE_SYCL_COMPILER> <DEFINES> <INCLUDES> <FLAGS> -o <OBJECT> -c <SOURCE>")
+endif()
+
+if(NOT CMAKE_SYCL_LINK_EXECUTABLE)
+  set(CMAKE_SYCL_LINK_EXECUTABLE "<CMAKE_SYCL_COMPILER> <FLAGS> <CMAKE_SYCL_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
+endif()
+
+set(CMAKE_SYCL_INFORMATION_LOADED 1)
diff --git a/cmake/CMakeTestSYCLCompiler.cmake b/cmake/CMakeTestSYCLCompiler.cmake
new file mode 100644
index 0000000000..e7c7219631
--- /dev/null
+++ b/cmake/CMakeTestSYCLCompiler.cmake
@@ -0,0 +1 @@
+set(CMAKE_SYCL_COMPILER_WORKS 1 CACHE INTERNAL "")
diff --git a/include/array.h b/include/array.h
index 3005087c85..e5e65b1493 100644
--- a/include/array.h
+++ b/include/array.h
@@ -34,6 +34,8 @@ namespace quda
     return output;
   }
 
+  template <typename T, int n> constexpr T &elem(array<T, n> &a, int i) { return a[i]; }
+
   /**
    * @brief Element-wise maximum of two arrays
    * @param a first array
diff --git a/include/blas_helper.cuh b/include/blas_helper.cuh
index 806eef5f5e..fa92ec024b 100644
--- a/include/blas_helper.cuh
+++ b/include/blas_helper.cuh
@@ -193,10 +193,10 @@ namespace quda
         norm_t max_[n];
         // two-pass to increase ILP (assumes length divisible by two, e.g. complex-valued)
 #pragma unroll
-        for (int i = 0; i < n; i++) max_[i] = fmaxf(fabsf((norm_t)v[i].real()), fabsf((norm_t)v[i].imag()));
+        for (int i = 0; i < n; i++) max_[i] = quda::max(quda::abs((norm_t)v[i].real()), quda::abs((norm_t)v[i].imag()));
         norm_t scale = 0.0;
 #pragma unroll
-        for (int i = 0; i < n; i++) scale = fmaxf(max_[i], scale);
+        for (int i = 0; i < n; i++) scale = quda::max(max_[i], scale);
         norm = scale * fixedInvMaxValue<store_t>::value;
         return fdividef(fixedMaxValue<store_t>::value, scale);
       }
@@ -309,7 +309,7 @@ namespace quda
           memcpy(&vecTmp[6], &norm, sizeof(norm_t)); // pack the norm
           array<store_t, 6> vecTmp2;
           copy_and_scale<store_t, real, 6>(vecTmp2, &v_[0], scale_inv);
-          std::memcpy(&vecTmp, &vecTmp2, sizeof(vecTmp2));
+          memcpy(&vecTmp, &vecTmp2, sizeof(vecTmp2));
           // second do vectorized copy into memory
           vector_store(data.spinor, parity * cb_offset + x, vecTmp);
         }
diff --git a/include/clover_field_order.h b/include/clover_field_order.h
index af1a8d73c8..63949971e3 100644
--- a/include/clover_field_order.h
+++ b/include/clover_field_order.h
@@ -860,8 +860,8 @@ namespace quda {
           if (clover.Order() != QUDA_QDPJIT_CLOVER_ORDER) {
             errorQuda("Invalid clover order %d for this accessor", clover.Order());
           }
-          offdiag = clover_ ? ((Float **)clover_)[0] : clover.data<Float **>(inverse)[0];
-          diag = clover_ ? ((Float **)clover_)[1] : clover.data<Float **>(inverse)[1];
+          offdiag = clover_ ? reinterpret_cast<Float **>(clover_)[0] : clover.data<Float **>(inverse)[0];
+          diag = clover_ ? reinterpret_cast<Float **>(clover_)[1] : clover.data<Float **>(inverse)[1];
         }
 
         QudaTwistFlavorType TwistFlavor() const { return twist_flavor; }
diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h
index 6362b4677b..add4b30fc6 100644
--- a/include/color_spinor_field.h
+++ b/include/color_spinor_field.h
@@ -488,7 +488,7 @@ namespace quda
     template <typename T = void *> auto data() const
     {
       if (ghost_only) errorQuda("Not defined for ghost-only field");
-      return reinterpret_cast<T>(v.data());
+      return static_cast<T>(v.data());
     }
 
     /**
@@ -635,6 +635,7 @@ namespace quda
        @param[in] gdr_recv Whether we are using GDR on the receive side
     */
     int commsQuery(int d, const qudaStream_t &stream, bool gdr_send = false, bool gdr_recv = false) const;
+    void commsQuery(int n, int d[], bool done[], bool gdr_send, bool gdr_recv) const;
 
     /**
        @brief Wait on halo communication to complete
@@ -872,7 +873,6 @@ namespace quda
 
     /**
      * @brief Print the site vector
-     * @param[in] a The field we are printing from
      * @param[in] parity Parity index
      * @param[in] x_cb Checkerboard space-time index
      * @param[in] rank The rank we are requesting from (default is rank = 0)
diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h
index 46ad849079..ee935fadfe 100644
--- a/include/color_spinor_field_order.h
+++ b/include/color_spinor_field_order.h
@@ -241,8 +241,9 @@ namespace quda
         constexpr int M = nSpinBlock * nColor * nVec;
 #pragma unroll
         for (int i = 0; i < M; i++) {
-          vec_t tmp
-            = vector_load<vec_t>(reinterpret_cast<const vec_t *>(in + parity * offset_cb), x_cb * N + chi * M + i);
+          // vec_t tmp
+          //   = vector_load<vec_t>(reinterpret_cast<const vec_t *>(in + parity * offset_cb), x_cb * N + chi * M + i);
+          vec_t tmp = vector_load<vec_t>(in + parity * offset_cb, x_cb * N + chi * M + i);
           memcpy(&out[i], &tmp, sizeof(vec_t));
         }
       }
@@ -1061,7 +1062,7 @@ namespace quda
           for (int i = 0; i < length_ghost / 2; i++)
             max_[i] = fmaxf((norm_type)fabsf((norm_type)v[i]), (norm_type)fabsf((norm_type)v[i + length_ghost / 2]));
 #pragma unroll
-          for (int i = 0; i < length_ghost / 2; i++) scale = fmaxf(max_[i], scale);
+          for (int i = 0; i < length_ghost / 2; i++) scale = max(max_[i], scale);
           ghost_norm[2 * dim + dir][parity * faceVolumeCB[dim] + x] = scale * fixedInvMaxValue<Float>::value;
           scale_inv = fdividef(fixedMaxValue<Float>::value, scale);
         }
@@ -1203,7 +1204,7 @@ namespace quda
           for (int i = 0; i < length / 2; i++)
             max_[i] = fmaxf(fabsf((norm_type)v[i]), fabsf((norm_type)v[i + length / 2]));
 #pragma unroll
-          for (int i = 0; i < length / 2; i++) scale = fmaxf(max_[i], scale);
+          for (int i = 0; i < length / 2; i++) scale = max(max_[i], scale);
           norm[x + parity * norm_offset] = scale * fixedInvMaxValue<Float>::value;
           scale_inv = fdividef(fixedMaxValue<Float>::value, scale);
         }
@@ -1306,10 +1307,10 @@ namespace quda
         // two-pass to increase ILP (assumes length divisible by two, e.g. complex-valued)
 #pragma unroll
         for (int i = 0; i < length_ghost / 2; i++)
-          max_[i] = fmaxf(fabsf((norm_type)v[i]), fabsf((norm_type)v[i + length_ghost / 2]));
+          max_[i] = max(abs((norm_type)v[i]), abs((norm_type)v[i + length_ghost / 2]));
         norm_type scale = 0.0;
 #pragma unroll
-        for (int i = 0; i < length_ghost / 2; i++) scale = fmaxf(max_[i], scale);
+        for (int i = 0; i < length_ghost / 2; i++) scale = max(max_[i], scale);
         norm_type nrm = scale * fixedInvMaxValue<Float>::value;
 
         real scale_inv = fdividef(fixedMaxValue<Float>::value, scale);
@@ -1411,11 +1412,10 @@ namespace quda
         norm_type max_[length / 2];
         // two-pass to increase ILP (assumes length divisible by two, e.g. complex-valued)
 #pragma unroll
-        for (int i = 0; i < length / 2; i++)
-          max_[i] = fmaxf(fabsf((norm_type)v[i]), fabsf((norm_type)v[i + length / 2]));
+        for (int i = 0; i < length / 2; i++) max_[i] = max(abs((norm_type)v[i]), abs((norm_type)v[i + length / 2]));
         norm_type scale = 0.0;
 #pragma unroll
-        for (int i = 0; i < length / 2; i++) scale = fmaxf(max_[i], scale);
+        for (int i = 0; i < length / 2; i++) scale = max(max_[i], scale);
         norm_type nrm = scale * fixedInvMaxValue<Float>::value;
 
         real scale_inv = fdividef(fixedMaxValue<Float>::value, scale);
diff --git a/include/comm_quda.h b/include/comm_quda.h
index 8496efab5a..66c5dba0e4 100644
--- a/include/comm_quda.h
+++ b/include/comm_quda.h
@@ -415,6 +415,7 @@ namespace quda
   void comm_start(MsgHandle *mh);
   void comm_wait(MsgHandle *mh);
   int comm_query(MsgHandle *mh);
+  // void comm_query(int n, MsgHandle *mh[], int *outcount, int array_of_indices[]);
 
   template <typename T> void comm_allreduce_sum(T &v);
   template <typename T> void comm_allreduce_max(T &v);
diff --git a/include/communicator_quda.h b/include/communicator_quda.h
index bf0e9ffba5..cdca2655a7 100644
--- a/include/communicator_quda.h
+++ b/include/communicator_quda.h
@@ -747,6 +747,8 @@ namespace quda
 
   int comm_query(MsgHandle *mh);
 
+  // void comm_query(int n, MsgHandle *mh[], int *outcount, int array_of_indices[]);
+
   template <typename T> T deterministic_reduce(T *array, int n)
   {
     std::sort(array, array + n); // sort reduction into ascending order for deterministic reduction
diff --git a/include/convert.h b/include/convert.h
index f56751873c..2d1026cb31 100644
--- a/include/convert.h
+++ b/include/convert.h
@@ -128,6 +128,7 @@ namespace quda
     }
   };
 
+#if 0
   /**
      @brief Fast float-to-integer round used on the device
   */
@@ -148,6 +149,7 @@ namespace quda
       return i;
     }
   };
+#endif
 
   /**
      @brief Regular double-to-integer round used on the host
@@ -156,6 +158,7 @@ namespace quda
     constexpr int operator()(double d) { return static_cast<int>(rint(d)); }
   };
 
+#if 0
   /**
      @brief Fast double-to-integer round used on the device
   */
@@ -166,6 +169,7 @@ namespace quda
       return reinterpret_cast<int &>(d);
     }
   };
+#endif
 
   /**
      @brief Copy function which is trival between floating point
diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh
index 834b59425c..41883d7b1a 100644
--- a/include/dslash_helper.cuh
+++ b/include/dslash_helper.cuh
@@ -500,6 +500,12 @@ namespace quda
     dslash.template operator()<kernel_type>(x_cb, s, parity);
   }
 
+  template <KernelType kernel_type, class D>
+  __forceinline__ __device__ void apply_dslash(D &dslash, int x_cb, int s, int parity, bool alive)
+  {
+    dslash.template operator()<kernel_type, true>(x_cb, s, parity, alive);
+  }
+
 #ifdef NVSHMEM_COMMS
   /**
    * @brief helper function for nvshmem uber kernel to signal that the interior kernel has completed.
@@ -678,7 +684,8 @@ namespace quda
     {
     }
 
-    __forceinline__ __device__ void operator()(int, int s, int parity)
+    template <bool allthreads = false>
+    __forceinline__ __device__ void operator()(int, int s, int parity, bool alive = true)
     {
       typename Arg::D dslash(*this);
       // for full fields set parity from z thread index else use arg setting
@@ -686,10 +693,11 @@ namespace quda
 
       if ((kernel_type == INTERIOR_KERNEL || kernel_type == UBER_KERNEL) &&
           target::block_idx().x < static_cast<unsigned int>(arg.pack_blocks)) {
-        // first few blocks do packing kernel
-        typename Arg::template P<dslash.pc_type()> packer;
-        packer(arg, s, 1 - parity, dslash.twist_pack()); // flip parity since pack is on input
-
+        if (!allthreads || alive) {
+          // first few blocks do packing kernel
+          typename Arg::template P<dslash.pc_type()> packer;
+          packer(arg, s, 1 - parity, dslash.twist_pack()); // flip parity since pack is on input
+        }
         // we use that when running the exterior -- this is either
         // * an explicit call to the exterior when not merged with the interior or
         // * the interior with exterior_blocks > 0
@@ -731,9 +739,19 @@ namespace quda
           }
 #endif
         } else {
-          if (x_cb >= arg.threads) return;
+	  if (x_cb >= arg.threads) {
+	    if constexpr (allthreads)
+	      alive = false;
+	    else
+	      return;
+	  }
+
+	  if constexpr (allthreads) {
+	    apply_dslash<kernel_type == UBER_KERNEL ? INTERIOR_KERNEL : kernel_type>(dslash, x_cb, s, parity, alive);
+	  } else {
+	    apply_dslash<kernel_type == UBER_KERNEL ? INTERIOR_KERNEL : kernel_type>(dslash, x_cb, s, parity);
+	  }
 
-          apply_dslash<kernel_type == UBER_KERNEL ? INTERIOR_KERNEL : kernel_type>(dslash, x_cb, s, parity);
           if constexpr (use_nvshmem_comms && kernel_type == UBER_KERNEL) {
             __syncthreads();
             if (target::thread_idx().x == 0 && target::thread_idx().y == 0 && target::thread_idx().z == 0)
diff --git a/include/externals/json.hpp b/include/externals/json.hpp
index cb27e05811..443aa9a665 100644
--- a/include/externals/json.hpp
+++ b/include/externals/json.hpp
@@ -21895,7 +21895,7 @@ inline void swap(nlohmann::NLOHMANN_BASIC_JSON_TPL& j1, nlohmann::NLOHMANN_BASIC
 /// @brief user-defined string literal for JSON values
 /// @sa https://json.nlohmann.me/api/basic_json/operator_literal_json/
 JSON_HEDLEY_NON_NULL(1)
-inline nlohmann::json operator "" _json(const char* s, std::size_t n)
+inline nlohmann::json operator""_json(const char* s, std::size_t n)
 {
     return nlohmann::json::parse(s, s + n);
 }
@@ -21903,7 +21903,7 @@ inline nlohmann::json operator "" _json(const char* s, std::size_t n)
 /// @brief user-defined string literal for JSON pointer
 /// @sa https://json.nlohmann.me/api/basic_json/operator_literal_json_pointer/
 JSON_HEDLEY_NON_NULL(1)
-inline nlohmann::json::json_pointer operator "" _json_pointer(const char* s, std::size_t n)
+inline nlohmann::json::json_pointer operator""_json_pointer(const char* s, std::size_t n)
 {
     return nlohmann::json::json_pointer(std::string(s, n));
 }
diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h
index 4561f1f21f..a851569746 100644
--- a/include/gauge_field_order.h
+++ b/include/gauge_field_order.h
@@ -1945,6 +1945,7 @@ namespace quda {
         LegacyOrder<Float, length>(u, ghost_), volumeCB(u.VolumeCB())
       {
         for (int i = 0; i < 4; i++) gauge[i] = gauge_ ? ((Float **)gauge_)[i] : u.data<Float *>(i);
+        // for (int i = 0; i < 4; i++) gauge[i] = gauge_ ? gauge_[i] : u.data<Float *>(i);
       }
 
         __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const
@@ -1991,6 +1992,7 @@ namespace quda {
         LegacyOrder<Float, length>(u, ghost_), volumeCB(u.VolumeCB())
       {
         for (int i = 0; i < 4; i++) gauge[i] = gauge_ ? ((Float **)gauge_)[i] : u.data<Float *>(i);
+        // for (int i = 0; i < 4; i++) gauge[i] = gauge_ ? gauge_[i] : u.data<Float *>(i);
       }
 
         __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const
diff --git a/include/kernels/block_orthogonalize.cuh b/include/kernels/block_orthogonalize.cuh
index 3a70a4a096..e3e0868c7f 100644
--- a/include/kernels/block_orthogonalize.cuh
+++ b/include/kernels/block_orthogonalize.cuh
@@ -135,7 +135,8 @@ namespace quda {
         for (int c = 0; c < nColor; c++) arg.V(parity, x_cb, chirality * spinBlock + s, c, i) = v(s, c);
     }
 
-    __device__ __host__ inline void operator()(dim3 block, dim3 thread)
+    template <bool allthreads = false> // true if all threads in block will enter, even if out of range
+    __device__ __host__ inline void operator()(dim3 block, dim3 thread, bool alive = true)
     {
       int x_coarse = block.x;
       int x_fine_offset = thread.x;
@@ -146,14 +147,20 @@ namespace quda {
       int x_cb[n_sites_per_thread];
 
       for (int tx = 0; tx < n_sites_per_thread; tx++) {
-        int x_fine_offset_tx = x_fine_offset * n_sites_per_thread + tx;
-        // all threads with x_fine_offset greater than aggregate_size_cb are second parity
-        int parity_offset = (x_fine_offset_tx >= arg.aggregate_size_cb && fineSpin != 1) ? 1 : 0;
-        x_offset_cb[tx] = x_fine_offset_tx - parity_offset * arg.aggregate_size_cb;
-        parity[tx] = fineSpin == 1 ? chirality : arg.nParity == 2 ? parity_offset : arg.parity;
-
-        x_cb[tx] = x_offset_cb[tx] >= arg.aggregate_size_cb ? 0 :
-          arg.coarse_to_fine[ (x_coarse*2 + parity[tx]) * arg.aggregate_size_cb + x_offset_cb[tx] ] - parity[tx]*arg.fineVolumeCB;
+        if (!allthreads || alive) {
+          int x_fine_offset_tx = x_fine_offset * n_sites_per_thread + tx;
+          // all threads with x_fine_offset greater than aggregate_size_cb are second parity
+          int parity_offset = (x_fine_offset_tx >= arg.aggregate_size_cb && fineSpin != 1) ? 1 : 0;
+          x_offset_cb[tx] = x_fine_offset_tx - parity_offset * arg.aggregate_size_cb;
+          parity[tx] = fineSpin == 1 ? chirality : arg.nParity == 2 ? parity_offset : arg.parity;
+
+          x_cb[tx] = x_offset_cb[tx] >= arg.aggregate_size_cb ?
+            0 :
+            arg.coarse_to_fine[(x_coarse * 2 + parity[tx]) * arg.aggregate_size_cb + x_offset_cb[tx]]
+              - parity[tx] * arg.fineVolumeCB;
+        } else {
+          x_offset_cb[tx] = arg.aggregate_size_cb;
+        }
       }
       if (fineSpin == 1) chirality = 0; // when using staggered chirality is mapped to parity
 
diff --git a/include/kernels/block_transpose.cuh b/include/kernels/block_transpose.cuh
index 4499dfa07c..d0eb84673e 100644
--- a/include/kernels/block_transpose.cuh
+++ b/include/kernels/block_transpose.cuh
@@ -73,7 +73,8 @@ namespace quda
         - V: spatial -> spin/color -> nVec
         The transpose uses shared memory to avoid strided memory accesses.
      */
-    __device__ __host__ inline void operator()(int x_cb, int)
+    template <bool allthreads = false> // true if all threads in block will enter, even if out of range
+    __device__ __host__ inline void operator()(int x_cb, int, bool = true)
     {
       int parity_color = target::block_idx().z;
       int color = parity_color % Arg::nColor;
diff --git a/include/kernels/clover_outer_product.cuh b/include/kernels/clover_outer_product.cuh
index e887e65f0d..65953e4008 100644
--- a/include/kernels/clover_outer_product.cuh
+++ b/include/kernels/clover_outer_product.cuh
@@ -40,7 +40,7 @@ namespace quda {
                    const ColorSpinorField &p_halo, cvector_ref<const ColorSpinorField> &x,
                    const ColorSpinorField &x_halo, const std::vector<double> &coeff) :
       kernel_param(dim3(dim == -1 ? static_cast<uint32_t>(x_halo.getDslashConstant().volume_4d_cb) :
-                                    x_halo.getDslashConstant().ghostFaceCB[dim],
+                                    x_halo.getDslashConstant().ghostFaceCB[dim == -1 ? 0 : dim],
                         x.SiteSubset(), dim == -1 ? 4 : dim)),
       n_src(p.size()),
       force(force),
diff --git a/include/kernels/coarse_op_kernel.cuh b/include/kernels/coarse_op_kernel.cuh
index 7a59bdfae5..f91f5873d6 100644
--- a/include/kernels/coarse_op_kernel.cuh
+++ b/include/kernels/coarse_op_kernel.cuh
@@ -1382,7 +1382,7 @@ namespace quda {
   };
 
   template <bool is_device> struct storeCoarseSharedAtomic_impl {
-    template <typename ...Args> void operator()(Args...)
+    template <bool allthreads, typename... Args> void operator()(Args...)
     {
       errorQuda("Shared-memory atomic aggregation not supported on host");
     }
@@ -1402,9 +1402,9 @@ namespace quda {
     template <typename Arg> using Cache = SharedMemoryCache<CacheT<Arg>, DimsStaticConditional<2, 1, 1>>;
     template <typename Arg> using Ops = KernelOps<Cache<Arg>>;
 
-    template <typename VUV, typename Pack, typename Ftor>
+    template <bool allthreads, typename VUV, typename Pack, typename Ftor>
     inline __device__ void operator()(VUV &vuv, bool isDiagonal, int coarse_x_cb, int coarse_parity, int i0, int j0,
-                                      int parity, const Pack &pack, const Ftor &ftor)
+                                      int parity, const Pack &pack, const Ftor &ftor, bool active)
     {
       using Arg = typename Ftor::Arg;
       const Arg &arg = ftor.arg;
@@ -1468,57 +1468,61 @@ namespace quda {
 
       if (tx < Arg::coarseSpin*Arg::coarseSpin && (parity == 0 || arg.parity_flip == 1) ) {
 
+        if (!allthreads || active) {
 #pragma unroll
-        for (int i = 0; i < TileType::M; i++) {
+          for (int i = 0; i < TileType::M; i++) {
 #pragma unroll
-          for (int j = 0; j < TileType::N; j++) {
-            if (pack.dir == QUDA_IN_PLACE) {
-              // same as dir == QUDA_FORWARDS
-              arg.X_atomic.atomicAdd(0,coarse_parity,coarse_x_cb,s_row,s_col,i0+i,j0+j,
-                                     X[i_block0+i][j_block0+j][x_][s_row][s_col]);
-            } else {
-              arg.Y_atomic.atomicAdd(dim_index,coarse_parity,coarse_x_cb,s_row,s_col,i0+i,j0+j,
-                                     Y[i_block0+i][j_block0+j][x_][s_row][s_col]);
-
-              if (pack.dir == QUDA_BACKWARDS) {
-                arg.X_atomic.atomicAdd(0,coarse_parity,coarse_x_cb,s_col,s_row,j0+j,i0+i,
-                                       conj(X[i_block0+i][j_block0+j][x_][s_row][s_col]));
+            for (int j = 0; j < TileType::N; j++) {
+              if (pack.dir == QUDA_IN_PLACE) {
+                // same as dir == QUDA_FORWARDS
+                arg.X_atomic.atomicAdd(0, coarse_parity, coarse_x_cb, s_row, s_col, i0 + i, j0 + j,
+                                       X[i_block0 + i][j_block0 + j][x_][s_row][s_col]);
               } else {
-                arg.X_atomic.atomicAdd(0,coarse_parity,coarse_x_cb,s_row,s_col,i0+i,j0+j,
-                                       X[i_block0+i][j_block0+j][x_][s_row][s_col]);
-              }
-
-              if (!arg.bidirectional) {
-                if (Arg::fineSpin != 1 && s_row == s_col) arg.X_atomic.atomicAdd(0,coarse_parity,coarse_x_cb,s_row,s_col,i0+i,j0+j,
-                                                                                 X[i_block0+i][j_block0+j][x_][s_row][s_col]);
-                else arg.X_atomic.atomicAdd(0,coarse_parity,coarse_x_cb,s_row,s_col,i0+i,j0+j,
-                                            -X[i_block0+i][j_block0+j][x_][s_row][s_col]);
-              }
-            } // dir == QUDA_IN_PLACE
+                arg.Y_atomic.atomicAdd(dim_index, coarse_parity, coarse_x_cb, s_row, s_col, i0 + i, j0 + j,
+                                       Y[i_block0 + i][j_block0 + j][x_][s_row][s_col]);
+
+                if (pack.dir == QUDA_BACKWARDS) {
+                  arg.X_atomic.atomicAdd(0, coarse_parity, coarse_x_cb, s_col, s_row, j0 + j, i0 + i,
+                                         conj(X[i_block0 + i][j_block0 + j][x_][s_row][s_col]));
+                } else {
+                  arg.X_atomic.atomicAdd(0, coarse_parity, coarse_x_cb, s_row, s_col, i0 + i, j0 + j,
+                                         X[i_block0 + i][j_block0 + j][x_][s_row][s_col]);
+                }
+
+                if (!arg.bidirectional) {
+                  if (Arg::fineSpin != 1 && s_row == s_col)
+                    arg.X_atomic.atomicAdd(0, coarse_parity, coarse_x_cb, s_row, s_col, i0 + i, j0 + j,
+                                           X[i_block0 + i][j_block0 + j][x_][s_row][s_col]);
+                  else
+                    arg.X_atomic.atomicAdd(0, coarse_parity, coarse_x_cb, s_row, s_col, i0 + i, j0 + j,
+                                           -X[i_block0 + i][j_block0 + j][x_][s_row][s_col]);
+                }
+              } // dir == QUDA_IN_PLACE
+            }
           }
         }
       }
     }
   };
 
-  template <typename VUV, typename Ftor>
+  template <bool allthreads, typename VUV, typename Ftor>
   __device__ __host__ void storeCoarseSharedAtomic(VUV &vuv, bool isDiagonal, int coarse_x_cb, int coarse_parity,
-                                                   int i0, int j0, int parity, const Ftor &ftor)
+                                                   int i0, int j0, int parity, const Ftor &ftor, bool active)
   {
     using Arg = typename Ftor::Arg;
     const Arg &arg = ftor.arg;
     switch (arg.dir) {
     case QUDA_BACKWARDS:
-      target::dispatch<storeCoarseSharedAtomic_impl>(vuv, isDiagonal, coarse_x_cb, coarse_parity, i0, j0, parity,
-                                                     Pack<QUDA_BACKWARDS>(), ftor);
+      target::dispatch<storeCoarseSharedAtomic_impl, allthreads>(vuv, isDiagonal, coarse_x_cb, coarse_parity, i0, j0,
+                                                                 parity, Pack<QUDA_BACKWARDS>(), ftor, active);
       break;
     case QUDA_FORWARDS:
-      target::dispatch<storeCoarseSharedAtomic_impl>(vuv, isDiagonal, coarse_x_cb, coarse_parity, i0, j0, parity,
-                                                     Pack<QUDA_FORWARDS>(), ftor);
+      target::dispatch<storeCoarseSharedAtomic_impl, allthreads>(vuv, isDiagonal, coarse_x_cb, coarse_parity, i0, j0,
+                                                                 parity, Pack<QUDA_FORWARDS>(), ftor, active);
       break;
     case QUDA_IN_PLACE:
-      target::dispatch<storeCoarseSharedAtomic_impl>(vuv, isDiagonal, coarse_x_cb, coarse_parity, i0, j0, parity,
-                                                     Pack<QUDA_IN_PLACE>(), ftor);
+      target::dispatch<storeCoarseSharedAtomic_impl, allthreads>(vuv, isDiagonal, coarse_x_cb, coarse_parity, i0, j0,
+                                                                 parity, Pack<QUDA_IN_PLACE>(), ftor, active);
       break;
     default:
       break;// do nothing
@@ -1605,9 +1609,9 @@ namespace quda {
 
   }
 
-  template <int nFace, typename Ftor>
+  template <int nFace, bool allthreads, typename Ftor>
   __device__ __host__ void computeVUV(const Ftor &ftor, int parity, int x_cb, int i0, int j0, int parity_coarse_,
-                                      int coarse_x_cb_)
+                                      int coarse_x_cb_, bool active)
   {
     using Arg = typename Ftor::Arg;
     const Arg &arg = ftor.arg;
@@ -1634,7 +1638,7 @@ namespace quda {
 
     using Ctype = decltype(make_tile_C<complex<real>, false>(arg.vuvTile));
     Ctype vuv[Arg::coarseSpin * Arg::coarseSpin];
-    multiplyVUV(vuv, arg, parity, x_cb, i0, j0);
+    if (!allthreads || active) multiplyVUV(vuv, arg, parity, x_cb, i0, j0);
 
     if (isDiagonal && !isFromCoarseClover) {
 #pragma unroll
@@ -1642,8 +1646,8 @@ namespace quda {
     }
 
     if (arg.shared_atomic)
-      storeCoarseSharedAtomic(vuv, isDiagonal, coarse_x_cb, coarse_parity, i0, j0, parity, ftor);
-    else
+      storeCoarseSharedAtomic<allthreads>(vuv, isDiagonal, coarse_x_cb, coarse_parity, i0, j0, parity, ftor, active);
+    else if (!allthreads || active)
       storeCoarseGlobalAtomic(vuv, isDiagonal, coarse_x_cb, coarse_parity, i0, j0, arg);
   }
 
@@ -1721,17 +1725,24 @@ namespace quda {
        @param[in] parity_c_row parity * output color row
        @param[in] c_col output coarse color column
     */
-    __device__ __host__ inline void operator()(int x_cb, int parity_c_row, int c_col)
+    template <bool allthreads = false>
+    __device__ __host__ inline void operator()(int x_cb, int parity_c_row, int c_col, bool active = true)
     {
-      int parity, parity_coarse, x_coarse_cb, c_row;
-      target::dispatch<getIndices>(parity_coarse, x_coarse_cb, parity, x_cb, parity_c_row, c_row, c_col, arg);
-
-      if (parity > 1) return;
-      if (c_row >= arg.vuvTile.M_tiles) return;
-      if (c_col >= arg.vuvTile.N_tiles) return;
-      if (!arg.shared_atomic && x_cb >= arg.fineVolumeCB) return;
-
-      computeVUV<nFace>(*this, parity, x_cb, c_row * arg.vuvTile.M, c_col * arg.vuvTile.N, parity_coarse, x_coarse_cb);
+      int parity = 0, parity_coarse = 0, x_coarse_cb = 0, c_row = 0;
+      if (!allthreads || active)
+        target::dispatch<getIndices>(parity_coarse, x_coarse_cb, parity, x_cb, parity_c_row, c_row, c_col, arg);
+
+      // if (parity > 1) return;
+      // if (c_row >= arg.vuvTile.M_tiles) return;
+      // if (c_col >= arg.vuvTile.N_tiles) return;
+      // if (!arg.shared_atomic && x_cb >= arg.fineVolumeCB) return;
+      if (parity > 1) active = false;
+      if (c_row >= arg.vuvTile.M_tiles) active = false;
+      if (c_col >= arg.vuvTile.N_tiles) active = false;
+      if (!arg.shared_atomic && x_cb >= arg.fineVolumeCB) active = false;
+
+      computeVUV<nFace, allthreads>(*this, parity, x_cb, c_row * arg.vuvTile.M, c_col * arg.vuvTile.N, parity_coarse,
+                                    x_coarse_cb, active);
     }
   };
 
@@ -1751,17 +1762,24 @@ namespace quda {
        @param[in] parity_c_row parity * output color row
        @param[in] c_col output coarse color column
     */
-    __device__ __host__ inline void operator()(int x_cb, int parity_c_row, int c_col)
+    template <bool allthreads = false>
+    __device__ __host__ inline void operator()(int x_cb, int parity_c_row, int c_col, bool active = true)
     {
-      int parity, parity_coarse, x_coarse_cb, c_row;
-      target::dispatch<getIndices>(parity_coarse, x_coarse_cb, parity, x_cb, parity_c_row, c_row, c_col, arg);
-
-      if (parity > 1) return;
-      if (c_row >= arg.vuvTile.M_tiles) return;
-      if (c_col >= arg.vuvTile.N_tiles) return;
-      if (!arg.shared_atomic && x_cb >= arg.fineVolumeCB) return;
-
-      computeVUV<nFace>(*this, parity, x_cb, c_row * arg.vuvTile.M, c_col * arg.vuvTile.N, parity_coarse, x_coarse_cb);
+      int parity = 0, parity_coarse = 0, x_coarse_cb = 0, c_row = 0;
+      if (!allthreads || active)
+        target::dispatch<getIndices>(parity_coarse, x_coarse_cb, parity, x_cb, parity_c_row, c_row, c_col, arg);
+
+      // if (parity > 1) return;
+      // if (c_row >= arg.vuvTile.M_tiles) return;
+      // if (c_col >= arg.vuvTile.N_tiles) return;
+      // if (!arg.shared_atomic && x_cb >= arg.fineVolumeCB) return;
+      if (parity > 1) active = false;
+      if (c_row >= arg.vuvTile.M_tiles) active = false;
+      if (c_col >= arg.vuvTile.N_tiles) active = false;
+      if (!arg.shared_atomic && x_cb >= arg.fineVolumeCB) active = false;
+
+      computeVUV<nFace, allthreads>(*this, parity, x_cb, c_row * arg.vuvTile.M, c_col * arg.vuvTile.N, parity_coarse,
+                                    x_coarse_cb, active);
     }
   };
 
diff --git a/include/kernels/color_spinor_pack.cuh b/include/kernels/color_spinor_pack.cuh
index e51aa85a4d..bd122f9adf 100644
--- a/include/kernels/color_spinor_pack.cuh
+++ b/include/kernels/color_spinor_pack.cuh
@@ -209,9 +209,9 @@ namespace quda {
     }
   };
 
-  template <typename Ftor>
+  template <bool allthreads, typename Ftor>
   __device__ __host__ inline std::enable_if_t<!Ftor::Arg::block_float, typename Ftor::Arg::real>
-  compute_site_max(const Ftor &, int, int, int, int, int)
+  compute_site_max(const Ftor &, int, int, int, int, int, bool)
   {
     return static_cast<typename Ftor::Arg::real>(1.0); // dummy return for non-block float
   }
@@ -219,24 +219,27 @@ namespace quda {
   /**
      Compute the max element over the spin-color components of a given site.
   */
-  template <typename Ftor>
+  template <bool allthreads, typename Ftor>
   __device__ __host__ inline std::enable_if_t<Ftor::Arg::block_float, typename Ftor::Arg::real>
-  compute_site_max(const Ftor &ftor, int src_idx, int x_cb, int spinor_parity, int spin_block, int color_block)
+  compute_site_max(const Ftor &ftor, int src_idx, int x_cb, int spinor_parity, int spin_block, int color_block,
+                   bool active)
   {
     using real = typename Ftor::Arg::real;
     const int Ms = spins_per_thread(Ftor::Arg::nSpin);
     const int Mc = colors_per_thread(Ftor::Arg::nColor);
     complex<real> thread_max = {0.0, 0.0};
 
+    if (!allthreads || active) {
 #pragma unroll
-    for (int spin_local=0; spin_local<Ms; spin_local++) {
-      int s = spin_block + spin_local;
+      for (int spin_local = 0; spin_local < Ms; spin_local++) {
+        int s = spin_block + spin_local;
 #pragma unroll
-      for (int color_local=0; color_local<Mc; color_local++) {
-        int c = color_block + color_local;
-        complex<real> z = ftor.arg.in[src_idx](spinor_parity, x_cb, s, c);
-        thread_max.real(max(thread_max.real(), abs(z.real())));
-        thread_max.imag(max(thread_max.imag(), abs(z.imag())));
+        for (int color_local = 0; color_local < Mc; color_local++) {
+          int c = color_block + color_local;
+          complex<real> z = ftor.arg.in[src_idx](spinor_parity, x_cb, s, c);
+          thread_max.real(max(thread_max.real(), abs(z.real())));
+          thread_max.imag(max(thread_max.imag(), abs(z.imag())));
+        }
       }
     }
 
@@ -306,7 +309,8 @@ namespace quda {
     }
     static constexpr const char *filename() { return KERNEL_FILE; }
 
-    __device__ __host__ void operator()(int tid, int spin_color_block, int parity)
+    template <bool allthreads = false>
+    __device__ __host__ void operator()(int tid, int spin_color_block, int parity, bool active = true)
     {
       const int Ms = spins_per_thread(Arg::nSpin);
       const int Mc = colors_per_thread(Arg::nColor);
@@ -322,21 +326,22 @@ namespace quda {
 
       int src_idx;
       int x_cb = indexFromFaceIndex(src_idx, dim, dir, ghost_idx, parity, arg);
-      auto max = compute_site_max(*this, src_idx, x_cb, spinor_parity, spin_block, color_block);
+      auto max = compute_site_max<allthreads>(*this, src_idx, x_cb, spinor_parity, spin_block, color_block, active);
 
+      if (!allthreads || active) {
 #pragma unroll
-      for (int spin_local=0; spin_local<Ms; spin_local++) {
-        int s = spin_block + spin_local;
+        for (int spin_local = 0; spin_local < Ms; spin_local++) {
+          int s = spin_block + spin_local;
 #pragma unroll
-        for (int color_local=0; color_local<Mc; color_local++) {
-          int c = color_block + color_local;
-          arg.out.Ghost(dim, dir, spinor_parity, ghost_idx, s, c, 0, max) = arg.in[src_idx](spinor_parity, x_cb, s, c);
+          for (int color_local = 0; color_local < Mc; color_local++) {
+            int c = color_block + color_local;
+            arg.out.Ghost(dim, dir, spinor_parity, ghost_idx, s, c, 0, max) = arg.in[src_idx](spinor_parity, x_cb, s, c);
+          }
         }
-      }
-
 #ifdef NVSHMEM_COMMS
-      if (arg.shmem) shmem_signalwait(0, 0, (arg.shmem & 4), arg);
+        if (arg.shmem) shmem_signalwait(0, 0, (arg.shmem & 4), arg);
 #endif
+      }
     }
   };
 
diff --git a/include/kernels/dslash_clover_helper.cuh b/include/kernels/dslash_clover_helper.cuh
index 94b6bce0d9..54a137893c 100644
--- a/include/kernels/dslash_clover_helper.cuh
+++ b/include/kernels/dslash_clover_helper.cuh
@@ -203,7 +203,8 @@ namespace quda {
     }
     static constexpr const char* filename() { return KERNEL_FILE; }
 
-    __device__ __host__ inline void operator()(int x_cb, int src_flavor, int parity)
+    template <bool allthreads = false>
+    __device__ __host__ inline void operator()(int x_cb, int src_flavor, int parity, bool active = true)
     {
       using namespace linalg; // for Cholesky
       const int clover_parity = arg.nParity == 2 ? parity : arg.parity;
@@ -214,15 +215,21 @@ namespace quda {
       const int flavor = src_flavor % 2;
 
       int my_flavor_idx = x_cb + flavor * arg.volumeCB;
-      fermion in = arg.in[src_idx](my_flavor_idx, spinor_parity);
-      in.toRel(); // change to chiral basis here
-
+      fermion in;
       int chirality = flavor; // relabel flavor as chirality
+      Mat A;
+      if (!allthreads || active) {
+        in = arg.in[src_idx](my_flavor_idx, spinor_parity);
+        in.toRel(); // change to chiral basis here
+        A = arg.clover(x_cb, clover_parity, chirality);
+      } else {
+        in = fermion {};
+        A = Mat {};
+      }
+
       // (C + i mu gamma_5 tau_3 - epsilon tau_1 )  [note: appropriate signs carried in arg.a / arg.b]
       const complex<real> a(0.0, chirality == 0 ? arg.a : -arg.a);
 
-      Mat A = arg.clover(x_cb, clover_parity, chirality);
-
       SharedMemoryCache<half_fermion> cache {*this};
 
       half_fermion in_chi[n_flavor]; // flavor array of chirally projected fermion
@@ -251,27 +258,32 @@ namespace quda {
         out_chi[flavor] += arg.b * in_chi[1 - flavor];
       }
 
-      if (arg.inverse) {
-        if (arg.dynamic_clover) {
-          Mat A2 = A.square();
-          A2 += arg.a2_minus_b2;
-          Cholesky<HMatrix, clover::cholesky_t<real>, N> cholesky(A2);
+      if (!allthreads || active) {
+        if (arg.inverse) {
+          if (arg.dynamic_clover) {
+            Mat A2 = A.square();
+            A2 += arg.a2_minus_b2;
+            Cholesky<HMatrix, clover::cholesky_t<real>, N> cholesky(A2);
 #pragma unroll
-          for (int flavor = 0; flavor < n_flavor; flavor++)
-            out_chi[flavor] = static_cast<real>(0.25) * cholesky.backward(cholesky.forward(out_chi[flavor]));
-        } else {
-          Mat Ainv = arg.cloverInv(x_cb, clover_parity, chirality);
+            for (int flavor = 0; flavor < n_flavor; flavor++)
+              out_chi[flavor] = static_cast<real>(0.25) * cholesky.backward(cholesky.forward(out_chi[flavor]));
+          } else {
+            Mat Ainv = arg.cloverInv(x_cb, clover_parity, chirality);
 #pragma unroll
-          for (int flavor = 0; flavor < n_flavor; flavor++)
-            out_chi[flavor] = static_cast<real>(2.0) * (Ainv * out_chi[flavor]);
+            for (int flavor = 0; flavor < n_flavor; flavor++)
+              out_chi[flavor] = static_cast<real>(2.0) * (Ainv * out_chi[flavor]);
+          }
         }
       }
 
       swizzle(out_chi, chirality); // undo the flavor-chirality swizzle
-      fermion out = out_chi[0].chiral_reconstruct(0) + out_chi[1].chiral_reconstruct(1);
-      out.toNonRel(); // change basis back
 
-      arg.out[src_idx](my_flavor_idx, spinor_parity) = out;
+      if (!allthreads || active) {
+        fermion out = out_chi[0].chiral_reconstruct(0) + out_chi[1].chiral_reconstruct(1);
+        out.toNonRel(); // change basis back
+
+        arg.out[src_idx](my_flavor_idx, spinor_parity) = out;
+      }
     }
   };
 }
diff --git a/include/kernels/dslash_coarse.cuh b/include/kernels/dslash_coarse.cuh
index 086a7def06..f630e66c0d 100644
--- a/include/kernels/dslash_coarse.cuh
+++ b/include/kernels/dslash_coarse.cuh
@@ -339,7 +339,8 @@ namespace quda {
     }
     static constexpr const char *filename() { return KERNEL_FILE; }
 
-    __device__ __host__ inline void operator()(int x_cb_color_offset, int src_parity, int sMd)
+    template <bool allthreads = false>
+    __device__ __host__ inline void operator()(int x_cb_color_offset, int src_parity, int sMd, bool active = true)
     {
       int x_cb = x_cb_color_offset;
       int color_offset = 0;
@@ -368,11 +369,16 @@ namespace quda {
       typename CoarseDslashParams<Arg>::array_t out {};
 
       if (Arg::dslash) {
-        applyDslash<Mc>(out, dim, dir, x_cb, src_idx, parity, s, color_block, color_offset, arg);
+        if (!allthreads || active) {
+          applyDslash<Mc>(out, dim, dir, x_cb, src_idx, parity, s, color_block, color_offset, arg);
+        }
         target::dispatch<dim_collapse>(out, dir, dim, *this);
       }
 
-      if (doBulk<Arg::type>() && Arg::clover && dir==0 && dim==0) applyClover<Mc>(out, arg, x_cb, src_idx, parity, s, color_block, color_offset);
+      if (!allthreads || active) {
+        if (doBulk<Arg::type>() && Arg::clover && dir == 0 && dim == 0)
+          applyClover<Mc>(out, arg, x_cb, src_idx, parity, s, color_block, color_offset);
+      }
 
       if (dir==0 && dim==0) {
         const int my_spinor_parity = (arg.nParity == 2) ? parity : 0;
@@ -380,13 +386,17 @@ namespace quda {
         // reduce down to the first group of column-split threads
         out = warp_combine<Arg::color_stride>(out);
 
+        if (!allthreads || active) {
 #pragma unroll
-        for (int color_local=0; color_local<Mc; color_local++) {
-          int c = color_block + color_local; // global color index
-          if (color_offset == 0) {
-            // if not halo we just store, else we accumulate
-            if (doBulk<Arg::type>()) arg.out[src_idx](my_spinor_parity, x_cb, s, c) = out[color_local];
-            else arg.out[src_idx](my_spinor_parity, x_cb, s, c) += out[color_local];
+          for (int color_local = 0; color_local < Mc; color_local++) {
+            int c = color_block + color_local; // global color index
+            if (color_offset == 0) {
+              // if not halo we just store, else we accumulate
+              if (doBulk<Arg::type>())
+                arg.out[src_idx](my_spinor_parity, x_cb, s, c) = out[color_local];
+              else
+                arg.out[src_idx](my_spinor_parity, x_cb, s, c) += out[color_local];
+            }
           }
         }
       }
diff --git a/include/kernels/dslash_domain_wall_4d_fused_m5.cuh b/include/kernels/dslash_domain_wall_4d_fused_m5.cuh
index 46e0ae876a..439cce5433 100644
--- a/include/kernels/dslash_domain_wall_4d_fused_m5.cuh
+++ b/include/kernels/dslash_domain_wall_4d_fused_m5.cuh
@@ -73,8 +73,8 @@ namespace quda
     template <typename Ftor> constexpr domainWall4DFusedM5(const Ftor &ftor) : KernelOpsT(ftor), arg(ftor.arg) { }
     static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation
 
-    template <KernelType mykernel_type = kernel_type>
-    __device__ __host__ __forceinline__ void operator()(int idx, int src_s, int parity)
+    template <KernelType mykernel_type = kernel_type, bool allthreads = false>
+    __device__ __host__ __forceinline__ void operator()(int idx, int src_s, int parity, bool alive = true)
     {
       typedef typename mapper<typename Arg::Float>::type real;
       typedef ColorSpinor<real, Arg::nColor, 4> Vector;
@@ -82,73 +82,74 @@ namespace quda
       int src_idx = src_s / arg.Ls;
       int s = src_s % arg.Ls;
 
-      bool active
-        = mykernel_type == EXTERIOR_KERNEL_ALL ? false : true; // is thread active (non-trival for fused kernel only)
+      bool active = mykernel_type != EXTERIOR_KERNEL_ALL; // is thread active (non-trival for fused kernel only)
       int thread_dim; // which dimension is thread working on (fused kernel only)
       auto coord = getCoords<QUDA_4D_PC, mykernel_type>(arg, idx, s, parity, thread_dim);
 
       const int my_spinor_parity = arg.nParity == 2 ? parity : 0;
       Vector stencil_out;
-      applyWilson<dagger, mykernel_type>(stencil_out, arg, coord, parity, idx, thread_dim, active, src_idx);
+      if (!allthreads || alive) {
+	applyWilson<dagger, mykernel_type>(stencil_out, arg, coord, parity, idx, thread_dim, active, src_idx);
+      }
 
       Vector out;
 
-      constexpr bool shared = true; // Use shared memory
-
       // In the following `x_cb` are all passed as `x_cb = 0`, since it will not be used if `shared = true`, and `shared = true`
 
-      if (active) {
-
-        /******
-         *  Apply M5pre
-         */
-        if (Arg::dslash5_type == Dslash5Type::DSLASH5_MOBIUS_PRE) {
-          constexpr bool sync = false;
-          out = d5<sync, dagger, shared, decltype(*this), typename Arg::Dslash5Arg>(*this, stencil_out,
-                                                                                    my_spinor_parity, 0, s, src_idx);
-        }
+      if (allthreads||active) {
+	/******
+	 *  Apply M5pre
+	 */
+	if (Arg::dslash5_type == Dslash5Type::DSLASH5_MOBIUS_PRE) {
+	  constexpr bool sync = false;
+	  out = d5<true, sync, dagger, shared, decltype(*this), typename Arg::Dslash5Arg>
+	    (*this, stencil_out, my_spinor_parity, 0, s, src_idx, alive&&active);
+	}
       }
 
       int xs = coord.x_cb + s * arg.dc.volume_4d_cb;
       if (Arg::dslash5_type == Dslash5Type::M5_INV_MOBIUS_M5_INV_DAG) {
 
-        /******
-         *  Apply the two M5inv's:
-         *    this is actually   y = 1 * x - kappa_b^2 * m5inv * D4 * in
-         *                     out = m5inv-dagger * y
-         */
-        if (active) {
-          constexpr bool sync = false;
-          out = variableInv<sync, dagger, shared, decltype(*this), typename Arg::Dslash5Arg>(
-            *this, stencil_out, my_spinor_parity, 0, s, src_idx);
+	/******
+	 *  Apply the two M5inv's:
+	 *    this is actually   y = 1 * x - kappa_b^2 * m5inv * D4 * in
+	 *                     out = m5inv-dagger * y
+	 */
+	if (allthreads||active) {
+	  constexpr bool sync = false;
+	  out = variableInv<true, sync, dagger, shared, decltype(*this), typename Arg::Dslash5Arg>
+	    (*this, stencil_out, my_spinor_parity, 0, s, src_idx, alive&&active);
         }
 
-        Vector aggregate_external;
-        if (xpay && mykernel_type == INTERIOR_KERNEL) {
-          Vector x = arg.x[src_idx](xs, my_spinor_parity);
-          out = x + arg.a_5[s] * out;
-        } else if (mykernel_type != INTERIOR_KERNEL && active) {
-          Vector y = arg.y[src_idx](xs, my_spinor_parity);
-          aggregate_external = xpay ? arg.a_5[s] * out : out;
-          out = y + aggregate_external;
-        }
+	if (!allthreads||alive) {
+	  Vector aggregate_external;
+	  if (xpay && mykernel_type == INTERIOR_KERNEL) {
+            Vector x = arg.x[src_idx](xs, my_spinor_parity);
+            out = x + arg.a_5[s] * out;
+          } else if (mykernel_type != INTERIOR_KERNEL && active) {
+            Vector y = arg.y[src_idx](xs, my_spinor_parity);
+            aggregate_external = xpay ? arg.a_5[s] * out : out;
+            out = y + aggregate_external;
+          }
 
-        if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.y[src_idx](xs, my_spinor_parity) = out;
+	  if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.y[src_idx](xs, my_spinor_parity) = out;
 
-        if (mykernel_type != INTERIOR_KERNEL && active) {
-          Vector x = arg.out[src_idx](xs, my_spinor_parity);
-          out = x + aggregate_external;
+	  if (mykernel_type != INTERIOR_KERNEL && active) {
+            Vector x = arg.out[src_idx](xs, my_spinor_parity);
+            out = x + aggregate_external;
+          }
         }
 
         bool complete = isComplete<mykernel_type>(arg, coord);
 
-        if (complete && active) {
-          constexpr bool sync = true;
-          constexpr bool this_dagger = true;
-          // Then we apply the second m5inv-dag
-          out = variableInv<sync, this_dagger, shared, decltype(*this), typename Arg::Dslash5Arg>(
-            *this, out, my_spinor_parity, 0, s, src_idx);
-        }
+        if (allthreads || (complete && active)) {
+	  constexpr bool sync = true;
+	  constexpr bool this_dagger = true;
+	  // Then we apply the second m5inv-dag
+	  auto tmp = variableInv<true, sync, this_dagger, shared, decltype(*this), typename Arg::Dslash5Arg>
+	    (*this, out, my_spinor_parity, 0, s, src_idx, alive && complete && active);
+	  if (alive && complete && active) out = tmp;
+	}
 
       } else if (Arg::dslash5_type == Dslash5Type::DSLASH5_MOBIUS
                  || Arg::dslash5_type == Dslash5Type::DSLASH5_MOBIUS_PRE_M5_MOB) {
@@ -159,25 +160,28 @@ namespace quda
          *    or               out = m5mob * x - kappa_b^2 * m5pre *D4 * in (Dslash5Type::DSLASH5_PRE_MOBIUS_M5_MOBIUS)
          */
 
-        if (active) {
-          if (Arg::dslash5_type == Dslash5Type::DSLASH5_MOBIUS) { out = stencil_out; }
+        if (allthreads || active) {
+	  if (Arg::dslash5_type == Dslash5Type::DSLASH5_MOBIUS) { out = stencil_out; }
 
-          if (Arg::dslash5_type == Dslash5Type::DSLASH5_MOBIUS_PRE_M5_MOB) {
-            constexpr bool sync = false;
-            out = d5<sync, dagger, shared, decltype(*this), typename Arg::Dslash5Arg, Dslash5Type::DSLASH5_MOBIUS_PRE>(
-              *this, stencil_out, my_spinor_parity, 0, s, src_idx);
-          }
-        }
+	  if (Arg::dslash5_type == Dslash5Type::DSLASH5_MOBIUS_PRE_M5_MOB) {
+	    constexpr bool sync = false;
+	    out = d5<true, sync, dagger, shared, decltype(*this), typename Arg::Dslash5Arg,
+		     Dslash5Type::DSLASH5_MOBIUS_PRE>(*this, stencil_out, my_spinor_parity, 0, s, src_idx, alive && active);
+	  }
+	}
 
         if (xpay && mykernel_type == INTERIOR_KERNEL) {
-          Vector x = arg.x[src_idx](xs, my_spinor_parity);
+          Vector x;
+          if (!allthreads || alive) x = arg.x[src_idx](xs, my_spinor_parity);
           constexpr bool sync_m5mob = Arg::dslash5_type == Dslash5Type::DSLASH5_MOBIUS ? false : true;
-          x = d5<sync_m5mob, dagger, shared, decltype(*this), typename Arg::Dslash5Arg, Dslash5Type::DSLASH5_MOBIUS>(
-            *this, x, my_spinor_parity, 0, s, src_idx);
-          out = x + arg.a_5[s] * out;
+          x = d5<allthreads, sync_m5mob, dagger, shared, decltype(*this), typename Arg::Dslash5Arg,
+                 Dslash5Type::DSLASH5_MOBIUS>(*this, x, my_spinor_parity, 0, s, src_idx, alive);
+          if (!allthreads || alive) out = x + arg.a_5[s] * out;
         } else if (mykernel_type != INTERIOR_KERNEL && active) {
-          Vector x = arg.out[src_idx](xs, my_spinor_parity);
-          out = x + (xpay ? arg.a_5[s] * out : out);
+	  if (!allthreads || alive) {
+	    Vector x = arg.out[src_idx](xs, my_spinor_parity);
+	    out = x + (xpay ? arg.a_5[s] * out : out);
+	  }
         }
 
       } else {
@@ -191,20 +195,22 @@ namespace quda
         if (Arg::dslash5_type == Dslash5Type::M5_INV_MOBIUS) {
           // Apply the m5inv.
           constexpr bool sync = false;
-          out = variableInv<sync, dagger, shared, decltype(*this), typename Arg::Dslash5Arg>(
-            *this, stencil_out, my_spinor_parity, 0, s, src_idx);
+          out = variableInv<allthreads, sync, dagger, shared, decltype(*this), typename Arg::Dslash5Arg>
+	    (*this, stencil_out, my_spinor_parity, 0, s, src_idx, alive);
         }
 
-        if (xpay && mykernel_type == INTERIOR_KERNEL) {
-          Vector x = arg.x[src_idx](xs, my_spinor_parity);
-          out = x + arg.a_5[s] * out;
-        } else if (mykernel_type != INTERIOR_KERNEL && active) {
-          Vector x = arg.out[src_idx](xs, my_spinor_parity);
-          out = x + (xpay ? arg.a_5[s] * out : out);
-        }
+	if (!allthreads || alive) {
+	  if (xpay && mykernel_type == INTERIOR_KERNEL) {
+	    Vector x = arg.x[src_idx](xs, my_spinor_parity);
+	    out = x + arg.a_5[s] * out;
+	  } else if (mykernel_type != INTERIOR_KERNEL && active) {
+	    Vector x = arg.out[src_idx](xs, my_spinor_parity);
+	    out = x + (xpay ? arg.a_5[s] * out : out);
+	  }
+	}
 
         bool complete = isComplete<mykernel_type>(arg, coord);
-        if (complete && active) {
+        if (allthreads || (complete && active)) {
 
           /******
            *  First apply M5inv, and then M5pre
@@ -212,12 +218,13 @@ namespace quda
           if (Arg::dslash5_type == Dslash5Type::M5_INV_MOBIUS_M5_PRE) {
             // Apply the m5inv.
             constexpr bool sync_m5inv = false;
-            out = variableInv<sync_m5inv, dagger, shared, decltype(*this), typename Arg::Dslash5Arg>(
-              *this, out, my_spinor_parity, 0, s, src_idx);
+            auto tmp = variableInv<true, sync_m5inv, dagger, shared, decltype(*this), typename Arg::Dslash5Arg>
+	      (*this, out, my_spinor_parity, 0, s, src_idx, alive && complete && active);
             // Apply the m5pre.
             constexpr bool sync_m5pre = true;
-            out = d5<sync_m5pre, dagger, shared, decltype(*this), typename Arg::Dslash5Arg>(*this, out, my_spinor_parity,
-                                                                                            0, s, src_idx);
+            tmp = d5<true, sync_m5pre, dagger, shared, decltype(*this), typename Arg::Dslash5Arg>
+	      (*this, tmp, my_spinor_parity, 0, s, src_idx, alive && complete && active);
+            if (alive && complete && active) out = tmp;
           }
 
           /******
@@ -226,16 +233,17 @@ namespace quda
           if (Arg::dslash5_type == Dslash5Type::M5_PRE_MOBIUS_M5_INV) {
             // Apply the m5pre.
             constexpr bool sync_m5pre = false;
-            out = d5<sync_m5pre, dagger, shared, decltype(*this), typename Arg::Dslash5Arg>(*this, out, my_spinor_parity,
-                                                                                            0, s, src_idx);
+            auto tmp = d5<true, sync_m5pre, dagger, shared, decltype(*this), typename Arg::Dslash5Arg>
+	      (*this, out, my_spinor_parity, 0, s, src_idx, alive && complete && active);
             // Apply the m5inv.
             constexpr bool sync_m5inv = true;
-            out = variableInv<sync_m5inv, dagger, shared, decltype(*this), typename Arg::Dslash5Arg>(
-              *this, out, my_spinor_parity, 0, s, src_idx);
+            tmp = variableInv<true, sync_m5inv, dagger, shared, decltype(*this), typename Arg::Dslash5Arg>
+	      (*this, tmp, my_spinor_parity, 0, s, src_idx, alive && complete && active);
+            if (alive && complete && active) out = tmp;
           }
         }
       }
-      if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](xs, my_spinor_parity) = out;
+      if (alive && (mykernel_type != EXTERIOR_KERNEL_ALL || active)) arg.out[src_idx](xs, my_spinor_parity) = out;
     }
   };
 
diff --git a/include/kernels/dslash_domain_wall_m5.cuh b/include/kernels/dslash_domain_wall_m5.cuh
index 9ea0419b8a..25433b46e4 100644
--- a/include/kernels/dslash_domain_wall_m5.cuh
+++ b/include/kernels/dslash_domain_wall_m5.cuh
@@ -215,9 +215,9 @@ namespace quda
     using Ops = std::conditional_t<shared, KernelOps<Cache>, NoKernelOps>;
   };
 
-  template <bool sync, bool dagger, bool shared, class Ftor, class Arg = typename Ftor::Arg,
-            Dslash5Type type = Arg::type, class Vector>
-  __device__ __host__ inline Vector d5(const Ftor &ftor, const Vector &in, int parity, int x_cb, int s, int src_idx)
+  template <bool allthreads, bool sync, bool dagger, bool shared, class Ftor, class Arg = typename Ftor::Arg,
+	    Dslash5Type type = Arg::type, class Vector>
+  __device__ __host__ inline Vector d5(const Ftor &ftor, const Vector &in, int parity, int x_cb, int s, int src_idx, bool alive)
   {
     const Arg &arg = ftor.arg;
     int local_src_idx = target::thread_idx().y / arg.Ls;
@@ -240,19 +240,21 @@ namespace quda
           cache.save(in.project(4, proj_dir));
           cache.sync();
         }
-        const int fwd_s = (s + 1) % arg.Ls;
-        const int fwd_idx = fwd_s * arg.volume_4d_cb + x_cb;
-        HalfVector half_in;
-        if constexpr (shared) {
-          half_in = cache.load(threadIdx.x, local_src_idx * arg.Ls + fwd_s, parity);
-        } else {
-          Vector full_in = arg.in[src_idx](fwd_idx, parity);
-          half_in = full_in.project(4, proj_dir);
-        }
-        if (s == arg.Ls - 1) {
-          out += (-arg.m_f * half_in).reconstruct(4, proj_dir);
-        } else {
-          out += half_in.reconstruct(4, proj_dir);
+	if (!allthreads || alive) {
+	  const int fwd_s = (s + 1) % arg.Ls;
+	  const int fwd_idx = fwd_s * arg.volume_4d_cb + x_cb;
+	  HalfVector half_in;
+	  if constexpr (shared) {
+	    half_in = cache.load(threadIdx.x, local_src_idx * arg.Ls + fwd_s, parity);
+	  } else {
+            Vector full_in = arg.in[src_idx](fwd_idx, parity);
+            half_in = full_in.project(4, proj_dir);
+          }
+	  if (s == arg.Ls - 1) {
+	    out += (-arg.m_f * half_in).reconstruct(4, proj_dir);
+	  } else {
+	    out += half_in.reconstruct(4, proj_dir);
+	  }
         }
       }
 
@@ -263,20 +265,22 @@ namespace quda
           cache.save(in.project(4, proj_dir));
           cache.sync();
         }
-        const int back_s = (s + arg.Ls - 1) % arg.Ls;
-        const int back_idx = back_s * arg.volume_4d_cb + x_cb;
-        HalfVector half_in;
-        if constexpr (shared) {
-          half_in = cache.load(threadIdx.x, local_src_idx * arg.Ls + back_s, parity);
-        } else {
-          Vector full_in = arg.in[src_idx](back_idx, parity);
-          half_in = full_in.project(4, proj_dir);
-        }
-        if (s == 0) {
-          out += (-arg.m_f * half_in).reconstruct(4, proj_dir);
-        } else {
-          out += half_in.reconstruct(4, proj_dir);
-        }
+	if (!allthreads || alive) {
+	  const int back_s = (s + arg.Ls - 1) % arg.Ls;
+	  const int back_idx = back_s * arg.volume_4d_cb + x_cb;
+	  HalfVector half_in;
+	  if constexpr (shared) {
+	    half_in = cache.load(threadIdx.x, local_src_idx * arg.Ls + back_s, parity);
+	  } else {
+            Vector full_in = arg.in[src_idx](back_idx, parity);
+            half_in = full_in.project(4, proj_dir);
+	  }
+	  if (s == 0) {
+	    out += (-arg.m_f * half_in).reconstruct(4, proj_dir);
+	  } else {
+	    out += half_in.reconstruct(4, proj_dir);
+	  }
+	}
       }
 
     } else { // use_half_vector
@@ -291,40 +295,44 @@ namespace quda
         cache.sync();
       }
 
-      { // forwards direction
-        const int fwd_s = (s + 1) % arg.Ls;
-        const int fwd_idx = fwd_s * arg.volume_4d_cb + x_cb;
-        const Vector in
-          = shared ? cache.load(threadIdx.x, local_src_idx * arg.Ls + fwd_s, parity) : arg.in[src_idx](fwd_idx, parity);
-        constexpr int proj_dir = dagger ? +1 : -1;
-        if (s == arg.Ls - 1) {
-          out += (-arg.m_f * in.project(4, proj_dir)).reconstruct(4, proj_dir);
-        } else {
-          out += in.project(4, proj_dir).reconstruct(4, proj_dir);
+      if (!allthreads || alive) {
+        { // forwards direction
+          const int fwd_s = (s + 1) % arg.Ls;
+          const int fwd_idx = fwd_s * arg.volume_4d_cb + x_cb;
+          const Vector in = shared ? cache.load(threadIdx.x, local_src_idx * arg.Ls + fwd_s, parity) :
+                                     arg.in[src_idx](fwd_idx, parity);
+          constexpr int proj_dir = dagger ? +1 : -1;
+          if (s == arg.Ls - 1) {
+            out += (-arg.m_f * in.project(4, proj_dir)).reconstruct(4, proj_dir);
+          } else {
+            out += in.project(4, proj_dir).reconstruct(4, proj_dir);
+          }
         }
-      }
 
-      { // backwards direction
-        const int back_s = (s + arg.Ls - 1) % arg.Ls;
-        const int back_idx = back_s * arg.volume_4d_cb + x_cb;
-        const Vector in = shared ? cache.load(threadIdx.x, local_src_idx * arg.Ls + back_s, parity) :
-                                   arg.in[src_idx](back_idx, parity);
-        constexpr int proj_dir = dagger ? -1 : +1;
-        if (s == 0) {
-          out += (-arg.m_f * in.project(4, proj_dir)).reconstruct(4, proj_dir);
-        } else {
-          out += in.project(4, proj_dir).reconstruct(4, proj_dir);
+        { // backwards direction
+          const int back_s = (s + arg.Ls - 1) % arg.Ls;
+          const int back_idx = back_s * arg.volume_4d_cb + x_cb;
+          const Vector in = shared ? cache.load(threadIdx.x, local_src_idx * arg.Ls + back_s, parity) :
+                                     arg.in[src_idx](back_idx, parity);
+          constexpr int proj_dir = dagger ? -1 : +1;
+          if (s == 0) {
+            out += (-arg.m_f * in.project(4, proj_dir)).reconstruct(4, proj_dir);
+          } else {
+            out += in.project(4, proj_dir).reconstruct(4, proj_dir);
+          }
         }
       }
     } // use_half_vector
 
-    if (type == Dslash5Type::DSLASH5_MOBIUS_PRE || type == Dslash5Type::M5_INV_MOBIUS_M5_PRE
-        || type == Dslash5Type::M5_PRE_MOBIUS_M5_INV) {
-      Vector diagonal = shared ? in : arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity);
-      out = coeff.alpha(s) * out + coeff.beta(s) * diagonal;
-    } else if (type == Dslash5Type::DSLASH5_MOBIUS) {
-      Vector diagonal = shared ? in : arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity);
-      out = coeff.kappa(s) * out + diagonal;
+    if (!allthreads || alive) {
+      if (type == Dslash5Type::DSLASH5_MOBIUS_PRE || type == Dslash5Type::M5_INV_MOBIUS_M5_PRE
+          || type == Dslash5Type::M5_PRE_MOBIUS_M5_INV) {
+        Vector diagonal = shared ? in : arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity);
+        out = coeff.alpha(s) * out + coeff.beta(s) * diagonal;
+      } else if (type == Dslash5Type::DSLASH5_MOBIUS) {
+        Vector diagonal = shared ? in : arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity);
+        out = coeff.kappa(s) * out + diagonal;
+      }
     }
 
     return out;
@@ -346,7 +354,8 @@ namespace quda
        @param[in] x_b Checkerboarded 4-d space-time index
        @param[in] s Ls dimension coordinate
     */
-    __device__ __host__ inline void operator()(int x_cb, int src_s, int parity)
+    template <bool allthreads = false>
+    __device__ __host__ inline void operator()(int x_cb, int src_s, int parity, bool alive = true)
     {
       using real = typename Arg::real;
       coeff_type<real, is_variable<Arg::type>::value, Arg> coeff(arg);
@@ -358,22 +367,24 @@ namespace quda
       constexpr bool sync = false;
       constexpr bool shared = false;
 
-      Vector out = d5<sync, Arg::dagger, shared>(*this, Vector(), parity, x_cb, s, src_idx);
-
-      if (Arg::xpay) {
-        if (Arg::type == Dslash5Type::DSLASH5_DWF) {
-          Vector x = arg.x[src_idx](s * arg.volume_4d_cb + x_cb, parity);
-          out = x + arg.a * out;
-        } else if (Arg::type == Dslash5Type::DSLASH5_MOBIUS_PRE) {
-          Vector x = arg.x[src_idx](s * arg.volume_4d_cb + x_cb, parity);
-          out = x + coeff.a(s) * out;
-        } else if (Arg::type == Dslash5Type::DSLASH5_MOBIUS) {
-          Vector x = arg.x[src_idx](s * arg.volume_4d_cb + x_cb, parity);
-          out = coeff.a(s) * x + out;
+      Vector out = d5<allthreads, sync, Arg::dagger, shared>(*this, Vector(), parity, x_cb, s, src_idx, alive);
+
+      if (!allthreads || alive) {
+        if (Arg::xpay) {
+          if (Arg::type == Dslash5Type::DSLASH5_DWF) {
+            Vector x = arg.x[src_idx](s * arg.volume_4d_cb + x_cb, parity);
+            out = x + arg.a * out;
+          } else if (Arg::type == Dslash5Type::DSLASH5_MOBIUS_PRE) {
+            Vector x = arg.x[src_idx](s * arg.volume_4d_cb + x_cb, parity);
+            out = x + coeff.a(s) * out;
+          } else if (Arg::type == Dslash5Type::DSLASH5_MOBIUS) {
+            Vector x = arg.x[src_idx](s * arg.volume_4d_cb + x_cb, parity);
+            out = coeff.a(s) * x + out;
+          }
         }
-      }
 
-      arg.out[src_idx](s * arg.volume_4d_cb + x_cb, parity) = out;
+        arg.out[src_idx](s * arg.volume_4d_cb + x_cb, parity) = out;
+      }
     }
   };
 
@@ -398,9 +409,9 @@ namespace quda
      @param[in] x_b Checkerboarded 4-d space-time index
      @param[in] s_ Ls dimension coordinate
   */
-  template <bool sync, bool dagger, bool shared, typename Vector, typename Ftor>
+  template <bool allthreads, bool sync, bool dagger, bool shared, typename Vector, typename Ftor>
   __device__ __host__ inline Vector constantInv(const Ftor &ftor, const Vector &in, int parity, int x_cb, int s_,
-                                                int src_idx)
+                                                int src_idx, bool alive)
   {
     using Arg = typename Ftor::Arg;
     const Arg &arg = ftor.arg;
@@ -421,23 +432,25 @@ namespace quda
 
     Vector out;
 
-    for (int s = 0; s < arg.Ls; s++) {
+    if (!allthreads || alive) {
+      for (int s = 0; s < arg.Ls; s++) {
 
-      Vector in = shared ? cache.load(threadIdx.x, local_src_idx * arg.Ls + s, parity) :
-                           arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity);
+        Vector in = shared ? cache.load(threadIdx.x, local_src_idx * arg.Ls + s, parity) :
+                             arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity);
 
-      {
-        int exp = s_ < s ? arg.Ls - s + s_ : s_ - s;
-        real factorR = inv * fpow(k, exp) * (s_ < s ? -arg.m_f : static_cast<real>(1.0));
-        constexpr int proj_dir = dagger ? -1 : +1;
-        out += factorR * (in.project(4, proj_dir)).reconstruct(4, proj_dir);
-      }
+        {
+          int exp = s_ < s ? arg.Ls - s + s_ : s_ - s;
+          real factorR = inv * fpow(k, exp) * (s_ < s ? -arg.m_f : static_cast<real>(1.0));
+          constexpr int proj_dir = dagger ? -1 : +1;
+          out += factorR * (in.project(4, proj_dir)).reconstruct(4, proj_dir);
+        }
 
-      {
-        int exp = s_ > s ? arg.Ls - s_ + s : s - s_;
-        real factorL = inv * fpow(k, exp) * (s_ > s ? -arg.m_f : static_cast<real>(1.0));
-        constexpr int proj_dir = dagger ? +1 : -1;
-        out += factorL * (in.project(4, proj_dir)).reconstruct(4, proj_dir);
+        {
+          int exp = s_ > s ? arg.Ls - s_ + s : s - s_;
+          real factorL = inv * fpow(k, exp) * (s_ > s ? -arg.m_f : static_cast<real>(1.0));
+          constexpr int proj_dir = dagger ? +1 : -1;
+          out += factorL * (in.project(4, proj_dir)).reconstruct(4, proj_dir);
+        }
       }
     }
 
@@ -467,9 +480,9 @@ namespace quda
      @param[in] x_b Checkerboarded 4-d space-time index
      @param[in] s_ Ls dimension coordinate
   */
-  template <bool sync, bool dagger, bool shared, typename Ftor, typename Arg = typename Ftor::Arg, typename Vector>
+  template <bool allthreads, bool sync, bool dagger, bool shared, typename Ftor, typename Arg = typename Ftor::Arg, typename Vector>
   __device__ __host__ inline Vector variableInv(const Ftor &ftor, const Vector &in, int parity, int x_cb, int s_,
-                                                int src_idx)
+                                                int src_idx, bool alive)
   {
     const Arg &arg = ftor.arg;
     int local_src_idx = target::thread_idx().y / arg.Ls;
@@ -486,30 +499,32 @@ namespace quda
       { // first do R
         constexpr int proj_dir = dagger ? -1 : +1;
 
-        if (shared) {
-          if (sync) { cache.sync(); }
+        if constexpr (shared) {
+          if constexpr (sync) { cache.sync(); }
           cache.save(in.project(4, proj_dir));
           cache.sync();
         }
 
-        int s = s_;
-        auto R = coeff.inv();
-        HalfVector r;
-        for (int s_count = 0; s_count < arg.Ls; s_count++) {
-          auto factorR = (s_ < s ? -arg.m_f * R : R);
-
-          if (shared) {
-            r += factorR * cache.load(threadIdx.x, local_src_idx * arg.Ls + s, parity);
-          } else {
-            Vector in = arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity);
-            r += factorR * in.project(4, proj_dir);
-          }
-
-          R *= coeff.kappa(s);
-          s = (s + arg.Ls - 1) % arg.Ls;
-        }
-
-        out += r.reconstruct(4, proj_dir);
+	if (!allthreads || alive) {
+	  int s = s_;
+	  auto R = coeff.inv();
+	  HalfVector r;
+	  for (int s_count = 0; s_count < arg.Ls; s_count++) {
+	    auto factorR = (s_ < s ? -arg.m_f * R : R);
+
+	    if (shared) {
+	      r += factorR * cache.load(threadIdx.x, local_src_idx * arg.Ls + s, parity);
+	    } else {
+              Vector in = arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity);
+              r += factorR * in.project(4, proj_dir);
+	    }
+
+	    R *= coeff.kappa(s);
+	    s = (s + arg.Ls - 1) % arg.Ls;
+	  }
+
+	  out += r.reconstruct(4, proj_dir);
+	}
       }
 
       { // second do L
@@ -520,24 +535,26 @@ namespace quda
           cache.sync();
         }
 
-        int s = s_;
-        auto L = coeff.inv();
-        HalfVector l;
-        for (int s_count = 0; s_count < arg.Ls; s_count++) {
-          auto factorL = (s_ > s ? -arg.m_f * L : L);
-
-          if (shared) {
-            l += factorL * cache.load(threadIdx.x, local_src_idx * arg.Ls + s, parity);
-          } else {
-            Vector in = arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity);
-            l += factorL * in.project(4, proj_dir);
-          }
-
-          L *= coeff.kappa(s);
-          s = (s + 1) % arg.Ls;
-        }
-
-        out += l.reconstruct(4, proj_dir);
+	if (!allthreads || alive) {
+	  int s = s_;
+	  auto L = coeff.inv();
+	  HalfVector l;
+	  for (int s_count = 0; s_count < arg.Ls; s_count++) {
+	    auto factorL = (s_ > s ? -arg.m_f * L : L);
+
+	    if (shared) {
+	      l += factorL * cache.load(threadIdx.x, local_src_idx * arg.Ls + s, parity);
+	    } else {
+              Vector in = arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity);
+              l += factorL * in.project(4, proj_dir);
+            }
+
+	    L *= coeff.kappa(s);
+	    s = (s + 1) % arg.Ls;
+	  }
+
+	  out += l.reconstruct(4, proj_dir);
+	}
       }
     } else { // use_half_vector
       using Cache = std::conditional_t<shared, SharedMemoryCache<Vector>, const Ftor &>;
@@ -548,44 +565,46 @@ namespace quda
         cache.sync();
       }
 
-      { // first do R
-        constexpr int proj_dir = dagger ? -1 : +1;
+      if (!allthreads || alive) {
+        { // first do R
+          constexpr int proj_dir = dagger ? -1 : +1;
+
+          int s = s_;
+          auto R = coeff.inv();
+          HalfVector r;
+          for (int s_count = 0; s_count < arg.Ls; s_count++) {
+            auto factorR = (s_ < s ? -arg.m_f * R : R);
 
-        int s = s_;
-        auto R = coeff.inv();
-        HalfVector r;
-        for (int s_count = 0; s_count < arg.Ls; s_count++) {
-          auto factorR = (s_ < s ? -arg.m_f * R : R);
+            Vector in = shared ? cache.load(threadIdx.x, local_src_idx * arg.Ls + s, parity) :
+                                 arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity);
+            r += factorR * in.project(4, proj_dir);
 
-          Vector in = shared ? cache.load(threadIdx.x, local_src_idx * arg.Ls + s, parity) :
-                               arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity);
-          r += factorR * in.project(4, proj_dir);
+            R *= coeff.kappa(s);
+            s = (s + arg.Ls - 1) % arg.Ls;
+          }
 
-          R *= coeff.kappa(s);
-          s = (s + arg.Ls - 1) % arg.Ls;
+          out += r.reconstruct(4, proj_dir);
         }
 
-        out += r.reconstruct(4, proj_dir);
-      }
+        { // second do L
+          constexpr int proj_dir = dagger ? +1 : -1;
 
-      { // second do L
-        constexpr int proj_dir = dagger ? +1 : -1;
+          int s = s_;
+          auto L = coeff.inv();
+          HalfVector l;
+          for (int s_count = 0; s_count < arg.Ls; s_count++) {
+            auto factorL = (s_ > s ? -arg.m_f * L : L);
 
-        int s = s_;
-        auto L = coeff.inv();
-        HalfVector l;
-        for (int s_count = 0; s_count < arg.Ls; s_count++) {
-          auto factorL = (s_ > s ? -arg.m_f * L : L);
+            Vector in = shared ? cache.load(threadIdx.x, local_src_idx * arg.Ls + s, parity) :
+                                 arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity);
+            l += factorL * in.project(4, proj_dir);
 
-          Vector in = shared ? cache.load(threadIdx.x, local_src_idx * arg.Ls + s, parity) :
-                               arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity);
-          l += factorL * in.project(4, proj_dir);
+            L *= coeff.kappa(s);
+            s = (s + 1) % arg.Ls;
+          }
 
-          L *= coeff.kappa(s);
-          s = (s + 1) % arg.Ls;
+          out += l.reconstruct(4, proj_dir);
         }
-
-        out += l.reconstruct(4, proj_dir);
       }
     } // use_half_vector
 
@@ -618,7 +637,8 @@ namespace quda
        @param[in] x_b Checkerboarded 4-d space-time index
        @param[in] s Ls dimension coordinate
     */
-    __device__ __host__ inline void operator()(int x_cb, int src_s, int parity)
+    template <bool allthreads = false>
+    __device__ __host__ inline void operator()(int x_cb, int src_s, int parity, bool alive = true)
     {
       constexpr int nSpin = 4;
       using real = typename Arg::real;
@@ -628,21 +648,25 @@ namespace quda
       int src_idx = src_s / arg.Ls;
       int s = src_s % arg.Ls;
 
-      Vector in = arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity);
-      Vector out;
+      Vector in, out;
+      if (!allthreads || alive) { in = arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity); }
       constexpr bool sync = false;
       if constexpr (mobius_m5::var_inverse()) { // zMobius, must call variableInv
-        out = variableInv<sync, Arg::dagger, mobius_m5::shared()>(*this, in, parity, x_cb, s, src_idx);
+        out
+          = variableInv<allthreads, sync, Arg::dagger, mobius_m5::shared()>(*this, in, parity, x_cb, s, src_idx, alive);
       } else {
-        out = constantInv<sync, Arg::dagger, mobius_m5::shared()>(*this, in, parity, x_cb, s, src_idx);
+        out
+          = constantInv<allthreads, sync, Arg::dagger, mobius_m5::shared()>(*this, in, parity, x_cb, s, src_idx, alive);
       }
 
-      if (Arg::xpay) {
-        Vector x = arg.x[src_idx](s * arg.volume_4d_cb + x_cb, parity);
-        out = x + coeff.a(s) * out;
-      }
+      if (!allthreads || alive) {
+        if (Arg::xpay) {
+          Vector x = arg.x[src_idx](s * arg.volume_4d_cb + x_cb, parity);
+          out = x + coeff.a(s) * out;
+        }
 
-      arg.out[src_idx](s * arg.volume_4d_cb + x_cb, parity) = out;
+        arg.out[src_idx](s * arg.volume_4d_cb + x_cb, parity) = out;
+      }
     }
   };
 
diff --git a/include/kernels/dslash_mobius_eofa.cuh b/include/kernels/dslash_mobius_eofa.cuh
index 49e65da6d7..3e2bb4e647 100644
--- a/include/kernels/dslash_mobius_eofa.cuh
+++ b/include/kernels/dslash_mobius_eofa.cuh
@@ -110,7 +110,8 @@ namespace quda
       }
       static constexpr const char *filename() { return KERNEL_FILE; }
 
-      __device__ __host__ inline void operator()(int x_cb, int src_s, int parity)
+      template <bool allthreads = false>
+      __device__ __host__ inline void operator()(int x_cb, int src_s, int parity, bool alive = true)
       {
         using real = typename Arg::real;
         typedef ColorSpinor<real, Arg::nColor, 4> Vector;
@@ -121,7 +122,7 @@ namespace quda
         SharedMemoryCache<Vector> cache {*this};
 
         Vector out;
-        cache.save(arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity));
+        if (!allthreads || alive) { cache.save(arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity)); }
         cache.sync();
 
         auto Ls = arg.Ls;
@@ -165,11 +166,13 @@ namespace quda
           }
 
           if (Arg::xpay) { // really axpy
-            Vector x = arg.x[src_idx](s * arg.volume_4d_cb + x_cb, parity);
-            out = arg.a * x + out;
-          }
-        }
-        arg.out[src_idx](s * arg.volume_4d_cb + x_cb, parity) = out;
+            if (!allthreads || alive) {
+              Vector x = arg.x[src_idx](s * arg.volume_4d_cb + x_cb, parity);
+              out = arg.a * x + out;
+	    }
+	  }
+	}
+	if (!allthreads || alive) { arg.out[src_idx](s * arg.volume_4d_cb + x_cb, parity) = out; }
       }
     };
 
@@ -196,7 +199,8 @@ namespace quda
       }
       static constexpr const char *filename() { return KERNEL_FILE; }
 
-      __device__ __host__ inline void operator()(int x_cb, int src_s, int parity)
+      template <bool allthreads = false>
+      __device__ __host__ inline void operator()(int x_cb, int src_s, int parity, bool alive = true)
       {
         using real = typename Arg::real;
         typedef ColorSpinor<real, Arg::nColor, 4> Vector;
@@ -206,7 +210,7 @@ namespace quda
 
         const auto sherman_morrison = arg.sherman_morrison;
         SharedMemoryCache<Vector> cache {*this};
-        cache.save(arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity));
+        if (!allthreads || alive) { cache.save(arg.in[src_idx](s * arg.volume_4d_cb + x_cb, parity)); }
         cache.sync();
 
         Vector out;
@@ -233,10 +237,12 @@ namespace quda
           }
         }
         if (Arg::xpay) { // really axpy
-          Vector x = arg.x[src_idx](s * arg.volume_4d_cb + x_cb, parity);
-          out = x + arg.a * out;
+          if (!allthreads || alive) {
+            Vector x = arg.x[src_idx](s * arg.volume_4d_cb + x_cb, parity);
+            out = x + arg.a * out;
+          }
         }
-        arg.out[src_idx](s * arg.volume_4d_cb + x_cb, parity) = out;
+        if (!allthreads || alive) { arg.out[src_idx](s * arg.volume_4d_cb + x_cb, parity) = out; }
       }
     };
 
diff --git a/include/kernels/dslash_ndeg_twisted_clover.cuh b/include/kernels/dslash_ndeg_twisted_clover.cuh
index 049f129d14..625bf68778 100644
--- a/include/kernels/dslash_ndeg_twisted_clover.cuh
+++ b/include/kernels/dslash_ndeg_twisted_clover.cuh
@@ -14,7 +14,7 @@ namespace quda
     static constexpr int length = (nSpin / (nSpin / 2)) * 2 * nColor * nColor * (nSpin / 2) * (nSpin / 2) / 2;
     typedef typename clover_mapper<Float, length, true>::type C;
     typedef typename mapper<Float>::type real;
-    
+
     const C A; /** the clover field */
     real a; /** this is the Wilson-dslash scale factor */
     real b; /** this is the chiral twist factor */
@@ -58,8 +58,8 @@ namespace quda
        out(x) = M*in = a * D * in + (A(x) + i*b*gamma_5*tau_3 + c*tau_1)*x
        Note this routine only exists in xpay form.
     */
-    template <KernelType mykernel_type = kernel_type>
-    __device__ __host__ __forceinline__ void operator()(int idx, int src_flavor, int parity)
+    template <KernelType mykernel_type = kernel_type, bool allthreads = false>
+    __device__ __host__ __forceinline__ void operator()(int idx, int src_flavor, int parity, bool alive = true)
     {
       typedef typename mapper<typename Arg::Float>::type real;
       typedef ColorSpinor<real, Arg::nColor, 4> Vector;
@@ -67,9 +67,8 @@ namespace quda
 
       int src_idx = src_flavor / 2;
       int flavor = src_flavor % 2;
-      bool active
-        = mykernel_type == EXTERIOR_KERNEL_ALL ? false : true; // is thread active (non-trival for fused kernel only)
-      int thread_dim;                                          // which dimension is thread working on (fused kernel only)
+      bool active = mykernel_type != EXTERIOR_KERNEL_ALL; // is thread active (non-trival for fused kernel only)
+      int thread_dim; // which dimension is thread working on (fused kernel only)
 
       auto coord = getCoords<QUDA_4D_PC, mykernel_type>(arg, idx, flavor, parity, thread_dim);
 
@@ -77,53 +76,64 @@ namespace quda
       const int my_flavor_idx = coord.x_cb + flavor * arg.dc.volume_4d_cb;
       Vector out;
 
-      if (arg.dd_out.isZero(coord)) {
-        if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](my_flavor_idx, my_spinor_parity) = out;
-        return;
+      if (!allthreads || alive) {
+	if (arg.dd_out.isZero(coord)) {
+	  if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](my_flavor_idx, my_spinor_parity) = out;
+	  if constexpr (!allthreads) return;
+	  else alive = false;
+	}
       }
 
-      // defined in dslash_wilson.cuh
-      applyWilson<dagger, mykernel_type>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
+      if (!allthreads || alive) {
+	// defined in dslash_wilson.cuh
+	applyWilson<dagger, mykernel_type>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
+      }
 
       if constexpr (mykernel_type == INTERIOR_KERNEL) {
         if (arg.dd_x.isZero(coord)) {
-          out = arg.a * out;
+	  if (!allthreads || alive) out = arg.a * out;
         } else {
-          // apply the chiral and flavor twists
-          // use consistent load order across s to ensure better cache locality
-          Vector x = arg.x[src_idx](my_flavor_idx, my_spinor_parity);
           SharedMemoryCache<Vector> cache {*this};
-          cache.save(x);
+          Vector tmp;
+	  if (!allthreads || alive) {
+            // apply the chiral and flavor twists
+            // use consistent load order across s to ensure better cache locality
+            Vector x = arg.x[src_idx](my_flavor_idx, my_spinor_parity);
+            cache.save(x);
 
-          x.toRel(); // switch to chiral basis
+            x.toRel(); // switch to chiral basis
 
-          Vector tmp;
 #pragma unroll
-          for (int chirality = 0; chirality < 2; chirality++) {
-            constexpr int n = Arg::nColor * Arg::nSpin / 2;
-            HMatrix<real, n> A = arg.A(coord.x_cb, parity, chirality);
-            HalfVector x_chi = x.chiral_project(chirality);
-            HalfVector Ax_chi = A * x_chi;
-            // i * mu * gamma_5 * tau_3
-            const complex<real> b(0.0, (chirality ^ flavor) == 0 ? static_cast<real>(arg.b) : -static_cast<real>(arg.b));
-            Ax_chi += b * x_chi;
-            tmp += Ax_chi.chiral_reconstruct(chirality);
+            for (int chirality = 0; chirality < 2; chirality++) {
+              constexpr int n = Arg::nColor * Arg::nSpin / 2;
+              HMatrix<real, n> A = arg.A(coord.x_cb, parity, chirality);
+              HalfVector x_chi = x.chiral_project(chirality);
+              HalfVector Ax_chi = A * x_chi;
+              // i * mu * gamma_5 * tau_3
+              const complex<real> b(0.0,
+                                    (chirality ^ flavor) == 0 ? static_cast<real>(arg.b) : -static_cast<real>(arg.b));
+              Ax_chi += b * x_chi;
+              tmp += Ax_chi.chiral_reconstruct(chirality);
+            }
+
+            tmp.toNonRel();
+            // tmp += (c * tau_1) * x
           }
-
-          tmp.toNonRel();
-          // tmp += (c * tau_1) * x
           cache.sync();
-          tmp += arg.c * cache.load_y(target::thread_idx().y + 1 - 2 * flavor);
+	  if (!allthreads || alive) {
+            tmp += arg.c * cache.load_y(target::thread_idx().y + 1 - 2 * flavor);
 
-          // add the Wilson part with normalisation
-          out = tmp + arg.a * out;
+            // add the Wilson part with normalisation
+            out = tmp + arg.a * out;
+          }
         }
       } else if (active) {
         Vector x = arg.out[src_idx](my_flavor_idx, my_spinor_parity);
         out = x + arg.a * out;
       }
 
-      if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](my_flavor_idx, my_spinor_parity) = out;
+      if (!allthreads || alive)
+	if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](my_flavor_idx, my_spinor_parity) = out;
     }
   };
 
diff --git a/include/kernels/dslash_ndeg_twisted_clover_preconditioned.cuh b/include/kernels/dslash_ndeg_twisted_clover_preconditioned.cuh
index 4b1a470db7..fb61259d65 100644
--- a/include/kernels/dslash_ndeg_twisted_clover_preconditioned.cuh
+++ b/include/kernels/dslash_ndeg_twisted_clover_preconditioned.cuh
@@ -13,7 +13,7 @@ namespace quda
     using WilsonArg<Float, nColor, nDim, DDArg, reconstruct_>::nSpin;
     static constexpr int length = (nSpin / (nSpin / 2)) * 2 * nColor * nColor * (nSpin / 2) * (nSpin / 2) / 2;
     static constexpr bool dynamic_clover = clover::dynamic_inverse();
-    
+
     typedef typename mapper<Float>::type real;
     typedef typename clover_mapper<Float, length>::type C;
     const C A;
@@ -64,8 +64,8 @@ namespace quda
        out(x) = M*in = a*(C + i*b*gamma_5*tau_3 + c*tau_1)/(C^2 + b^2 - c^2)*D*x ( xpay == false )
        out(x) = M*in = in + a*(C + i*b*gamma_5*tau_3 + c*tau_1)/(C^2 + b^2 - c^2)*D*x ( xpay == true )
     */
-    template <KernelType mykernel_type = kernel_type>
-    __device__ __host__ __forceinline__ void operator()(int idx, int src_flavor, int parity)
+    template <KernelType mykernel_type = kernel_type, bool allthreads = false>
+    __device__ __host__ __forceinline__ void operator()(int idx, int src_flavor, int parity, bool alive = true)
     {
       using namespace linalg; // for Cholesky
       typedef typename mapper<typename Arg::Float>::type real;
@@ -75,98 +75,107 @@ namespace quda
 
       int src_idx = src_flavor / 2;
       int flavor = src_flavor % 2;
-
-      bool active
-        = mykernel_type == EXTERIOR_KERNEL_ALL ? false : true; // is thread active (non-trival for fused kernel only)
-      int thread_dim;                                          // which dimension is thread working on (fused kernel only)
+      bool active = mykernel_type != EXTERIOR_KERNEL_ALL; // is thread active (non-trival for fused kernel only)
+      int thread_dim; // which dimension is thread working on (fused kernel only)
       auto coord = getCoords<QUDA_4D_PC, mykernel_type>(arg, idx, flavor, parity, thread_dim);
 
       const int my_spinor_parity = arg.nParity == 2 ? parity : 0;
       int my_flavor_idx = coord.x_cb + flavor * arg.dc.volume_4d_cb;
       Vector out;
-      if (arg.dd_out.isZero(coord)) {
-        if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](my_flavor_idx, my_spinor_parity) = out;
-        return;
+      if (!allthreads || alive) {
+	if (arg.dd_out.isZero(coord)) {
+	  if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](my_flavor_idx, my_spinor_parity) = out;
+	  if constexpr (!allthreads) return;
+	  else alive = false;
+	}
       }
 
-      // defined in dslash_wilson.cuh
-      applyWilson<dagger, mykernel_type>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
+      if (!allthreads || alive) {
+	// defined in dslash_wilson.cuh
+	applyWilson<dagger, mykernel_type>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
 
-      if (mykernel_type != INTERIOR_KERNEL && active) {
-        // if we're not the interior kernel, then we must sum the partial
-        Vector x = arg.out[src_idx](my_flavor_idx, my_spinor_parity);
-        out += x;
+	if (mykernel_type != INTERIOR_KERNEL && active) {
+	  // if we're not the interior kernel, then we must sum the partial
+	  Vector x = arg.out[src_idx](my_flavor_idx, my_spinor_parity);
+	  out += x;
+	}
       }
 
+      constexpr int n_flavor = 2;
+      HalfVector out_chi[n_flavor]; // flavor array of chirally projected fermion
       if (isComplete<mykernel_type>(arg, coord) && active) {
-        out.toRel();
-
-        constexpr int n_flavor = 2;
-        HalfVector out_chi[n_flavor]; // flavor array of chirally projected fermion
+	out.toRel();
 #pragma unroll
-        for (int i = 0; i < n_flavor; i++) out_chi[i] = out.chiral_project(i);
-
-        int chirality = flavor; // relabel flavor as chirality
-
-        SharedMemoryCache<HalfVector> cache {*this};
-
-        auto swizzle = [&](HalfVector x[2], int chirality) {
-          if (chirality == 0)
-            cache.save_y(x[1], target::thread_idx().y);
-          else
-            cache.save_y(x[0], target::thread_idx().y);
-          cache.sync();
-          if (chirality == 0)
-            x[1] = cache.load_y(target::thread_idx().y + 1);
-          else
-            x[0] = cache.load_y(target::thread_idx().y - 1);
-        };
-
-        swizzle(out_chi, chirality); // apply the flavor-chirality swizzle between threads
-
-        // load in the clover matrix
-        HMat A = arg.A(coord.x_cb, parity, chirality);
+	for (int i = 0; i < n_flavor; i++) out_chi[i] = out.chiral_project(i);
+      }
 
-        HalfVector A_chi[n_flavor];
+      int chirality = flavor; // relabel flavor as chirality
+      SharedMemoryCache<HalfVector> cache {*this};
+      auto swizzle = [&](HalfVector x[2], int chirality) {
+        if (chirality == 0)
+          cache.save_y(x[1], target::thread_idx().y);
+        else
+          cache.save_y(x[0], target::thread_idx().y);
+        cache.sync();
+        if (chirality == 0)
+          x[1] = cache.load_y(target::thread_idx().y + 1);
+        else
+          x[0] = cache.load_y(target::thread_idx().y - 1);
+      };
+
+      swizzle(out_chi, chirality); // apply the flavor-chirality swizzle between threads
+
+      if (!allthreads || alive) {
+	if (isComplete<mykernel_type>(arg, coord) && active) {
+	  // load in the clover matrix
+	  HMat A = arg.A(coord.x_cb, parity, chirality);
+
+	  HalfVector A_chi[n_flavor];
 #pragma unroll
-        for (int flavor_ = 0; flavor_ < n_flavor; flavor_++) {
-          const complex<real> b(0.0, (chirality^flavor_) == 0 ? arg.b : -arg.b);
-          A_chi[flavor_] = A * out_chi[flavor_];
-          A_chi[flavor_] += b * out_chi[flavor_];
-          A_chi[flavor_] += arg.c * out_chi[1 - flavor_];
-        }
-
-        if constexpr (Arg::dynamic_clover) {
-          HMat A2 = A.square();
-          A2 += arg.b2_minus_c2;
-          Cholesky<HMatrix, clover::cholesky_t<typename Arg::Float>, Arg::nColor * Arg::nSpin / 2> cholesky(A2);
+	  for (int flavor_ = 0; flavor_ < n_flavor; flavor_++) {
+	    const complex<real> b(0.0, (chirality^flavor_) == 0 ? arg.b : -arg.b);
+	    A_chi[flavor_] = A * out_chi[flavor_];
+	    A_chi[flavor_] += b * out_chi[flavor_];
+	    A_chi[flavor_] += arg.c * out_chi[1 - flavor_];
+	  }
+
+	  if constexpr (Arg::dynamic_clover) {
+	    HMat A2 = A.square();
+	    A2 += arg.b2_minus_c2;
+	    Cholesky<HMatrix, clover::cholesky_t<typename Arg::Float>, Arg::nColor * Arg::nSpin / 2> cholesky(A2);
 
 #pragma unroll
-          for (int flavor_ = 0; flavor_ < n_flavor; flavor_++) {
-            out_chi[flavor_] = static_cast<real>(0.25) * cholesky.backward(cholesky.forward(A_chi[flavor_]));
-          }
-        } else {
-          HMat A2inv = arg.A2inv(coord.x_cb, parity, chirality);
+	    for (int flavor_ = 0; flavor_ < n_flavor; flavor_++) {
+	      out_chi[flavor_] = static_cast<real>(0.25) * cholesky.backward(cholesky.forward(A_chi[flavor_]));
+	    }
+	  } else {
+	    HMat A2inv = arg.A2inv(coord.x_cb, parity, chirality);
 #pragma unroll
-          for (int flavor_ = 0; flavor_ < n_flavor; flavor_++) {
-            out_chi[flavor_] = static_cast<real>(2.0) * (A2inv * A_chi[flavor_]);
-          }
-        }
-
-        swizzle(out_chi, chirality); // undo the flavor-chirality swizzle
-        Vector tmp = out_chi[0].chiral_reconstruct(0) + out_chi[1].chiral_reconstruct(1);
-        tmp.toNonRel(); // switch back to non-chiral basis
-
-        if (xpay && !arg.dd_x.isZero(coord)) {
-          Vector x = arg.x[src_idx](my_flavor_idx, my_spinor_parity);
-          out = x + arg.a * tmp;
-        } else {
-          // multiplication with a needed here?
-          out = arg.a * tmp;
-        }
+	    for (int flavor_ = 0; flavor_ < n_flavor; flavor_++) {
+	      out_chi[flavor_] = static_cast<real>(2.0) * (A2inv * A_chi[flavor_]);
+	    }
+	  }
+	}
       }
 
-      if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](my_flavor_idx, my_spinor_parity) = out;
+      swizzle(out_chi, chirality); // undo the flavor-chirality swizzle
+
+      if (!allthreads || alive) {
+	if (isComplete<mykernel_type>(arg, coord) && active) {
+	  Vector tmp = out_chi[0].chiral_reconstruct(0) + out_chi[1].chiral_reconstruct(1);
+	  tmp.toNonRel(); // switch back to non-chiral basis
+
+	  if (xpay && !arg.dd_x.isZero(coord)) {
+	    Vector x = arg.x[src_idx](my_flavor_idx, my_spinor_parity);
+	    out = x + arg.a * tmp;
+	  } else {
+	    // multiplication with a needed here?
+	    out = arg.a * tmp;
+	  }
+	}
+
+	if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](my_flavor_idx, my_spinor_parity) = out;
+      }
     }
   };
 } // namespace quda
diff --git a/include/kernels/dslash_ndeg_twisted_mass_preconditioned.cuh b/include/kernels/dslash_ndeg_twisted_mass_preconditioned.cuh
index 7effb07ae3..8244eb6787 100644
--- a/include/kernels/dslash_ndeg_twisted_mass_preconditioned.cuh
+++ b/include/kernels/dslash_ndeg_twisted_mass_preconditioned.cuh
@@ -64,8 +64,8 @@ namespace quda
        - with xpay:  out(x) = M*in = x + a*(1+i*b*gamma_5 + c*tau_1)D * in
     */
 
-    template <KernelType mykernel_type = kernel_type>
-    __device__ __host__ __forceinline__ void operator()(int idx, int src_flavor, int parity)
+    template <KernelType mykernel_type = kernel_type, bool allthreads = false>
+    __device__ __host__ __forceinline__ void operator()(int idx, int src_flavor, int parity, bool alive = true)
     {
       typedef typename mapper<typename Arg::Float>::type real;
       typedef ColorSpinor<real, Arg::nColor, 4> Vector;
@@ -73,62 +73,68 @@ namespace quda
       int src_idx = src_flavor / 2;
       int flavor = src_flavor % 2;
 
-      bool active
-        = mykernel_type == EXTERIOR_KERNEL_ALL ? false : true; // is thread active (non-trival for fused kernel only)
-      int thread_dim;                                        // which dimension is thread working on (fused kernel only)
+      bool active = mykernel_type != EXTERIOR_KERNEL_ALL; // is thread active (non-trival for fused kernel only)
+      int thread_dim; // which dimension is thread working on (fused kernel only)
       auto coord = getCoords<QUDA_4D_PC, mykernel_type>(arg, idx, flavor, parity, thread_dim);
 
       const int my_spinor_parity = arg.nParity == 2 ? parity : 0;
       int my_flavor_idx = coord.x_cb + flavor * arg.dc.volume_4d_cb;
       Vector out;
-      if (arg.dd_out.isZero(coord)) {
-        if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](my_flavor_idx, my_spinor_parity) = out;
-        return;
+      if (!allthreads || alive) {
+        if (arg.dd_out.isZero(coord)) {
+	  if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](my_flavor_idx, my_spinor_parity) = out;
+          if constexpr (!allthreads) return;
+          else alive = false;
+        }
       }
 
-      if (!dagger || Arg::asymmetric) // defined in dslash_wilson.cuh
-        applyWilson<dagger, mykernel_type>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
-      else // defined in dslash_twisted_mass_preconditioned
-        applyWilsonTM<dagger, 2, mykernel_type>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
-
-      if (xpay && mykernel_type == INTERIOR_KERNEL && !arg.dd_x.isZero(coord)) {
-
-        if (!dagger || Arg::asymmetric) { // apply inverse twist which is undone below
-          // use consistent load order across s to ensure better cache locality
-          Vector x0 = arg.x[src_idx](coord.x_cb + 0 * arg.dc.volume_4d_cb, my_spinor_parity);
-          Vector x1 = arg.x[src_idx](coord.x_cb + 1 * arg.dc.volume_4d_cb, my_spinor_parity);
-          if (flavor == 0)
-            out += arg.a_inv * (x0 + arg.b_inv * x0.igamma(4) + arg.c_inv * x1);
-          else
-            out += arg.a_inv * (x1 - arg.b_inv * x1.igamma(4) + arg.c_inv * x0);
-        } else {
-          Vector x = arg.x[src_idx](my_flavor_idx, my_spinor_parity);
-          out += x; // just directly add since twist already applied in the dslash
+      if (!allthreads || alive) {
+	if (!dagger || Arg::asymmetric) // defined in dslash_wilson.cuh
+	  applyWilson<dagger, mykernel_type>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
+	else // defined in dslash_twisted_mass_preconditioned
+	  applyWilsonTM<dagger, 2, mykernel_type>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
+
+        if (xpay && mykernel_type == INTERIOR_KERNEL && !arg.dd_x.isZero(coord)) {
+          if constexpr (!dagger || Arg::asymmetric) { // apply inverse twist which is undone below
+            // use consistent load order across s to ensure better cache locality
+            Vector x0 = arg.x[src_idx](coord.x_cb + 0 * arg.dc.volume_4d_cb, my_spinor_parity);
+            Vector x1 = arg.x[src_idx](coord.x_cb + 1 * arg.dc.volume_4d_cb, my_spinor_parity);
+            if (flavor == 0)
+              out += arg.a_inv * (x0 + arg.b_inv * x0.igamma(4) + arg.c_inv * x1);
+            else
+              out += arg.a_inv * (x1 - arg.b_inv * x1.igamma(4) + arg.c_inv * x0);
+          } else {
+            Vector x = arg.x[src_idx](my_flavor_idx, my_spinor_parity);
+            out += x; // just directly add since twist already applied in the dslash
+          }
+        } else if (mykernel_type != INTERIOR_KERNEL && active) {
+          // if we're not the interior kernel, then we must sum the partial
+          Vector x = arg.out[src_idx](my_flavor_idx, my_spinor_parity);
+          out += x;
         }
-
-      } else if (mykernel_type != INTERIOR_KERNEL && active) {
-        // if we're not the interior kernel, then we must sum the partial
-        Vector x = arg.out[src_idx](my_flavor_idx, my_spinor_parity);
-        out += x;
       }
 
       if constexpr (!dagger || Arg::asymmetric) { // apply A^{-1} to D*in
         SharedMemoryCache<Vector> cache {*this};
-        if (isComplete<mykernel_type>(arg, coord) && active) {
-          // to apply the preconditioner we need to put "out" in shared memory so the other flavor can access it
-          cache.save(out);
-        }
-
-        cache.sync(); // safe to sync in here since other threads will exit
-        if (isComplete<mykernel_type>(arg, coord) && active) {
-          if (flavor == 0)
-            out = arg.a * (out + arg.b * out.igamma(4) + arg.c * cache.load_y(target::thread_idx().y + 1));
-          else
-            out = arg.a * (out - arg.b * out.igamma(4) + arg.c * cache.load_y(target::thread_idx().y - 1));
-        }
+	if (!allthreads || alive) {
+	  if (isComplete<mykernel_type>(arg, coord) && active) {
+	    // to apply the preconditioner we need to put "out" in shared memory so the other flavor can access it
+	    cache.save(out);
+	  }
+	}
+        cache.sync(); // safe to sync here since other threads will exit if allowed, or all be here
+	if (!allthreads || alive) {
+	  if (isComplete<mykernel_type>(arg, coord) && active) {
+	    if (flavor == 0)
+	      out = arg.a * (out + arg.b * out.igamma(4) + arg.c * cache.load_y(target::thread_idx().y + 1));
+	    else
+	      out = arg.a * (out - arg.b * out.igamma(4) + arg.c * cache.load_y(target::thread_idx().y - 1));
+	  }
+	}
       }
 
-      if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](my_flavor_idx, my_spinor_parity) = out;
+      if (!allthreads || alive)
+	if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](my_flavor_idx, my_spinor_parity) = out;
     }
   };
 
diff --git a/include/kernels/gauge_fix_ovr.cuh b/include/kernels/gauge_fix_ovr.cuh
index 1f147742d2..a4c663040c 100644
--- a/include/kernels/gauge_fix_ovr.cuh
+++ b/include/kernels/gauge_fix_ovr.cuh
@@ -146,7 +146,7 @@ namespace quda {
     constexpr computeFix(const Arg &arg, const Ops &...ops) : KernelOpsT(ops...), arg(arg) { }
     static constexpr const char *filename() { return KERNEL_FILE; }
 
-    __device__ inline void operator()(int idx, int mu)
+    template <bool allthreads = false> __device__ inline void operator()(int idx, int mu, bool active = true)
     {
       using real = typename Arg::real;
       using Link = Matrix<complex<real>, 3>;
@@ -161,7 +161,7 @@ namespace quda {
         for (int dr = 0; dr < 4; dr++) p += arg.border[dr];
         getCoords(x, idx, arg.X, p + parity);
       } else {
-        idx = arg.borderpoints[parity][idx];  // load the lattice site assigment
+        if (!allthreads || active) idx = arg.borderpoints[parity][idx]; // load the lattice site assigment
         x[3] = idx / (X[0] * X[1]  * X[2]);
         x[2] = (idx / (X[0] * X[1])) % X[2];
         x[1] = (idx / X[0]) % X[1];
@@ -188,7 +188,8 @@ namespace quda {
           parity = 1 - parity;
         }
         idx = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
-        Link link = arg.u(dim, idx, parity);
+        Link link;
+        if (!allthreads || active) link = arg.u(dim, idx, parity);
 
         if constexpr (Arg::type == 0) {
           // 8 threads per lattice site, the reduction is performed by shared memory without using atomicadd.
@@ -201,11 +202,13 @@ namespace quda {
           GaugeFixHit_NoAtomicAdd_LessSM<real, Arg::gauge_dir, 3>(link, arg.relax_boost, mu, *this);
         }
 
+        if (!allthreads || active) arg.u(dim, idx, parity) = link;
         arg.u(dim, idx, parity) = link;
       } else if constexpr (Arg::type == 2 || Arg::type == 3) {
         // 4 threads per lattice site
         idx = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
-        Link link = arg.u(mu, idx, parity);
+        Link link;
+        if (!allthreads || active) link = arg.u(mu, idx, parity);
 
         switch (mu) {
         case 0: x[0] = (x[0] - 1 + X[0]) % X[0]; break;
@@ -214,7 +217,8 @@ namespace quda {
         case 3: x[3] = (x[3] - 1 + X[3]) % X[3]; break;
         }
         int idx1 = (((x[3] * X[2] + x[2]) * X[1] + x[1]) * X[0] + x[0]) >> 1;
-        Link link1 = arg.u(mu, idx1, 1 - parity);
+        Link link1;
+        if (!allthreads || active) link1 = arg.u(mu, idx1, 1 - parity);
 
         if constexpr (Arg::type == 2) {
           // 4 threads per lattice site, the reduction is performed by shared memory without using atomicadd.
@@ -227,8 +231,10 @@ namespace quda {
           GaugeFixHit_NoAtomicAdd_LessSM<real, Arg::gauge_dir, 3>(link, link1, arg.relax_boost, mu, *this);
         }
 
-        arg.u(mu, idx, parity) = link;
-        arg.u(mu, idx1, 1 - parity) = link1;
+        if (!allthreads || active) {
+          arg.u(mu, idx, parity) = link;
+          arg.u(mu, idx1, 1 - parity) = link1;
+        }
       }
     }
   };
diff --git a/include/kernels/madwf_transfer.cuh b/include/kernels/madwf_transfer.cuh
index 616d6a40c0..008631ad4e 100644
--- a/include/kernels/madwf_transfer.cuh
+++ b/include/kernels/madwf_transfer.cuh
@@ -111,7 +111,8 @@ namespace quda
         @param[in] x_b Checkerboarded 4-d space-time index
         @param[in] s The output Ls dimension coordinate
        */
-      __device__ __host__ inline void operator()(int x_cb, int s, int parity)
+      template <bool allthreads = false>
+      __device__ __host__ inline void operator()(int x_cb, int s, int parity, bool active = true)
       {
         constexpr bool dagger = Arg::dagger;
 
@@ -132,14 +133,16 @@ namespace quda
         }
         cache.sync();
 
-        Vector out;
-        // t -> s_in, s-> s_out
-        for (int t = 0; t < Ls_in; t++) {
-          Vector in = arg.in(t * volume_4d_cb + x_cb, parity);
-          int wm_index = dagger ? t * Ls_out + s : s * Ls_in + t;
-          matrix_vector_multiply<dagger>(out, reinterpret_cast<const matrix_t *>(cache.data())[wm_index], in);
+        if (!allthreads || active) {
+          Vector out;
+          // t -> s_in, s-> s_out
+          for (int t = 0; t < Ls_in; t++) {
+            Vector in = arg.in(t * volume_4d_cb + x_cb, parity);
+            int wm_index = dagger ? t * Ls_out + s : s * Ls_in + t;
+            matrix_vector_multiply<dagger>(out, reinterpret_cast<const matrix_t *>(cache.data())[wm_index], in);
+          }
+          arg.out(s * volume_4d_cb + x_cb, parity) = out;
         }
-        arg.out(s * volume_4d_cb + x_cb, parity) = out;
       }
     };
   } // namespace madwf_ml
diff --git a/include/kernels/multi_blas_core.cuh b/include/kernels/multi_blas_core.cuh
index 0c7db44292..0abe2e37ee 100644
--- a/include/kernels/multi_blas_core.cuh
+++ b/include/kernels/multi_blas_core.cuh
@@ -15,7 +15,8 @@ namespace quda
 #ifndef QUDA_FAST_COMPILE_REDUCE
     constexpr bool enable_warp_split() { return false; }
 #else
-    constexpr bool enable_warp_split() { return true; }
+    // constexpr bool enable_warp_split() { return true; }
+    constexpr bool enable_warp_split() { return false; }
 #endif
 
     /**
@@ -64,12 +65,15 @@ namespace quda
        @param[in,out] arg Argument struct with required meta data
        (input/output fields, functor, etc.)
     */
-    template <typename Arg> struct MultiBlas_ {
+    // template <typename Arg> struct MultiBlas_ {
+    template <typename Arg> struct MultiBlas_ : only_warp_combine<array<complex<typename Arg::real>, Arg::n / 2>> {
+      //  std::conditional_t<Arg::Functor::write.Y||Arg::Functor::write.W,BlockSync,void> {
       const Arg &arg;
       constexpr MultiBlas_(const Arg &arg) : arg(arg) {}
       static constexpr const char *filename() { return KERNEL_FILE; }
 
-      __device__ __host__ inline void operator()(int i, int k, int parity)
+      template <bool allthreads = false> // true if all threads in group will enter, even if out of range
+      __device__ __host__ inline void operator()(int i, int k, int parity, bool active = true)
       {
         using vec = array<complex<typename Arg::real>, Arg::n/2>;
 
@@ -83,22 +87,24 @@ namespace quda
         const int l_idx = lane_id / vector_site_width;
 
         vec x, y, z, w;
-        if (l_idx == 0 || warp_split == 1) {
-          if (arg.f.read.Y) arg.Y[k].load(y, idx, parity);
-          if (arg.f.read.W) arg.W[k].load(w, idx, parity);
-        } else {
-          y = ::quda::zero<complex<typename Arg::real>, Arg::n/2>();
-          w = ::quda::zero<complex<typename Arg::real>, Arg::n/2>();
-        }
+        if (!allthreads || active) {
+          if (l_idx == 0 || warp_split == 1) {
+            if (arg.f.read.Y) arg.Y[k].load(y, idx, parity);
+            if (arg.f.read.W) arg.W[k].load(w, idx, parity);
+          } else {
+            y = ::quda::zero<complex<typename Arg::real>, Arg::n / 2>();
+            w = ::quda::zero<complex<typename Arg::real>, Arg::n / 2>();
+          }
 
 #pragma unroll
-        for (int l_ = 0; l_ < Arg::NXZ; l_ += warp_split) {
-          const int l = l_ + l_idx;
-          if (l < Arg::NXZ || warp_split == 1) {
-            if (arg.f.read.X) arg.X[l].load(x, idx, parity);
-            if (arg.f.read.Z) arg.Z[l].load(z, idx, parity);
-
-            arg.f(x, y, z, w, k, l);
+          for (int l_ = 0; l_ < Arg::NXZ; l_ += warp_split) {
+            const int l = l_ + l_idx;
+            if (l < Arg::NXZ || warp_split == 1) {
+              if (arg.f.read.X) arg.X[l].load(x, idx, parity);
+              if (arg.f.read.Z) arg.Z[l].load(z, idx, parity);
+
+              arg.f(x, y, z, w, k, l);
+            }
           }
         }
 
@@ -106,9 +112,11 @@ namespace quda
         if (arg.f.write.Y) y = warp_combine<warp_split>(y);
         if (arg.f.write.W) w = warp_combine<warp_split>(w);
 
-        if (l_idx == 0 || warp_split == 1) {
-          if (arg.f.write.Y) arg.Y[k].save(y, idx, parity);
-          if (arg.f.write.W) arg.W[k].save(w, idx, parity);
+        if (!allthreads || active) {
+          if (l_idx == 0 || warp_split == 1) {
+            if (arg.f.write.Y) arg.Y[k].save(y, idx, parity);
+            if (arg.f.write.W) arg.W[k].save(w, idx, parity);
+          }
         }
       }
     };
diff --git a/include/kernels/restrictor.cuh b/include/kernels/restrictor.cuh
index a73af4f70f..51a8bc854b 100644
--- a/include/kernels/restrictor.cuh
+++ b/include/kernels/restrictor.cuh
@@ -139,7 +139,8 @@ namespace quda {
     constexpr Restrictor(const Arg &arg, const Ops &...ops) : KernelOpsT(ops...), arg(arg) { }
     static constexpr const char *filename() { return KERNEL_FILE; }
 
-    __device__ __host__ inline void operator()(dim3 block, dim3 thread)
+    template <bool allthreads = false>
+    __device__ __host__ inline void operator()(dim3 block, dim3 thread, bool active = true)
     {
       int x_fine_offset = thread.x;
       const int x_coarse = block.x;
@@ -149,50 +150,55 @@ namespace quda {
       const int coarse_color_block = coarse_color_thread * coarse_color_per_thread;
 
       vector reduced{0};
-      while (x_fine_offset < arg.aggregate_size) {
-        // all threads with x_fine_offset greater than aggregate_size_cb are second parity
-        const int parity_offset = x_fine_offset >= arg.aggregate_size_cb ? 1 : 0;
-        const int x_fine_cb_offset = x_fine_offset % arg.aggregate_size_cb;
-        const int parity = arg.nParity == 2 ? parity_offset : arg.parity;
+      if (!allthreads || active) {
+        while (x_fine_offset < arg.aggregate_size) {
+          // all threads with x_fine_offset greater than aggregate_size_cb are second parity
+          const int parity_offset = x_fine_offset >= arg.aggregate_size_cb ? 1 : 0;
+          const int x_fine_cb_offset = x_fine_offset % arg.aggregate_size_cb;
+          const int parity = arg.nParity == 2 ? parity_offset : arg.parity;
 
-        // look-up map is ordered as (coarse-block-id + fine-point-id),
-        // with fine-point-id parity ordered
-        const int x_fine_site_id = (x_coarse * 2 + parity) * arg.aggregate_size_cb + x_fine_cb_offset;
-        const int x_fine = arg.coarse_to_fine[x_fine_site_id];
-        const int x_fine_cb = x_fine - parity * arg.in[src_idx].VolumeCB();
+          // look-up map is ordered as (coarse-block-id + fine-point-id),
+          // with fine-point-id parity ordered
+          const int x_fine_site_id = (x_coarse * 2 + parity) * arg.aggregate_size_cb + x_fine_cb_offset;
+          const int x_fine = arg.coarse_to_fine[x_fine_site_id];
+          const int x_fine_cb = x_fine - parity * arg.in[src_idx].VolumeCB();
 
-        array<complex<typename Arg::real>, Arg::fineSpin * coarse_color_per_thread> tmp{0};
+          array<complex<typename Arg::real>, Arg::fineSpin * coarse_color_per_thread> tmp {0};
 
-        rotateCoarseColor(tmp, arg, src_idx, parity, x_fine_cb, coarse_color_block);
+          rotateCoarseColor(tmp, arg, src_idx, parity, x_fine_cb, coarse_color_block);
 
-        // perform any local spin coarsening
+          // perform any local spin coarsening
 #pragma unroll
-        for (int s = 0; s<Arg::fineSpin; s++) {
+          for (int s = 0; s < Arg::fineSpin; s++) {
 #pragma unroll
-          for (int v = 0; v<coarse_color_per_thread; v++) {
-            if (arg.spin_map(s, parity) == 0) {
-              reduced[0 * coarse_color_per_thread + v] += tmp[s*coarse_color_per_thread + v];
-            } else {
-              reduced[1 * coarse_color_per_thread + v] += tmp[s*coarse_color_per_thread + v];
+            for (int v = 0; v < coarse_color_per_thread; v++) {
+              if (arg.spin_map(s, parity) == 0) {
+                reduced[0 * coarse_color_per_thread + v] += tmp[s * coarse_color_per_thread + v];
+              } else {
+                reduced[1 * coarse_color_per_thread + v] += tmp[s * coarse_color_per_thread + v];
+              }
             }
           }
-        }
 
-        x_fine_offset += target::block_dim().x;
+          x_fine_offset += target::block_dim().x;
+        }
       }
 
       reduced = BlockReduce_t(*this, thread.z).Sum(reduced);
 
-      if (target::thread_idx().x == 0) {
-        const int parity_coarse = x_coarse >= arg.out[src_idx].VolumeCB() ? 1 : 0;
-        const int x_coarse_cb = x_coarse - parity_coarse*arg.out[src_idx].VolumeCB();
+      if (!allthreads || active) {
+        if (target::thread_idx().x == 0) {
+          const int parity_coarse = x_coarse >= arg.out[src_idx].VolumeCB() ? 1 : 0;
+          const int x_coarse_cb = x_coarse - parity_coarse * arg.out[src_idx].VolumeCB();
 
 #pragma unroll
-        for (int s = 0; s < Arg::coarseSpin; s++) {
+          for (int s = 0; s < Arg::coarseSpin; s++) {
 #pragma unroll
-          for (int coarse_color_local=0; coarse_color_local<coarse_color_per_thread; coarse_color_local++) {
-            int v = coarse_color_thread * coarse_color_per_thread + coarse_color_local;
-            arg.out[src_idx](parity_coarse, x_coarse_cb, s, v) = reduced[s*coarse_color_per_thread+coarse_color_local];
+            for (int coarse_color_local = 0; coarse_color_local < coarse_color_per_thread; coarse_color_local++) {
+              int v = coarse_color_thread * coarse_color_per_thread + coarse_color_local;
+              arg.out[src_idx](parity_coarse, x_coarse_cb, s, v)
+                = reduced[s * coarse_color_per_thread + coarse_color_local];
+            }
           }
         }
       }
diff --git a/include/register_traits.h b/include/register_traits.h
index 0dd05a598c..b592f8e984 100644
--- a/include/register_traits.h
+++ b/include/register_traits.h
@@ -33,6 +33,20 @@ namespace quda {
     double4 y;
   };
 
+  template <typename T> std::enable_if_t<std::is_arithmetic_v<T>, T &> constexpr elem(T &a, int i) { return (&a)[i]; }
+
+  template <typename T, typename R = decltype(std::declval<T>().x)>
+  std::enable_if_t<std::is_arithmetic_v<R>, R &> constexpr elem(T &a, int i)
+  {
+    return (&a.x)[i];
+  }
+
+  template <typename T, typename R = decltype(std::declval<T>().x.x), int = 0>
+  std::enable_if_t<std::is_arithmetic_v<R>, R &> constexpr elem(T &a, int i)
+  {
+    return (&a.x.x)[i];
+  }
+
   /*
     Here we use traits to define the greater type used for mixing types of computation involving these types
   */
diff --git a/include/targets/cuda/shared_memory_helper.h b/include/targets/cuda/shared_memory_helper.h
index 69d8c095ce..3b4b46a132 100644
--- a/include/targets/cuda/shared_memory_helper.h
+++ b/include/targets/cuda/shared_memory_helper.h
@@ -80,8 +80,9 @@ namespace quda
     /**
        @brief Constructor for SharedMemory object.
     */
-    template <typename... U>
-    constexpr SharedMemory(const KernelOps<U...> &) : data(cache(get_offset(target::block_dim())))
+    template <typename... U, typename... Arg>
+    constexpr SharedMemory(const KernelOps<U...> &, const Arg &...arg) :
+      data(cache(get_offset(target::block_dim(), arg...)))
     {
     }
 
diff --git a/include/targets/cuda/target_device.h b/include/targets/cuda/target_device.h
index ee7c646172..077504027a 100644
--- a/include/targets/cuda/target_device.h
+++ b/include/targets/cuda/target_device.h
@@ -32,24 +32,42 @@ namespace quda
 #ifdef _NVHPC_CUDA
 
     // nvc++: run-time dispatch using if target
-    template <template <bool, typename...> class f, typename... Args> __host__ __device__ auto dispatch(Args &&...args)
+    template <template <bool, typename...> class f, auto... Params, typename... Args>
+    __host__ __device__ auto dispatch(Args &&...args)
     {
       if target (nv::target::is_device) {
-        return f<true>()(args...);
+        if constexpr (sizeof...(Params) == 0) {
+          return f<true>()(args...);
+        } else {
+          return f<true>().template operator()<Params...>(args...);
+        }
       } else {
-        return f<false>()(args...);
+        if constexpr (sizeof...(Params) == 0) {
+          return f<false>()(args...);
+        } else {
+          return f<false>().template operator()<Params...>(args...);
+        }
       }
     }
 
 #else
 
     // nvcc or clang: compile-time dispatch
-    template <template <bool, typename...> class f, typename... Args> __host__ __device__ auto dispatch(Args &&...args)
+    template <template <bool, typename...> class f, auto... Params, typename... Args>
+    __host__ __device__ auto dispatch(Args &&...args)
     {
 #ifdef __CUDA_ARCH__
-      return f<true>()(args...);
+      if constexpr (sizeof...(Params) == 0) {
+        return f<true>()(args...);
+      } else {
+        return f<true>().template operator()<Params...>(args...);
+      }
 #else
-      return f<false>()(args...);
+      if constexpr (sizeof...(Params) == 0) {
+        return f<false>()(args...);
+      } else {
+        return f<false>().template operator()<Params...>(args...);
+      }
 #endif
     }
 
diff --git a/include/targets/generic/load_store.h b/include/targets/generic/load_store.h
index 3239aeaefc..b3df298eed 100644
--- a/include/targets/generic/load_store.h
+++ b/include/targets/generic/load_store.h
@@ -1,17 +1,25 @@
 #pragma once
 
+#include <register_traits.h>
 #include <target_device.h>
 
 namespace quda
 {
 
+  /**
+     @brief Element type used for coalesced storage.
+   */
+  template <typename T>
+  using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;
+
   /**
      @brief Non-specialized load operation
   */
   template <bool is_device> struct vector_load_impl {
     template <typename T> __device__ __host__ inline void operator()(T &value, const void *ptr, int idx)
     {
-      value = reinterpret_cast<const T *>(ptr)[idx];
+      // value = reinterpret_cast<const T *>(ptr)[idx];
+      memcpy(&value, static_cast<const T *>(ptr) + idx, sizeof(value));
     }
   };
 
@@ -39,11 +47,12 @@ namespace quda
   template <bool is_device> struct vector_store_impl {
     template <typename T> __device__ __host__ inline void operator()(void *ptr, int idx, const T &value)
     {
-      reinterpret_cast<T *>(ptr)[idx] = value;
+      // reinterpret_cast<T *>(ptr)[idx] = value;
+      memcpy(static_cast<T *>(ptr) + idx, &value, sizeof(value));
     }
   };
 
-  template <typename vector_t> __device__ __host__ inline void vector_store(void *ptr, int idx, const vector_t &value)
+  template <typename vector_t> __device__ __host__ inline void vector_storeV(void *ptr, int idx, const vector_t &value)
   {
     target::dispatch<vector_store_impl>(ptr, idx, value);
   }
@@ -55,7 +64,9 @@ namespace quda
     vector_t value_v;
     static_assert(sizeof(value_a) == sizeof(value_v), "array type and vector type are different sizes");
     memcpy(&value_v, &value_a, sizeof(vector_t));
-    vector_store<vector_t>(ptr, idx, value_v);
+    //vector_storeV<vector_t>(ptr, idx, value_v);
+    scalar_t *a = static_cast<scalar_t *>(ptr) + N*idx;
+    memcpy(a, &value_v, sizeof(value_v));
   }
 
 } // namespace quda
diff --git a/include/targets/generic/shared_memory_cache_helper.h b/include/targets/generic/shared_memory_cache_helper.h
index d1a58f2c43..7d1f8ff3bc 100644
--- a/include/targets/generic/shared_memory_cache_helper.h
+++ b/include/targets/generic/shared_memory_cache_helper.h
@@ -122,9 +122,9 @@ namespace quda
     /**
        @brief Constructor for SharedMemoryCache.
     */
-    template <typename... U, typename... Args>
-    constexpr SharedMemoryCache(const KernelOps<U...> &ops, const Args &...dim_args) :
-      Smem(ops), block(D::dims(target::block_dim(), dim_args...)), stride(block.x * block.y * block.z)
+    template <typename... U, typename... Arg>
+    constexpr SharedMemoryCache(const KernelOps<U...> &ops, const Arg &...arg) :
+      Smem(ops, arg...), block(D::dims(target::block_dim(), arg...)), stride(block.x * block.y * block.z)
     {
       // for when enable warp striding we may want to test for
       // something or explcitly require opt in, e.g., to enforce
@@ -133,8 +133,8 @@ namespace quda
       // static_assert(KernelOps<U...>::Arg::shared_memory_warp_stride......
       checkKernelOps<SharedMemoryCache<T, D, O>>(ops);
       // sanity check
-      assert(shared_mem_size(dim3 {32, 16, 8}, dim_args...)
-             == Smem::get_offset(dim3 {32, 16, 8}) + SizeDims<D>::size(dim3 {32, 16, 8}, dim_args...) * sizeof(T));
+      assert(shared_mem_size(dim3 {32, 16, 8}, arg...)
+             == Smem::get_offset(dim3 {32, 16, 8}) + SizeDims<D>::size(dim3 {32, 16, 8}, arg...) * sizeof(T));
     }
 
     constexpr SharedMemoryCache(const SharedMemoryCache<T, D, O> &) = delete;
diff --git a/include/targets/generic/special_ops.h b/include/targets/generic/special_ops.h
new file mode 100644
index 0000000000..9d4683a1d8
--- /dev/null
+++ b/include/targets/generic/special_ops.h
@@ -0,0 +1,184 @@
+#pragma once
+#include <target_device.h>
+#include <kernel_ops.h>
+
+namespace quda
+{
+
+#if 0
+  // dimensions functors for SharedMemoryCache
+  struct opDimsBlock {
+    template <typename ...Arg> static constexpr dim3 dims(dim3 b, const Arg &...) { return b; }
+  };
+  template <int bx, int by, int bz> struct opDimsStatic {
+    template <typename ...Arg> static constexpr dim3 dims(dim3, const Arg &...) { return dim3(bx,by,bz); }
+  };
+
+  // size functors for determining shared memory size
+  struct opSizeBlock {
+    template <typename T, typename ...Arg> static constexpr unsigned int size(dim3 b, const Arg &...) {
+      return b.x * b.y * b.z * sizeof(T);
+    }
+  };
+  struct opSizeBlockDivWarp {
+    template <typename T, typename ...Arg> static constexpr unsigned int size(dim3 b, const Arg &...) {
+      return ((b.x * b.y * b.z + device::warp_size() - 1)/device::warp_size()) * sizeof(T);
+    }
+  };
+  template <unsigned int S> struct opSizeStatic {
+    template <typename T, typename ...Arg> static constexpr unsigned int size(dim3, const Arg &...) {
+      return S * sizeof(T);
+    }
+  };
+  template <typename D> struct opSizeDims {
+    template <typename T, typename ...Arg> static constexpr unsigned int size(dim3 b, const Arg &...arg) {
+      return opSizeBlock::size<T>(D::dims(b, arg...));
+    }
+  };
+#endif
+
+  template <typename T, typename... Arg> static constexpr unsigned int sharedMemSize(dim3 block, Arg &...arg);
+
+  // alternative to SpecialOps
+  struct NoSpecialOps {
+    using SpecialOpsT = NoSpecialOps;
+    using KernelOpsT = NoSpecialOps;
+  };
+  // SpecialOps forward declaration and base type
+  template <typename... T> struct SpecialOps;
+  template <typename... T> using KernelOps = SpecialOps<T...>;
+  template <typename... T> struct SpecialOps_Base {
+    using SpecialOpsT = SpecialOps<T...>;
+    using KernelOpsT = SpecialOps<T...>;
+    template <typename... Arg> static constexpr unsigned int shared_mem_size(dim3 block, Arg &...arg)
+    {
+      return sharedMemSize<KernelOpsT>(block, arg...);
+    }
+  };
+
+  // getSpecialOps
+  template <typename T, typename U = void> struct getSpecialOpsS {
+    using type = NoSpecialOps;
+  };
+  template <typename T> struct getSpecialOpsS<T, std::conditional_t<true, void, typename T::SpecialOpsT>> {
+    using type = typename T::SpecialOpsT;
+  };
+  template <typename T> using getSpecialOps = typename getSpecialOpsS<T>::type;
+
+  // hasSpecialOp: checks if first type matches any of the op
+  // <op, SpecialOps<ops...>>
+  template <typename T, typename U> static constexpr bool hasSpecialOp = false;
+  template <typename T, typename... U>
+  static constexpr bool hasSpecialOp<T, SpecialOps<U...>> = (std::is_same_v<T, U> || ...);
+
+  // template <typename T, typename Ops> void checkSpecialOps() { static_assert(hasSpecialOp<T,Ops>); }
+  // template <typename T, typename Ops> void checkSpecialOps(const Ops &) {
+  // static_assert(hasSpecialOp<T,typename Ops::SpecialOpsT>);
+  // }
+  template <typename... T, typename Ops> void checkSpecialOps(const Ops &)
+  {
+    static_assert((hasSpecialOp<T, typename Ops::SpecialOpsT> || ...));
+  }
+
+  // forward declarations of op types
+  struct op_blockSync;
+  template <typename T> struct op_warp_combine;
+
+  // only types for convenience
+  using only_blockSync = SpecialOps<op_blockSync>;
+  template <typename T> using only_warp_combine = SpecialOps<op_warp_combine<T>>;
+
+  // explicitSpecialOps
+  template <typename T, typename U = void> struct explicitSpecialOpsS : std::false_type {
+  };
+  template <typename T>
+  struct explicitSpecialOpsS<T, std::conditional_t<true, void, typename T::SpecialOpsT>> : std::true_type {
+  };
+  template <typename T> inline constexpr bool explicitSpecialOps = explicitSpecialOpsS<T>::value;
+
+  // hasSpecialOps
+#if 1
+  template <typename T> inline constexpr bool hasSpecialOps = !std::is_same_v<getSpecialOps<T>, NoSpecialOps>;
+#else
+  template <typename T> struct hasSpecialOpsImpl {
+    static constexpr bool value = false;
+  };
+  template <typename... U> struct hasSpecialOpsImpl<SpecialOps<U...>> {
+    static constexpr bool value = true;
+  };
+  template <typename T> inline constexpr bool hasSpecialOps = hasSpecialOpsImpl<T>::value;
+#endif
+
+  // checkSpecialOp
+  template <typename T, typename... U> static constexpr void checkSpecialOp()
+  {
+    static_assert((std::is_same_v<T, U> || ...) == true);
+  }
+
+  // combineOps
+  template <typename... T> struct combineOpsS {
+  };
+  template <typename... T> struct combineOpsS<NoSpecialOps, SpecialOps<T...>> {
+    using type = SpecialOps<T...>;
+  };
+  template <typename... T> struct combineOpsS<SpecialOps<T...>, NoSpecialOps> {
+    using type = SpecialOps<T...>;
+  };
+  template <typename... T, typename... U> struct combineOpsS<SpecialOps<T...>, SpecialOps<U...>> {
+    using type = SpecialOps<T..., U...>;
+  };
+  template <typename T, typename U> using combineOps = typename combineOpsS<T, U>::type;
+
+  // sharedMemSize
+#if 0
+  template <typename ...T> struct sharedMemSizeS {
+    template <typename ...Arg>
+    static constexpr unsigned int size(dim3 block, Arg &...arg) {
+      return std::max({sharedMemSizeS<T>::size(block, arg...)...});
+    }
+  };
+  template <typename ...T, typename ...Arg> static constexpr unsigned int sharedMemSize(dim3 block, Arg &...arg) {
+    return sharedMemSizeS<T...>::size(block, arg...);
+  }
+  template <typename ...T> struct sharedMemSizeS<SpecialOps<T...>> {
+    template <typename ...Arg>
+    static constexpr unsigned int size(dim3 block, Arg &...arg) { return sharedMemSize<T...>(block, arg...); }
+  };
+  template <typename ...T> struct sharedMemSizeS<op_Sequential<T...>> {
+    template <typename ...Arg>
+    static constexpr unsigned int size(dim3 block, Arg &...arg) { return sharedMemSize<T...>(block, arg...); }
+  };
+  template <typename ...T> struct sharedMemSizeS<op_Concurrent<T...>> {
+    template <typename ...Arg>
+    static constexpr unsigned int size(dim3 block, Arg &...arg) { return (sharedMemSize<T>(block, arg...) + ...); }
+  };
+  template <typename T> struct sharedMemSizeS<T> { // T should be of op_Base
+    template <typename ...Arg>
+    static constexpr unsigned int size(dim3 block, Arg &...arg) {
+      return sharedMemSize<typename T::dependencies>(block, arg...);
+    }
+  };
+#else
+  template <typename T> struct sharedMemSizeS {
+    template <typename... Arg> static constexpr unsigned int size(dim3 block, Arg &...arg)
+    {
+      // return 0;
+      return T::shared_mem_size(block, arg...);
+    }
+  };
+  template <> struct sharedMemSizeS<NoSpecialOps> {
+    template <typename... Arg> static constexpr unsigned int size(dim3, Arg &...) { return 0; }
+  };
+  template <typename... T> struct sharedMemSizeS<SpecialOps<T...>> {
+    template <typename... Arg> static constexpr unsigned int size(dim3 block, Arg &...arg)
+    {
+      return std::max({sharedMemSizeS<T>::size(block, arg...)...});
+    }
+  };
+  template <typename T, typename... Arg> static constexpr unsigned int sharedMemSize(dim3 block, Arg &...arg)
+  {
+    return sharedMemSizeS<T>::size(block, arg...);
+  }
+#endif
+
+} // namespace quda
diff --git a/include/targets/hip/special_ops_target.h b/include/targets/hip/special_ops_target.h
new file mode 100644
index 0000000000..3aea0745a5
--- /dev/null
+++ b/include/targets/hip/special_ops_target.h
@@ -0,0 +1,24 @@
+#pragma once
+#include <special_ops.h>
+
+namespace quda
+{
+
+  // SpecialOps
+  template <typename... T> struct SpecialOps : SpecialOps_Base<T...> {
+    template <typename... U> constexpr void setSpecialOps(const SpecialOps<U...> &)
+    {
+      static_assert(std::is_same_v<SpecialOps<T...>, SpecialOps<U...>>);
+    }
+  };
+
+  // op implementations
+  struct op_blockSync {
+    template <typename... Arg> static constexpr unsigned int shared_mem_size(dim3, Arg &...) { return 0; }
+  };
+
+  template <typename T> struct op_warp_combine {
+    template <typename... Arg> static constexpr unsigned int shared_mem_size(dim3, Arg &...) { return 0; }
+  };
+
+} // namespace quda
diff --git a/include/targets/hip/target_device.h b/include/targets/hip/target_device.h
index 897c9bdae1..0f2bd317cf 100644
--- a/include/targets/hip/target_device.h
+++ b/include/targets/hip/target_device.h
@@ -10,12 +10,21 @@ namespace quda
   {
 
     // hip-clang: compile-time dispatch
-    template <template <bool, typename...> class f, typename... Args> __host__ __device__ auto dispatch(Args &&...args)
+    template <template <bool, typename...> class f, auto... Params, typename... Args>
+    __host__ __device__ auto dispatch(Args &&...args)
     {
 #ifdef __HIP_DEVICE_COMPILE__
-      return f<true>()(args...);
+      if constexpr (sizeof...(Params) == 0) {
+        return f<true>()(args...);
+      } else {
+        return f<true>().template operator()<Params...>(args...);
+      }
 #else
-      return f<false>()(args...);
+      if constexpr (sizeof...(Params) == 0) {
+        return f<false>()(args...);
+      } else {
+        return f<false>().template operator()<Params...>(args...);
+      }
 #endif
     }
 
diff --git a/include/targets/sycl/FFT_Plans.h b/include/targets/sycl/FFT_Plans.h
new file mode 100644
index 0000000000..5b35701201
--- /dev/null
+++ b/include/targets/sycl/FFT_Plans.h
@@ -0,0 +1,187 @@
+#pragma once
+
+#ifndef NATIVE_FFT_LIB
+#include "../generic/FFT_Plans.h"
+#else
+
+#include <quda_internal.h>
+#include <quda_sycl_api.h>
+#include <oneapi/mkl/dft.hpp>
+using namespace oneapi::mkl::dft;
+
+#define FFT_FORWARD 0
+#define FFT_INVERSE 1
+
+namespace quda
+{
+
+  typedef struct {
+    bool isDouble;
+    union {
+      descriptor<precision::SINGLE, domain::COMPLEX> *s;
+      descriptor<precision::DOUBLE, domain::COMPLEX> *d;
+    };
+  } FFTPlanHandle;
+
+  inline static constexpr bool HaveFFT() { return true; }
+
+  /**
+   * @brief Call MKL to perform a single-precision complex-to-complex
+   * transform plan in the transform direction as specified by direction
+   * parameter
+   * @param[in] MKL FFT plan
+   * @param[in] data_in, pointer to the complex input data (in GPU memory) to transform
+   * @param[out] data_out, pointer to the complex output data (in GPU memory)
+   * @param[in] direction, the transform direction: CUFFT_FORWARD or CUFFT_INVERSE
+   */
+  inline void ApplyFFT(FFTPlanHandle &plan, float2 *data_in, float2 *data_out, int direction)
+  {
+    if (plan.isDouble) { errorQuda("Called single precision FFT with double precision plan\n"); }
+    sycl::event e;
+    if (direction == FFT_FORWARD) {
+      // warningQuda("Forward FFT");
+      e = compute_forward(*plan.s, (float *)data_in, (float *)data_out);
+    } else {
+      // warningQuda("Backward FFT");
+      e = compute_backward(*plan.s, (float *)data_in, (float *)data_out);
+    }
+    e.wait();
+    // warningQuda("Done FFT");
+  }
+
+  /**
+   * @brief Call CUFFT to perform a double-precision complex-to-complex transform plan in the transform direction
+  as specified by direction parameter
+   * @param[in] CUFFT plan
+   * @param[in] data_in, pointer to the complex input data (in GPU memory) to transform
+   * @param[out] data_out, pointer to the complex output data (in GPU memory)
+   * @param[in] direction, the transform direction: CUFFT_FORWARD or CUFFT_INVERSE
+   */
+  inline void ApplyFFT(FFTPlanHandle &plan, double2 *data_in, double2 *data_out, int direction)
+  {
+    if (!plan.isDouble) { errorQuda("Called double precision FFT with single precision plan\n"); }
+    sycl::event e;
+    if (direction == FFT_FORWARD) {
+      e = compute_forward(*plan.d, (double *)data_in, (double *)data_out);
+    } else {
+      e = compute_backward(*plan.d, (double *)data_in, (double *)data_out);
+    }
+    e.wait();
+  }
+
+  /**
+   * @brief Creates a CUFFT plan supporting 4D (1D+3D) data layouts for complex-to-complex
+   * @param[out] plan, CUFFT plan
+   * @param[in] size, int4 with lattice size dimensions, (.x,.y,.z,.w) -> (Nx, Ny, Nz, Nt)
+   * @param[in] dim, 1 for 1D plan along the temporal direction with batch size Nx*Ny*Nz, 3 for 3D plan along Nx, Ny and
+   * Nz with batch size Nt
+   * @param[in] precision The precision of the computation
+   */
+
+  // inline void SetPlanFFTMany(FFTPlanHandle &plan, int4 size, int dim, QudaPrecision precision)
+  inline void SetPlanFFTMany(FFTPlanHandle &, int4, int dim, QudaPrecision precision)
+  {
+    warningQuda("SetPlanFFTMany %i %i : unimplemented", dim, precision);
+#if 0
+    auto type = precision == QUDA_DOUBLE_PRECISION ? CUFFT_Z2Z : CUFFT_C2C;
+    switch (dim) {
+    case 1: {
+      int n[1] = {size.w};
+      CUFFT_SAFE_CALL(cufftPlanMany(&plan, 1, n, NULL, 1, 0, NULL, 1, 0, type, size.x * size.y * size.z));
+    } break;
+    case 3: {
+      int n[3] = {size.x, size.y, size.z};
+      CUFFT_SAFE_CALL(cufftPlanMany(&plan, 3, n, NULL, 1, 0, NULL, 1, 0, type, size.w));
+    } break;
+    }
+    CUFFT_SAFE_CALL(cufftSetStream(plan, target::cuda::get_stream(device::get_default_stream())));
+#endif
+  }
+
+  /**
+   * @brief Creates a CUFFT plan supporting 4D (2D+2D) data layouts for complex-to-complex
+   * @param[out] plan, CUFFT plan
+   * @param[in] size, int4 with lattice size dimensions, (.x,.y,.z,.w) -> (Nx, Ny, Nz, Nt)
+   * @param[in] dim, 0 for 2D plan in Z-T planes with batch size Nx*Ny, 1 for 2D plan in X-Y planes with batch size Nz*Nt
+   * @param[in] precision The precision of the computation
+   */
+  inline void SetPlanFFT2DMany(FFTPlanHandle &plan, int4 size, int dim, QudaPrecision precision)
+  {
+    // warningQuda("SetPlanFFT2DMany %i %i", dim, precision);
+    if (precision == QUDA_SINGLE_PRECISION) {
+      plan.isDouble = false;
+      if (dim == 0) {
+        auto q = quda::device::defaultQueue();
+        MKL_LONG distance = size.w * size.z;
+        plan.s = new std::remove_pointer_t<decltype(plan.s)>({size.w, size.z});
+        // plan.s = new std::remove_pointer_t<decltype(plan.s)>({size.z, size.w});
+        plan.s->set_value(config_param::NUMBER_OF_TRANSFORMS, size.x * size.y);
+        plan.s->set_value(config_param::FWD_DISTANCE, distance);
+        plan.s->set_value(config_param::BWD_DISTANCE, distance);
+        plan.s->set_value(config_param::BACKWARD_SCALE, (1.0 / distance));
+        plan.s->commit(q);
+      } else {
+        auto q = quda::device::defaultQueue();
+        MKL_LONG distance = size.x * size.y;
+        // plan.s = new std::remove_pointer_t<decltype(plan.s)>({size.x, size.y});
+        plan.s = new std::remove_pointer_t<decltype(plan.s)>({size.y, size.x});
+        plan.s->set_value(config_param::NUMBER_OF_TRANSFORMS, size.w * size.z);
+        plan.s->set_value(config_param::FWD_DISTANCE, distance);
+        plan.s->set_value(config_param::BWD_DISTANCE, distance);
+        plan.s->set_value(config_param::BACKWARD_SCALE, (1.0 / distance));
+        plan.s->commit(q);
+      }
+    } else {
+      plan.isDouble = true;
+      if (dim == 0) {
+        auto q = quda::device::defaultQueue();
+        MKL_LONG distance = size.w * size.z;
+        plan.d = new std::remove_pointer_t<decltype(plan.d)>({size.w, size.z});
+        // plan.d = new std::remove_pointer_t<decltype(plan.d)>({size.z, size.w});
+        plan.d->set_value(config_param::NUMBER_OF_TRANSFORMS, size.x * size.y);
+        plan.d->set_value(config_param::FWD_DISTANCE, distance);
+        plan.d->set_value(config_param::BWD_DISTANCE, distance);
+        plan.d->set_value(config_param::BACKWARD_SCALE, (1.0 / distance));
+        plan.d->commit(q);
+      } else {
+        auto q = quda::device::defaultQueue();
+        MKL_LONG distance = size.x * size.y;
+        // plan.d = new std::remove_pointer_t<decltype(plan.d)>({size.x, size.y});
+        plan.d = new std::remove_pointer_t<decltype(plan.d)>({size.y, size.x});
+        plan.d->set_value(config_param::NUMBER_OF_TRANSFORMS, size.w * size.z);
+        plan.d->set_value(config_param::FWD_DISTANCE, distance);
+        plan.d->set_value(config_param::BWD_DISTANCE, distance);
+        plan.d->set_value(config_param::BACKWARD_SCALE, (1.0 / distance));
+        plan.d->commit(q);
+      }
+    }
+#if 0
+    auto type = precision == QUDA_DOUBLE_PRECISION ? CUFFT_Z2Z : CUFFT_C2C;
+    switch (dim) {
+    case 0: {
+      int n[2] = {size.w, size.z};
+      CUFFT_SAFE_CALL(cufftPlanMany(&plan, 2, n, NULL, 1, 0, NULL, 1, 0, type, size.x * size.y));
+    } break;
+    case 1: {
+      int n[2] = {size.x, size.y};
+      CUFFT_SAFE_CALL(cufftPlanMany(&plan, 2, n, NULL, 1, 0, NULL, 1, 0, type, size.z * size.w));
+    } break;
+    }
+    CUFFT_SAFE_CALL(cufftSetStream(plan, target::cuda::get_stream(device::get_default_stream())));
+#endif
+  }
+
+  inline void FFTDestroyPlan(FFTPlanHandle &plan)
+  {
+    if (plan.isDouble) {
+      // plan.d->~descriptor();
+      delete plan.d;
+    } else {
+      // plan.s->~descriptor();
+      delete plan.s;
+    }
+  }
+
+} // namespace quda
+
+#endif // ifndef NATIVE_FFT_LIB
diff --git a/include/targets/sycl/aos.h b/include/targets/sycl/aos.h
new file mode 100644
index 0000000000..cafb8b529f
--- /dev/null
+++ b/include/targets/sycl/aos.h
@@ -0,0 +1,61 @@
+#pragma once
+
+// #include <quda_sycl.h>
+
+namespace quda
+{
+
+#if 0
+  template <typename T> struct subgroup_load_store {
+    //using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;
+    //using atom_t = std::conditional_t<sizeof(T) % 16 == 0, int4, std::conditional_t<sizeof(T) % 8 == 0, int2, int>>;
+    using atom_t = int;
+    static_assert(sizeof(T) % 4 == 0, "block_load & block_store do not support sub-word size types");
+    static constexpr int n_element = sizeof(T) / sizeof(atom_t);
+    using vec = atom_t[n_element];
+  };
+#endif
+
+  template <typename T, int n> __host__ __device__ void block_load(T out[n], const T *in)
+  {
+    // #pragma unroll
+    // for (int i = 0; i < n; i++) out[i] = in[i];
+    memcpy(out, in, n * sizeof(T));
+    // using U = T[n];
+    // using LS = subgroup_load_store<U>;
+    // using V = typename LS::vec;
+    // using A = typename LS::atom_t;
+    // constexpr int nv = LS::n_element;
+    // auto sg = sycl::ext::oneapi::experimental::this_sub_group();
+    // auto vin = reinterpret_cast<const V*>(in) - sg.get_local_id();
+    // auto vin = reinterpret_cast<const A*>(in) - nv*sg.get_local_id();
+    // auto t = sg.load<nv>(sycl::multi_ptr<const A,sycl::access::address_space::global_space>{vin});
+    // #pragma unroll
+    // for (int i = 0; i < nv; i++) t[i] = sg.load(vin + sg);
+    // auto vout = sg.load(vin);
+    // #pragma unroll
+    // for (int i = 0; i < n; i++) out[i] = vout[i];
+  }
+
+  template <typename T, int n> __host__ __device__ void block_store(T *out, const T in[n])
+  {
+    // #pragma unroll
+    // for (int i = 0; i < n; i++) out[i] = in[i];
+    memcpy(out, in, n * sizeof(T));
+  }
+
+  template <typename T> __host__ __device__ void block_load(T &out, const T *in)
+  {
+    // out = *in;
+    memcpy(&out, in, sizeof(T));
+    // auto sg = sycl::ext::oneapi::experimental::this_sub_group();
+    // out = sg.load(in - sg.get_local_id());
+  }
+
+  template <typename T> __host__ __device__ void block_store(T *out, const T &in)
+  {
+    //*out = in;
+    memcpy(out, &in, sizeof(T));
+  }
+
+} // namespace quda
diff --git a/include/targets/sycl/atomic.cuh b/include/targets/sycl/atomic.cuh
new file mode 100644
index 0000000000..ff3e58daac
--- /dev/null
+++ b/include/targets/sycl/atomic.cuh
@@ -0,0 +1,241 @@
+// old depracated version
+
+#pragma once
+
+/**
+   @file atomic.cuh
+
+   @section Description
+
+   Provides definitions of atomic functions that are not native to
+   CUDA.  These are intentionally not declared in the namespace to
+   avoid confusion when resolving the native atomicAdd functions.
+ */
+
+// inline constexpr auto mo = sycl::memory_order::relaxed;
+inline constexpr auto mo = sycl::ext::oneapi::memory_order::acq_rel;
+// inline constexpr auto mo = memory_order::seq_cst;
+// inline constexpr auto mo = sycl::memory_order::acq_rel;
+
+// inline constexpr auto ms = sycl::ext::oneapi::memory_scope::system;
+inline constexpr auto ms = sycl::ext::oneapi::memory_scope::device;
+// inline constexpr auto ms = sycl::memory_scope::system;
+
+// inline constexpr auto as = sycl::access::address_space::generic_space;
+inline constexpr auto as = sycl::access::address_space::global_space;
+
+template <typename T> using atomicRef = sycl::ext::oneapi::atomic_ref<T, mo, ms, as>;
+// using atomicRef = sycl::atomic_ref<T,mo,ms,as>;
+
+template <typename T> static inline atomicRef<T> makeAtomicRef(T *address) { return atomicRef<T>(*address); }
+
+static inline uint __float_as_uint(float x) { return *reinterpret_cast<uint *>(&x); }
+
+static inline float __uint_as_float(uint x) { return *reinterpret_cast<float *>(&x); }
+
+static inline unsigned int atomicMax(unsigned int *address, unsigned int val)
+{
+  auto ar = makeAtomicRef(address);
+  auto old = ar.fetch_max(val);
+  return old;
+}
+
+static inline int atomicCAS(int *address, int compare, int val)
+{
+  auto ar = makeAtomicRef(address);
+  auto old = ar.compare_exchange_strong(compare, val);
+  return old;
+}
+static inline unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val)
+{
+  auto ar = makeAtomicRef(address);
+  auto old = ar.compare_exchange_strong(compare, val);
+  return old;
+}
+
+/**
+   @brief Implementation of double-precision atomic addition using compare
+   and swap. Taken from the CUDA programming guide.
+
+   @param addr Address that stores the atomic variable to be updated
+   @param val Value to be added to the atomic
+*/
+static inline int atomicAdd(int *address, int val)
+{
+  auto ar = makeAtomicRef(address);
+  auto old = ar.fetch_add(val);
+  return old;
+}
+static inline float atomicAdd(float *address, float val)
+{
+  auto ar = makeAtomicRef(address);
+  auto old = ar.fetch_add(val);
+  return old;
+}
+static inline double atomicAdd(double *address, double val)
+{
+  auto ar = makeAtomicRef(address);
+  auto old = ar.fetch_add(val);
+  return old;
+}
+
+/**
+   @brief Implementation of double2 atomic addition using two
+   double-precision additions.
+
+   @param addr Address that stores the atomic variable to be updated
+   @param val Value to be added to the atomic
+*/
+static inline double2 atomicAdd(double2 *addr, double2 val)
+{
+  double2 old = *addr;
+  // This is a necessary evil to avoid conflicts between the atomicAdd
+  // declared in the CUDA headers which are visible for host
+  // compilation, which cause a conflict when compiled on clang-cuda.
+  // As a result we do not support any architecture without native
+  // double precision atomics on clang-cuda.
+  old.x = atomicAdd((double *)addr, val.x);
+  old.y = atomicAdd((double *)addr + 1, val.y);
+  return old;
+}
+
+/**
+   @brief Implementation of float2 atomic addition using two
+   single-precision additions.
+
+   @param addr Address that stores the atomic variable to be updated
+   @param val Value to be added to the atomic
+*/
+static inline float2 atomicAdd(float2 *addr, float2 val)
+{
+  float2 old = *addr;
+  old.x = atomicAdd((float *)addr, val.x);
+  old.y = atomicAdd((float *)addr + 1, val.y);
+  return old;
+}
+
+/**
+   @brief Implementation of int2 atomic addition using two
+   int additions.
+
+   @param addr Address that stores the atomic variable to be updated
+   @param val Value to be added to the atomic
+*/
+static inline int2 atomicAdd(int2 *addr, int2 val)
+{
+  int2 old = *addr;
+  old.x = atomicAdd((int *)addr, val.x);
+  old.y = atomicAdd((int *)addr + 1, val.y);
+  return old;
+}
+
+union uint32_short2 {
+  unsigned int i;
+  short2 s;
+};
+
+/**
+   @brief Implementation of short2 atomic addition using compare
+   and swap.
+
+   @param addr Address that stores the atomic variable to be updated
+   @param val Value to be added to the atomic
+*/
+static inline short2 atomicAdd(short2 *addr, short2 val)
+{
+  uint32_short2 old, assumed, incremented;
+  old.s = *addr;
+  do {
+    assumed.s = old.s;
+    incremented.s = make_short2(val.x + assumed.s.x, val.y + assumed.s.y);
+    old.i = atomicCAS((unsigned int *)addr, assumed.i, incremented.i);
+  } while (assumed.i != old.i);
+
+  return old.s;
+}
+
+union uint32_char2 {
+  unsigned short i;
+  char2 s;
+};
+
+/**
+   @brief Implementation of char2 atomic addition using compare
+   and swap.
+
+   @param addr Address that stores the atomic variable to be updated
+   @param val Value to be added to the atomic
+*/
+static inline char2 atomicAdd(char2 *addr, char2 val)
+{
+  uint32_char2 old, assumed, incremented;
+  old.s = *addr;
+  do {
+    assumed.s = old.s;
+    incremented.s = make_char2(val.x + assumed.s.x, val.y + assumed.s.y);
+    old.i = atomicCAS((unsigned int *)addr, assumed.i, incremented.i);
+  } while (assumed.i != old.i);
+
+  return old.s;
+}
+
+/**
+   @brief Implementation of single-precision atomic max using compare
+   and swap. May not support NaNs properly...
+
+   @param addr Address that stores the atomic variable to be updated
+   @param val Value to be added to the atomic
+*/
+static inline float atomicMax(float *addr, float val)
+{
+  unsigned int old = __float_as_uint(*addr), assumed;
+  do {
+    assumed = old;
+    if (__uint_as_float(old) >= val) break;
+
+    old = atomicCAS((unsigned int *)addr, assumed, __float_as_uint(val));
+  } while (assumed != old);
+
+  return __uint_as_float(old);
+}
+
+/**
+   @brief Implementation of single-precision atomic max specialized
+   for positive-definite numbers.  Here we take advantage of the
+   property that when positive floating point numbers are
+   reinterpretted as unsigned integers, they have the same unique
+   sorted order.
+
+   @param addr Address that stores the atomic variable to be updated
+   @param val Value to be added to the atomic
+*/
+static inline float atomicAbsMax(float *addr, float val)
+{
+  uint32_t val_ = __float_as_uint(val);
+  uint32_t *addr_ = reinterpret_cast<uint32_t *>(addr);
+  return atomicMax(addr_, val_);
+}
+
+/**
+   @brief atomic_fetch_add function performs similarly as atomic_ref::fetch_add
+   @param[in,out] addr The memory address of the variable we are
+   updating atomically
+   @param[in] val The value we summing to the value at addr
+ */
+template <typename T> inline void atomic_fetch_add(T *addr, T val) { atomicAdd(addr, val); }
+
+#if 0
+template <typename T, int n> void atomic_fetch_add(vector_type<T, n> *addr, vector_type<T, n> val)
+{
+  for (int i = 0; i < n; i++) atomic_fetch_add(&(*addr)[i], val[i]);
+}
+#endif
+
+/**
+   @brief atomic_fetch_max function that does an atomic max.
+   @param[in,out] addr The memory address of the variable we are
+   updating atomically
+   @param[in] val The value we are comparing against.  Must be
+   positive valued else result is undefined.
+ */
+template <typename T> inline void atomic_fetch_abs_max(T *addr, T val) { atomicAbsMax(addr, val); }
diff --git a/include/targets/sycl/atomic_helper.h b/include/targets/sycl/atomic_helper.h
new file mode 100644
index 0000000000..2c7eb57536
--- /dev/null
+++ b/include/targets/sycl/atomic_helper.h
@@ -0,0 +1,109 @@
+#pragma once
+
+#include <array.h>
+
+/**
+   @file atomic_helper.h
+
+   @section Provides definitions of atomic functions that are used in QUDA.
+ */
+
+inline constexpr auto mo = sycl::memory_order::relaxed;
+// inline constexpr auto mo = sycl::ext::oneapi::memory_order::acq_rel;
+// inline constexpr auto mo = memory_order::seq_cst;
+// inline constexpr auto mo = sycl::memory_order::acq_rel;
+
+// inline constexpr auto ms = sycl::memory_scope::system;
+inline constexpr auto ms = sycl::memory_scope::device;
+inline constexpr auto msg = sycl::memory_scope::work_group;
+
+// inline constexpr auto as = sycl::access::address_space::generic_space;
+inline constexpr auto as = sycl::access::address_space::global_space;
+inline constexpr auto asl = sycl::access::address_space::local_space;
+
+// using atomicRef = sycl::ext::oneapi::atomic_ref<T,mo,ms,as>;
+template <typename T> using atomicRef = sycl::atomic_ref<T, mo, ms, as>;
+template <typename T> using atomicRefL = sycl::atomic_ref<T, mo, msg, asl>;
+
+template <typename T> static inline atomicRef<T> makeAtomicRef(T *address) { return atomicRef<T>(*address); }
+
+template <typename T> static inline atomicRefL<T> makeAtomicRefL(T *address) { return atomicRefL<T>(*address); }
+
+#if 0
+using lfloat = std::remove_pointer_t<decltype(std::declval<sycl::local_ptr<float>>().get())>;
+using ldouble = std::remove_pointer_t<decltype(std::declval<sycl::local_ptr<double>>().get())>;
+
+static inline atomicRefL<float> makeAtomicRef(lfloat *address) {
+  return atomicRefL<float>(*address);
+}
+
+static inline atomicRefL<double> makeAtomicRef(ldouble *address) {
+  return atomicRefL<double>(*address);
+}
+
+static inline atomicRefL<float> makeAtomicRefL(lfloat *address) {
+  return atomicRefL<float>(*address);
+}
+
+static inline atomicRefL<double> makeAtomicRefL(ldouble *address) {
+  return atomicRefL<double>(*address);
+}
+#endif
+
+static inline uint __float_as_uint(float x) { return *reinterpret_cast<uint *>(&x); }
+
+#if 0
+static inline int atomicAdd(int *address, int val)
+{
+  auto ar = makeAtomicRef(address);
+  auto old = ar.fetch_add(val);
+  return old;
+}
+
+static inline unsigned int atomicAdd(unsigned int *address, unsigned int val)
+{
+  auto ar = makeAtomicRef(address);
+  auto old = ar.fetch_add(val);
+  return old;
+}
+
+static inline float atomicAdd(float *address, float val)
+{
+  auto ar = makeAtomicRef(address);
+  auto old = ar.fetch_add(val);
+  return old;
+}
+
+static inline double atomicAdd(double *address, double val)
+{
+  auto ar = makeAtomicRef(address);
+  auto old = ar.fetch_add(val);
+  return old;
+}
+#endif
+template <typename T, typename U> static inline int atomicAdd(T *address, U val)
+{
+  auto ar = makeAtomicRef(address);
+  auto old = ar.fetch_add(val);
+  return old;
+}
+
+static inline uint32_t atomicMax(uint32_t *address, uint32_t val)
+{
+  auto ar = makeAtomicRef(address);
+  auto old = ar.fetch_max(val);
+  return old;
+}
+
+template <typename T, typename U> __device__ __host__ inline void atomic_fetch_add(T *addr, U val)
+{
+  atomicAdd(addr, val);
+}
+
+template <typename T, typename U> __device__ __host__ inline void atomic_add_shared(T *addr, U val)
+{
+  auto ar = makeAtomicRefL(addr);
+  ar += val;
+}
+
+#include <../cuda/atomic_helper.h>
diff --git a/include/targets/sycl/block_reduce_helper.h b/include/targets/sycl/block_reduce_helper.h
new file mode 100644
index 0000000000..c18fe855be
--- /dev/null
+++ b/include/targets/sycl/block_reduce_helper.h
@@ -0,0 +1,223 @@
+#pragma once
+
+#include <target_device.h>
+#include <reducer.h>
+#include <group_reduce.h>
+#include <kernel_ops_target.h>
+#include <shared_memory_helper.h>
+
+/**
+   @file block_reduce_helper.h
+
+   @section This files contains the CUDA device specializations for
+   warp- and block-level reductions, using the CUB library
+ */
+
+// using namespace quda;
+
+namespace quda
+{
+
+  /**
+     @brief The atomic word size we use for a given reduction type.
+     This type should be lock-free to guarantee correct behaviour on
+     platforms that are not coherent with respect to the host
+   */
+  template <typename T, typename Enable = void> struct atomic_type;
+
+  template <> struct atomic_type<device_reduce_t> {
+    using type = device_reduce_t;
+  };
+
+  template <> struct atomic_type<float> {
+    using type = float;
+  };
+
+  template <typename T> struct atomic_type<T, std::enable_if_t<std::is_same_v<T, array<device_reduce_t, T::N>>>> {
+    using type = device_reduce_t;
+  };
+
+  template <typename T>
+  struct atomic_type<T, std::enable_if_t<std::is_same_v<T, array<array<device_reduce_t, T::value_type::N>, T::N>>>> {
+    using type = device_reduce_t;
+  };
+
+  template <typename T> struct atomic_type<T, std::enable_if_t<std::is_same_v<T, array<complex<double>, T::N>>>> {
+    using type = double;
+  };
+
+  template <typename T> struct atomic_type<T, std::enable_if_t<std::is_same_v<T, array<complex<float>, T::N>>>> {
+    using type = float;
+  };
+
+  // pre-declaration of warp_reduce that we wish to specialize
+  template <bool> struct warp_reduce;
+
+  /**
+     @brief SYCL specialization of warp_reduce, utilizing subgroup operations
+  */
+  template <> struct warp_reduce<true> {
+
+    /**
+       @brief Perform a warp-wide reduction using subgroups
+       @param[in] value_ thread-local value to be reduced
+       @param[in] all Whether we want all threads to have visibility
+       to the result (all = true) or just the first thread in the
+       warp (all = false).
+       @param[in] r The reduction operation we want to apply
+       @return The warp-wide reduced value
+     */
+    template <typename T, typename reducer_t, typename param_t>
+    T inline operator()(const T &value_, bool all, const reducer_t &r, const param_t &)
+    {
+      // auto sg = sycl::ext::oneapi::experimental::this_sub_group();
+      auto sg = sycl::ext::oneapi::this_work_item::get_sub_group();
+      T value = value_;
+#pragma unroll
+      for (int offset = param_t::width / 2; offset >= 1; offset /= 2) {
+        value = r(value, sycl::shift_group_left(sg, value, offset));
+      }
+      // if (all) value = sycl::select_from_group(sg, value, 0);
+      if (all) value = sycl::group_broadcast(sg, value);
+      return value;
+    }
+  };
+
+  // pre-declaration of block_reduce that we wish to specialize
+  // template <bool> struct block_reduce;
+  template <typename T, int block_dim, int batch_size> class BlockReduce;
+
+  /**
+     @brief SYCL specialization of block_reduce, using SYCL group reductions
+  */
+  template <typename T, int block_dim, int batch_size> struct block_reduceG {
+    // using dependencies = op_Sequential<op_blockSync>;
+    // using dependentOps = KernelOps<op_blockSync>;
+    using BlockReduce_t = BlockReduce<T, block_dim, batch_size>;
+    template <typename S> inline block_reduceG(S &) {};
+    /**
+       @brief Perform a block-wide reduction
+       @param[in] value_ thread-local value to be reduced
+       @param[in] async Whether this reduction will be performed
+       asynchronously with respect to the calling threads
+       @param[in] batch The batch index of the reduction
+       @param[in] all Whether we want all threads to have visibility
+       to the result (all = true) or just the first thread in the
+       block (all = false)
+       @param[in] r The reduction operation we want to apply
+       @return The block-wide reduced value
+     */
+    template <typename reducer_t> inline T apply(const T &value_, bool async, int batch, bool, const reducer_t &r)
+    {
+      if (!async) __syncthreads(); // only synchronize if we are not pipelining
+      const int nbatch = batch_size;
+      // const int nbatch = std::min(param_t::batch_size, localRangeZ);
+      auto grp = getGroup();
+      T result;
+      // T result = reducer_t::init();
+      for (int i = 0; i < nbatch; i++) {
+        T in = (i == batch) ? value_ : reducer_t::init();
+        T out;
+        blockReduce(grp, out, in, r);
+        if (i == batch) result = out;
+      }
+      return result;
+    }
+  };
+
+  /**
+     @brief SYCL specialization of block_reduce, building on the warp_reduce
+  */
+  template <typename T, int block_dim, int batch_size> struct block_reduceW : SharedMemory<T, SizeBlockDivWarp> {
+    using Smem = SharedMemory<T, SizeBlockDivWarp>;
+    using BlockReduce_t = BlockReduce<T, block_dim, batch_size>;
+    template <typename S> inline block_reduceW(S &ops) : Smem(ops) {};
+
+    template <int width_> struct warp_reduce_param {
+      static constexpr int width = width_;
+    };
+
+    /**
+       @brief Perform a block-wide reduction
+       @param[in] value_ thread-local value to be reduced
+       @param[in] async Whether this reduction will be performed
+       asynchronously with respect to the calling threads
+       @param[in] batch The batch index of the reduction
+       @param[in] all Whether we want all threads to have visibility
+       to the result (all = true) or just the first thread in the
+       block (all = false)
+       @param[in] r The reduction operation we want to apply
+       @return The block-wide reduced value
+     */
+    template <typename reducer_t> inline T apply(const T &value_, bool async, int batch, bool all, const reducer_t &r)
+    {
+      constexpr auto max_items = device::max_block_size() / device::warp_size();
+      const auto thread_idx = target::thread_idx_linear<block_dim>();
+      const auto block_size = target::block_size<block_dim>();
+      const auto warp_idx = thread_idx / device::warp_size();
+      const auto warp_items = (block_size + device::warp_size() - 1) / device::warp_size();
+
+      // first do warp reduce
+      T value = warp_reduce<true>()(value_, false, r, warp_reduce_param<device::warp_size()>());
+
+      if (!all && warp_items == 1) return value; // short circuit for single warp CTA
+
+      // now do reduction between warps
+      if (!async) __syncthreads(); // only synchronize if we are not pipelining
+
+      auto storage = Smem::sharedMem();
+
+      // if first thread in warp, write result to shared memory
+      if (thread_idx % device::warp_size() == 0) storage[batch * warp_items + warp_idx] = value;
+      // blockSync(ops);
+      __syncthreads();
+
+      // whether to use the first warp or first thread for the final reduction
+      constexpr bool final_warp_reduction = true;
+
+      if constexpr (final_warp_reduction) { // first warp completes the reduction (requires first warp is full)
+        if (warp_idx == 0) {
+          if constexpr (max_items > device::warp_size()) { // never true for max block size 1024, warp = 32
+            value = r.init();
+            for (auto i = thread_idx; i < warp_items; i += device::warp_size())
+              value = r(storage[batch * warp_items + i], value);
+          } else { // optimized path where we know the final reduction will fit in a warp
+            value = thread_idx < warp_items ? storage[batch * warp_items + thread_idx] : r.init();
+          }
+          value = warp_reduce<true>()(value, false, r, warp_reduce_param<device::warp_size()>());
+        }
+      } else { // first thread completes the reduction
+        if (thread_idx == 0) {
+          for (unsigned int i = 1; i < warp_items; i++) value = r(storage[batch * warp_items + i], value);
+        }
+      }
+
+      if (all) {
+        if (thread_idx == 0) storage[batch * warp_items + 0] = value;
+        // blockSync(ops);
+        __syncthreads();
+        value = storage[batch * warp_items + 0];
+      }
+
+      return value;
+    }
+  };
+
+  // template <typename T, int block_dim, int batch_size> using block_reduce = block_reduceG<T,block_dim,batch_size>;
+  template <typename T, int block_dim, int batch_size> using block_reduce = block_reduceW<T, block_dim, batch_size>;
+
+} // namespace quda
+
+#include "../generic/block_reduce_helper.h"
+
+namespace quda
+{
+  template <typename T, int block_dim, int batch_size>
+  static constexpr bool needsFullBlockImpl<BlockReduce<T, block_dim, batch_size>> = true;
+  template <typename T, int block_dim, int batch_size>
+  static constexpr bool needsSharedMemImpl<BlockReduce<T, block_dim, batch_size>> = true;
+} // namespace quda
+
+static_assert(needsFullBlock<KernelOps<BlockReduce<double, 1>>> == true);
+static_assert(BlockReduce<double, 1>::shared_mem_size(dim3 {8, 8, 8}) > 0);
+static_assert(needsSharedMem<KernelOps<BlockReduce<double, 1>>> == true);
diff --git a/include/targets/sycl/block_reduce_helper_rog.h b/include/targets/sycl/block_reduce_helper_rog.h
new file mode 100644
index 0000000000..25c4f13a95
--- /dev/null
+++ b/include/targets/sycl/block_reduce_helper_rog.h
@@ -0,0 +1,378 @@
+#pragma once
+
+#include <target_device.h>
+#include <reducer.h>
+#include <group_reduce.h>
+
+/**
+   @file block_reduce_helper.h
+
+   @section This files contains the SYCL implementations
+   for warp- and block-level reductions.
+ */
+
+using namespace quda;
+
+namespace quda
+{
+
+#if 0
+  /**
+     @brief warp_reduce_param is used as a container for passing
+     non-type parameters to specialize warp_reduce through the
+     target::dispatch
+     @tparam width The number of logical threads taking part in the warp reduction
+   */
+  template <int width_ = device::warp_size()> struct warp_reduce_param {
+    static_assert(width_ <= device::warp_size(), "WarpReduce logical width must not be greater than the warp size");
+    static constexpr int width = width_;
+  };
+
+  /**
+     @brief block_reduce_param is used as a container for passing
+     non-type parameters to specialize block_reduce through the
+     target::dispatch
+     @tparam block_size_x_ The number of threads in the x dimension
+     @tparam block_size_y_ The number of threads in the y dimension
+     @tparam block_size_z_ The number of threads in the z dimension
+     @tparam batched Whether this is a batched reduction or not.  If
+     batched, then the block_size_z_ parameter is set equal to the
+     batch size.
+   */
+  template <int block_size_x_, int block_size_y_, int block_size_z_, bool batched> struct block_reduce_param {
+    static constexpr int block_size_x = block_size_x_;
+    static constexpr int block_size_y = block_size_y_;
+    static constexpr int block_size_z = !batched ? block_size_z_ : 1;
+    static constexpr int batch_size = !batched ? 1 : block_size_z_;
+  };
+
+  /**
+     @brief Dummy generic implementation of warp_reduce
+  */
+  template <bool is_device> struct warp_reduce {
+    template <typename T, typename reducer_t, typename param_t> T operator()(const T &value, bool, reducer_t, param_t)
+    {
+      return value;
+    }
+  };
+
+  /**
+     @brief Dummy generic implementation of block_reduce
+  */
+  template <bool is_device> struct block_reduce {
+    template <typename T, typename reducer_t, typename param_t>
+    T operator()(const T &value, bool, int, bool, reducer_t, param_t)
+    {
+      return value;
+    }
+  };
+#endif
+
+  /**
+     @brief WarpReduce provides a generic interface for performing
+     perform reductions at the warp or sub-warp level
+     @tparam T The type of the value that we are reducing
+     @tparam width The number of logical threads taking part in the warp reduction
+  */
+  template <typename T, int width> class WarpReduce
+  {
+    static_assert(width <= device::warp_size(), "WarpReduce logical width must not be greater than the warp size");
+    // using param_t = warp_reduce_param<width>;
+    // const nreduce = device::warp_size() / width;
+
+  public:
+    constexpr WarpReduce() { }
+
+    /**
+       @brief Perform a warp-wide sum reduction
+       @param[in] value Thread-local value to be reduced
+       @return Reduced value (defined in logical thread 0 only)
+     */
+    inline T Sum(const T &value)
+    {
+      // static const __SYCL_CONSTANT_AS char format[] = "WarpReduce::Sum unimplemented\n";
+      // sycl::ext::oneapi::experimental::printf(format);
+      return value;
+      // return target::dispatch<warp_reduce>(value, false, quda::plus<T>(), param_t());
+    }
+
+    /**
+       @brief Perform a warp-wide sum reduction
+       @param[in] value Thread-local value to be reduced
+       @return Reduced value (defined in all threads within the logical warp)
+     */
+    inline T AllSum(const T &value)
+    {
+      // static const __SYCL_CONSTANT_AS char format[] = "WarpReduce::AllSum unimplemented\n";
+      // sycl::ext::oneapi::experimental::printf(format);
+      return value;
+      // return target::dispatch<warp_reduce>(value, true, quda::plus<T>(), param_t());
+    }
+
+    /**
+       @brief Perform a warp-wide max reduction
+       @param[in] value Thread-local value to be reduced
+       @return Reduced value (defined in logical thread 0 only)
+     */
+    inline T Max(const T &value)
+    {
+      // static const __SYCL_CONSTANT_AS char format[] = "WarpReduce::Max unimplemented\n";
+      // sycl::ext::oneapi::experimental::printf(format);
+      return value;
+      // return target::dispatch<warp_reduce>(value, false, quda::maximum<T>(), param_t());
+    }
+
+    /**
+       @brief Perform a warp-wide max reduction
+       @param[in] value Thread-local value to be reduced
+       @return Reduced value (defined in all threads within the logical warp)
+     */
+    inline T AllMax(const T &value)
+    {
+      // static const __SYCL_CONSTANT_AS char format[] = "WarpReduce::AllMax unimplemented\n";
+      // sycl::ext::oneapi::experimental::printf(format);
+      return value;
+      // return target::dispatch<warp_reduce>(value, true, quda::maximum<T>(), param_t());
+    }
+
+    /**
+       @brief Perform a warp-wide min reduction
+       @param[in] value Thread-local value to be reduced
+       @return Reduced value (defined in logical thread 0 only)
+     */
+    inline T Min(const T &value)
+    {
+      // static const __SYCL_CONSTANT_AS char format[] = "WarpReduce::Min unimplemented\n";
+      // sycl::ext::oneapi::experimental::printf(format);
+      return value;
+      // return target::dispatch<warp_reduce>(value, false, quda::minimum<T>(), param_t());
+    }
+
+    /**
+       @brief Perform a warp-wide min reduction
+       @param[in] value Thread-local value to be reduced
+       @return Reduced value (defined in all threads within the logical warp)
+     */
+    inline T AllMin(const T &value)
+    {
+      // static const __SYCL_CONSTANT_AS char format[] = "WarpReduce::AllMin unimplemented\n";
+      // sycl::ext::oneapi::experimental::printf(format);
+      return value;
+      // return target::dispatch<warp_reduce>(value, true, quda::minimum<T>(), param_t());
+    }
+  };
+
+  /**
+     @brief BlockReduce provides a generic interface for performing
+     reductions at the block level
+     @tparam T The type of the value that we are reducing
+     @tparam block_dim The number of thread block dimensions we are reducing
+     @tparam batch_size Batch size of the reduction.  Threads will be
+     ordered such that batch size is the slowest running index.
+  */
+  template <typename T, int block_dim, int batch_size_ = 1> class BlockReduce
+  {
+    static constexpr int batch_size = std::max(batch_size_, 1);
+    const int nbatch = batch_size_ != 0 ? batch_size_ : localRangeZ;
+    const int batch;
+
+  public:
+    constexpr BlockReduce(int batch = 0) : batch(batch) { }
+
+    /**
+       @brief Perform a block-wide sum reduction
+       @param[in] value Thread-local value to be reduced
+       @return Reduced value (defined in logical thread 0 only)
+     */
+    template <bool async = true> inline T Sum(const T &value)
+    {
+      if (!async) __syncthreads(); // only synchronize if we are not pipelining
+      auto grp = getGroup();
+#if 1
+      T result;
+      // for(int i=0; i<batch_size; i++) {
+      for (int i = 0; i < nbatch; i++) {
+        T in = (i == batch) ? value : quda::zero<T>();
+        T out;
+        blockReduceSum(grp, out, in);
+        if (i == batch) result = out;
+      }
+      return result;
+#else
+      using atype = T[512]; // FIXME
+      auto mem0 = sycl::ext::oneapi::group_local_memory_for_overwrite<atype>(grp);
+      auto mem = *mem0.get();
+      auto r0 = localRangeX;
+      auto r1 = localRangeY;
+      auto r2 = localRangeZ;
+      auto i0 = localIdX;
+      auto i1 = localIdY;
+      auto i2 = localIdZ;
+      auto r = r0 * r1;
+      auto i = i1 * r0 + i0;
+      if (i2 * r + i < 512) { mem[i2 * r + i] = value; }
+      group_barrier(grp);
+      for (int s = 1; s < r; s *= 2) {
+        int a = 2 * s * i;
+        int as = a + s;
+        if (as < r) {
+          if (i2 * r + as < 512) { mem[i2 * r + a] = mem[i2 * r + a] + mem[i2 * r + as]; }
+        }
+        group_barrier(grp);
+      }
+      return mem[0];
+#endif
+    }
+
+    /**
+       @brief Perform a block-wide sum reduction
+       @param[in] value Thread-local value to be reduced
+       @return Reduced value (defined in all threads in the block)
+     */
+    template <bool async = true> __device__ __host__ inline T AllSum(const T &value)
+    {
+      static_assert(batch_size == 1, "Cannot do AllSum with batch_size > 1");
+      if (!async) __syncthreads(); // only synchronize if we are not pipelining
+      auto grp = getGroup();
+      T result;
+      blockReduceSum(grp, result, value);
+      return result;
+    }
+
+    /**
+       @brief Perform a block-wide max reduction
+       @param[in] value Thread-local value to be reduced
+       @return Reduced value (defined in logical thread 0 only)
+     */
+    template <bool async = true> __device__ __host__ inline T Max(const T &value)
+    {
+      static_assert(batch_size == 1, "Cannot do Max with batch_size > 1");
+      if (!async) __syncthreads(); // only synchronize if we are not pipelining
+      auto grp = getGroup();
+      T result;
+      blockReduceMax(grp, result, value);
+      return result;
+    }
+
+    /**
+       @brief Perform a block-wide max reduction
+       @param[in] value Thread-local value to be reduced
+       @return Reduced value (defined in all threads in the block)
+     */
+    template <bool async = true> __device__ __host__ inline T AllMax(const T &value)
+    {
+      static_assert(batch_size == 1, "Cannot do AllMax with batch_size > 1");
+      if (!async) __syncthreads(); // only synchronize if we are not pipelining
+      auto grp = getGroup();
+      T result;
+      blockReduceMax(grp, result, value);
+      return result;
+    }
+
+    /**
+       @brief Perform a block-wide min reduction
+       @param[in] value Thread-local value to be reduced
+       @return Reduced value (defined in logical thread 0 only)
+     */
+    template <bool async = true> __device__ __host__ inline T Min(const T &value)
+    {
+      static_assert(batch_size == 1, "Cannot do Min with batch_size > 1");
+      if (!async) __syncthreads(); // only synchronize if we are not pipelining
+      auto grp = getGroup();
+      T result;
+      blockReduceMin(grp, result, value);
+      return result;
+    }
+
+    /**
+       @brief Perform a block-wide min reduction
+       @param[in] value Thread-local value to be reduced
+       @return Reduced value (defined in all threads in the block)
+     */
+    template <bool async = true> __device__ __host__ inline T AllMin(const T &value)
+    {
+      static_assert(batch_size == 1, "Cannot do AllMin with batch_size > 1");
+      if (!async) __syncthreads(); // only synchronize if we are not pipelining
+      auto grp = getGroup();
+      T result;
+      blockReduceMin(grp, result, value);
+      return result;
+    }
+
+    /**
+       @brief Perform a block-wide custom reduction
+       @param[in] value Thread-local value to be reduced
+       @param[in] r The reduction operation we want to apply
+       @return Reduced value (defined in logical thread 0 only)
+     */
+#if 0
+    template <bool async = true, typename U>
+    inline T
+    ReduceNotSum(const T &value, const quda::maximum<U> &r)
+    {
+      return Max<async>(value);
+    }
+
+    template <bool async = true, typename U>
+    inline T
+    ReduceNotSum(const T &value, const quda::minimum<U> &r)
+    {
+      return Min<async>(value);
+    }
+
+    template <bool async = true, typename reducer_t>
+    inline std::enable_if_t<!reducer_t::do_sum,T>
+    Reduce(const T &value, const reducer_t &r)
+    {
+      return ReduceNotSum<async>(value, typename reducer_t::reducer_t());
+    }
+
+    template <bool async = true, typename reducer_t>
+    inline std::enable_if_t<reducer_t::do_sum,T>
+    Reduce(const T &value, const reducer_t &r)
+    {
+      return Sum<async>(value);
+    }
+#endif
+
+    template <bool async = true, typename R>
+    inline std::enable_if_t<std::is_same_v<typename R::reducer_t, plus<typename R::reduce_t>>, T> Reduce(const T &value,
+                                                                                                         const R &)
+    {
+      return Sum<async>(value);
+    }
+
+    template <bool async = true, typename R>
+    inline std::enable_if_t<std::is_same_v<typename R::reducer_t, maximum<typename R::reduce_t>>, T>
+    Reduce(const T &value, const R &)
+    {
+      return Max<async>(value);
+    }
+
+    template <bool async = true, typename R>
+    inline std::enable_if_t<std::is_same_v<typename R::reducer_t, minimum<typename R::reduce_t>>, T>
+    Reduce(const T &value, const R &)
+    {
+      return Min<async>(value);
+    }
+
+#if 0
+    /**
+       @brief Perform a block-wide custom reduction
+       @param[in] value Thread-local value to be reduced
+       @param[in] r The reduction operation we want to apply
+       @return Reduced value (defined in all threads in the block)
+     */
+    template <bool async = true, typename R>
+    inline T AllReduce(const T &value, const R &r)
+    {
+      static_assert(batch_size == 1, "Cannot do AllReduce with batch_size > 1");
+      auto grp = getGroup();
+      T result;
+      blockReduce(grp, result, value, r);  // FIXME: not used
+      return result;
+    }
+#endif
+  };
+
+} // namespace quda
diff --git a/include/targets/sycl/block_reduction_kernel.h b/include/targets/sycl/block_reduction_kernel.h
new file mode 100644
index 0000000000..ffe46233f6
--- /dev/null
+++ b/include/targets/sycl/block_reduction_kernel.h
@@ -0,0 +1,197 @@
+#pragma once
+#include <tunable_kernel.h>
+#include <reduce_helper.h>
+#include <quda_sycl_api.h>
+
+namespace quda
+{
+
+  /**
+     @brief This helper function swizzles the block index through
+     mapping the block index onto a matrix and tranposing it.  This is
+     done to potentially increase the cache utilization.  Requires
+     that the argument class has a member parameter "swizzle" which
+     determines if we are swizzling and a parameter "swizzle_factor"
+     which is the effective matrix dimension that we are tranposing in
+     this mapping.
+
+     Specifically, the thread block id is remapped by
+     transposing its coordinates: if the original order can be
+     parameterized by
+
+     blockIdx.x = j * swizzle + i,
+
+     then the new order is
+
+     block_idx = i * (gridDim.x / swizzle) + j
+
+     We need to factor out any remainder and leave this in original
+     ordering.
+
+     @param arg Kernel argument struct
+     @return Swizzled block index
+   */
+  template <typename Arg> int virtual_block_idx(const Arg &arg, const sycl::nd_item<3> &)
+  {
+    int block_idx = groupIdX;
+    if (arg.swizzle) {
+      // the portion of the grid that is exactly divisible by the number of SMs
+      // const int gridp = gridDim.x - gridDim.x % arg.swizzle_factor;
+      const int ngrp = groupRangeX;
+      const int gridp = ngrp - ngrp % arg.swizzle_factor;
+
+      // block_idx = blockIdx.x;
+      // if (blockIdx.x < gridp) {
+      if (block_idx < gridp) {
+        // this is the portion of the block that we are going to transpose
+        // const int i = blockIdx.x % arg.swizzle_factor;
+        // const int j = blockIdx.x / arg.swizzle_factor;
+        const int i = block_idx % arg.swizzle_factor;
+        const int j = block_idx / arg.swizzle_factor;
+
+        // transpose the coordinates
+        block_idx = i * (gridp / arg.swizzle_factor) + j;
+      }
+    }
+    return block_idx;
+  }
+
+  /**
+     @brief This class is derived from the arg class that the functor
+     creates and curries in the block size.  This allows the block
+     size to be set statically at launch time in the actual argument
+     class that is passed to the kernel.
+
+     @tparam block_size x-dimension block-size
+     @param[in] arg Kernel argument
+   */
+  template <unsigned int block_size_, typename Arg_> struct BlockKernelArg : Arg_ {
+    using Arg = Arg_;
+    static constexpr unsigned int block_size = block_size_;
+    BlockKernelArg(const Arg &arg) : Arg(arg) { }
+  };
+
+  /**
+     @brief BlockKernel2D_impl is the implementation of the Generic
+     block kernel.  Here, we split the block (CTA) and thread indices
+     and pass them separately to the transform functor.  The x thread
+     dimension is templated (Arg::block_size), e.g., for efficient
+     reductions.
+
+     @tparam Functor Kernel functor that defines the kernel
+     @tparam Arg Kernel argument struct that set any required meta
+     data for the kernel
+     @param[in] arg Kernel argument
+  */
+  template <template <typename> class Functor, typename Arg, typename... S>
+  inline std::enable_if_t<!needsFullBlock<Functor<Arg>>, void> BlockKernel2DImpl(const Arg &arg,
+                                                                                 const sycl::nd_item<3> &ndi, S... smem)
+  {
+    const dim3 block_idx(virtual_block_idx(arg, ndi), groupIdY, groupIdZ);
+    const dim3 thread_idx(localIdX, localIdY, localIdZ);
+    const unsigned int j = globalIdY;
+    if (j >= arg.threads.y) return;
+    const unsigned int k = globalIdZ;
+    if (k >= arg.threads.z) return;
+
+#if 0
+    Functor<Arg> f(arg);
+    if constexpr (hasKernelOps<Functor<Arg>>) {
+      f.setNdItem(ndi);
+    }
+    if constexpr (needsSharedMem<Functor<Arg>>) {
+      f.setSharedMem(smem...);
+    }
+#else
+    Ftor<Functor<Arg>> f(arg, ndi, smem...);
+#endif
+
+    f(block_idx, thread_idx);
+  }
+  template <template <typename> class Functor, typename Arg, typename... S>
+  inline std::enable_if_t<needsFullBlock<Functor<Arg>>, void> BlockKernel2DImpl(const Arg &arg,
+                                                                                const sycl::nd_item<3> &ndi, S... smem)
+  {
+    const dim3 block_idx(virtual_block_idx(arg, ndi), groupIdY, groupIdZ);
+    const dim3 thread_idx(localIdX, localIdY, localIdZ);
+    bool active = true;
+    const unsigned int j = globalIdY;
+    if (j >= arg.threads.y) active = false;
+    const unsigned int k = globalIdZ;
+    if (k >= arg.threads.z) active = false;
+
+#if 0
+    Functor<Arg> f(arg);
+    if constexpr (hasKernelOps<Functor<Arg>>) {
+      f.setNdItem(ndi);
+    }
+    if constexpr (needsSharedMem<Functor<Arg>>) {
+      f.setSharedMem(smem...);
+    }
+#else
+    Ftor<Functor<Arg>> f(arg, ndi, smem...);
+#endif
+
+    f.template operator()<true>(block_idx, thread_idx, active);
+  }
+  template <template <typename> class Functor, typename Arg> struct BlockKernel2DS {
+    using KernelOpsT = getKernelOps<Functor<Arg>>;
+    template <typename... S> BlockKernel2DS(const Arg &arg, const sycl::nd_item<3> &ndi, S... smem)
+    {
+#ifdef QUDA_THREADS_BLOCKED
+      BlockKernel2DImpl<Functor, Arg>(arg, ndi);
+#else
+      BlockKernel2DImpl<Functor, Arg>(arg, ndi, smem...);
+#endif
+    }
+  };
+
+  template <template <typename> class Functor, typename Arg, bool grid_stride = false>
+  qudaError_t BlockKernel2D(const TuneParam &tp, const qudaStream_t &stream, Arg &arg)
+  {
+    static_assert(!grid_stride, "grid_stride not supported for BlockKernel");
+    auto err = QUDA_SUCCESS;
+    auto globalSize = globalRange(tp);
+    auto localSize = localRange(tp);
+    // if (localSize[RANGE_X] % device::warp_size() != 0) {
+    // return QUDA_ERROR;
+    // }
+    if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
+      printfQuda("BlockKernel2D sizeof(arg): %lu\n", sizeof(arg));
+      printfQuda("  global: %s  local: %s  threads: %s\n", str(globalSize).c_str(), str(localSize).c_str(),
+                 str(arg.threads).c_str());
+      printfQuda("  Functor: %s\n", typeid(Functor<Arg>).name());
+      printfQuda("  Arg: %s\n", typeid(Arg).name());
+      printfQuda("  KernelOps: %s\n", typeid(getKernelOps<Functor<Arg>>).name());
+      printfQuda("  needsFullBlock: %i  needsSharedMem: %i\n", needsFullBlock<Functor<Arg>>,
+                 needsSharedMem<Functor<Arg>>);
+      printfQuda("  sharedMemSize: %i\n", sharedMemSize<getKernelOps<Functor<Arg>>>(tp.block, arg));
+      printfQuda("  shared_bytes: %i\n", tp.shared_bytes);
+    }
+    // if (localSize[RANGE_X] % device::warp_size() != 0) {
+    // if(needsFullBlock<Functor<Arg>>) {
+    // std::ostringstream what;
+    // what << "localSizeX (" << localSize[RANGE_X] << ") % warp_size (" << device::warp_size() << ") != 0";
+    // target::sycl::set_error(what.str(), "pre-launch", __func__, __FILE__, __STRINGIFY__(__LINE__), activeTuning());
+    // return QUDA_ERROR;
+    // }
+    // }
+    // if (arg.threads.x%localSize[RANGE_X] != 0) {
+    // warningQuda("arg.threads.x (%i) %% localSize X (%lu) != 0", arg.threads.x, localSize[RANGE_X]);
+    //  return QUDA_ERROR;
+    //}
+    // if (globalSize[RANGE_Y] != arg.threads.y) {
+    // warningQuda("globalSize Y (%lu) != arg.threads.y (%i)", globalSize[RANGE_Y], arg.threads.y);
+    //  return QUDA_ERROR;
+    //}
+    // if (globalSize[RANGE_Z] != arg.threads.z) {
+    // warningQuda("globalSize Z (%lu) != arg.threads.z (%i)", globalSize[RANGE_Z], arg.threads.z);
+    //  return QUDA_ERROR;
+    //}
+    sycl::nd_range<3> ndRange {globalSize, localSize};
+    err = launch<BlockKernel2DS<Functor, Arg>>(stream, ndRange, arg);
+    if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("end BlockKernel2D\n"); }
+    return err;
+  }
+
+} // namespace quda
diff --git a/include/targets/sycl/block_reduction_kernel_host.h b/include/targets/sycl/block_reduction_kernel_host.h
new file mode 100644
index 0000000000..4d6cc92d20
--- /dev/null
+++ b/include/targets/sycl/block_reduction_kernel_host.h
@@ -0,0 +1,23 @@
+namespace quda
+{
+
+  template <template <typename> class Functor, typename Arg> void BlockKernel2D_host(const Arg &arg)
+  {
+    if constexpr (needsSharedMem<typename Functor<Arg>::KernelOpsT>) {
+      constexpr auto smemsize = sharedMemSize<typename Functor<Arg>::KernelOpsT>(dim3(1, 1, 1));
+      char smem[smemsize];
+      Functor<Arg> t {arg, &smem[0]};
+      dim3 block(0, 0, 0);
+      for (block.y = 0; block.y < arg.grid_dim.y; block.y++) {
+        for (block.x = 0; block.x < arg.grid_dim.x; block.x++) { t(block, dim3(0, 0, 0)); }
+      }
+    } else {
+      Functor<Arg> t(arg);
+      dim3 block(0, 0, 0);
+      for (block.y = 0; block.y < arg.grid_dim.y; block.y++) {
+        for (block.x = 0; block.x < arg.grid_dim.x; block.x++) { t(block, dim3(0, 0, 0)); }
+      }
+    }
+  }
+
+} // namespace quda
diff --git a/include/targets/sycl/constant_kernel_arg.h b/include/targets/sycl/constant_kernel_arg.h
new file mode 100644
index 0000000000..1e9b7925d4
--- /dev/null
+++ b/include/targets/sycl/constant_kernel_arg.h
@@ -0,0 +1,13 @@
+#include <target_device.h>
+
+/**
+   @file constant_kernel_arg.h
+
+   This file should be included in the kernel files for which we wish
+   to utilize __constant__ memory for the kernel parameter struct.
+   This needs to be included before the definition of the kernel,
+   e.g., kernel.h in order for the compiler to do the kernel
+   instantiation correctly.
+ */
+
+/* nothing needed here */
diff --git a/include/targets/sycl/group_reduce.h b/include/targets/sycl/group_reduce.h
new file mode 100644
index 0000000000..f0b4dcfed0
--- /dev/null
+++ b/include/targets/sycl/group_reduce.h
@@ -0,0 +1,142 @@
+#pragma once
+
+#include <quda_sycl_api.h>
+#include <array.h>
+using quda::array;
+
+template <typename T> inline void blockReduceSum(sycl::group<3> grp, T &out, const T &in)
+{
+  out = sycl::reduce_over_group(grp, in, sycl::plus<>());
+}
+
+template <typename T, int N>
+inline std::enable_if_t<(N == 1) || (N == 2) || (N == 3) || (N == 4) || (N == 8) || (N == 16), void>
+blockReduceSum(sycl::group<3> grp, array<T, N> &out, const array<T, N> &in)
+{
+  auto inx = reinterpret_cast<const sycl::vec<T, N> *>(&in);
+  auto outx = sycl::reduce_over_group(grp, *inx, sycl::plus<>());
+  out = *reinterpret_cast<array<T, N> *>(&outx);
+}
+
+template <typename T, int N>
+inline std::enable_if_t<N == 6, void> blockReduceSum(sycl::group<3> grp, array<T, N> &out, const array<T, N> &in)
+{
+  for (int i = 0; i < 2; i++) {
+    auto inx = reinterpret_cast<const array<T, 3> *>(&in[3 * i]);
+    auto outx = reinterpret_cast<array<T, 3> *>(&out[3 * i]);
+    blockReduceSum(grp, *outx, *inx);
+  }
+}
+
+template <typename T, int N>
+inline std::enable_if_t<N == 32, void> blockReduceSum(sycl::group<3> grp, array<T, N> &out, const array<T, N> &in)
+{
+  for (int i = 0; i < 2; i++) {
+    auto inx = reinterpret_cast<const array<T, 16> *>(&in[16 * i]);
+    auto outx = reinterpret_cast<array<T, 16> *>(&out[16 * i]);
+    blockReduceSum(grp, *outx, *inx);
+  }
+}
+
+template <typename T, int M, int N>
+inline void blockReduceSum(sycl::group<3> grp, array<array<T, M>, N> &out, const array<array<T, M>, N> &in)
+{
+  const int N2 = M * N;
+  auto inx = reinterpret_cast<const array<T, N2> *>(&in);
+  auto outx = reinterpret_cast<array<T, N2> *>(&out);
+  blockReduceSum(grp, *outx, *inx);
+}
+
+template <typename T, int N>
+inline void blockReduceSum(sycl::group<3> grp, array<vec2<T>, N> &out, const array<vec2<T>, N> &in)
+{
+  const int N2 = 2 * N;
+  auto inx = reinterpret_cast<const array<T, N2> *>(&in);
+  auto outx = reinterpret_cast<array<T, N2> *>(&out);
+  blockReduceSum(grp, *outx, *inx);
+}
+
+template <typename T> inline void blockReduceSum(sycl::group<3> grp, quda::complex<T> &out, const quda::complex<T> &in)
+{
+  auto inx = reinterpret_cast<const array<T, 2> *>(&in);
+  auto outx = reinterpret_cast<array<T, 2> *>(&out);
+  blockReduceSum(grp, *outx, *inx);
+}
+
+template <typename T, int N>
+inline void blockReduceSum(sycl::group<3> grp, array<quda::complex<T>, N> &out, const array<quda::complex<T>, N> &in)
+{
+  const int N2 = 2 * N;
+  auto inx = reinterpret_cast<const array<T, N2> *>(&in);
+  auto outx = reinterpret_cast<array<T, N2> *>(&out);
+  blockReduceSum(grp, *outx, *inx);
+}
+
+template <typename T> inline void blockReduceSum(sycl::group<3> grp, vec2<T> &out, const vec2<T> &in)
+{
+  auto inx = reinterpret_cast<const array<T, 2> *>(&in);
+  auto outx = reinterpret_cast<array<T, 2> *>(&out);
+  blockReduceSum(grp, *outx, *inx);
+}
+
+template <typename T> inline void blockReduceSum(sycl::group<3> grp, vec3<T> &out, const vec3<T> &in)
+{
+  auto inx = reinterpret_cast<const array<T, 3> *>(&in);
+  auto outx = reinterpret_cast<array<T, 3> *>(&out);
+  blockReduceSum(grp, *outx, *inx);
+}
+
+template <typename T> inline void blockReduceSum(sycl::group<3> grp, vec4<T> &out, const vec4<T> &in)
+{
+  auto inx = reinterpret_cast<const array<T, 4> *>(&in);
+  auto outx = reinterpret_cast<array<T, 4> *>(&out);
+  blockReduceSum(grp, *outx, *inx);
+}
+
+template <typename T> inline void blockReduceMax(sycl::group<3> grp, T &out, const T &in)
+{
+  out = sycl::reduce_over_group(grp, in, sycl::maximum<>());
+}
+
+template <typename T, int N>
+inline std::enable_if_t<(N == 1) || (N == 2) || (N == 3) || (N == 4) || (N == 8) || (N == 16), void>
+blockReduceMax(sycl::group<3> grp, array<T, N> &out, const array<T, N> &in)
+{
+  auto inx = reinterpret_cast<const sycl::vec<T, N> *>(&in);
+  auto outx = sycl::reduce_over_group(grp, *inx, sycl::maximum<>());
+  out = *reinterpret_cast<array<T, N> *>(&outx);
+}
+
+template <typename T>
+inline void blockReduceMax(sycl::group<3> grp, quda::deviation_t<T> &out, const quda::deviation_t<T> &in)
+{
+  auto inx = reinterpret_cast<const sycl::vec<T, 2> *>(&in);
+  auto outx = sycl::reduce_over_group(grp, *inx, sycl::maximum<>());
+  out = *reinterpret_cast<quda::deviation_t<T> *>(&outx);
+}
+
+template <typename T> inline void blockReduceMin(sycl::group<3> grp, T &out, const T &in)
+{
+  out = sycl::reduce_over_group(grp, in, sycl::minimum<>());
+}
+
+template <typename T, typename R>
+inline std::enable_if_t<std::is_same_v<typename R::reducer_t, quda::plus<typename R::reduce_t>>, void>
+blockReduce(sycl::group<3> grp, T &out, const T &in, const R &)
+{
+  blockReduceSum(grp, out, in);
+}
+
+template <typename T, typename R>
+inline std::enable_if_t<std::is_same_v<typename R::reducer_t, quda::maximum<typename R::reduce_t>>, void>
+blockReduce(sycl::group<3> grp, T &out, const T &in, const R &)
+{
+  blockReduceMax(grp, out, in);
+}
+
+template <typename T, typename R>
+inline std::enable_if_t<std::is_same_v<typename R::reducer_t, quda::minimum<typename R::reduce_t>>, void>
+blockReduce(sycl::group<3> grp, T &out, const T &in, const R &)
+{
+  blockReduceMin(grp, out, in);
+}
diff --git a/include/targets/sycl/inline_ptx.h b/include/targets/sycl/inline_ptx.h
new file mode 100644
index 0000000000..27aff05193
--- /dev/null
+++ b/include/targets/sycl/inline_ptx.h
@@ -0,0 +1,96 @@
+#pragma once
+
+/*
+  Inline ptx instructions for low-level control of code generation.
+  Primarily these are for doing stores avoiding L1 cache and minimal
+  impact on L2 (streaming through L2).
+*/
+
+namespace quda
+{
+
+  inline void load_streaming_double2(double2 &a, const double2 *addr)
+  {
+    a.x = addr->x;
+    a.y = addr->y;
+  }
+
+  inline void load_streaming_float4(float4 &a, const float4 *addr)
+  {
+    a.x = addr->x;
+    a.y = addr->y;
+    a.z = addr->z;
+    a.w = addr->w;
+  }
+
+  inline void load_cached_short4(short4 &a, const short4 *addr)
+  {
+    a.x = addr->x;
+    a.y = addr->y;
+    a.z = addr->z;
+    a.w = addr->w;
+  }
+
+  inline void load_cached_short2(short2 &a, const short2 *addr)
+  {
+    a.x = addr->x;
+    a.y = addr->y;
+  }
+
+  inline void load_global_short4(short4 &a, const short4 *addr)
+  {
+    a.x = addr->x;
+    a.y = addr->y;
+    a.z = addr->z;
+    a.w = addr->w;
+  }
+
+  inline void load_global_short2(short2 &a, const short2 *addr)
+  {
+    a.x = addr->x;
+    a.y = addr->y;
+  }
+
+  inline void load_global_float4(float4 &a, const float4 *addr)
+  {
+    a.x = addr->x;
+    a.y = addr->y;
+    a.z = addr->z;
+    a.w = addr->w;
+  }
+
+  inline void store_streaming_float4(float4 *addr, float x, float y, float z, float w)
+  {
+    addr->x = x;
+    addr->y = y;
+    addr->z = z;
+    addr->w = w;
+  }
+
+  inline void store_streaming_short4(short4 *addr, short x, short y, short z, short w)
+  {
+    addr->x = x;
+    addr->y = y;
+    addr->z = z;
+    addr->w = w;
+  }
+
+  inline void store_streaming_double2(double2 *addr, double x, double y)
+  {
+    addr->x = x;
+    addr->y = y;
+  }
+
+  inline void store_streaming_float2(float2 *addr, float x, float y)
+  {
+    addr->x = x;
+    addr->y = y;
+  }
+
+  inline void store_streaming_short2(short2 *addr, short x, short y)
+  {
+    addr->x = x;
+    addr->y = y;
+  }
+
+} // namespace quda
diff --git a/include/targets/sycl/kernel.h b/include/targets/sycl/kernel.h
new file mode 100644
index 0000000000..0614df74b4
--- /dev/null
+++ b/include/targets/sycl/kernel.h
@@ -0,0 +1,403 @@
+#pragma once
+
+#include <device.h>
+#include <tunable_kernel.h>
+#include <kernel_helper.h>
+#include <target_device.h>
+#include <utility>
+#include <quda_sycl_api.h>
+
+namespace quda
+{
+
+  // Kernel1D
+
+  template <template <typename> class Functor, typename Arg, bool grid_stride = false>
+  void Kernel1DImpl(const Arg &arg, const sycl::nd_item<3> &)
+  {
+    Functor<Arg> f(arg);
+    auto i = globalIdX;
+    while (i < arg.threads.x) {
+      f(i);
+      if (grid_stride)
+        i += globalRangeX;
+      else
+        break;
+    }
+  }
+  template <template <typename> class Functor, typename Arg, bool grid_stride = false>
+  void Kernel1DImplB(const Arg &arg, const sycl::nd_item<3> &)
+  {
+    Functor<Arg> f(arg);
+    auto tid = globalIdX;
+    auto nid = globalRangeX;
+    auto n = arg.threads.x;
+    auto i0 = (tid * n) / nid;
+    auto i1 = ((tid + 1) * n) / nid;
+    for (auto i = i0; i < i1; i++) { f(i); }
+  }
+  template <template <typename> class Functor, typename Arg, bool grid_stride = false> struct Kernel1DS {
+    using KernelOpsT = getKernelOps<Functor<Arg>>;
+    Kernel1DS(const Arg &arg, const sycl::nd_item<3> &ndi)
+    {
+#ifdef QUDA_THREADS_BLOCKED
+      Kernel1DImplB<Functor, Arg, grid_stride>(arg, ndi);
+#else
+      Kernel1DImpl<Functor, Arg, grid_stride>(arg, ndi);
+#endif
+    }
+  };
+
+  template <template <typename> class Functor, typename Arg, bool grid_stride = false>
+  qudaError_t Kernel1D(const TuneParam &tp, const qudaStream_t &stream, const Arg &arg)
+  {
+    static_assert(!hasKernelOps<Functor<Arg>>);
+    auto err = QUDA_SUCCESS;
+    auto globalSize = globalRange(tp);
+    auto localSize = localRange(tp);
+    // if (localSize[RANGE_X] % device::warp_size() != 0) {
+    //   return QUDA_ERROR;
+    // }
+#if 0
+    if (localSize[RANGE_X] > arg.threads.x) {
+      localSize[RANGE_X] = arg.threads.x;
+      globalSize[RANGE_X] = arg.threads.x;
+    } else if (grid_stride) {
+      if (globalSize[RANGE_X] > arg.threads.x) {
+	globalSize[RANGE_X] = ((arg.threads.x+localSize[RANGE_X]-1)/localSize[RANGE_X])*localSize[RANGE_X];
+      }
+    } else {
+      if (globalSize[RANGE_X] != arg.threads.x) {
+	globalSize[RANGE_X] = ((arg.threads.x+localSize[RANGE_X]-1)/localSize[RANGE_X])*localSize[RANGE_X];
+      }
+    }
+#endif
+    if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
+      printfQuda("Kernel1D grid_stride: %s  sizeof(arg): %lu\n", grid_stride ? "true" : "false", sizeof(arg));
+      printfQuda("  global: %s  local: %s  threads: %s\n", str(globalSize).c_str(), str(localSize).c_str(),
+                 str(arg.threads).c_str());
+      printfQuda("  Functor: %s\n", typeid(Functor<Arg>).name());
+      printfQuda("  Arg: %s\n", typeid(Arg).name());
+      printfQuda("  KernelOps: %s\n", typeid(getKernelOps<Functor<Arg>>).name());
+      printfQuda("  needsFullBlock: %i  needsSharedMem: %i\n", needsFullBlock<Functor<Arg>>,
+                 needsSharedMem<Functor<Arg>>);
+      printfQuda("  shared_bytes: %i\n", tp.shared_bytes);
+    }
+    // if (localSize[RANGE_X] % device::warp_size() != 0) {
+    // if(needsFullBlock<Functor<Arg>>) {
+    // std::ostringstream what;
+    // what << "localSizeX (" << localSize[RANGE_X] << ") % warp_size (" << device::warp_size() << ") != 0";
+    // target::sycl::set_error(what.str(), "pre-launch", __func__, __FILE__, __STRINGIFY__(__LINE__), activeTuning());
+    // return QUDA_ERROR;
+    // }
+    // }
+    // if (arg.threads.x%localSize[RANGE_X] != 0) {
+    // warningQuda("arg.threads.x (%i) %% localSize X (%lu) != 0", arg.threads.x, localSize[RANGE_X]);
+    //  return QUDA_ERROR;
+    //}
+    // sycl::nd_range<3> ndRange{globalSize, localSize};
+    // err = launch<Kernel1DS<Functor, Arg, grid_stride>>(stream, ndRange, arg);
+    err = launch<Kernel1DS<Functor, Arg, grid_stride>>(tp, stream, arg);
+    if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("end Kernel1D\n"); }
+    return err;
+  }
+
+  // Kernel2D
+
+  template <template <typename> class Functor, typename Arg, bool grid_stride, typename... S>
+  std::enable_if_t<!needsFullBlock<Functor<Arg>>, void> Kernel2DImpl(const Arg &arg, const sycl::nd_item<3> &ndi,
+                                                                     S... smem)
+  {
+#if 0
+    Functor<Arg> f(arg);
+    if constexpr (hasKernelOps<Functor<Arg>>) {
+      f.setNdItem(ndi);
+    }
+    if constexpr (needsSharedMem<Functor<Arg>>) {
+      f.setSharedMem(smem...);
+    }
+#else
+    // Functor<Arg> f(arg, smem...);
+    Ftor<Functor<Arg>> f(arg, ndi, smem...);
+#endif
+
+    auto j = globalIdY;
+    if (j >= arg.threads.y) return;
+    auto i = globalIdX;
+    while (i < arg.threads.x) {
+      f(i, j);
+      if (grid_stride)
+        i += globalRangeX;
+      else
+        break;
+    }
+  }
+  template <template <typename> class Functor, typename Arg, bool grid_stride, typename... S>
+  std::enable_if_t<needsFullBlock<Functor<Arg>>, void> Kernel2DImpl(const Arg &arg, const sycl::nd_item<3> &ndi, S... smem)
+  {
+#if 0
+    Functor<Arg> f(arg);
+    if constexpr (hasKernelOps<Functor<Arg>>) {
+      f.setNdItem(ndi);
+    }
+    if constexpr (needsSharedMem<Functor<Arg>>) {
+      f.setSharedMem(smem...);
+    }
+#else
+    // Functor<Arg> f(arg, smem...);
+    Ftor<Functor<Arg>> f(arg, ndi, smem...);
+#endif
+
+    bool active = true;
+    auto j = globalIdY;
+    if (j >= arg.threads.y) active = false;
+    auto i = globalIdX;
+    while (i - localIdX < arg.threads.x) {
+      if (i >= arg.threads.x) active = false;
+      f.template operator()<true>(i, j, active);
+      if (grid_stride)
+        i += globalRangeX;
+      else
+        break;
+    }
+  }
+
+  template <template <typename> class Functor, typename Arg, bool grid_stride = false>
+  void Kernel2DImplB(const Arg &arg, const sycl::nd_item<3> &)
+  {
+    Functor<Arg> f(arg);
+    auto j = globalIdY;
+    if (j >= arg.threads.y) return;
+    auto tid = globalIdX;
+    auto nid = globalRangeX;
+    auto n = arg.threads.x;
+    auto i0 = (tid * n) / nid;
+    auto i1 = ((tid + 1) * n) / nid;
+    for (auto i = i0; i < i1; i++) { f(i, j); }
+  }
+  template <template <typename> class Functor, typename Arg, bool grid_stride = false> struct Kernel2DS {
+    using KernelOpsT = getKernelOps<Functor<Arg>>;
+    template <typename... S> Kernel2DS(const Arg &arg, const sycl::nd_item<3> &ndi, S... smem)
+    {
+#ifdef QUDA_THREADS_BLOCKED
+      Kernel2DImplB<Functor, Arg, grid_stride>(arg, ndi);
+#else
+      Kernel2DImpl<Functor, Arg, grid_stride>(arg, ndi, smem...);
+#endif
+    }
+  };
+
+  template <template <typename> class Functor, typename Arg, bool grid_stride = false>
+  qudaError_t Kernel2D(const TuneParam &tp, const qudaStream_t &stream, Arg &arg)
+  {
+    auto err = QUDA_SUCCESS;
+    auto globalSize = globalRange(tp);
+    auto localSize = localRange(tp);
+    // if (localSize[RANGE_X] % device::warp_size() != 0) {
+    //   return QUDA_ERROR;
+    // }
+#if 0
+    if (localSize[RANGE_X] > arg.threads.x) {
+      localSize[RANGE_X] = arg.threads.x;
+      globalSize[RANGE_X] = arg.threads.x;
+    } else if (grid_stride) {
+      if (globalSize[RANGE_X] > arg.threads.x) {
+	globalSize[RANGE_X] = ((arg.threads.x+localSize[RANGE_X]-1)/localSize[RANGE_X])*localSize[RANGE_X];
+      }
+    } else {
+      if (globalSize[RANGE_X] != arg.threads.x) {
+	globalSize[RANGE_X] = ((arg.threads.x+localSize[RANGE_X]-1)/localSize[RANGE_X])*localSize[RANGE_X];
+      }
+    }
+#endif
+    if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
+      printfQuda("Kernel2D grid_stride: %s  sizeof(arg): %lu\n", grid_stride ? "true" : "false", sizeof(arg));
+      printfQuda("  global: %s  local: %s  threads: %s\n", str(globalSize).c_str(), str(localSize).c_str(),
+                 str(arg.threads).c_str());
+      printfQuda("  Functor: %s\n", typeid(Functor<Arg>).name());
+      printfQuda("  Arg: %s\n", typeid(Arg).name());
+      printfQuda("  KernelOps: %s\n", typeid(getKernelOps<Functor<Arg>>).name());
+      printfQuda("  needsFullBlock: %i  needsSharedMem: %i\n", needsFullBlock<Functor<Arg>>,
+                 needsSharedMem<Functor<Arg>>);
+      printfQuda("  shared_bytes: %i\n", tp.shared_bytes);
+    }
+    // if (localSize[RANGE_X] % device::warp_size() != 0) {
+    // if(needsFullBlock<Functor<Arg>>) {
+    // std::ostringstream what;
+    // what << "localSizeX (" << localSize[RANGE_X] << ") % warp_size (" << device::warp_size() << ") != 0";
+    // target::sycl::set_error(what.str(), "pre-launch", __func__, __FILE__, __STRINGIFY__(__LINE__), activeTuning());
+    // return QUDA_ERROR;
+    // }
+    // }
+    // if (arg.threads.x%localSize[RANGE_X] != 0) {
+    // warningQuda("arg.threads.x (%i) %% localSize X (%lu) != 0", arg.threads.x, localSize[RANGE_X]);
+    //  return QUDA_ERROR;
+    //}
+    // if (globalSize[RANGE_Y] != arg.threads.y) {
+    // warningQuda("globalSize Y (%lu) != arg.threads.y (%i)", globalSize[RANGE_Y], arg.threads.y);
+    //  return QUDA_ERROR;
+    //}
+    // auto t0 = __rdtsc();
+    sycl::nd_range<3> ndRange {globalSize, localSize};
+    err = launch<Kernel2DS<Functor, Arg, grid_stride>>(stream, ndRange, arg);
+    // auto t1 = __rdtsc();
+    // printf("%llu\n", t1-t0);
+    if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("end Kernel2D\n"); }
+    return err;
+  }
+
+  // Kernel3D
+
+  template <template <typename> class Functor, typename Arg, bool grid_stride, typename... S>
+  std::enable_if_t<!needsFullBlock<Functor<Arg>>, void> Kernel3DImpl(const Arg &arg, const sycl::nd_item<3> &ndi,
+                                                                     S... smem)
+  {
+#if 0
+    Functor<Arg> f(arg);
+    if constexpr (hasKernelOps<Functor<Arg>>) {
+      f.setNdItem(ndi);
+    }
+    if constexpr (needsSharedMem<Functor<Arg>>) {
+      f.setSharedMem(smem...);
+    }
+#else
+    // Functor<Arg> f(arg, smem...);
+    Ftor<Functor<Arg>> f(arg, ndi, smem...);
+#endif
+
+    auto j = globalIdY;
+    if (j >= arg.threads.y) return;
+    auto k = globalIdZ;
+    if (k >= arg.threads.z) return;
+    auto i = globalIdX;
+    while (i < arg.threads.x) {
+      f(i, j, k);
+      if (grid_stride)
+        i += globalRangeX;
+      else
+        break;
+    }
+  }
+  template <template <typename> class Functor, typename Arg, bool grid_stride, typename... S>
+  std::enable_if_t<needsFullBlock<Functor<Arg>>, void> Kernel3DImpl(const Arg &arg, const sycl::nd_item<3> &ndi, S... smem)
+  {
+#if 0
+    Functor<Arg> f(arg);
+    if constexpr (hasKernelOps<Functor<Arg>>) {
+      f.setNdItem(ndi);
+    }
+    if constexpr (needsSharedMem<Functor<Arg>>) {
+      f.setSharedMem(smem...);
+    }
+#else
+    // Functor<Arg> f(arg, smem...);
+    Ftor<Functor<Arg>> f(arg, ndi, smem...);
+#endif
+
+    bool active = true;
+    auto j = globalIdY;
+    if (j >= arg.threads.y) active = false;
+    auto k = globalIdZ;
+    if (k >= arg.threads.z) active = false;
+    auto i = globalIdX;
+    while (i - localIdX < arg.threads.x) {
+      if (i >= arg.threads.x) active = false;
+      f.template operator()<true>(i, j, k, active);
+      if (grid_stride)
+        i += globalRangeX;
+      else
+        break;
+    }
+  }
+
+  template <template <typename> class Functor, typename Arg, bool grid_stride>
+  void Kernel3DImplB(const Arg &arg, const sycl::nd_item<3> &)
+  {
+    Functor<Arg> f(arg);
+
+    auto j = globalIdY;
+    if (j >= arg.threads.y) return;
+    auto k = globalIdZ;
+    if (k >= arg.threads.z) return;
+    auto tid = globalIdX;
+    auto nid = globalRangeX;
+    auto n = arg.threads.x;
+    auto i0 = (tid * n) / nid;
+    auto i1 = ((tid + 1) * n) / nid;
+    for (auto i = i0; i < i1; i++) { f(i, j, k); }
+  }
+  template <template <typename> class Functor, typename Arg, bool grid_stride = false> struct Kernel3DS {
+    using KernelOpsT = getKernelOps<Functor<Arg>>;
+    template <typename... S> Kernel3DS(const Arg &arg, const sycl::nd_item<3> &ndi, S... smem)
+    {
+#ifdef QUDA_THREADS_BLOCKED
+      Kernel3DImplB<Functor, Arg, grid_stride>(arg, ndi);
+#else
+      Kernel3DImpl<Functor, Arg, grid_stride>(arg, ndi, smem...);
+#endif
+    }
+  };
+
+  template <template <typename> class Functor, typename Arg, bool grid_stride = false>
+  qudaError_t Kernel3D(const TuneParam &tp, const qudaStream_t &stream, Arg &arg)
+  {
+    auto err = QUDA_SUCCESS;
+    auto globalSize = globalRange(tp);
+    auto localSize = localRange(tp);
+#if 0
+    if (localSize[RANGE_X] > arg.threads.x) {
+      localSize[RANGE_X] = arg.threads.x;
+      globalSize[RANGE_X] = arg.threads.x;
+    } else if (grid_stride) {
+      if (globalSize[RANGE_X] > arg.threads.x) {
+	globalSize[RANGE_X] = ((arg.threads.x+localSize[RANGE_X]-1)/localSize[RANGE_X])*localSize[RANGE_X];
+      }
+    } else {
+      if (globalSize[RANGE_X] != arg.threads.x) {
+	globalSize[RANGE_X] = ((arg.threads.x+localSize[RANGE_X]-1)/localSize[RANGE_X])*localSize[RANGE_X];
+      }
+    }
+#endif
+    // printfQuda("Kernel3D %s\n", typeid(Functor<Arg>).name());
+    if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
+      printfQuda("Kernel3D param grid_stride: %s  sizeof(arg): %lu\n", grid_stride ? "true" : "false", sizeof(arg));
+      printfQuda("  global: %s  local: %s  threads: %s\n", str(globalSize).c_str(), str(localSize).c_str(),
+                 str(arg.threads).c_str());
+      printfQuda("  Functor: %s\n", typeid(Functor<Arg>).name());
+      printfQuda("  Arg: %s\n", typeid(Arg).name());
+      printfQuda("  KernelOps: %s\n", typeid(getKernelOps<Functor<Arg>>).name());
+      printfQuda("  needsFullBlock: %i  needsSharedMem: %i\n", needsFullBlock<Functor<Arg>>,
+                 needsSharedMem<Functor<Arg>>);
+      printfQuda("  shared_bytes: %i\n", tp.shared_bytes);
+      // fflush(stdout);
+    }
+    // if (localSize[RANGE_X] % device::warp_size() != 0) {
+    // if(needsFullBlock<Functor<Arg>>) {
+    // std::ostringstream what;
+    // what << "localSizeX (" << localSize[RANGE_X] << ") % warp_size (" << device::warp_size() << ") != 0";
+    // target::sycl::set_error(what.str(), "pre-launch", __func__, __FILE__, __STRINGIFY__(__LINE__), activeTuning());
+    // return QUDA_ERROR;
+    // }
+    // }
+    // if (arg.threads.x%localSize[RANGE_X] != 0) {
+    // warningQuda("arg.threads.x (%i) %% localSize X (%lu) != 0", arg.threads.x, localSize[RANGE_X]);
+    // return QUDA_ERROR;
+    // }
+    // if (globalSize[RANGE_Y] != arg.threads.y) {
+    // warningQuda("globalSize Y (%lu) != arg.threads.y (%i)", globalSize[RANGE_Y], arg.threads.y);
+    //  return QUDA_ERROR;
+    //}
+    // if (globalSize[RANGE_Z] != arg.threads.z) {
+    // warningQuda("globalSize Z (%lu) != arg.threads.z (%i)", globalSize[RANGE_Z], arg.threads.z);
+    //  return QUDA_ERROR;
+    //}
+    sycl::nd_range<3> ndRange {globalSize, localSize};
+    err = launch<Kernel3DS<Functor, Arg, grid_stride>>(stream, ndRange, arg);
+    if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
+      printfQuda("end Kernel3D\n");
+      // fflush(stdout);
+    }
+    return err;
+  }
+
+} // namespace quda
diff --git a/include/targets/sycl/kernel_host.h b/include/targets/sycl/kernel_host.h
new file mode 100644
index 0000000000..dd29782030
--- /dev/null
+++ b/include/targets/sycl/kernel_host.h
@@ -0,0 +1,43 @@
+#pragma once
+
+namespace quda
+{
+
+  template <template <typename> class Functor, typename Arg> void Kernel1D_host(const Arg &arg)
+  {
+    Functor<Arg> f(const_cast<Arg &>(arg));
+    for (int i = 0; i < static_cast<int>(arg.threads.x); i++) { f(i); }
+  }
+
+  template <template <typename> class Functor, typename Arg> void Kernel2D_host(const Arg &arg)
+  {
+    Functor<Arg> f(const_cast<Arg &>(arg));
+    for (int i = 0; i < static_cast<int>(arg.threads.x); i++) {
+      for (int j = 0; j < static_cast<int>(arg.threads.y); j++) { f(i, j); }
+    }
+  }
+
+  template <template <typename> class Functor, typename Arg> void Kernel3D_host(const Arg &arg)
+  {
+    if constexpr (needsSharedMem<getKernelOps<Functor<Arg>>>) {
+      const int maxsmemsize = 64*1024;
+      auto smemsize = sharedMemSize<getKernelOps<Functor<Arg>>>(dim3(1, 1, 1), arg);
+      if (smemsize > maxsmemsize) errorQuda("Smem size (%d) > Max Smem size (%d)", smemsize, maxsmemsize);
+      char smem[maxsmemsize];
+      Functor<Arg> f {arg, &smem[0]};
+      for (int i = 0; i < static_cast<int>(arg.threads.x); i++) {
+        for (int j = 0; j < static_cast<int>(arg.threads.y); j++) {
+          for (int k = 0; k < static_cast<int>(arg.threads.z); k++) { f(i, j, k); }
+        }
+      }
+    } else {
+      Functor<Arg> f(const_cast<Arg &>(arg));
+      for (int i = 0; i < static_cast<int>(arg.threads.x); i++) {
+        for (int j = 0; j < static_cast<int>(arg.threads.y); j++) {
+          for (int k = 0; k < static_cast<int>(arg.threads.z); k++) { f(i, j, k); }
+        }
+      }
+    }
+  }
+
+} // namespace quda
diff --git a/include/targets/sycl/kernel_ops_target.h b/include/targets/sycl/kernel_ops_target.h
new file mode 100644
index 0000000000..e2b878facb
--- /dev/null
+++ b/include/targets/sycl/kernel_ops_target.h
@@ -0,0 +1,361 @@
+#pragma once
+#include <kernel_ops.h>
+#include <block_reduce_helper.h>
+
+namespace quda
+{
+
+  // needsSharedMem
+#if 0
+  template <typename T> static constexpr bool needsSharedMem = needsSharedMem<getKernelOps<T>>;
+  template <typename ...T> static constexpr bool needsSharedMemImpl = (needsSharedMemImpl<T> || ...);
+  template <> static constexpr bool needsSharedMemImpl<depNone> = false;
+  template <> static constexpr bool needsSharedMemImpl<depFullBlock> = false;
+  template <typename T, typename S> static constexpr bool needsSharedMemImpl<depSharedMem<T,S>> = true;
+  template <typename ...T> static constexpr bool needsSharedMemImpl<op_Concurrent<T...>> = needsSharedMemImpl<T...>;
+  template <typename ...T> static constexpr bool needsSharedMemImpl<op_Sequential<T...>> = needsSharedMemImpl<T...>;
+  template <typename T> static constexpr bool needsSharedMemF() {
+    if constexpr (std::is_base_of<op_Base,T>::value) {
+    //if constexpr (is_instance<T,op_Base>) {
+      return needsSharedMemImpl<typename T::dependencies>;
+    } else {
+      //if constexpr (hasKernelOps<T>) {
+      //return needsSharedMem<getKernelOps<T>>;
+      //} else {
+      //return false;
+      return needsSharedMem<typename T::dependentOps>;
+      //}
+    }
+  }
+  template <typename T> static constexpr bool needsSharedMemImpl<T> = needsSharedMemF<T>();
+  template <> static constexpr bool needsSharedMem<NoKernelOps> = false;
+  template <typename ...T> static constexpr bool needsSharedMem<KernelOps<T...>> = needsSharedMemImpl<T...>;
+#endif
+#if 0
+  //template <typename ...T> static constexpr bool needsSharedMemImpl = (needsSharedMemImpl<T> || ...);
+  template <typename T> static constexpr bool needsSharedMemImpl = (T::shared_mem_size(dim3{8,8,8}) > 0);
+  template <typename... T> static constexpr bool needsSharedMemImpl<KernelOps<T...>> = (needsSharedMemImpl<T> || ...);
+  template <typename T> static constexpr bool needsSharedMem = needsSharedMem<getKernelOps<T>>;
+  template <typename... T> static constexpr bool needsSharedMem<KernelOps<T...>> = (needsSharedMemImpl<T> || ...);
+  //template <> static constexpr bool needsSharedMem<NoKernelOps> = false;
+#endif
+
+  template <typename T> static constexpr bool needsSharedMemImpl = (T) false;
+  template <typename T> static constexpr bool needsSharedMem = needsSharedMem<getKernelOps<T>>;
+  template <typename... T> static constexpr bool needsSharedMem<KernelOps<T...>> = (needsSharedMemImpl<T> || ...);
+
+  // KernelOps
+  template <typename... T> struct KernelOps : KernelOpsBase<T...> {
+    // struct KernelOpsTarget<T...> {
+    // using KernelOpsT = op_Sequential<T...>;
+    // using KernelOpsT = KernelOps<T...>;
+    // using KernelOpsElemType = typename KernelOpsElemTypeS<T...>::type;
+    // const sycl::nd_item<3> *ndi = nullptr;
+    // char *smem;
+    sycl::local_ptr<char> smem = nullptr;
+
+    // KernelOps() = delete;
+    inline KernelOps() { static_assert(!needsSharedMem<KernelOps<T...>>); }
+    inline KernelOps(char *s)
+    { // for host
+      static_assert(needsSharedMem<KernelOps<T...>>);
+      smem = s;
+    }
+    // template <typename S>
+    // inline KernelOps(S s) {
+    //   static_assert(needsSharedMem<KernelOps<T...>>);
+    //   smem = s.get();
+    // }
+    template <typename... U> inline KernelOps(const KernelOps<U...> &ops)
+    {
+      checkKernelOps<T...>(ops);
+      if constexpr (needsSharedMem<KernelOps<T...>>) { smem = ops.smem; }
+    }
+
+#if 0
+    //inline void setNdItem(const sycl::nd_item<3> &i) { ndi = &i; }
+    inline void setNdItem(const sycl::nd_item<3> &i) {}
+    inline void setSharedMem(char *s) { smem = s; }
+    template <typename ...U> inline void setKernelOps(const KernelOps<U...> &ops) {
+      static_assert(std::is_same_v<KernelOps<T...>,KernelOps<U...>>);
+      //ndi = ops.ndi;
+      smem = ops.smem;
+    }
+#endif
+#if 0
+    KernelOpsElemType *getSharedMemPtr() {
+      static_assert(!std::is_same_v<KernelOpsElemType,void>);
+      return reinterpret_cast<KernelOpsElemType*>(smem);
+    }
+#endif
+  };
+
+  // blockSync
+  template <typename... T> inline void blockSync(const KernelOps<T...> &ops)
+  {
+    // static_assert(hasBlockSync<T...>);
+    // checkKernelOp<op_blockSync,T...>();
+    checkKernelOps<op_blockSync>(ops);
+    // if (ops->ndi == nullptr) {
+    //   errorQuda("KernelOps not set");
+    // }
+#ifdef __SYCL_DEVICE_ONLY__
+    // sycl::group_barrier(ops->ndi->get_group());
+    sycl::group_barrier(getGroup());
+#endif
+  }
+  // template <typename ...T> inline void blockSync(KernelOps<T...> ops) { blockSync(&ops); }
+
+  // template <typename ...T> static constexpr bool isOpConcurrent = false;
+  // template <typename ...T> static constexpr bool isOpConcurrent<op_Concurrent<T...>> = true;
+
+  // template <typename T, typename ...U> static constexpr int getOpIndex = 0;
+  // template <typename T, typename ...U> static constexpr int getOpIndex<T,op_Concurrent<U...>> = getOpIndex<T,U...>;
+  // template <typename T, typename U, typename ...V> static constexpr int getOpIndex<T, U, V...> =
+  //   std::is_same_v<T,U> ? 0 : (1 + getOpIndex<T,V...>);
+
+#if 0
+  // getKernelOp
+  template <typename U, typename ...T>
+  inline U getKernelOp(const KernelOps<T...> &ops) {
+    //static_assert(hasKernelOpType<U,T...>);
+    checkKernelOp<U,T...>();
+    //if (ops->ndi == nullptr || ops->smem == nullptr) {
+    //	errorQuda("KernelOps not set");
+    //}
+    //KernelOpsType<U,n> s;
+    KernelOps<U> s;
+    //s.ndi = ops.ndi;
+    //s.smem = ops->smem + sharedMemOffset<U,n>()(ops->ndi->get_local_range());  // FIXME: need to pass arg
+    //s.smem = ops.smem + sharedMemOffset<U,n>()(getBlockDim());  // FIXME: need to pass arg
+    s.smem = ops.smem;
+    return s;
+  }
+  template <typename U, typename ...T>
+    inline U getKernelOp(const KernelOps<T...> *ops) { return getKernelOp<U>(*ops); }
+  template <typename U> struct getKernelOpF {
+    template <typename T> inline U operator()(const T &ops) { return getKernelOp<U>(ops); }
+  };
+#endif
+
+#if 0
+  // getDependentOps
+  template <typename U, int n = 0, typename ...T>
+  inline KernelOpDependencies<KernelOpsType<U,n>> getDependentOps(const KernelOps<T...> &ops) {
+    static_assert(hasKernelOpType<U,T...>);
+    //if (ops->ndi == nullptr || ops->smem == nullptr) {
+    //errorQuda("KernelOps not set");
+    //}
+    //KernelOpDependencies<KernelOpsType<U,n>> s;
+    //s.ndi = ops.ndi;
+    //s.smem = ops->smem + sharedMemOffset<U,n>()(ops->ndi->get_local_range());  // FIXME: need to pass arg
+    //s.smem = ops.smem + sharedMemOffset<U,n>()(getBlockDim());  // FIXME: need to pass arg
+    //return s;
+    using R = KernelOpDependencies<KernelOpsType<U,n>>;
+    if constexpr (needsSharedMem<R>) {
+      auto m = ops.smem + KernelOps<U>::
+      R s{};
+      return s;
+    } else {
+      R s{};
+      return s;
+    }
+  }
+#endif
+
+  // getSharedMemPtr
+#if 0
+  template <typename ...T>
+  //KernelOpsElemType<T...> *getSharedMemPtr(KernelOps<T...> *ops) {
+  sycl::local_ptr<KernelOpsElemType<T...>> getSharedMemPtr(KernelOps<T...> *ops) {
+    static_assert(!std::is_same_v<KernelOpsElemType<T...>,void>);
+    //return reinterpret_cast<KernelOpsElemType<T...>*>(ops->smem);
+    //return reinterpret_cast<KernelOpsElemType<T...>*>(ops->smem.get());
+    //sycl::local_ptr<KernelOpsElemType<T...>> smem = ops->smem.get();
+    //return smem.get();
+    //auto p = ops->smem.get();
+    sycl::local_ptr<void> v(ops->smem);
+    sycl::local_ptr<KernelOpsElemType<T...>> p(v);
+    return p;
+    //sycl::local_ptr<KernelOpsElemType<T...>> smem;
+    //using LT = decltype(smem.get());
+    //LT pt = reinterpret_cast<LT>(p);
+    //sycl::local_ptr<KernelOpsElemType<T...>> smem2(pt);
+    //return smem2;
+    //return reinterpret_cast<KernelOpsElemType<T...>*>(0);
+  }
+  template <typename ...T>
+  inline KernelOpsElemType<T...> *getSharedMemPtr(KernelOps<T...> ops) { return getSharedMemPtr(&ops); }
+#endif
+
+#if 0
+  template <typename T, typename S, typename O = op_SharedMemory<T,S>>
+  inline sycl::local_ptr<T> getSharedMemPtr(const only_SharedMemory<T,S> &ops) {
+    //if (ops->ndi == nullptr || ops->smem == nullptr) {
+    //errorQuda("KernelOps not set");
+    //}
+    sycl::local_ptr<void> v(ops.smem);
+    sycl::local_ptr<T> p(v);
+    return p;
+  }
+  //template <typename T, typename S>
+  //inline sycl::local_ptr<T> getSharedMemPtr(only_SharedMemory<T,S> ops) { return getSharedMemPtr(&ops); }
+  template <typename O, typename T, typename U, typename ...V>
+  inline auto getSharedMemPtr(const KernelOps<T,U,V...> &ops) {
+    KernelOps<O> op = getKernelOp<O>(ops);
+    return getSharedMemPtr(op);
+  }
+#endif
+
+#if 0
+  template <typename T, typename O>
+  inline auto getSharedMemory(O *ops)
+  {
+    auto s = getKernelOp<T>(ops);
+    return getSharedMemPtr(s);
+  }
+#endif
+
+  // needsFullBlock
+#if 0
+  template <typename T> static constexpr bool needsFullBlock = needsFullBlock<getKernelOps<T>>;
+  template <typename ...T> static constexpr bool needsFullBlockImpl = (needsFullBlockImpl<T> || ...);
+  template <> static constexpr bool needsFullBlockImpl<depNone> = false;
+  template <> static constexpr bool needsFullBlockImpl<depFullBlock> = true;
+  template <typename T, typename S> static constexpr bool needsFullBlockImpl<depSharedMem<T,S>> = false;
+  template <typename ...T> static constexpr bool needsFullBlockImpl<op_Concurrent<T...>> = needsFullBlockImpl<T...>;
+  template <typename ...T> static constexpr bool needsFullBlockImpl<op_Sequential<T...>> = needsFullBlockImpl<T...>;
+  template <typename T> static constexpr bool needsFullBlockF() {
+    if constexpr (std::is_base_of<op_Base,T>::value) {
+      return needsFullBlockImpl<typename T::dependencies>;
+    } else {
+      //if constexpr (hasKernelOps<T>) {
+      //return needsFullBlock<getKernelOps<T>>;
+      //} else {
+      //return false;
+      return needsFullBlock<typename T::dependentOps>;
+      //}
+    }
+  }
+  template <typename T> static constexpr bool needsFullBlockImpl<T> = needsFullBlockF<T>();
+  template <> static constexpr bool needsFullBlock<NoKernelOps> = false;
+  template <typename ...T> static constexpr bool needsFullBlock<KernelOps<T...>> = needsFullBlockImpl<T...>;
+#else
+  template <typename T> static constexpr bool needsFullBlockImpl = (T) false;
+  template <typename... T> static constexpr bool needsFullBlockImpl<KernelOps<T...>> = (needsFullBlockImpl<T> || ...);
+  // template <> constexpr bool needsFullBlockImpl<NoKernelOps> = false;
+  template <typename T> static constexpr bool needsFullBlock = needsFullBlockImpl<getKernelOps<T>>;
+#endif
+
+  // base operation dependencies
+  struct depNone {
+  };
+  template <> struct sharedMemSizeS<depNone> {
+    template <typename... Arg> static constexpr unsigned int size(dim3, Arg &...) { return 0; }
+  };
+
+  struct depFullBlock {
+  };
+  template <> struct sharedMemSizeS<depFullBlock> {
+    template <typename... Arg> static constexpr unsigned int size(dim3, Arg &...) { return 0; }
+  };
+
+  template <typename T, typename S> struct depSharedMem {
+  };
+  template <typename T, typename S> struct sharedMemSizeS<depSharedMem<T, S>> {
+    template <typename... Arg> static constexpr unsigned int size(dim3 block, Arg &...arg)
+    {
+      return S().template size<T>(block, arg...);
+    }
+  };
+
+  // op implementations
+  // struct op_blockSync : op_BaseT<void> {
+  struct op_blockSync {
+    // using dependencies = depFullBlock;
+    template <typename... Arg> static constexpr unsigned int shared_mem_size(dim3, Arg &...) { return 0; }
+  };
+  template <> static constexpr bool needsSharedMemImpl<op_blockSync> = false;
+
+  template <typename T>
+  // struct op_warp_combine : op_BaseT<T> {
+  struct op_warp_combine {
+    // using dependencies = depNone;
+    // using dependencies = depFullBlock;
+    template <typename... Arg> static constexpr unsigned int shared_mem_size(dim3, Arg &...) { return 0; }
+  };
+  template <typename T> static constexpr bool needsFullBlockImpl<op_warp_combine<T>> = false;
+  template <typename T> static constexpr bool needsSharedMemImpl<op_warp_combine<T>> = false;
+
+#if 0
+  template <typename T, int N>
+  struct op_thread_array : op_BaseT<T,N> {
+    //using dependencies = depNone;
+    using dependencies = op_SharedMemory<array<T,N>,opSizeBlock>;
+  };
+
+  template <typename T>
+  struct op_BlockReduce : op_BaseT<T> {
+    using concurrentOps = op_Concurrent<op_blockSync,op_SharedMemory<T,opSizeBlockDivWarp>>;
+    using opBlockSync = getKernelOpF<concurrentOps,0>;
+    using opSharedMem = getKernelOpF<concurrentOps,1>;
+    //using specialOps = KernelOps<concurrentOps>;
+    using dependencies = concurrentOps;
+  };
+
+  template <typename T, typename D>
+  struct op_SharedMemoryCache : op_BaseT<T> {
+    template <typename ...Arg> static constexpr dim3 dims(dim3 block, Arg &...arg) { return D::dims(block, arg...); }
+    using dependencies = op_Sequential<op_blockSync,op_SharedMemory<T,opSizeDims<D>>>;
+  };
+
+  template <typename T, typename S>
+  struct op_SharedMemory : op_BaseT<T> {
+    using dependencies = depSharedMem<T,S>;
+    template <typename ...Arg>
+    static constexpr unsigned int shared_mem_size(dim3 block, Arg &...arg) { return S::template size<T>(block, arg...); }
+  };
+#endif
+
+  // needsFullWarp?
+
+  // tests
+#if 0
+  static const int opTestArg = 10;
+  static_assert(needsFullBlock<only_SharedMemoryCache<float>> == true);
+  static_assert(sharedMemSize<only_SharedMemoryCache<float>>(dim3(2,3,4))==24*sizeof(float));
+  static_assert(sharedMemSize<only_SharedMemoryCache<float>>(dim3(2,3,4),opTestArg)==24*sizeof(float));
+
+  template <typename T, typename U> static constexpr bool opTestHasKernelOpType = hasKernelOpType<T,U>;
+  template <typename T, int n = 0> static constexpr bool opTestAllHasKernelOpType = false;
+  template <typename ...T> static constexpr bool opTestAllHasKernelOpType<KernelOps<T...>,sizeof...(T)> = true;
+  template <typename ...T, int n> static constexpr bool opTestAllHasKernelOpType<KernelOps<T...>,n> =
+    opTestHasKernelOpType<std::tuple_element_t<n,std::tuple<T...>>,KernelOps<T...>> &&
+    opTestAllHasKernelOpType<KernelOps<T...>,n+1>;
+
+  using opTestC1 = op_Concurrent<op_blockSync,op_thread_array<bool,4>>;
+  using opTest1 = KernelOps<op_blockSync,op_warp_combine<int>,op_thread_array<float,4>,op_SharedMemoryCache<float>,
+    op_SharedMemory<double>,op_SharedMemStatic<char,100>,opTestC1>;
+  static_assert(opTestAllHasKernelOpType<opTest1>);
+  static_assert(hasKernelOpType<opTestC1,opTest1>);
+  static_assert(!hasKernelOpType<op_thread_array<bool,4>,opTest1>);
+
+  static_assert(sharedMemSize<opTest1>(dim3(0,0,0))==std::max((unsigned int)100,0*sizeof(double)));
+  static_assert(sharedMemSize<opTest1>(dim3(1,2,5))==std::max({(unsigned int)100,10*sizeof(double),40*sizeof(float)}));
+  static_assert(sharedMemSize<opTest1>(dim3(2,5,10))==std::max({(unsigned int)100,100*sizeof(double),400*sizeof(float)}));
+#endif
+
+#if 0
+  using opTest2 = KernelOps<op_blockSync,op_warp_combine<int>,op_thread_array<float,4>,
+			     op_SharedMemoryCache<double>,op_SharedMemory<float>,op_SharedMemStatic<char,100>>;
+  static_assert(opTestAllHasKernelOpType<opTest1>);
+   template <typename T, typename U> static constexpr bool opTestKernelOpsType =
+    //std::is_same_v<KernelOpsType<T,U>,KernelOps<T>;
+    hasKernelOpType<T,U>;
+  template <typename T, int n = 0> static constexpr bool opTestAllKernelOpsType = false;
+  template <typename ...T> static constexpr bool opTestAllKernelOpsType<KernelOps<T...>,sizeof...(T)> = true;
+  template <typename ...T, int n> static constexpr bool opTestAllKernelOpsType<KernelOps<T...>,n> =
+    opTestKernelOpsType<std::tuple_element_t<n,std::tuple<T...>>,KernelOps<T...>> &&
+    opTestAllKernelOpsType<KernelOps<T...>,n+1>;
+#endif
+} // namespace quda
diff --git a/include/targets/sycl/math_helper.cuh b/include/targets/sycl/math_helper.cuh
new file mode 100644
index 0000000000..3db31c1c55
--- /dev/null
+++ b/include/targets/sycl/math_helper.cuh
@@ -0,0 +1,160 @@
+#pragma once
+
+// #include <math.h>
+
+namespace quda
+{
+
+  inline int abs(const int a) { return sycl::abs(a); }
+  inline float abs(const float a) { return sycl::fabs(a); }
+  inline double abs(const double a) { return sycl::fabs(a); }
+
+  template <typename T> inline int rint(const T a) { return (int)sycl::round(a); }
+  template <typename T> inline T fmod(const T a, const T b) { return sycl::fmod(a, b); }
+
+  /**
+   * @brief Maximum of two numbers
+   * @param a first number
+   * @param b second number
+   */
+  template <typename T> inline __host__ __device__ T max(const T a, const T b) { return a > b ? a : b; }
+
+  /**
+   * @brief Minimum of two numbers
+   * @param a first number
+   * @param b second number
+   */
+  template <typename T> inline __host__ __device__ T min(const T &a, const T &b) { return a < b ? a : b; }
+
+  /**
+   * @brief Sine calculation in QUDA NAMESPACE
+   * @param a the angle
+   * @return result of the sin(a)
+   */
+  template <typename T> inline T sin(T a) { return sycl::sin(a); }
+
+  /**
+   * @brief Cosine calculation in QUDA NAMESPACE
+   * @param a the angle
+   * @return result of the cos(a)
+   */
+  template <typename T> inline T cos(T a) { return sycl::cos(a); }
+
+  /**
+   * @brief Combined sin and cos colculation in QUDA NAMESPACE
+   * @param a the angle
+   * @param s pointer to the storage for the result of the sin
+   * @param c pointer to the storage for the result of the cos
+   *
+   */
+  template <typename T> inline void sincos(const T a, T *s, T *c)
+  {
+    //*s = sycl::sincos(a, c);
+    *s = sycl::sin(a);
+    *c = sycl::cos(a);
+  }
+
+  /**
+   * @brief Sine pi calculation in QUDA NAMESPACE
+   * @param a the angle
+   * @return result of the sin(a * pi)
+   */
+  template <typename T> inline T sinpi(T a) { return sycl::sinpi(a); }
+
+  /**
+   * @brief Cosine pi calculation in QUDA NAMESPACE
+   * @param a the angle
+   * @return result of the cos(a * pi)
+   */
+  template <typename T> inline T cospi(T a) { return sycl::cospi(a); }
+
+  /**
+   * @brief Combined sinpi and cospi calculation in QUDA NAMESPACE
+   * @param a the angle
+   * @param s pointer to the storage for the result of the sin
+   * @param c pointer to the storage for the result of the cos
+   */
+  template <typename T> inline void sincospi(const T &a, T *s, T *c)
+  {
+    //*s = sycl::sincos(static_cast<T>(M_PI)*a, c);
+    *s = sycl::sinpi(a);
+    *c = sycl::cospi(a);
+  }
+
+  /**
+   * @brief Arc cosine calculation in QUDA NAMESPACE
+   * @param a the angle
+   * @return result of the acos(a)
+   */
+  template <typename T> inline T acos(const T a) { return sycl::acos(a); }
+
+  template <typename T> inline T atan2(const T a, const T b) { return sycl::atan2(a, b); }
+
+  template <typename T> inline T sinh(const T a) { return sycl::sinh(a); }
+  template <typename T> inline T cosh(const T a) { return sycl::cosh(a); }
+
+  /**
+   * @brief Square root function (sqrt)
+   * @param a the argument
+   */
+  template <typename T> inline __host__ __device__ T sqrt(T a) { return sycl::sqrt(a); }
+
+  /**
+   * @brief Reciprocal square root function (rsqrt)
+   * @param a the argument
+   */
+  template <typename T> inline __host__ __device__ T rsqrt(T a) { return sycl::rsqrt(a); }
+
+  template <typename T> inline T hypot(const T a, const T b) { return sycl::hypot(a, b); }
+
+  /**
+   * @brief Exponential function
+   * @param a the argument
+   */
+  template <typename T> inline __host__ __device__ T exp(T a) { return sycl::exp(a); }
+
+  /**
+   * @brief Natural log function
+   * @param a the argument
+   */
+  template <typename T> inline __host__ __device__ T log(T a) { return sycl::log(a); }
+
+  /*
+    @brief Power function
+    @param a argument we want to raise to some power
+    @param b power that we want to raise a to
+    @return pow(a,b)
+  */
+  template <typename real> __device__ __host__ inline real pow(real a, real b) { return sycl::pow(a, b); }
+  template <typename real> __device__ __host__ inline real pow(real a, int b) { return sycl::pown(a, b); }
+
+  /*
+    @brief Fast power function that works for negative "a" argument
+    @param a argument we want to raise to some power
+    @param b power that we want to raise a to
+    @return pow(a,b)
+  */
+  template <typename real> __device__ __host__ inline real fpow(real a, int b) { return sycl::pown(a, b); }
+
+  /**
+     @brief Optimized division routine on the device
+  */
+  // inline float fdividef(float a, float b) { return a/b; }
+  inline float fdividef(float a, float b) { return sycl::native::divide(a, b); }
+
+  __device__ __host__ inline float2 add2(float2 a, float2 b) { return {a.x + b.x, a.y + b.y}; }
+  __device__ __host__ inline double2 add2(double2 a, double2 b) { return {a.x + b.x, a.y + b.y}; }
+
+  __device__ __host__ inline float2 mul2(float2 a, float2 b) { return {a.x * b.x, a.y * b.y}; }
+  __device__ __host__ inline double2 mul2(double2 a, double2 b) { return {a.x * b.x, a.y * b.y}; }
+
+  __device__ __host__ inline float2 fma2(float2 a, float2 b, float2 c)
+  {
+    return {a.x * b.x + c.x, a.y * b.y + c.y};
+  }
+  __device__ __host__ inline double2 fma2(double2 a, double2 b, double2 c)
+  {
+    return {a.x * b.x + c.x, a.y * b.y + c.y};
+  }
+
+} // namespace quda
diff --git a/include/targets/sycl/math_helper.h b/include/targets/sycl/math_helper.h
new file mode 100644
index 0000000000..a6cdab0277
--- /dev/null
+++ b/include/targets/sycl/math_helper.h
@@ -0,0 +1 @@
+#include <math_helper.cuh>
diff --git a/include/targets/sycl/quda_sycl.h b/include/targets/sycl/quda_sycl.h
new file mode 100644
index 0000000000..e3bec15c6c
--- /dev/null
+++ b/include/targets/sycl/quda_sycl.h
@@ -0,0 +1,207 @@
+#pragma once
+
+#include <sycl/sycl.hpp>
+#include <cstddef>
+#include <sstream>
+#include <sycl/ext/oneapi/experimental/builtins.hpp>
+
+#ifdef __SYCL_DEVICE_ONLY__
+#define __SYCL_CONSTANT_AS __attribute__((opencl_constant))
+#else
+#define __SYCL_CONSTANT_AS
+#endif
+
+using cudaStream_t = int;
+
+// #include <error.h>
+#include "shortvec.h"
+// #include <stream.h>
+// #include <event.h>
+
+#define __host__
+#define __device__
+// #define __global__
+#define __forceinline__ __attribute__((always_inline)) inline
+// #define __launch_bounds__(x)
+
+// FIXME
+// #define __constant__ static
+#define __shared__
+// #define __shfl_down_sync(m, x, o) x
+
+#define RANGE_X 2
+#define RANGE_Y 1
+#define RANGE_Z 0
+
+inline std::string str(dim3 x)
+{
+  std::ostringstream ss;
+  ss << "(" << x.x << "," << x.y << "," << x.z << ")";
+  return ss.str();
+}
+
+inline std::string str(sycl::id<3> x)
+{
+  std::ostringstream ss;
+  ss << "(" << x[RANGE_X] << "," << x[RANGE_Y] << "," << x[RANGE_Z] << ")";
+  return ss.str();
+}
+
+inline std::string str(sycl::range<3> x)
+{
+  std::ostringstream ss;
+  ss << "(" << x[RANGE_X] << "," << x[RANGE_Y] << "," << x[RANGE_Z] << ")";
+  return ss.str();
+}
+
+template <typename T> inline std::string str(std::vector<T> v)
+{
+  std::ostringstream ss;
+  std::copy(v.begin(), v.end(), std::ostream_iterator<T>(ss, " "));
+  return ss.str();
+}
+
+static inline auto getGroup()
+{
+  // return sycl::this_group<3>();
+  // return sycl::ext::oneapi::experimental::this_group<3>();
+  return sycl::ext::oneapi::this_work_item::get_work_group<3>();
+}
+static inline auto getNdItem()
+{
+  // return sycl::this_nd_item<3>();
+  // return sycl::ext::oneapi::experimental::this_nd_item<3>();
+  return sycl::ext::oneapi::this_work_item::get_nd_item<3>();
+}
+
+static inline unsigned int globalRange(int d) { return getNdItem().get_global_range(d); }
+static inline unsigned int globalId(int d) { return getNdItem().get_global_id(d); }
+static inline unsigned int groupRange(int d) { return getNdItem().get_group_range(d); }
+static inline unsigned int groupId(int d) { return getNdItem().get_group(d); }
+static inline unsigned int localRange(int d) { return getNdItem().get_local_range(d); }
+static inline unsigned int localId(int d) { return getNdItem().get_local_id(d); }
+
+#define globalRangeX ::globalRange(RANGE_X)
+#define globalRangeY ::globalRange(RANGE_Y)
+#define globalRangeZ ::globalRange(RANGE_Z)
+#define globalIdX ::globalId(RANGE_X)
+#define globalIdY ::globalId(RANGE_Y)
+#define globalIdZ ::globalId(RANGE_Z)
+
+#define localRangeX ::localRange(RANGE_X)
+#define localRangeY ::localRange(RANGE_Y)
+#define localRangeZ ::localRange(RANGE_Z)
+#define localIdX ::localId(RANGE_X)
+#define localIdY ::localId(RANGE_Y)
+#define localIdZ ::localId(RANGE_Z)
+
+#define groupRangeX ::groupRange(RANGE_X)
+#define groupRangeY ::groupRange(RANGE_Y)
+#define groupRangeZ ::groupRange(RANGE_Z)
+#define groupIdX ::groupId(RANGE_X)
+#define groupIdY ::groupId(RANGE_Y)
+#define groupIdZ ::groupId(RANGE_Z)
+
+inline dim3 makeDim3(sycl::range<3> &&r)
+{
+  dim3 d;
+  d.x = r[RANGE_X];
+  d.y = r[RANGE_Y];
+  d.z = r[RANGE_Z];
+  return d;
+}
+
+inline dim3 getGridDim()
+{
+  dim3 r;
+  // #ifdef __SYCL_DEVICE_ONLY__
+  auto ndi = getNdItem();
+  r.x = ndi.get_group_range(RANGE_X);
+  r.y = ndi.get_group_range(RANGE_Y);
+  r.z = ndi.get_group_range(RANGE_Z);
+  // #endif
+  return r;
+}
+
+inline dim3 getBlockIdx()
+{
+  dim3 r;
+  // #ifdef __SYCL_DEVICE_ONLY__
+  auto ndi = getNdItem();
+  r.x = ndi.get_group(RANGE_X);
+  r.y = ndi.get_group(RANGE_Y);
+  r.z = ndi.get_group(RANGE_Z);
+  // #endif
+  return r;
+}
+
+inline dim3 getBlockDim()
+{
+  dim3 r;
+  // #ifdef __SYCL_DEVICE_ONLY__
+  auto ndi = getNdItem();
+  r.x = ndi.get_local_range(RANGE_X);
+  r.y = ndi.get_local_range(RANGE_Y);
+  r.z = ndi.get_local_range(RANGE_Z);
+  // #endif
+  return r;
+}
+
+inline dim3 getBlockDim(sycl::nd_item<3> &ndi)
+{
+  dim3 r;
+  // #ifdef __SYCL_DEVICE_ONLY__
+  r.x = ndi.get_local_range(RANGE_X);
+  r.y = ndi.get_local_range(RANGE_Y);
+  r.z = ndi.get_local_range(RANGE_Z);
+  // #endif
+  return r;
+}
+
+inline dim3 getThreadIdx()
+{
+  dim3 r;
+  // #ifdef __SYCL_DEVICE_ONLY__
+  // auto ndi = cl::sycl::detail::Builder::getNDItem<3>();
+  auto ndi = getNdItem();
+  r.x = ndi.get_local_id(RANGE_X);
+  r.y = ndi.get_local_id(RANGE_Y);
+  r.z = ndi.get_local_id(RANGE_Z);
+  // #endif
+  return r;
+}
+
+#if 0
+inline uint getLocalLinearId()
+{
+  int id = 0;
+#ifdef __SYCL_DEVICE_ONLY__
+  //auto ndi = getNdItem();
+  //auto ndi = sycl::this_nd_item<3>();
+  auto ndi = getNdItem();
+  id = ndi.get_local_linear_id();
+#endif
+  return id;
+}
+#endif
+
+#define gridDim getGridDim()
+#define blockIdx getBlockIdx()
+#define blockDim getBlockDim()
+#define threadIdx getThreadIdx()
+
+inline void syncthreads(void)
+{
+  // auto ndi = getNdItem();
+  // ndi.barrier();
+  group_barrier(getGroup());
+}
+#define __syncthreads syncthreads
+
+namespace quda
+{
+  namespace device
+  {
+    unsigned int max_parameter_size();
+  }
+} // namespace quda
diff --git a/include/targets/sycl/quda_sycl_api.h b/include/targets/sycl/quda_sycl_api.h
new file mode 100644
index 0000000000..f5bc03c03a
--- /dev/null
+++ b/include/targets/sycl/quda_sycl_api.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <quda_sycl.h>
+
+namespace quda
+{
+  namespace device
+  {
+    sycl::queue get_target_stream(const qudaStream_t &stream);
+    sycl::queue defaultQueue(void);
+    size_t getEventIdx(const qudaStream_t &stream);
+    void wasSynced(const qudaStream_t &stream);
+    void wasSynced(const qudaStream_t &stream, size_t eventIdx);
+    void *get_arg_buf(qudaStream_t stream, size_t size);
+    void *get_arg_buf_d(qudaStream_t stream, size_t size);
+    void free_arg_buf();
+  } // namespace device
+
+  namespace target
+  {
+    namespace sycl
+    {
+      void set_error(std::string error_str, const char *api_func, const char *func, const char *file, const char *line,
+                     bool allow_error);
+    }
+  } // namespace target
+} // namespace quda
+
+#if 0
+///// MATH
+
+#include <math_helper.cuh>
+
+//#define rsqrt(x) (1/sqrt(x))
+//inline float rsqrt(float x) { return 1.0f/sqrt(x); }
+//inline void sincos(float x, float *s, float *c)
+//{
+  //*s = sin(x);
+  //*c = cos(x);
+  //*s = sycl::sincos(x, c);
+//}
+
+inline float fdividef(float a, float b) { return quda::fdividef(a,b); }
+#endif
diff --git a/include/targets/sycl/random_helper_mkl.h b/include/targets/sycl/random_helper_mkl.h
new file mode 100644
index 0000000000..7764fe02a2
--- /dev/null
+++ b/include/targets/sycl/random_helper_mkl.h
@@ -0,0 +1,157 @@
+#pragma once
+
+#include <random_quda.h>
+#include <oneapi/mkl/rng/device.hpp>
+
+namespace drng = oneapi::mkl::rng::device;
+
+namespace quda
+{
+
+  struct RNGState {
+    drng::mrg32k3a<1> state;
+    // double next_gauss;
+    // bool next_valid;
+  };
+
+  /**
+   * \brief random init
+   * @param [in] seed -- The RNG seed
+   * @param [in] sequence -- The sequence
+   * @param [in] offset -- the offset
+   * @param [in,out] state - the RNG State
+   */
+  inline void random_init(unsigned long long seed, unsigned long long sequence, unsigned long long offset, RNGState &state)
+  {
+    // curand_init(seed, sequence, offset, &state.state);
+    std::array<std::uint32_t, 6> init;
+    for (int i = 0; i < 6; i++) { init[i] = 12345; }
+    if (seed != 0) {
+      double d1 = 12345.0 * (((unsigned int)seed) ^ 0x55555555);
+      double d2 = 12345.0 * ((seed >> 32) ^ 0xaaaaaaaa);
+      init[0] = fmod(d1, 4294967087.0);
+      init[1] = fmod(d2, 4294967087.0);
+      init[2] = fmod(d1, 4294967087.0);
+      init[3] = fmod(d2, 4294944443.0);
+      init[4] = fmod(d1, 4294944443.0);
+      init[5] = fmod(d2, 4294944443.0);
+    }
+    auto seed_list = {init[0], init[1], init[2], init[3], init[4], init[5]};
+    std::initializer_list<std::uint64_t> num_to_skip = {offset, (1 << 12) * sequence};
+    state.state = drng::mrg32k3a<1>(seed_list, num_to_skip);
+    // state.next_valid = false;
+  }
+
+  template <class Real> struct uniform {
+  };
+  template <> struct uniform<float> {
+
+    /**
+     * \brief Return a uniform deviate between 0 and 1
+     * @param [in,out] the RNG State
+     */
+    static inline float rand(RNGState &state)
+    {
+      // curand_uniform(&state.state);
+      drng::uniform<float> distr;
+      return generate(distr, state.state);
+    }
+
+    /**
+     * \brief return a uniform deviate between a and b
+     * @param [in,out] the RNG state
+     * @param [in] a (the lower end of the range)
+     * @param [in] b (the upper end of the range)
+     */
+    static inline float rand(RNGState &state, float a, float b)
+    {
+      // return a + (b - a) * rand(state);
+      drng::uniform<float> distr(a, b);
+      return generate(distr, state.state);
+    }
+  };
+
+  template <> struct uniform<double> {
+    /**
+     * \brief Return a uniform deviate between 0 and 1
+     * @param [in,out] the RNG State
+     */
+    static inline double rand(RNGState &state)
+    {
+      // curand_uniform_double(&state.state);
+      drng::uniform<double> distr;
+      return generate(distr, state.state);
+    }
+
+    /**
+     * \brief Return a uniform deviate between a and b
+     * @param [in,out] the RNG State
+     * @param [in] a -- the lower end of the range
+     * @param [in] b -- the high end of the range
+     */
+    static inline double rand(RNGState &state, double a, double b)
+    {
+      // return a + (b - a) * rand(state);
+      drng::uniform<double> distr(a, b);
+      return generate(distr, state.state);
+    }
+  };
+
+  template <class Real> struct normal {
+  };
+
+  template <> struct normal<float> {
+    /**
+     * \brief return a gaussian normal deviate with mean of 0
+     * @param [in,out] state
+     */
+    static inline float rand(RNGState &state)
+    {
+      // curand_normal(&state.state);
+#if 0
+      float x = 0;
+      if(state.next_valid) {
+	x = state.next_gauss;
+	state.next_valid = false;
+      } else {
+	drng::gaussian<float> distr;
+	x = generate(distr, state.state);
+	state.next_gauss = generate(distr, state.state);
+	state.next_valid = true;
+      }
+      return x;
+#else
+      drng::gaussian<float> distr;
+      return generate(distr, state.state);
+#endif
+    }
+  };
+
+  template <> struct normal<double> {
+    /**
+     * \brief return a gaussian (normal) deviate with a mean of 0
+     * @param [in,out] state
+     */
+    static inline double rand(RNGState &state)
+    {
+      // curand_normal_double(&state.state);
+#if 0
+      double x = 0;
+      if(state.next_valid) {
+	x = state.next_gauss;
+	state.next_valid = false;
+      } else {
+	drng::gaussian<double> distr;
+	x = generate(distr, state.state);
+	state.next_gauss = generate(distr, state.state);
+	//state.next_valid = true;
+      }
+      return x;
+#else
+      drng::gaussian<double> distr;
+      return generate(distr, state.state);
+#endif
+    }
+  };
+
+} // namespace quda
diff --git a/include/targets/sycl/reduce_helper.h b/include/targets/sycl/reduce_helper.h
new file mode 100644
index 0000000000..eafdcc5523
--- /dev/null
+++ b/include/targets/sycl/reduce_helper.h
@@ -0,0 +1,223 @@
+#pragma once
+
+#include <quda_internal.h>
+#include <target_device.h>
+#include <block_reduce_helper.h>
+#include <kernel_helper.h>
+#include <atomic_helper.h>
+
+using count_t = unsigned int;
+
+namespace quda
+{
+
+  // declaration of reduce function
+  // template <typename Reducer, typename Arg, typename T>
+  // inline void reduce(Arg &arg, const Reducer &r, const T &in, const int idx = 0);
+
+  /**
+     @brief ReduceArg is the argument type that all kernel arguments
+     shoud inherit from if the kernel is to utilize global reductions.
+     @tparam T the type that will be reduced
+     @tparam use_kernel_arg Whether the kernel will source the
+     parameter struct as an explicit kernel argument or from constant
+     memory
+   */
+  template <typename T, use_kernel_arg_p use_kernel_arg = use_kernel_arg_p::TRUE>
+  struct ReduceArg : kernel_param<use_kernel_arg> {
+    using reduce_t = T;
+
+    template <typename Arg, typename Reducer, typename I, typename BR>
+    friend void reduce(Arg &, const Reducer &, const I &, const unsigned int, BR &br);
+    qudaError_t launch_error; /** only do complete if no launch error to avoid hang */
+    static constexpr unsigned int max_n_batch_block
+      = 1; /** by default reductions do not support batching withing the block */
+
+  private:
+    const int n_reduce; /** number of reductions of length n_item */
+    T *partial;         /** device buffer */
+    T *result_d;        /** device-mapped host buffer */
+    T *result_h;        /** host buffer */
+    count_t *count; /** count array that is used to track the number of completed thread blocks at a given batch index */
+    T *device_output_async_buffer = nullptr; // Optional device output buffer for the reduction result
+
+  public:
+    /**
+       @brief Constructor for ReduceArg
+       @param[in] threads The number threads partaking in the kernel
+       @param[in] n_reduce The number of reductions
+    */
+    ReduceArg(dim3 threads, int n_reduce = 1, bool = false) :
+      kernel_param<use_kernel_arg>(threads), launch_error(QUDA_ERROR_UNINITIALIZED), n_reduce(n_reduce)
+    {
+      reducer::init(n_reduce, sizeof(*partial));
+      // these buffers may be allocated in init, so we can't set the local copies until now
+      partial = static_cast<decltype(partial)>(reducer::get_device_buffer());
+      result_d = static_cast<decltype(result_d)>(reducer::get_mapped_buffer());
+      result_h = static_cast<decltype(result_h)>(reducer::get_host_buffer());
+      count = reducer::get_count<count_t>();
+
+      if (commAsyncReduction()) result_d = partial;
+    }
+
+    /**
+      @brief Set device_output_async_buffer
+    */
+    void set_output_async_buffer(T *ptr)
+    {
+      if (!commAsyncReduction()) {
+        errorQuda("When setting the asynchronous buffer the commAsyncReduction option must be set.");
+      }
+      device_output_async_buffer = ptr;
+    }
+
+    /**
+      @brief Get device_output_async_buffer
+    */
+    __device__ __host__ T *get_output_async_buffer() const { return device_output_async_buffer; }
+
+    /**
+       @brief Finalize the reduction, returning the computed reduction
+       into result.  The generic path posts an event after the kernel
+       and then polls on completion of the event.
+       @param[out] result The reduction result is copied here
+       @param[in] stream The stream on which we the reduction is being done
+     */
+    template <typename host_t, typename device_t = host_t>
+    void complete(std::vector<host_t> &result, const qudaStream_t stream = device::get_default_stream())
+    {
+      if (launch_error == QUDA_ERROR) return; // kernel launch failed so return
+      if (launch_error == QUDA_ERROR_UNINITIALIZED) errorQuda("No reduction kernel appears to have been launched");
+      // auto event = reducer::get_event();
+      // qudaEventRecord(event, stream);
+      // while (!qudaEventQuery(event)) { }
+      // qudaEventSynchronize(event);
+      auto q = device::get_target_stream(stream);
+      q.wait();
+
+      // copy back result element by element and convert if necessary to host reduce type
+      // unit size here may differ from system_atomic_t size, e.g., if doing double-double
+      const int n_element = n_reduce * sizeof(T) / sizeof(device_t);
+      if (result.size() != (unsigned)n_element)
+        errorQuda("result vector length %lu does not match n_reduce %d", result.size(), n_element);
+      for (int i = 0; i < n_element; i++) result[i] = reinterpret_cast<device_t *>(result_h)[i];
+    }
+
+    void debug()
+    {
+      warningQuda("r2d count: %i", count[0]);
+      warningQuda("r2d result_h: %g", ((double *)result_h)[0]);
+    }
+  };
+
+  template <typename Arg, typename Reducer, typename T> struct reduceParams {
+    static constexpr auto n_batch_block = std::min(Arg::max_n_batch_block, device::max_block_size());
+    using BlockReduce_t = BlockReduce<T, Reducer::reduce_block_dim, n_batch_block>;
+    // using reduceConcurrentOps = op_Concurrent<op_blockSync,op_SharedMemory<bool>>;
+    // using opBlockSync = getKernelOpF<reduceConcurrentOps,0>;
+    // using opSharedMem = getKernelOpF<reduceConcurrentOps,1>;
+    // using Ops = KernelOps<BlockReduce_t,reduceConcurrentOps>;
+    using opBlockSync = op_blockSync;
+    // using opSharedMem = op_SharedMemory<bool>;
+    using Smem = SharedMemory<bool, SizeZ>;
+    // using Ops = KernelOps<BlockReduce_t,opBlockSync,opSharedMem>;
+    using Ops = KernelOps<BlockReduce_t, opBlockSync, Smem>;
+  };
+
+  /**
+     @brief Generic reduction function that reduces block-distributed
+     data "in" per thread to a single value.  This is the generic
+     variant which require explicit host-device synchronization to
+     signal the completion of the reduction to the host.
+
+     @param arg The kernel argument, this must derive from ReduceArg
+     @param r Instance of the reducer to be used in this reduction
+     @param in The input per-thread data to be reduced
+     @param idx In the case of multiple reductions, idx identifies
+     which reduction this thread block corresponds to.  Typically idx
+     will be constant along constant block_idx().y and block_idx().z.
+  */
+  template <typename Arg, typename Reducer, typename T, typename O>
+  inline void reduce(Arg &arg, const Reducer &r, const T &in, const unsigned int idx, O &ops)
+  {
+    using BlockReduce_t = typename reduceParams<Arg, Reducer, T>::BlockReduce_t;
+    BlockReduce_t br(ops, target::thread_idx().z);
+    T aggregate = br.Reduce(in, r);
+
+    if (target::grid_dim().x == 1) { // special case
+      if (target::thread_idx().x == 0 && target::thread_idx().y == 0 && idx < arg.threads.z) {
+        if (arg.get_output_async_buffer()) {
+          arg.get_output_async_buffer()[idx] = aggregate;
+        } else {
+          arg.result_d[idx] = aggregate;
+        }
+        arg.count[idx] = 0; // set to zero for next time
+      }
+      return;
+    }
+
+#if 0
+      auto glmem = sycl::ext::oneapi::group_local_memory_for_overwrite<bool[n_batch_block]>(getGroup());
+      auto isLastBlockDone = *glmem.get();
+#else
+    // using opSharedMem = typename reduceParams<Arg, Reducer, T>::opSharedMem;
+    // auto isLastBlockDone = getSharedMemPtr(opSharedMem()(ops));
+    // auto isLastBlockDone = getSharedMemPtr<opSharedMem>(ops);
+    using Smem = typename reduceParams<Arg, Reducer, T>::Smem;
+    Smem smem(ops);
+    auto isLastBlockDone = smem.sharedMem();
+#endif
+
+    if (target::thread_idx().x == 0 && target::thread_idx().y == 0 && idx < arg.threads.z) {
+      arg.partial[idx * target::grid_dim().x + target::block_idx().x] = aggregate;
+      sycl::atomic_fence(sycl::memory_order::release, sycl::memory_scope::device); // flush result
+
+      // increment global block counter
+      count_t value = atomicAdd(&arg.count[idx], 1);
+
+      // determine if last block
+      isLastBlockDone[target::thread_idx().z] = (value == (target::grid_dim().x - 1));
+    }
+
+#if 0
+    __syncthreads();
+#else
+    // using opBlockSync = typename reduceParams<Arg, Reducer, T>::opBlockSync;
+    // blockSync(opBlockSync()(ops));
+    blockSync(ops);
+#endif
+    bool active = false;
+    if (idx < arg.threads.z) active = isLastBlockDone[target::thread_idx().z];
+    bool anyActive = sycl::any_of_group(getGroup(), active);
+
+    // finish the reduction if last block
+    if (anyActive) {
+      T sum = Reducer::init();
+      if (active) {
+        // auto i = target::thread_idx().y * target::block_dim().x + target::thread_idx().x;
+        auto i = target::thread_idx_linear<2>();
+        sycl::atomic_fence(sycl::memory_order::acquire, sycl::memory_scope::device);
+        while (i < target::grid_dim().x) {
+          // sum = r(sum, const_cast<T &>(static_cast<volatile T *>(arg.partial)[idx * target::grid_dim().x + i]));
+          sum = r(sum, arg.partial[idx * target::grid_dim().x + i]);
+          // i += target::block_dim().x * target::block_dim().y;
+          i += target::block_size<2>();
+        }
+      }
+
+      // sum = BlockReduce(&ops..., target::thread_idx().z).Reduce(sum, r);
+      sum = br.Reduce(sum, r);
+
+      // write out the final reduced value
+      if (active && target::thread_idx().x == 0 && target::thread_idx().y == 0) {
+        if (arg.get_output_async_buffer()) {
+          arg.get_output_async_buffer()[idx] = sum;
+        } else {
+          arg.result_d[idx] = sum;
+        }
+        arg.count[idx] = 0; // set to zero for next time
+      }
+    }
+  }
+
+} // namespace quda
diff --git a/include/targets/sycl/reduction_kernel.h b/include/targets/sycl/reduction_kernel.h
new file mode 100644
index 0000000000..13483259df
--- /dev/null
+++ b/include/targets/sycl/reduction_kernel.h
@@ -0,0 +1,282 @@
+#pragma once
+#include <tunable_kernel.h>
+#include <reduce_helper.h>
+#include <timer.h>
+#include <quda_sycl_api.h>
+
+// #define HIGH_LEVEL_REDUCTIONS
+
+namespace quda
+{
+
+#ifndef HIGH_LEVEL_REDUCTIONS
+  template <template <typename> class Functor, typename Arg, bool grid_stride = true, typename S>
+  void Reduction2DImpl(const Arg &arg, const sycl::nd_item<3> &ndi, S smem)
+  {
+    Functor<Arg> f(arg);
+#if 0
+    typename reduceParams<Arg,Functor<Arg>,typename Functor<Arg>::reduce_t>::Ops rso;
+    rso.setNdItem(ndi);
+    rso.setSharedMem(smem);
+#else
+    typename reduceParams<Arg, Functor<Arg>, typename Functor<Arg>::reduce_t>::Ops rso {smem};
+#endif
+    auto idx = globalIdX;
+    auto j = localIdY;
+    auto value = f.init();
+    while (idx < arg.threads.x) {
+      value = f(value, idx, j);
+      if (grid_stride)
+        idx += globalRangeX;
+      else
+        break;
+    }
+    if constexpr (needsSharedMem<Functor<Arg>>) group_barrier(ndi.get_group());
+    // perform final inter-block reduction and write out result
+    reduce(arg, f, value, 0, rso);
+  }
+  template <template <typename> class Functor, typename Arg, bool grid_stride = false> struct Reduction2DS {
+    using KernelOpsT = typename reduceParams<Arg, Functor<Arg>, typename Functor<Arg>::reduce_t>::Ops;
+    template <typename... T> Reduction2DS(const Arg &arg, const sycl::nd_item<3> &ndi, T... smem)
+    {
+#ifdef QUDA_THREADS_BLOCKED
+      Reduction2DImpl<Functor, Arg, grid_stride>(arg, ndi);
+#else
+      Reduction2DImpl<Functor, Arg, grid_stride>(arg, ndi, smem...);
+#endif
+    }
+  };
+#else
+  template <template <typename> class Functor, bool grid_stride, typename Arg, typename R>
+  void Reduction2DImplN(const Arg &arg, const sycl::nd_item<3> &ndi, R &reducer)
+  {
+    Functor<Arg> f(arg);
+    auto idx = globalIdX;
+    auto j = localIdY;
+    auto value = f.init();
+    while (idx < arg.threads.x) {
+      value = f(value, idx, j);
+      if (grid_stride)
+        idx += globalRangeX;
+      else
+        break;
+    }
+    reducer.combine(value);
+  }
+  template <template <typename> class Functor, bool grid_stride = false> struct Reduction2DS {
+    using KernelOpsT = NoKernelOps;
+    template <typename Arg, typename R> static void apply(const Arg &arg, const sycl::nd_item<3> &ndi, R &reducer)
+    {
+#ifdef QUDA_THREADS_BLOCKED
+      Reduction2DImplN<Functor, grid_stride>(arg, ndi, reducer);
+#else
+      Reduction2DImplN<Functor, grid_stride>(arg, ndi, reducer);
+#endif
+    }
+  };
+#endif
+  template <template <typename> class Functor, typename Arg, bool grid_stride = true>
+  qudaError_t Reduction2D(const TuneParam &tp, const qudaStream_t &stream, Arg &arg)
+  {
+    static_assert(!hasKernelOps<Functor<Arg>>);
+    auto err = QUDA_SUCCESS;
+    auto globalSize = globalRange(tp);
+    auto localSize = localRange(tp);
+    // if (localSize[RANGE_X] % device::warp_size() != 0) {
+    // return QUDA_ERROR;
+    // }
+#if 0
+    if (localSize[RANGE_X] > arg.threads.x) {
+      localSize[RANGE_X] = arg.threads.x;
+      globalSize[RANGE_X] = arg.threads.x;
+    } else if (grid_stride) {
+      if (globalSize[RANGE_X] > arg.threads.x) {
+	globalSize[RANGE_X] = ((arg.threads.x+localSize[RANGE_X]-1)/localSize[RANGE_X])*localSize[RANGE_X];
+      }
+    } else {
+      if (globalSize[RANGE_X] != arg.threads.x) {
+	globalSize[RANGE_X] = ((arg.threads.x+localSize[RANGE_X]-1)/localSize[RANGE_X])*localSize[RANGE_X];
+      }
+    }
+#endif
+    host_timer_t timer;
+    if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
+      printfQuda("Reduction2D grid_stride: %s  sizeof(arg): %lu\n", grid_stride ? "true" : "false", sizeof(arg));
+      printfQuda("  global: %s  local: %s  threads: %s\n", str(globalSize).c_str(), str(localSize).c_str(),
+                 str(arg.threads).c_str());
+      printfQuda("  Functor: %s\n", typeid(Functor<Arg>).name());
+      printfQuda("  Arg: %s\n", typeid(Arg).name());
+      printfQuda("  SLM size: %lu\n", localSize.size() * sizeof(typename Functor<Arg>::reduce_t) / device::warp_size());
+      printfQuda("  KernelOps: %s\n", typeid(getKernelOps<Functor<Arg>>).name());
+      printfQuda("  needsFullBlock: %i  needsSharedMem: %i\n", needsFullBlock<Functor<Arg>>,
+                 needsSharedMem<Functor<Arg>>);
+      printfQuda("  shared_bytes: %i\n", tp.shared_bytes);
+      timer.start();
+    }
+    // if (localSize[RANGE_X] % device::warp_size() != 0) {
+    // if(needsFullBlock<Functor<Arg>>) {
+    // std::ostringstream what;
+    // what << "localSizeX (" << localSize[RANGE_X] << ") % warp_size (" << device::warp_size() << ") != 0";
+    // target::sycl::set_error(what.str(), "pre-launch", __func__, __FILE__, __STRINGIFY__(__LINE__), activeTuning());
+    // return QUDA_ERROR;
+    // }
+    // }
+    // if (arg.threads.x%localSize[RANGE_X] != 0) {
+    // warningQuda("arg.threads.x (%i) %% localSize X (%lu) != 0", arg.threads.x, localSize[RANGE_X]);
+    //  return QUDA_ERROR;
+    //}
+    // if (globalSize[RANGE_Y] != arg.threads.y) { // shouldn't happen here
+    // warningQuda("globalSize Y (%lu) != arg.threads.y (%i)", globalSize[RANGE_Y], arg.threads.y);
+    //  return QUDA_ERROR;
+    //}
+    sycl::nd_range<3> ndRange {globalSize, localSize};
+#ifndef HIGH_LEVEL_REDUCTIONS
+    err = launch<Reduction2DS<Functor, Arg, grid_stride>>(stream, ndRange, arg);
+#else
+    err = launchR<Functor, Reduction2DS<Functor, grid_stride>>(stream, ndRange, arg);
+#endif
+    if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
+      timer.stop();
+      // printfQuda("  launch time: %g\n", timer.last());
+      // auto q = device::get_target_stream(stream);
+      // using reduce_t = typename Functor<Arg>::reduce_t;
+      // if (commAsyncReduction()) {
+      // q.memcpy(result_h, result_d, sizeof(reduce_t));
+      // }
+      // q.wait_and_throw();
+      // printfQuda("end Reduction2D result_h: %g\n", *(double *)result_h);
+      printfQuda("end Reduction2D launch time: %g\n", timer.last());
+    }
+    return err;
+  }
+
+  // MultiReduction
+
+  template <template <typename> class Functor, typename Arg, bool grid_stride = true, typename S>
+  void MultiReductionImpl(const Arg &arg, const sycl::nd_item<3> &ndi, S smem)
+  {
+    static_assert(!needsFullBlock<Functor<Arg>>);
+    using reduce_t = typename Functor<Arg>::reduce_t;
+#if 0
+    Functor<Arg> f(arg);
+    if constexpr (hasKernelOps<Functor<Arg>>) {
+      f.setNdItem(ndi);
+    }
+    if constexpr (needsSharedMem<Functor<Arg>>) {
+      f.setSharedMem(smem);
+    }
+#else
+    // Functor<Arg> f(arg, smem...);
+    Ftor<Functor<Arg>> f(arg, ndi, smem);
+#endif
+
+#if 0
+    typename reduceParams<Arg,Functor<Arg>,typename Functor<Arg>::reduce_t>::Ops rso;
+    rso.setNdItem(ndi);
+    rso.setSharedMem(smem);
+#else
+    typename reduceParams<Arg, Functor<Arg>, typename Functor<Arg>::reduce_t>::Ops rso {smem};
+#endif
+
+    auto idx = globalIdX;
+    auto k = localIdY;
+    auto j = globalIdZ;
+
+    reduce_t value = f.init();
+
+    if (j < arg.threads.z) {
+      while (idx < arg.threads.x) {
+        value = f(value, idx, k, j);
+        if (grid_stride)
+          idx += globalRangeX;
+        else
+          break;
+      }
+    }
+    if constexpr (needsSharedMem<Functor<Arg>>) group_barrier(ndi.get_group());
+
+    // perform final inter-block reduction and write out result
+    reduce(arg, f, value, j, rso);
+  }
+  template <template <typename> class Functor, typename Arg, bool grid_stride> struct MultiReductionS {
+    using KernelOpsT
+      = combineOps<getKernelOps<Functor<Arg>>, typename reduceParams<Arg, Functor<Arg>, typename Functor<Arg>::reduce_t>::Ops>;
+    template <typename... T> MultiReductionS(const Arg &arg, const sycl::nd_item<3> &ndi, T... smem)
+    {
+      // #ifdef QUDA_THREADS_BLOCKED
+      // MultiReductionImpl<Functor,Arg,grid_stride>(arg, ndi);
+      // #else
+      MultiReductionImpl<Functor, Arg, grid_stride>(arg, ndi, smem...);
+      // #endif
+    }
+  };
+
+  template <template <typename> class Functor, typename Arg, bool grid_stride = true>
+  qudaError_t MultiReduction(const TuneParam &tp, const qudaStream_t &stream, Arg &arg)
+  {
+    // static_assert(!hasKernelOps<Functor<Arg>>);
+    auto err = QUDA_SUCCESS;
+    auto globalSize = globalRange(tp);
+    auto localSize = localRange(tp);
+    // if (localSize[RANGE_X] % device::warp_size() != 0) {
+    // return QUDA_ERROR;
+    // }
+#if 0
+    if (localSize[RANGE_X] > arg.threads.x) {
+      localSize[RANGE_X] = arg.threads.x;
+      globalSize[RANGE_X] = arg.threads.x;
+    } else if (grid_stride) {
+      if (globalSize[RANGE_X] > arg.threads.x) {
+	globalSize[RANGE_X] = ((arg.threads.x+localSize[RANGE_X]-1)/localSize[RANGE_X])*localSize[RANGE_X];
+      }
+    } else {
+      if (globalSize[RANGE_X] != arg.threads.x) {
+	globalSize[RANGE_X] = ((arg.threads.x+localSize[RANGE_X]-1)/localSize[RANGE_X])*localSize[RANGE_X];
+      }
+    }
+#endif
+    if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
+      using reduce_t = typename Functor<Arg>::reduce_t;
+      printfQuda("MultiReduction grid_stride: %s  sizeof(arg): %lu\n", grid_stride ? "true" : "false", sizeof(arg));
+      printfQuda("  global: %s  local: %s  threads: %s\n", str(globalSize).c_str(), str(localSize).c_str(),
+                 str(arg.threads).c_str());
+      printfQuda("  Functor: %s\n", typeid(Functor<Arg>).name());
+      printfQuda("  Arg: %s\n", typeid(Arg).name());
+      printfQuda("  reduce_t: %s\n", typeid(reduce_t).name());
+      printfQuda("  Arg::max_n_batch_block: %d\n", Arg::max_n_batch_block);
+      printfQuda("  Functor::reduce_block_dim: %d\n", Functor<Arg>::reduce_block_dim);
+      printfQuda("  max_block_z: %d\n", device::max_block_size() / (tp.block.x * tp.block.y));
+      printfQuda("  SLM size: %lu\n", localSize.size() * sizeof(typename Functor<Arg>::reduce_t) / device::warp_size());
+      printfQuda("  KernelOps: %s\n", typeid(getKernelOps<Functor<Arg>>).name());
+      printfQuda("  needsFullBlock: %i  needsSharedMem: %i\n", needsFullBlock<Functor<Arg>>,
+                 needsSharedMem<Functor<Arg>>);
+      printfQuda("  shared_bytes: %i\n", tp.shared_bytes);
+    }
+    // if (localSize[RANGE_X] % device::warp_size() != 0) {
+    // if(needsFullBlock<Functor<Arg>>) {
+    // std::ostringstream what;
+    // what << "localSizeX (" << localSize[RANGE_X] << ") % warp_size (" << device::warp_size() << ") != 0";
+    // target::sycl::set_error(what.str(), "pre-launch", __func__, __FILE__, __STRINGIFY__(__LINE__), activeTuning());
+    // return QUDA_ERROR;
+    // }
+    // }
+    // if (arg.threads.x%localSize[RANGE_X] != 0) {
+    // warningQuda("arg.threads.x (%i) %% localSize X (%lu) != 0", arg.threads.x, localSize[RANGE_X]);
+    //  return QUDA_ERROR;
+    //}
+    // if (globalSize[RANGE_Y] != arg.threads.y) { // shouldn't happen here
+    // warningQuda("globalSize Y (%lu) != arg.threads.y (%i)", globalSize[RANGE_Y], arg.threads.y);
+    //  return QUDA_ERROR;
+    //}
+    // if (globalSize[RANGE_Z] != arg.threads.z) {
+    // warningQuda("globalSize Z (%lu) != arg.threads.z (%i)", globalSize[RANGE_Z], arg.threads.z);
+    //  return QUDA_ERROR;
+    //}
+    sycl::nd_range<3> ndRange {globalSize, localSize};
+    err = launch<MultiReductionS<Functor, Arg, grid_stride>>(stream, ndRange, arg);
+    // err = launchX<MultiReductionS<Functor, Arg, grid_stride>>(stream, ndRange, arg);
+    if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("end MultiReduction\n"); }
+    return err;
+  }
+
+} // namespace quda
diff --git a/include/targets/sycl/shared_memory_cache_helper.h b/include/targets/sycl/shared_memory_cache_helper.h
new file mode 100644
index 0000000000..01a45efe46
--- /dev/null
+++ b/include/targets/sycl/shared_memory_cache_helper.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <../generic/shared_memory_cache_helper.h>
+
+namespace quda
+{
+  template <typename T, typename D, typename O>
+  static constexpr bool needsFullBlockImpl<SharedMemoryCache<T, D, O>> = true;
+  template <typename T, typename D, typename O>
+  static constexpr bool needsSharedMemImpl<SharedMemoryCache<T, D, O>> = true;
+} // namespace quda
diff --git a/include/targets/sycl/shared_memory_helper.h b/include/targets/sycl/shared_memory_helper.h
new file mode 100644
index 0000000000..66b7f1278e
--- /dev/null
+++ b/include/targets/sycl/shared_memory_helper.h
@@ -0,0 +1,85 @@
+#pragma once
+
+#include <target_device.h>
+
+/**
+   @file shared_memory_helper.h
+
+   Target specific helper for allocating and accessing shared memory.
+ */
+
+namespace quda
+{
+
+  /**
+     @brief Class which is used to allocate and access shared memory.
+     The shared memory is treated as an array of type T, with the
+     number of elements given by the call to the static member
+     S::size(target::block_dim()).  The offset from the beginning of
+     the total shared memory block is given by the static member
+     O::shared_mem_size(target::block_dim()), or 0 if O is void.
+   */
+  template <typename T, typename S, typename O = void> class SharedMemory
+  {
+    sycl::local_ptr<T> data;
+    const unsigned int size; // number of elements of type T
+
+  public:
+    using value_type = T;
+
+    /**
+       @brief Byte offset for this shared memory object.
+    */
+    template <typename... Arg> static constexpr unsigned int get_offset(const dim3 block, const Arg &...arg)
+    {
+      unsigned int o = 0;
+      if constexpr (!std::is_same_v<O, void>) { o = O::shared_mem_size(block, arg...); }
+      return o;
+    }
+
+    /**
+       @brief Shared memory size in bytes.
+    */
+    template <typename... Arg> static constexpr unsigned int shared_mem_size(const dim3 block, const Arg &...arg)
+    {
+      return get_offset(block, arg...) + S::size(block, arg...) * sizeof(T);
+    }
+
+    /**
+       @brief Constructor for SharedMemory object.
+    */
+#if 0
+    SharedMemory() : size(S::size(target::block_dim()))
+    {
+      auto grp = getGroup();
+      using atype = T[512]; // FIXME
+      auto mem0 = sycl::ext::oneapi::group_local_memory_for_overwrite<atype>(grp);
+      auto offset = get_offset(target::block_dim());
+      data = *mem0.get() + offset;
+    }
+#endif
+
+    template <typename... U, typename... Arg>
+    SharedMemory(const KernelOps<U...> &ops, const Arg &...arg) : size(S::size(target::block_dim(), arg...))
+    {
+      // auto op = getDependentOps<op_SharedMemory<T,SizeSmem<SharedMemory<T,S,O>>>>(ops);
+      auto op = ops;
+      auto offset = get_offset(target::block_dim(), arg...);
+      sycl::local_ptr<void> v(op.smem + offset);
+      sycl::local_ptr<T> p(v);
+      data = p;
+    }
+
+    constexpr auto sharedMem() const { return *this; }
+
+    /**
+       @brief Subscripting operator returning a reference to element.
+       @param[in] i The index to use.
+       @return Reference to value stored at that index.
+     */
+    __device__ __host__ T &operator[](const int i) { return data[i]; }
+  };
+
+  template <typename T, typename S, typename O> static constexpr bool needsSharedMemImpl<SharedMemory<T, S, O>> = true;
+
+} // namespace quda
diff --git a/include/targets/sycl/shortvec.h b/include/targets/sycl/shortvec.h
new file mode 100644
index 0000000000..f134169b13
--- /dev/null
+++ b/include/targets/sycl/shortvec.h
@@ -0,0 +1,92 @@
+#pragma once
+
+template <typename T> struct vec1 {
+  T x;
+} __attribute__((aligned(1 * sizeof(T))));
+template <typename T> inline vec1<T> make_vec1(T a) { return vec1<T> {a}; }
+
+template <typename T> struct vec2 {
+  T x, y;
+} __attribute__((aligned(2 * sizeof(T))));
+template <typename T> inline vec2<T> make_vec2(T a, T b) { return vec2<T> {a, b}; }
+
+template <typename T> struct vec3 {
+  T x, y, z;
+  inline vec3() { }
+  // inline vec3(T a, T b, T c): x(a),y(b),z(c) {}
+  constexpr inline vec3(T a) : x(a), y(0), z(0) { }
+  constexpr inline vec3(T a, T b) : x(a), y(b), z(0) { }
+  constexpr inline vec3(T a, T b, T c) : x(a), y(b), z(c) { }
+};
+template <typename T, typename A1, typename A2, typename A3> inline vec3<T> make_vec3(A1 a, A2 b, A3 c)
+{
+  return vec3<T> {a, b, c};
+}
+// template <typename T>
+// inline vec3<T> vec3<T>::vec3(T a, T b, T c) { return vec3<T>{a,b,c}; }
+// inline vec3<unsigned int> vec3<unsigned int>::vec3(unsigned int a, unsigned int b, unsigned int c) { return vec3<unsigned int>{a,b,c}; }
+
+template <typename T> struct vec4 {
+  T x, y, z, w;
+} __attribute__((aligned(4 * sizeof(T))));
+template <typename T> inline vec4<T> make_vec4(T a, T b, T c, T d) { return vec4<T> {a, b, c, d}; }
+
+typedef vec1<char> char1;
+typedef vec2<char> char2;
+typedef vec3<char> char3;
+typedef vec4<char> char4;
+#define make_char1(a) make_vec1<char>(a)
+#define make_char2(a, b) make_vec2<char>(a, b)
+#define make_char3(a, b, c) make_vec3<char>(a, b, c)
+#define make_char4(a, b, c, d) make_vec4<char>(a, b, c, d)
+
+typedef vec1<short> short1;
+typedef vec2<short> short2;
+typedef vec3<short> short3;
+typedef vec4<short> short4;
+#define make_short1(a) make_vec1<short>(a)
+#define make_short2(a, b) make_vec2<short>(a, b)
+#define make_short3(a, b, c) make_vec3<short>(a, b, c)
+#define make_short4(a, b, c, d) make_vec4<short>(a, b, c, d)
+
+typedef vec1<int> int1;
+typedef vec2<int> int2;
+typedef vec3<int> int3;
+typedef vec4<int> int4;
+#define make_int1(a) make_vec1<int>(a)
+#define make_int2(a, b) make_vec2<int>(a, b)
+#define make_int3(a, b, c) make_vec3<int>(a, b, c)
+#define make_int4(a, b, c, d) make_vec4<int>(a, b, c, d)
+
+typedef vec1<unsigned int> uint1;
+typedef vec2<unsigned int> uint2;
+typedef vec3<unsigned int> uint3;
+typedef vec4<unsigned int> uint4;
+#define make_uint1(a) make_vec1<uint>(a)
+#define make_uint2(a, b) make_vec2<uint>(a, b)
+#define make_uint3(a, b, c) make_vec3<uint>(a, b, c)
+// #define make_uint3(a,b,c) uint3{a,b,c}
+#define make_uint4(a, b, c, d) make_vec4<uint>(a, b, c, d)
+
+typedef vec1<float> float1;
+typedef vec2<float> float2;
+typedef vec3<float> float3;
+typedef vec4<float> float4;
+#define make_float1(a) make_vec1<float>(a)
+#define make_float2(a, b) make_vec2<float>(a, b)
+// #define make_float3(a,b,c) make_vec3<float>(a,b,c)
+#define make_float3(a, b, c)                                                                                           \
+  float3 { a, b, c }
+#define make_float4(a, b, c, d) make_vec4<float>(a, b, c, d)
+
+typedef vec1<double> double1;
+typedef vec2<double> double2;
+typedef vec3<double> double3;
+typedef vec4<double> double4;
+#define make_double1(a) make_vec1<double>(a)
+#define make_double2(a, b) make_vec2<double>(a, b)
+#define make_double3(a, b, c) make_vec3<double>(a, b, c)
+#define make_double4(a, b, c, d) make_vec4<double>(a, b, c, d)
+
+typedef vec3<unsigned int> dim3;
+// constexpr dim3(unsigned int a) { return make_uint3(a,0,0); }
diff --git a/include/targets/sycl/special_ops_target.h b/include/targets/sycl/special_ops_target.h
new file mode 100644
index 0000000000..43657f0ca7
--- /dev/null
+++ b/include/targets/sycl/special_ops_target.h
@@ -0,0 +1,353 @@
+#pragma once
+#include <special_ops.h>
+#include <block_reduce_helper.h>
+
+namespace quda
+{
+
+  // needsSharedMem
+#if 0
+  template <typename T> static constexpr bool needsSharedMem = needsSharedMem<getSpecialOps<T>>;
+  template <typename ...T> static constexpr bool needsSharedMemImpl = (needsSharedMemImpl<T> || ...);
+  template <> static constexpr bool needsSharedMemImpl<depNone> = false;
+  template <> static constexpr bool needsSharedMemImpl<depFullBlock> = false;
+  template <typename T, typename S> static constexpr bool needsSharedMemImpl<depSharedMem<T,S>> = true;
+  template <typename ...T> static constexpr bool needsSharedMemImpl<op_Concurrent<T...>> = needsSharedMemImpl<T...>;
+  template <typename ...T> static constexpr bool needsSharedMemImpl<op_Sequential<T...>> = needsSharedMemImpl<T...>;
+  template <typename T> static constexpr bool needsSharedMemF() {
+    if constexpr (std::is_base_of<op_Base,T>::value) {
+    //if constexpr (is_instance<T,op_Base>) {
+      return needsSharedMemImpl<typename T::dependencies>;
+    } else {
+      //if constexpr (hasSpecialOps<T>) {
+      //return needsSharedMem<getSpecialOps<T>>;
+      //} else {
+      //return false;
+      return needsSharedMem<typename T::dependentOps>;
+      //}
+    }
+  }
+  template <typename T> static constexpr bool needsSharedMemImpl<T> = needsSharedMemF<T>();
+  template <> static constexpr bool needsSharedMem<NoSpecialOps> = false;
+  template <typename ...T> static constexpr bool needsSharedMem<SpecialOps<T...>> = needsSharedMemImpl<T...>;
+#else
+  // template <typename ...T> static constexpr bool needsSharedMemImpl = (needsSharedMemImpl<T> || ...);
+  template <typename T> static constexpr bool needsSharedMemImpl = (T::shared_mem_size(dim3 {8, 8, 8}) > 0);
+  template <typename... T> static constexpr bool needsSharedMemImpl<SpecialOps<T...>> = (needsSharedMemImpl<T> || ...);
+  template <typename T> static constexpr bool needsSharedMem = needsSharedMem<getSpecialOps<T>>;
+  template <typename... T> static constexpr bool needsSharedMem<SpecialOps<T...>> = (needsSharedMemImpl<T> || ...);
+  template <> static constexpr bool needsSharedMem<NoSpecialOps> = false;
+#endif
+
+  // SpecialOps
+  template <typename... T> struct SpecialOps : SpecialOps_Base<T...> {
+    // struct SpecialOpsTarget<T...> {
+    // using SpecialOpsT = op_Sequential<T...>;
+    // using SpecialOpsT = SpecialOps<T...>;
+    // using SpecialOpsElemType = typename SpecialOpsElemTypeS<T...>::type;
+    // const sycl::nd_item<3> *ndi = nullptr;
+    // char *smem;
+    sycl::local_ptr<char> smem = nullptr;
+
+    // SpecialOps() = delete;
+    inline SpecialOps() { static_assert(!needsSharedMem<SpecialOps<T...>>); }
+    inline SpecialOps(char *s)
+    { // for host
+      static_assert(needsSharedMem<SpecialOps<T...>>);
+      smem = s;
+    }
+    // template <typename S>
+    // inline SpecialOps(S s) {
+    //   static_assert(needsSharedMem<SpecialOps<T...>>);
+    //   smem = s.get();
+    // }
+    template <typename... U> inline SpecialOps(const SpecialOps<U...> &ops)
+    {
+      checkSpecialOps<T...>(ops);
+      if constexpr (needsSharedMem<SpecialOps<T...>>) { smem = ops.smem; }
+    }
+
+#if 0
+    //inline void setNdItem(const sycl::nd_item<3> &i) { ndi = &i; }
+    inline void setNdItem(const sycl::nd_item<3> &i) {}
+    inline void setSharedMem(char *s) { smem = s; }
+    template <typename ...U> inline void setSpecialOps(const SpecialOps<U...> &ops) {
+      static_assert(std::is_same_v<SpecialOps<T...>,SpecialOps<U...>>);
+      //ndi = ops.ndi;
+      smem = ops.smem;
+    }
+#endif
+#if 0
+    SpecialOpsElemType *getSharedMemPtr() {
+      static_assert(!std::is_same_v<SpecialOpsElemType,void>);
+      return reinterpret_cast<SpecialOpsElemType*>(smem);
+    }
+#endif
+  };
+
+  // blockSync
+  template <typename... T> inline void blockSync(const SpecialOps<T...> &)
+  {
+    // static_assert(hasBlockSync<T...>);
+    checkSpecialOp<op_blockSync, T...>();
+    // if (ops->ndi == nullptr) {
+    //   errorQuda("SpecialOps not set");
+    // }
+#ifdef __SYCL_DEVICE_ONLY__
+    // sycl::group_barrier(ops->ndi->get_group());
+    sycl::group_barrier(getGroup());
+#endif
+  }
+  // template <typename ...T> inline void blockSync(SpecialOps<T...> ops) { blockSync(&ops); }
+
+  // template <typename ...T> static constexpr bool isOpConcurrent = false;
+  // template <typename ...T> static constexpr bool isOpConcurrent<op_Concurrent<T...>> = true;
+
+  // template <typename T, typename ...U> static constexpr int getOpIndex = 0;
+  // template <typename T, typename ...U> static constexpr int getOpIndex<T,op_Concurrent<U...>> = getOpIndex<T,U...>;
+  // template <typename T, typename U, typename ...V> static constexpr int getOpIndex<T, U, V...> =
+  //   std::is_same_v<T,U> ? 0 : (1 + getOpIndex<T,V...>);
+
+#if 0
+  // getSpecialOp
+  template <typename U, typename ...T>
+  inline U getSpecialOp(const SpecialOps<T...> &ops) {
+    //static_assert(hasSpecialOpType<U,T...>);
+    checkSpecialOp<U,T...>();
+    //if (ops->ndi == nullptr || ops->smem == nullptr) {
+    //	errorQuda("SpecialOps not set");
+    //}
+    //SpecialOpsType<U,n> s;
+    SpecialOps<U> s;
+    //s.ndi = ops.ndi;
+    //s.smem = ops->smem + sharedMemOffset<U,n>()(ops->ndi->get_local_range());  // FIXME: need to pass arg
+    //s.smem = ops.smem + sharedMemOffset<U,n>()(getBlockDim());  // FIXME: need to pass arg
+    s.smem = ops.smem;
+    return s;
+  }
+  template <typename U, typename ...T>
+    inline U getSpecialOp(const SpecialOps<T...> *ops) { return getSpecialOp<U>(*ops); }
+  template <typename U> struct getSpecialOpF {
+    template <typename T> inline U operator()(const T &ops) { return getSpecialOp<U>(ops); }
+  };
+#endif
+
+#if 0
+  // getDependentOps
+  template <typename U, int n = 0, typename ...T>
+  inline SpecialOpDependencies<SpecialOpsType<U,n>> getDependentOps(const SpecialOps<T...> &ops) {
+    static_assert(hasSpecialOpType<U,T...>);
+    //if (ops->ndi == nullptr || ops->smem == nullptr) {
+    //errorQuda("SpecialOps not set");
+    //}
+    //SpecialOpDependencies<SpecialOpsType<U,n>> s;
+    //s.ndi = ops.ndi;
+    //s.smem = ops->smem + sharedMemOffset<U,n>()(ops->ndi->get_local_range());  // FIXME: need to pass arg
+    //s.smem = ops.smem + sharedMemOffset<U,n>()(getBlockDim());  // FIXME: need to pass arg
+    //return s;
+    using R = SpecialOpDependencies<SpecialOpsType<U,n>>;
+    if constexpr (needsSharedMem<R>) {
+      auto m = ops.smem + SpecialOps<U>::
+      R s{};
+      return s;
+    } else {
+      R s{};
+      return s;
+    }
+  }
+#endif
+
+  // getSharedMemPtr
+#if 0
+  template <typename ...T>
+  //SpecialOpsElemType<T...> *getSharedMemPtr(SpecialOps<T...> *ops) {
+  sycl::local_ptr<SpecialOpsElemType<T...>> getSharedMemPtr(SpecialOps<T...> *ops) {
+    static_assert(!std::is_same_v<SpecialOpsElemType<T...>,void>);
+    //return reinterpret_cast<SpecialOpsElemType<T...>*>(ops->smem);
+    //return reinterpret_cast<SpecialOpsElemType<T...>*>(ops->smem.get());
+    //sycl::local_ptr<SpecialOpsElemType<T...>> smem = ops->smem.get();
+    //return smem.get();
+    //auto p = ops->smem.get();
+    sycl::local_ptr<void> v(ops->smem);
+    sycl::local_ptr<SpecialOpsElemType<T...>> p(v);
+    return p;
+    //sycl::local_ptr<SpecialOpsElemType<T...>> smem;
+    //using LT = decltype(smem.get());
+    //LT pt = reinterpret_cast<LT>(p);
+    //sycl::local_ptr<SpecialOpsElemType<T...>> smem2(pt);
+    //return smem2;
+    //return reinterpret_cast<SpecialOpsElemType<T...>*>(0);
+  }
+  template <typename ...T>
+  inline SpecialOpsElemType<T...> *getSharedMemPtr(SpecialOps<T...> ops) { return getSharedMemPtr(&ops); }
+#endif
+
+#if 0
+  template <typename T, typename S, typename O = op_SharedMemory<T,S>>
+  inline sycl::local_ptr<T> getSharedMemPtr(const only_SharedMemory<T,S> &ops) {
+    //if (ops->ndi == nullptr || ops->smem == nullptr) {
+    //errorQuda("SpecialOps not set");
+    //}
+    sycl::local_ptr<void> v(ops.smem);
+    sycl::local_ptr<T> p(v);
+    return p;
+  }
+  //template <typename T, typename S>
+  //inline sycl::local_ptr<T> getSharedMemPtr(only_SharedMemory<T,S> ops) { return getSharedMemPtr(&ops); }
+  template <typename O, typename T, typename U, typename ...V>
+  inline auto getSharedMemPtr(const SpecialOps<T,U,V...> &ops) {
+    SpecialOps<O> op = getSpecialOp<O>(ops);
+    return getSharedMemPtr(op);
+  }
+#endif
+
+#if 0
+  template <typename T, typename O>
+  inline auto getSharedMemory(O *ops)
+  {
+    auto s = getSpecialOp<T>(ops);
+    return getSharedMemPtr(s);
+  }
+#endif
+
+  // needsFullBlock
+#if 0
+  template <typename T> static constexpr bool needsFullBlock = needsFullBlock<getSpecialOps<T>>;
+  template <typename ...T> static constexpr bool needsFullBlockImpl = (needsFullBlockImpl<T> || ...);
+  template <> static constexpr bool needsFullBlockImpl<depNone> = false;
+  template <> static constexpr bool needsFullBlockImpl<depFullBlock> = true;
+  template <typename T, typename S> static constexpr bool needsFullBlockImpl<depSharedMem<T,S>> = false;
+  template <typename ...T> static constexpr bool needsFullBlockImpl<op_Concurrent<T...>> = needsFullBlockImpl<T...>;
+  template <typename ...T> static constexpr bool needsFullBlockImpl<op_Sequential<T...>> = needsFullBlockImpl<T...>;
+  template <typename T> static constexpr bool needsFullBlockF() {
+    if constexpr (std::is_base_of<op_Base,T>::value) {
+      return needsFullBlockImpl<typename T::dependencies>;
+    } else {
+      //if constexpr (hasSpecialOps<T>) {
+      //return needsFullBlock<getSpecialOps<T>>;
+      //} else {
+      //return false;
+      return needsFullBlock<typename T::dependentOps>;
+      //}
+    }
+  }
+  template <typename T> static constexpr bool needsFullBlockImpl<T> = needsFullBlockF<T>();
+  template <> static constexpr bool needsFullBlock<NoSpecialOps> = false;
+  template <typename ...T> static constexpr bool needsFullBlock<SpecialOps<T...>> = needsFullBlockImpl<T...>;
+#else
+  template <typename T> static constexpr bool needsFullBlockImpl = (T) false;
+  template <typename... T> static constexpr bool needsFullBlockImpl<SpecialOps<T...>> = (needsFullBlockImpl<T> || ...);
+  template <> static constexpr bool needsFullBlockImpl<NoSpecialOps> = false;
+  template <typename T> static constexpr bool needsFullBlock = needsFullBlockImpl<getSpecialOps<T>>;
+#endif
+
+  // base operation dependencies
+  struct depNone {
+  };
+  template <> struct sharedMemSizeS<depNone> {
+    template <typename... Arg> static constexpr unsigned int size(dim3, Arg &...) { return 0; }
+  };
+
+  struct depFullBlock {
+  };
+  template <> struct sharedMemSizeS<depFullBlock> {
+    template <typename... Arg> static constexpr unsigned int size(dim3, Arg &...) { return 0; }
+  };
+
+  template <typename T, typename S> struct depSharedMem {
+  };
+  template <typename T, typename S> struct sharedMemSizeS<depSharedMem<T, S>> {
+    template <typename... Arg> static constexpr unsigned int size(dim3 block, Arg &...arg)
+    {
+      return S().template size<T>(block, arg...);
+    }
+  };
+
+  // op implementations
+  // struct op_blockSync : op_BaseT<void> {
+  struct op_blockSync {
+    // using dependencies = depFullBlock;
+    template <typename... Arg> static constexpr unsigned int shared_mem_size(dim3, Arg &...) { return 0; }
+  };
+
+  template <typename T>
+  // struct op_warp_combine : op_BaseT<T> {
+  struct op_warp_combine {
+    // using dependencies = depNone;
+    // using dependencies = depFullBlock;
+    template <typename... Arg> static constexpr unsigned int shared_mem_size(dim3, Arg &...) { return 0; }
+  };
+  template <typename T> static constexpr bool needsFullBlockImpl<op_warp_combine<T>> = false;
+
+#if 0
+  template <typename T, int N>
+  struct op_thread_array : op_BaseT<T,N> {
+    //using dependencies = depNone;
+    using dependencies = op_SharedMemory<array<T,N>,opSizeBlock>;
+  };
+
+  template <typename T>
+  struct op_BlockReduce : op_BaseT<T> {
+    using concurrentOps = op_Concurrent<op_blockSync,op_SharedMemory<T,opSizeBlockDivWarp>>;
+    using opBlockSync = getSpecialOpF<concurrentOps,0>;
+    using opSharedMem = getSpecialOpF<concurrentOps,1>;
+    //using specialOps = SpecialOps<concurrentOps>;
+    using dependencies = concurrentOps;
+  };
+
+  template <typename T, typename D>
+  struct op_SharedMemoryCache : op_BaseT<T> {
+    template <typename ...Arg> static constexpr dim3 dims(dim3 block, Arg &...arg) { return D::dims(block, arg...); }
+    using dependencies = op_Sequential<op_blockSync,op_SharedMemory<T,opSizeDims<D>>>;
+  };
+
+  template <typename T, typename S>
+  struct op_SharedMemory : op_BaseT<T> {
+    using dependencies = depSharedMem<T,S>;
+    template <typename ...Arg>
+    static constexpr unsigned int shared_mem_size(dim3 block, Arg &...arg) { return S::template size<T>(block, arg...); }
+  };
+#endif
+
+  // needsFullWarp?
+
+  // tests
+#if 0
+  static const int opTestArg = 10;
+  static_assert(needsFullBlock<only_SharedMemoryCache<float>> == true);
+  static_assert(sharedMemSize<only_SharedMemoryCache<float>>(dim3(2,3,4))==24*sizeof(float));
+  static_assert(sharedMemSize<only_SharedMemoryCache<float>>(dim3(2,3,4),opTestArg)==24*sizeof(float));
+
+  template <typename T, typename U> static constexpr bool opTestHasSpecialOpType = hasSpecialOpType<T,U>;
+  template <typename T, int n = 0> static constexpr bool opTestAllHasSpecialOpType = false;
+  template <typename ...T> static constexpr bool opTestAllHasSpecialOpType<SpecialOps<T...>,sizeof...(T)> = true;
+  template <typename ...T, int n> static constexpr bool opTestAllHasSpecialOpType<SpecialOps<T...>,n> =
+    opTestHasSpecialOpType<std::tuple_element_t<n,std::tuple<T...>>,SpecialOps<T...>> &&
+    opTestAllHasSpecialOpType<SpecialOps<T...>,n+1>;
+
+  using opTestC1 = op_Concurrent<op_blockSync,op_thread_array<bool,4>>;
+  using opTest1 = SpecialOps<op_blockSync,op_warp_combine<int>,op_thread_array<float,4>,op_SharedMemoryCache<float>,
+    op_SharedMemory<double>,op_SharedMemStatic<char,100>,opTestC1>;
+  static_assert(opTestAllHasSpecialOpType<opTest1>);
+  static_assert(hasSpecialOpType<opTestC1,opTest1>);
+  static_assert(!hasSpecialOpType<op_thread_array<bool,4>,opTest1>);
+
+  static_assert(sharedMemSize<opTest1>(dim3(0,0,0))==std::max((unsigned int)100,0*sizeof(double)));
+  static_assert(sharedMemSize<opTest1>(dim3(1,2,5))==std::max({(unsigned int)100,10*sizeof(double),40*sizeof(float)}));
+  static_assert(sharedMemSize<opTest1>(dim3(2,5,10))==std::max({(unsigned int)100,100*sizeof(double),400*sizeof(float)}));
+#endif
+
+#if 0
+  using opTest2 = SpecialOps<op_blockSync,op_warp_combine<int>,op_thread_array<float,4>,
+			     op_SharedMemoryCache<double>,op_SharedMemory<float>,op_SharedMemStatic<char,100>>;
+  static_assert(opTestAllHasSpecialOpType<opTest1>);
+   template <typename T, typename U> static constexpr bool opTestSpecialOpsType =
+    //std::is_same_v<SpecialOpsType<T,U>,SpecialOps<T>;
+    hasSpecialOpType<T,U>;
+  template <typename T, int n = 0> static constexpr bool opTestAllSpecialOpsType = false;
+  template <typename ...T> static constexpr bool opTestAllSpecialOpsType<SpecialOps<T...>,sizeof...(T)> = true;
+  template <typename ...T, int n> static constexpr bool opTestAllSpecialOpsType<SpecialOps<T...>,n> =
+    opTestSpecialOpsType<std::tuple_element_t<n,std::tuple<T...>>,SpecialOps<T...>> &&
+    opTestAllSpecialOpsType<SpecialOps<T...>,n+1>;
+#endif
+} // namespace quda
diff --git a/include/targets/sycl/target_device.h b/include/targets/sycl/target_device.h
new file mode 100644
index 0000000000..afc4649e29
--- /dev/null
+++ b/include/targets/sycl/target_device.h
@@ -0,0 +1,250 @@
+#pragma once
+
+#ifndef QUDA_WARP_SIZE
+#define QUDA_WARP_SIZE 16
+#endif
+#ifndef QUDA_MAX_BLOCK_SIZE
+#define QUDA_MAX_BLOCK_SIZE 512
+#endif
+// #ifndef QUDA_MAX_ARGUMENT_SIZE
+// #define QUDA_MAX_ARGUMENT_SIZE 2048
+// #endif
+
+// #define HOSTDEVICE __host__ __device__
+// #define HostDevice __host__ __device__
+
+namespace quda
+{
+
+  namespace target
+  {
+
+#if 0
+    // compile-time dispatch
+    template <template <bool, typename ...> class f, typename ...Args>
+    auto dispatch(Args &&... args)
+    {
+#ifdef __SYCL_DEVICE_ONLY__
+      return f<true>()(args...);
+#else
+      return f<false>()(args...);
+#endif
+    }
+#endif
+
+    // compile-time dispatch
+    template <template <bool, typename...> class f, auto... Params, typename... Args> auto dispatch(Args &&...args)
+    {
+#ifdef __SYCL_DEVICE_ONLY__
+      if constexpr (sizeof...(Params) == 0) {
+        return f<true>()(args...);
+      } else {
+        return f<true>().template operator()<Params...>(args...);
+      }
+#else
+      if constexpr (sizeof...(Params) == 0) {
+        return f<false>()(args...);
+      } else {
+        return f<false>().template operator()<Params...>(args...);
+      }
+#endif
+    }
+
+    template <bool is_device> struct is_device_impl {
+      constexpr bool operator()() { return false; }
+    };
+    template <> struct is_device_impl<true> {
+      constexpr bool operator()() { return true; }
+    };
+
+    /**
+       @brief Helper function that returns if the current execution
+       region is on the device
+    */
+    __device__ __host__ inline bool is_device() { return dispatch<is_device_impl>(); }
+
+    template <bool is_device> struct is_host_impl {
+      constexpr bool operator()() { return true; }
+    };
+    template <> struct is_host_impl<true> {
+      constexpr bool operator()() { return false; }
+    };
+
+    /**
+       @brief Helper function that returns if the current execution
+       region is on the host
+    */
+    __device__ __host__ inline bool is_host() { return dispatch<is_host_impl>(); }
+
+    template <bool is_device> struct block_dim_impl {
+      dim3 operator()() { return dim3(1, 1, 1); }
+    };
+    template <> struct block_dim_impl<true> {
+      dim3 operator()() { return getBlockDim(); }
+    };
+
+    /**
+       @brief Helper function that returns the block dimensions.  On
+       CUDA this returns the intrinsic blockDim, whereas on the host
+       this returns (1, 1, 1).
+    */
+    inline dim3 block_dim() { return getBlockDim(); }
+
+    /**
+       @brief Helper function that returns the grid dimensions.  On
+       CUDA this returns the intrinsic blockDim, whereas on the host
+       this returns (1, 1, 1).
+    */
+    inline dim3 grid_dim() { return getGridDim(); }
+
+    /**
+       @brief Helper function that returns the thread indices within a
+       thread block.  On CUDA this returns the intrinsic
+       blockIdx, whereas on the host this just returns (0, 0, 0).
+    */
+    inline dim3 block_idx() { return getBlockIdx(); }
+
+    /**
+       @brief Helper function that returns the thread indices within a
+       thread block.  On CUDA this returns the intrinsic
+       threadIdx, whereas on the host this just returns (0, 0, 0).
+    */
+    inline dim3 thread_idx() { return getThreadIdx(); }
+
+    // inline uint local_linear_id() { return getLocalLinearId(); }
+
+    /**
+       @brief Helper function that returns a linear thread index within a thread block.
+    */
+    template <int dim> __device__ __host__ inline auto thread_idx_linear()
+    {
+      switch (dim) {
+      case 1: return thread_idx().x;
+      case 2: return thread_idx().y * block_dim().x + thread_idx().x;
+      case 3:
+      default: return (thread_idx().z * block_dim().y + thread_idx().y) * block_dim().x + thread_idx().x;
+      }
+    }
+
+    /**
+       @brief Helper function that returns the total number thread in a thread block
+    */
+    template <int dim> __device__ __host__ inline auto block_size()
+    {
+      switch (dim) {
+      case 1: return block_dim().x;
+      case 2: return block_dim().y * block_dim().x;
+      case 3:
+      default: return block_dim().z * block_dim().y * block_dim().x;
+      }
+    }
+
+  } // namespace target
+
+  namespace device
+  {
+
+    /**
+       @brief Helper function that returns the warp-size of the
+       architecture we are running on.
+    */
+    constexpr int warp_size() { return QUDA_WARP_SIZE; }
+
+    /**
+       @brief Return the thread mask for a converged warp.
+    */
+    constexpr unsigned int warp_converged_mask() { return 0xffffffff; }
+
+    /**
+       @brief Helper function that returns the maximum number of threads
+       in a block in the x dimension.
+    */
+    template <int block_size_y = 1, int block_size_z = 1> constexpr unsigned int max_block_size()
+    {
+      // return std::max(warp_size(), 1024 / (block_size_y * block_size_z));
+      // return QUDA_MAX_BLOCK_SIZE / (block_size_y * block_size_z);
+      return std::max(warp_size(), QUDA_MAX_BLOCK_SIZE / (block_size_y * block_size_z));
+    }
+
+    /**
+       @brief Helper function that returns the maximum number of threads
+       in a block in the x dimension for reduction kernels.
+    */
+    template <int block_size_y = 1, int block_size_z = 1> constexpr unsigned int max_reduce_block_size()
+    {
+#ifdef QUDA_FAST_COMPILE_REDUCE
+      // This is the specialized variant used when we have fast-compilation mode enabled
+      return warp_size();
+#else
+      return max_block_size<block_size_y, block_size_z>();
+#endif
+    }
+
+    /**
+       @brief Helper function that returns the maximum number of threads
+       in a block in the x dimension for reduction kernels.
+    */
+    template <int block_size_y = 1, int block_size_z = 1> constexpr unsigned int max_multi_reduce_block_size()
+    {
+#ifdef QUDA_FAST_COMPILE_REDUCE
+      // This is the specialized variant used when we have fast-compilation mode enabled
+      return warp_size();
+#else
+      return max_block_size<block_size_y, block_size_z>();
+#endif
+    }
+
+    /**
+       @brief Helper function that returns the maximum size of a
+       __constant__ buffer on the target architecture.  For CUDA,
+       this is set to the somewhat arbitrary limit of 32 KiB for now.
+    */
+    constexpr size_t max_constant_size() { return 32768; }
+
+    /**
+       @brief Helper function that returns the maximum size of a
+       constant_param_t buffer on the target architecture.  For CUDA,
+       this corresponds to the maximum __constant__ buffer size.
+    */
+    // constexpr size_t max_constant_param_size() { return 8192; }
+    constexpr size_t max_constant_param_size() { return 32768; }
+
+    /**
+       @brief Helper function that returns the maximum static size of
+       the kernel arguments passed to a kernel on the target
+       architecture.
+    */
+    constexpr size_t max_kernel_arg_size()
+    {
+      // return std::max(QUDA_MAX_ARGUMENT_SIZE-8, 0);  // reserve 8 bytes for local accessor
+      return std::max(MAX_KERNEL_ARG_SIZE - 8, 0); // reserve 8 bytes for local accessor
+    }
+
+    /**
+       @brief Helper function that returns the bank width of the
+       shared memory bank width on the target architecture.
+    */
+    constexpr int shared_memory_bank_width() { return 32; }
+
+    /**
+       @brief Helper function that returns the size of the
+       shared memory on the target architecture.
+    */
+    constexpr int shared_memory_size() { return 32768; }
+
+    /**
+       @brief Helper function that returns true if we are to pass the
+       kernel parameter struct to the kernel as an explicit kernel
+       argument.  Otherwise the parameter struct is explicitly copied
+       to the device prior to kernel launch.
+    */
+    template <typename Arg> constexpr bool use_kernel_arg()
+    {
+      // return (sizeof(Arg) <= device::max_kernel_arg_size() && Arg::use_kernel_arg);
+      return Arg::always_use_kernel_arg()
+        || (Arg::default_use_kernel_arg() && sizeof(Arg) <= device::max_kernel_arg_size());
+    }
+
+  } // namespace device
+
+} // namespace quda
diff --git a/include/targets/sycl/thread_array.h b/include/targets/sycl/thread_array.h
new file mode 100644
index 0000000000..2515007700
--- /dev/null
+++ b/include/targets/sycl/thread_array.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#define SLM
+
+#ifdef SLM
+
+#include <../generic/thread_array.h>
+
+#else
+
+namespace quda
+{
+  template <typename T, int n> struct thread_array : array<T, n> {
+    // constexpr thread_array() : array<T,n>{} {}
+    template <typename Ops> constexpr thread_array(Ops &ops) : array<T, n> {} { }
+    static constexpr unsigned int shared_mem_size(dim3) { return 0; }
+  };
+} // namespace quda
+
+#endif
+
+namespace quda
+{
+  template <typename T, int n> static constexpr bool needsFullBlockImpl<thread_array<T, n>> = false;
+}
diff --git a/include/targets/sycl/thread_local_cache.h b/include/targets/sycl/thread_local_cache.h
new file mode 100644
index 0000000000..4dbda3d439
--- /dev/null
+++ b/include/targets/sycl/thread_local_cache.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#if 1
+#include "../generic/thread_local_cache.h"
+#else
+#include "../generic/thread_local_cache_noshared.h"
+#endif
+
+namespace quda
+{
+  template <typename T, int N, typename O> static constexpr bool needsFullBlockImpl<ThreadLocalCache<T, N, O>> = false;
+  template <typename T, int N, typename O> static constexpr bool needsSharedMemImpl<ThreadLocalCache<T, N, O>> = true;
+} // namespace quda
diff --git a/include/targets/sycl/tunable_kernel.h b/include/targets/sycl/tunable_kernel.h
new file mode 100644
index 0000000000..cb973c9e71
--- /dev/null
+++ b/include/targets/sycl/tunable_kernel.h
@@ -0,0 +1,363 @@
+#pragma once
+
+#include <device.h>
+#include <tune_quda.h>
+#include <target_device.h>
+#include <lattice_field.h>
+#include <kernel_helper.h>
+#include <quda_sycl.h>
+#include <quda_sycl_api.h>
+#include <reduce_helper.h>
+#include <kernel_ops_target.h>
+
+namespace quda
+{
+
+  class TunableKernel : public Tunable
+  {
+
+  protected:
+    QudaFieldLocation location;
+
+    template <template <typename> class Functor, bool grid_stride, typename Arg>
+    inline qudaError_t launch_device(const kernel_t &kernel, const TuneParam &tp, const qudaStream_t &stream,
+                                     const Arg &arg)
+    {
+      checkSharedBytes<Functor>(tp, arg);
+      using launcher_t = qudaError_t (*)(const TuneParam &, const qudaStream_t &, const Arg &);
+      auto f = reinterpret_cast<launcher_t>(const_cast<void *>(kernel.func));
+      launch_error = f(tp, stream, arg);
+      if (launch_error != QUDA_SUCCESS && !activeTuning()) {
+        errorQuda("Launch error: %s", qudaGetLastErrorString().c_str());
+      }
+      return launch_error;
+    }
+
+  public:
+    TunableKernel(const LatticeField &field, QudaFieldLocation location = QUDA_INVALID_FIELD_LOCATION) :
+      location(location != QUDA_INVALID_FIELD_LOCATION ? location : field.Location())
+    {
+      strcpy(vol, field.VolString().c_str());
+      strcpy(aux, compile_type_str(field, location));
+      if (this->location == QUDA_CUDA_FIELD_LOCATION) {
+        strcat(aux, "kernel_arg_threshold=");
+        i32toa(aux + strlen(aux), device::max_kernel_arg_size());
+        strcat(aux, ",");
+      }
+      if (this->location == QUDA_CPU_FIELD_LOCATION) strcat(aux, getOmpThreadStr());
+      strcat(aux, field.AuxString().c_str());
+    }
+
+    TunableKernel(size_t n_items, QudaFieldLocation location = QUDA_INVALID_FIELD_LOCATION) : location(location)
+    {
+      u64toa(vol, n_items);
+      strcpy(aux, compile_type_str(location));
+      if (location == QUDA_CUDA_FIELD_LOCATION) {
+        strcat(aux, "kernel_arg_threshold=");
+        i32toa(aux + strlen(aux), device::max_kernel_arg_size());
+        strcat(aux, ",");
+      }
+      if (this->location == QUDA_CPU_FIELD_LOCATION) strcat(aux, getOmpThreadStr());
+    }
+
+    virtual bool advanceTuneParam(TuneParam &param) const override
+    {
+      return location == QUDA_CPU_FIELD_LOCATION ? false : Tunable::advanceTuneParam(param);
+    }
+
+    TuneKey tuneKey() const override { return TuneKey(vol, typeid(*this).name(), aux); }
+  };
+
+  // kernel helpers
+  inline sycl::range<3> globalRange(const TuneParam &tp)
+  {
+    sycl::range<3> r(1, 1, 1);
+    r[RANGE_X] = tp.grid.x * tp.block.x;
+    r[RANGE_Y] = tp.grid.y * tp.block.y;
+    r[RANGE_Z] = tp.grid.z * tp.block.z;
+    return r;
+  }
+  inline sycl::range<3> localRange(const TuneParam &tp)
+  {
+    sycl::range<3> r(1, 1, 1);
+    r[RANGE_X] = tp.block.x;
+    r[RANGE_Y] = tp.block.y;
+    r[RANGE_Z] = tp.block.z;
+    return r;
+  }
+
+  template <typename F, typename Arg>
+  std::enable_if_t<device::use_kernel_arg<Arg>(), qudaError_t> launch(const TuneParam &tp, const qudaStream_t &stream,
+                                                                      const Arg &arg)
+  {
+    if (sizeof(Arg) > device::max_parameter_size()) {
+      errorQuda("Kernel arg too large: %lu > %u\n", sizeof(Arg), device::max_parameter_size());
+    }
+    qudaError_t err = QUDA_SUCCESS;
+    auto globalSize = globalRange(tp);
+    auto localSize = localRange(tp);
+    sycl::nd_range<3> ndRange {globalSize, localSize};
+    auto q = device::get_target_stream(stream);
+    try {
+      if constexpr (needsSharedMem<typename F::KernelOpsT>) {
+        // auto localsize = ndRange.get_local_range().size();
+        auto block = makeDim3(ndRange.get_local_range());
+        auto smemsize = sharedMemSize<typename F::KernelOpsT>(block, arg);
+        // auto smemsize = sharedMemSize<typename F::KernelOpsT>(block);
+        if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("  Allocating local mem size: %lu\n", smemsize); }
+        if (smemsize > device::max_dynamic_shared_memory()) {
+          warningQuda("Local mem request too large %lu > %lu\n", smemsize, device::max_dynamic_shared_memory());
+          return QUDA_ERROR;
+        }
+        q.submit([&](sycl::handler &h) {
+          sycl::range<1> smem_range(smemsize);
+          auto la = sycl::local_accessor<char>(smem_range, h);
+          h.parallel_for<>(ndRange,
+                           //[=](sycl::nd_item<3> ndi) {
+                           [=](sycl::nd_item<3> ndi) [[sycl::reqd_sub_group_size(QUDA_WARP_SIZE)]] {
+                             // auto smem = la.get_pointer();
+                             auto smem = la.get_multi_ptr<sycl::access::decorated::yes>();
+                             // arg.lmem = smem;
+                             F f(arg, ndi, smem.get());
+                           });
+        });
+      } else { // no shared mem
+        q.submit([&](sycl::handler &h) {
+          h.parallel_for<>(ndRange,
+                           //[=](sycl::nd_item<3> ndi) {
+                           [=](sycl::nd_item<3> ndi) [[sycl::reqd_sub_group_size(QUDA_WARP_SIZE)]] { F f(arg, ndi); });
+        });
+      }
+    } catch (sycl::exception const &e) {
+      auto what = e.what();
+      target::sycl::set_error(what, "submit", __func__, __FILE__, __STRINGIFY__(__LINE__), activeTuning());
+      if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
+        printfQuda("  Caught synchronous SYCL exception:\n  %s\n", e.what());
+      }
+      err = QUDA_ERROR;
+    }
+    return err;
+  }
+
+  template <typename F, typename Arg>
+  std::enable_if_t<device::use_kernel_arg<Arg>(), qudaError_t> launch(const qudaStream_t &stream,
+                                                                      sycl::nd_range<3> &ndRange, Arg &arg)
+  {
+    if (sizeof(Arg) > device::max_parameter_size()) {
+      errorQuda("Kernel arg too large: %lu > %u\n", sizeof(Arg), device::max_parameter_size());
+    }
+    qudaError_t err = QUDA_SUCCESS;
+    auto q = device::get_target_stream(stream);
+    try {
+      if constexpr (needsSharedMem<typename F::KernelOpsT>) {
+        // auto localsize = ndRange.get_local_range().size();
+        auto block = makeDim3(ndRange.get_local_range());
+        auto smemsize = sharedMemSize<typename F::KernelOpsT>(block, arg);
+        // auto smemsize = sharedMemSize<typename F::KernelOpsT>(block);
+        if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("  Allocating local mem size: %u\n", smemsize); }
+        if (smemsize > device::max_dynamic_shared_memory()) {
+          warningQuda("Local mem request too large %u > %lu\n", smemsize, device::max_dynamic_shared_memory());
+          return QUDA_ERROR;
+        }
+        q.submit([&](sycl::handler &h) {
+          sycl::range<1> smem_range(smemsize);
+          auto la = sycl::local_accessor<char>(smem_range, h);
+          h.parallel_for<>(ndRange,
+                           //[=](sycl::nd_item<3> ndi) {
+                           [=](sycl::nd_item<3> ndi) [[sycl::reqd_sub_group_size(QUDA_WARP_SIZE)]] {
+                             // auto smem = la.get_pointer();
+                             auto smem = la.get_multi_ptr<sycl::access::decorated::yes>();
+                             // arg.lmem = smem;
+                             F f(arg, ndi, smem.get());
+                           });
+        });
+      } else { // no shared mem
+        q.submit([&](sycl::handler &h) {
+          h.parallel_for<>(ndRange,
+                           //[=](sycl::nd_item<3> ndi) {
+                           [=](sycl::nd_item<3> ndi) [[sycl::reqd_sub_group_size(QUDA_WARP_SIZE)]] { F f(arg, ndi); });
+        });
+      }
+    } catch (sycl::exception const &e) {
+      auto what = e.what();
+      target::sycl::set_error(what, "submit", __func__, __FILE__, __STRINGIFY__(__LINE__), activeTuning());
+      if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
+        printfQuda("  Caught synchronous SYCL exception:\n  %s\n", e.what());
+      }
+      err = QUDA_ERROR;
+    }
+    return err;
+  }
+
+  template <typename F, typename Arg>
+  std::enable_if_t<!device::use_kernel_arg<Arg>(), qudaError_t> launch(const qudaStream_t &stream,
+                                                                       sycl::nd_range<3> &ndRange, Arg &arg)
+  {
+    qudaError_t err = QUDA_SUCCESS;
+    auto q = device::get_target_stream(stream);
+    auto size = sizeof(arg);
+    auto ph = device::get_arg_buf(stream, size);
+    memcpy(ph, &arg, size);
+    auto p = ph;
+    try {
+      if constexpr (needsSharedMem<typename F::KernelOpsT>) {
+        // auto localsize = ndRange.get_local_range().size();
+        auto block = makeDim3(ndRange.get_local_range());
+        auto smemsize = sharedMemSize<typename F::KernelOpsT>(block, arg);
+        // auto smemsize = sharedMemSize<typename F::KernelOpsT>(block);
+        if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("  Allocating local mem size: %u\n", smemsize); }
+        if (smemsize > device::max_dynamic_shared_memory()) {
+          warningQuda("Local mem request too large %u > %lu\n", smemsize, device::max_dynamic_shared_memory());
+          return QUDA_ERROR;
+        }
+        q.submit([&](sycl::handler &h) {
+          sycl::range<1> smem_range(smemsize);
+          auto la = sycl::local_accessor<char>(smem_range, h);
+          h.parallel_for<>(ndRange,
+                           //[=](sycl::nd_item<3> ndi) {
+                           [=](sycl::nd_item<3> ndi) [[sycl::reqd_sub_group_size(QUDA_WARP_SIZE)]] {
+                             Arg *arg2 = reinterpret_cast<Arg *>(p);
+                             // auto smem = la.get_pointer();
+                             auto smem = la.get_multi_ptr<sycl::access::decorated::yes>();
+                             // arg2->lmem = smem;
+                             F f(*arg2, ndi, smem.get());
+                           });
+        });
+      } else {
+        q.submit([&](sycl::handler &h) {
+          h.parallel_for<>(ndRange,
+                           //[=](sycl::nd_item<3> ndi) {
+                           [=](sycl::nd_item<3> ndi) [[sycl::reqd_sub_group_size(QUDA_WARP_SIZE)]] {
+                             const Arg *arg2 = reinterpret_cast<const Arg *>(p);
+                             F f(*arg2, ndi);
+                           });
+        });
+      }
+    } catch (sycl::exception const &e) {
+      auto what = e.what();
+      target::sycl::set_error(what, "submit", __func__, __FILE__, __STRINGIFY__(__LINE__), activeTuning());
+      if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
+        printfQuda("  Caught synchronous SYCL exception:\n  %s\n", e.what());
+      }
+      err = QUDA_ERROR;
+    }
+    return err;
+  }
+
+  template <typename F, typename Arg>
+  qudaError_t launchX(const qudaStream_t &stream, sycl::nd_range<3> &ndRange, const Arg &arg)
+  {
+    static_assert(!needsSharedMem<typename F::KernelOpsT>);
+    qudaError_t err = QUDA_SUCCESS;
+    auto q = device::get_target_stream(stream);
+    auto size = sizeof(arg);
+    auto ph = device::get_arg_buf(stream, size);
+    memcpy(ph, &arg, size);
+    auto p = ph;
+    try {
+      q.submit([&](sycl::handler &h) {
+        h.parallel_for<>(ndRange,
+                         //[=](sycl::nd_item<3> ndi) {
+                         [=](sycl::nd_item<3> ndi) [[sycl::reqd_sub_group_size(QUDA_WARP_SIZE)]] {
+                           const Arg *arg2 = reinterpret_cast<const Arg *>(p);
+                           F f(*arg2, ndi);
+                         });
+      });
+    } catch (sycl::exception const &e) {
+      auto what = e.what();
+      target::sycl::set_error(what, "submit", __func__, __FILE__, __STRINGIFY__(__LINE__), activeTuning());
+      if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
+        printfQuda("  Caught synchronous SYCL exception:\n  %s\n", e.what());
+      }
+      err = QUDA_ERROR;
+    }
+    return err;
+  }
+
+  template <template <typename> class Transformer, typename F, typename Arg>
+  std::enable_if_t<device::use_kernel_arg<Arg>(), qudaError_t> launchR(const qudaStream_t &stream,
+                                                                       sycl::nd_range<3> &ndRange, const Arg &arg)
+  {
+    static_assert(!needsSharedMem<typename F::KernelOpsT>);
+    if (sizeof(Arg) > device::max_parameter_size()) {
+      errorQuda("Kernel arg too large: %lu > %u\n", sizeof(Arg), device::max_parameter_size());
+    }
+    qudaError_t err = QUDA_SUCCESS;
+    auto q = device::get_target_stream(stream);
+    using reduce_t = typename Transformer<Arg>::reduce_t;
+    using reducer_t = typename Transformer<Arg>::reducer_t;
+    auto result_h = reinterpret_cast<reduce_t *>(quda::reducer::get_host_buffer());
+    *result_h = reducer_t::init();
+    reduce_t *result_d = result_h;
+    if (commAsyncReduction()) {
+      result_d = reinterpret_cast<reduce_t *>(quda::reducer::get_device_buffer());
+      q.memcpy(result_d, result_h, sizeof(reduce_t));
+    }
+    auto reducer_h = sycl::reduction(result_d, *result_h, reducer_t());
+    try {
+      q.submit([&](sycl::handler &h) {
+        h.parallel_for<>(ndRange, reducer_h,
+                         //[=](sycl::nd_item<3> ndi, auto &reducer_d) {
+                         [=](sycl::nd_item<3> ndi, auto &reducer_d)
+                           [[sycl::reqd_sub_group_size(QUDA_WARP_SIZE)]] { F::apply(arg, ndi, reducer_d); });
+      });
+    } catch (sycl::exception const &e) {
+      auto what = e.what();
+      target::sycl::set_error(what, "submit", __func__, __FILE__, __STRINGIFY__(__LINE__), activeTuning());
+      if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
+        printfQuda("  Caught synchronous SYCL exception:\n  %s\n", e.what());
+      }
+      err = QUDA_ERROR;
+    }
+    return err;
+  }
+
+  template <template <typename> class Transformer, typename F, typename Arg>
+  std::enable_if_t<!device::use_kernel_arg<Arg>(), qudaError_t> launchR(const qudaStream_t &stream,
+                                                                        sycl::nd_range<3> &ndRange, const Arg &arg)
+  {
+    static_assert(!needsSharedMem<typename F::KernelOpsT>);
+    qudaError_t err = QUDA_SUCCESS;
+    auto q = device::get_target_stream(stream);
+    auto size = sizeof(arg);
+    auto ph = device::get_arg_buf(stream, size);
+    memcpy(ph, &arg, size);
+    auto p = ph;
+    using reduce_t = typename Transformer<Arg>::reduce_t;
+    using reducer_t = typename Transformer<Arg>::reducer_t;
+    auto result_h = reinterpret_cast<reduce_t *>(quda::reducer::get_host_buffer());
+    *result_h = reducer_t::init();
+    reduce_t *result_d = result_h;
+    if (commAsyncReduction()) {
+      result_d = reinterpret_cast<reduce_t *>(quda::reducer::get_device_buffer());
+      q.memcpy(result_d, result_h, sizeof(reduce_t));
+    }
+    auto reducer_h = sycl::reduction(result_d, *result_h, reducer_t());
+    try {
+      q.submit([&](sycl::handler &h) {
+        h.parallel_for<>(ndRange, reducer_h,
+                         //[=](sycl::nd_item<3> ndi, auto &reducer_d) {
+                         [=](sycl::nd_item<3> ndi, auto &reducer_d) [[sycl::reqd_sub_group_size(QUDA_WARP_SIZE)]] {
+                           const Arg *arg2 = reinterpret_cast<const Arg *>(p);
+                           F::apply(*arg2, ndi, reducer_d);
+                         });
+      });
+    } catch (sycl::exception const &e) {
+      auto what = e.what();
+      target::sycl::set_error(what, "submit", __func__, __FILE__, __STRINGIFY__(__LINE__), activeTuning());
+      if (getVerbosity() >= QUDA_DEBUG_VERBOSE) {
+        printfQuda("  Caught synchronous SYCL exception:\n  %s\n", e.what());
+      }
+      err = QUDA_ERROR;
+    }
+    return err;
+  }
+
+  template <typename F, bool = hasKernelOps<F>, bool = needsSharedMem<F>> struct Ftor : F {
+    template <typename Arg, typename S> Ftor(const Arg &arg, const sycl::nd_item<3> &, S smem) : F {arg, smem} { }
+  };
+  template <typename F, bool ns> struct Ftor<F, ns, false> : F {
+    template <typename Arg, typename... S> Ftor(const Arg &arg, const sycl::nd_item<3> &, S...) : F {arg} { }
+  };
+
+} // namespace quda
diff --git a/include/targets/sycl/warp_collective.h b/include/targets/sycl/warp_collective.h
new file mode 100644
index 0000000000..522ec8c6ba
--- /dev/null
+++ b/include/targets/sycl/warp_collective.h
@@ -0,0 +1,38 @@
+namespace quda
+{
+
+  template <int warp_split, typename T> inline T warp_combine(T &x)
+  {
+    if (warp_split > 1) {
+      constexpr int warp_size = device::warp_size();
+      // auto sg = sycl::ext::oneapi::experimental::this_sub_group();
+      auto sg = sycl::ext::oneapi::this_work_item::get_sub_group();
+      // const int sg_size = sg.get_local_range().size();
+#if 1
+      // reduce down to the first group of column-split threads
+#pragma unroll
+      for (int offset = warp_size / 2; offset >= warp_size / warp_split; offset /= 2) {
+        // auto y = sycl::shift_group_left(sg, x, offset);
+        auto y = sycl::permute_group_by_xor(sg, x, offset);
+#pragma unroll
+        for (int i = 0; i < x.size(); i++) { x[i] += y[i]; }
+      }
+#else
+#pragma unroll
+      for (int i = 0; i < x.size(); i++) {
+        // reduce down to the first group of column-split threads
+#pragma unroll
+        for (int offset = warp_size / 2; offset >= warp_size / warp_split; offset /= 2) {
+          x[i].real(x[i].real() + sycl::shift_group_left(sg, x[i].real(), offset));
+          x[i].imag(x[i].imag() + sycl::shift_group_left(sg, x[i].imag(), offset));
+          // x[i] += sycl::shift_group_left(sg, x[i], offset);
+          // x[i].real(x[i].real() + sycl::permute_group_by_xor(sg, x[i].real(), offset));
+          // x[i].imag(x[i].imag() + sycl::permute_group_by_xor(sg, x[i].imag(), offset));
+        }
+      }
+#endif
+    }
+    return x;
+  }
+
+} // namespace quda
diff --git a/lib/block_orthogonalize.in.cu b/lib/block_orthogonalize.in.cu
index 14cd94d9c4..dd5368c5be 100644
--- a/lib/block_orthogonalize.in.cu
+++ b/lib/block_orthogonalize.in.cu
@@ -20,7 +20,7 @@ namespace quda {
 #ifndef QUDA_FAST_COMPILE_REDUCE
     using array_type = PowerOfTwoArray<device::warp_size(), device::max_block_size()>;
 #else
-    using array_type = PowerOfTwoArray<device::max_block_size(), device::max_block_size()>;
+    using array_type = PowerOfTwoArray<device::max_block_size() / 2, device::max_block_size()>;
 #endif
     static constexpr array_type block = array_type();
 
@@ -156,6 +156,16 @@ namespace quda {
       }
     }
 
+#if defined(QUDA_TARGET_SYCL)
+    unsigned int sharedBytesPerBlock(const TuneParam &tp) const
+    {
+      using sum_t = double;
+      int mVec = quda::tile_size<nColor, nVec>(tp.block.x);
+      int vsize = 2 * sizeof(sum_t) * mVec;
+      return vsize * (tp.block.x * tp.block.y * tp.block.z) / device::warp_size();
+    }
+#endif
+
 #ifdef SWIZZLE
     bool advanceAux(TuneParam &param) const
     {
@@ -188,6 +198,7 @@ namespace quda {
       param.block = dim3(OrthoAggregates::block_mapper(active_x_threads), 1, 1);
       param.grid = dim3((nSpin == 1 ? V.VolumeCB() : V.Volume()) / active_x_threads, 1, chiral_blocks);
       param.aux.x = 1; // swizzle factor
+      setSharedBytes(param);
     }
 
     void defaultTuneParam(TuneParam &param) const { initTuneParam(param); }
diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp
index 442ef13ab1..910f8f1c01 100644
--- a/lib/color_spinor_field.cpp
+++ b/lib/color_spinor_field.cpp
@@ -1208,6 +1208,89 @@ namespace quda
     }
   }
 
+#if 0
+  void ColorSpinorField::commsQuery(int n, int d[], bool done[], bool gdr_send, bool gdr_recv) const
+  {
+    if (Location() == QUDA_CPU_FIELD_LOCATION) errorQuda("Host field not supported");
+    if ((gdr_send || gdr_recv) && !comm_gdr_enabled()) errorQuda("Requesting GDR comms but GDR is not enabled");
+    // note this is scatter centric, so dir=0 (1) is send backwards
+    // (forwards) and receive from forwards (backwards)
+
+    int nq = 0;
+    bool *complete[4*QUDA_MAX_DIM];
+    MsgHandle *mh[4*QUDA_MAX_DIM];
+
+    for (int i=0; i<n; i++) {
+      int dim = d[i] / 2;
+      int dir = d[i] % 2;
+
+      if (!commDimPartitioned(dim)) {
+	done[i] = true;
+	continue;
+      }
+
+      // first query send to backwards
+      if (comm_peer2peer_enabled(dir, dim)) {
+	if (!complete_send[dim][dir]) {
+	  complete[nq] = &complete_send[dim][dir];
+	  mh[nq] = mh_send_p2p[bufferIndex][dim][dir];
+	  nq++;
+	}
+      } else if (gdr_send) {
+	if (!complete_send[dim][dir]) {
+	  complete[nq] = &complete_send[dim][dir];
+	  mh[nq] = mh_send_rdma[bufferIndex][dim][dir];
+	  nq++;
+	}
+      } else {
+	if (!complete_send[dim][dir]) {
+	  complete[nq] = &complete_send[dim][dir];
+	  mh[nq] = mh_send[bufferIndex][dim][dir];
+	  nq++;
+	}
+      }
+
+      // second query receive from forwards
+      if (comm_peer2peer_enabled(1 - dir, dim)) {
+	if (!complete_recv[dim][1 - dir]) {
+	  complete[nq] = &complete_recv[dim][1 - dir];
+	  mh[nq] = mh_recv_p2p[bufferIndex][dim][1 - dir];
+	  nq++;
+	}
+      } else if (gdr_recv) {
+	if (!complete_recv[dim][1 - dir]) {
+	  complete[nq] = &complete_recv[dim][1 - dir];
+	  mh[nq] = mh_recv_rdma[bufferIndex][dim][1 - dir];
+	  nq++;
+	}
+      } else {
+	if (!complete_recv[dim][1 - dir]) {
+	  complete[nq] = &complete_recv[dim][1 - dir];
+	  mh[nq] = mh_recv[bufferIndex][dim][1 - dir];
+	  nq++;
+	}
+      }
+    }
+
+    int outcount;
+    int outidx[4*QUDA_MAX_DIM];
+    comm_query(nq, mh, &outcount, outidx);
+    for(int i=0; i<outcount; i++) {
+      *complete[outidx[i]] = true;
+    }
+
+    for (int i=0; i<n; i++) {
+      int dim = d[i] / 2;
+      int dir = d[i] % 2;
+      if (complete_recv[dim][1 - dir] && complete_send[dim][dir]) {
+	complete_send[dim][dir] = false;
+	complete_recv[dim][1 - dir] = false;
+	done[i] = true;
+      }
+    }
+  }
+#endif
+
   void ColorSpinorField::commsWait(int d, const qudaStream_t &, bool gdr_send, bool gdr_recv) const
   {
     if (Location() == QUDA_CPU_FIELD_LOCATION) errorQuda("Host field not supported");
diff --git a/lib/communicator_mpi.cpp b/lib/communicator_mpi.cpp
index e8f6a5d671..20990b818a 100644
--- a/lib/communicator_mpi.cpp
+++ b/lib/communicator_mpi.cpp
@@ -1,5 +1,8 @@
 #include <communicator_quda.h>
 
+// #define QUDA_COMM_CHECKHANG
+// #define QUDA_COMM_CHECKSUM
+
 #define MPI_CHECK(mpi_call)                                                                                            \
   do {                                                                                                                 \
     int status = mpi_call;                                                                                             \
@@ -33,8 +36,55 @@ namespace quda
        determine whether we need to free the datatype or not.
      */
     bool custom;
+
+#ifdef QUDA_COMM_CHECKHANG
+    double startTime;
+#endif
+#if defined(QUDA_COMM_CHECKHANG) || defined(QUDA_COMM_CHECKSUM)
+    bool isSend;
+    int otherRank;
+#endif
+#ifdef QUDA_COMM_CHECKSUM
+    void *buffer;
+    size_t nbytes;
+    uint64_t chksum;
+    MPI_Request chkreq;
+#endif
   };
 
+#ifdef QUDA_COMM_CHECKSUM
+  uint64_t chksum_cpu(void *buf, size_t n)
+  {
+    uint64_t sum = 0xf0f0f0f0;
+    // assume buffer is aligned
+    auto bufl = static_cast<uint64_t *>(buf);
+    size_t nl = n / 8;
+    for (size_t i = 0; i < nl; i++) { sum ^= bufl[i]; }
+    size_t nc = 8 * nl;
+    char *bufc = static_cast<char *>(buf) + nc;
+    size_t rem = n - nc;
+    for (size_t i = 0; i < rem; i++) { sum ^= ((uint64_t)bufc[i]) << i; }
+    return sum;
+  }
+  uint64_t chksum_gpu(void *buf, size_t n)
+  {
+    void *bufh = safe_malloc(n);
+    qudaMemcpy(bufh, buf, n, qudaMemcpyDeviceToHost);
+    auto chk = chksum_cpu(bufh, n);
+    host_free(bufh);
+    return chk;
+  }
+  uint64_t chksum(void *buf, size_t n)
+  {
+    auto loc = get_pointer_location(buf);
+    if (loc == QUDA_CPU_FIELD_LOCATION) {
+      return chksum_cpu(buf, n);
+    } else {
+      return chksum_gpu(buf, n);
+    }
+  }
+#endif
+
   Communicator::Communicator(int nDim, const int *commDims, QudaCommsMap rank_from_coords, void *map_data,
                              bool user_set_comm_handle_, void *user_comm)
   {
@@ -139,7 +189,15 @@ namespace quda
     MsgHandle *mh = (MsgHandle *)safe_malloc(sizeof(MsgHandle));
     MPI_CHECK(MPI_Send_init(buffer, nbytes, MPI_BYTE, rank, tag, MPI_COMM_HANDLE, &(mh->request)));
     mh->custom = false;
-
+#if defined(QUDA_COMM_CHECKHANG) || defined(QUDA_COMM_CHECKSUM)
+    mh->isSend = true;
+    mh->otherRank = rank;
+#endif
+#ifdef QUDA_COMM_CHECKSUM
+    mh->buffer = buffer;
+    mh->nbytes = nbytes;
+    MPI_CHECK(MPI_Send_init(&(mh->chksum), 1, MPI_UINT64_T, rank, tag, MPI_COMM_HANDLE, &(mh->chkreq)));
+#endif
     return mh;
   }
 
@@ -151,7 +209,15 @@ namespace quda
     MsgHandle *mh = (MsgHandle *)safe_malloc(sizeof(MsgHandle));
     MPI_CHECK(MPI_Recv_init(buffer, nbytes, MPI_BYTE, rank, tag, MPI_COMM_HANDLE, &(mh->request)));
     mh->custom = false;
-
+#if defined(QUDA_COMM_CHECKHANG) || defined(QUDA_COMM_CHECKSUM)
+    mh->isSend = false;
+    mh->otherRank = rank;
+#endif
+#ifdef QUDA_COMM_CHECKSUM
+    mh->buffer = buffer;
+    mh->nbytes = nbytes;
+    MPI_CHECK(MPI_Recv_init(&(mh->chksum), 1, MPI_UINT64_T, rank, tag, MPI_COMM_HANDLE, &(mh->chkreq)));
+#endif
     return mh;
   }
 
@@ -173,6 +239,15 @@ namespace quda
     MsgHandle *mh = (MsgHandle *)safe_malloc(sizeof(MsgHandle));
     MPI_CHECK(MPI_Send_init(buffer, nbytes, MPI_BYTE, rank, tag, MPI_COMM_HANDLE, &(mh->request)));
     mh->custom = false;
+#if defined(QUDA_COMM_CHECKHANG) || defined(QUDA_COMM_CHECKSUM)
+    mh->isSend = true;
+    mh->otherRank = rank;
+#endif
+#ifdef QUDA_COMM_CHECKSUM
+    mh->buffer = buffer;
+    mh->nbytes = nbytes;
+    MPI_CHECK(MPI_Send_init(&(mh->chksum), 1, MPI_UINT64_T, rank, tag, MPI_COMM_HANDLE, &(mh->chkreq)));
+#endif
 
     return mh;
   }
@@ -195,6 +270,15 @@ namespace quda
     MsgHandle *mh = (MsgHandle *)safe_malloc(sizeof(MsgHandle));
     MPI_CHECK(MPI_Recv_init(buffer, nbytes, MPI_BYTE, rank, tag, MPI_COMM_HANDLE, &(mh->request)));
     mh->custom = false;
+#if defined(QUDA_COMM_CHECKHANG) || defined(QUDA_COMM_CHECKSUM)
+    mh->isSend = false;
+    mh->otherRank = rank;
+#endif
+#ifdef QUDA_COMM_CHECKSUM
+    mh->buffer = buffer;
+    mh->nbytes = nbytes;
+    MPI_CHECK(MPI_Recv_init(&(mh->chksum), 1, MPI_UINT64_T, rank, tag, MPI_COMM_HANDLE, &(mh->chkreq)));
+#endif
 
     return mh;
   }
@@ -223,6 +307,15 @@ namespace quda
     mh->custom = true;
 
     MPI_CHECK(MPI_Send_init(buffer, 1, mh->datatype, rank, tag, MPI_COMM_HANDLE, &(mh->request)));
+#if defined(QUDA_COMM_CHECKHANG) || defined(QUDA_COMM_CHECKSUM)
+    mh->isSend = true;
+    mh->otherRank = rank;
+#endif
+#ifdef QUDA_COMM_CHECKSUM
+    mh->buffer = buffer;
+    mh->nbytes = 0; // strides not supported yet
+    // MPI_CHECK(MPI_Send_init(&(mh->chksum), 1, MPI_UINT64_T, rank, tag, MPI_COMM_HANDLE, &(mh->chkreq)));
+#endif
 
     return mh;
   }
@@ -251,6 +344,15 @@ namespace quda
     mh->custom = true;
 
     MPI_CHECK(MPI_Recv_init(buffer, 1, mh->datatype, rank, tag, MPI_COMM_HANDLE, &(mh->request)));
+#if defined(QUDA_COMM_CHECKHANG) || defined(QUDA_COMM_CHECKSUM)
+    mh->isSend = false;
+    mh->otherRank = rank;
+#endif
+#ifdef QUDA_COMM_CHECKSUM
+    mh->buffer = buffer;
+    mh->nbytes = 0; // strides not supported yet
+    // MPI_CHECK(MPI_Recv_init(&(mh->chksum), 1, MPI_UINT64_T, rank, tag, MPI_COMM_HANDLE, &(mh->chkreq)));
+#endif
 
     return mh;
   }
@@ -259,22 +361,111 @@ namespace quda
   {
     MPI_CHECK(MPI_Request_free(&(mh->request)));
     if (mh->custom) MPI_CHECK(MPI_Type_free(&(mh->datatype)));
+#ifdef QUDA_COMM_CHECKSUM
+    if (mh->nbytes > 0) MPI_CHECK(MPI_Request_free(&(mh->chkreq)));
+#endif
     host_free(mh);
     mh = nullptr;
   }
 
-  void Communicator::comm_start(MsgHandle *mh) { MPI_CHECK(MPI_Start(&(mh->request))); }
+  void Communicator::comm_start(MsgHandle *mh)
+  {
+    MPI_CHECK(MPI_Start(&(mh->request)));
+#ifdef QUDA_COMM_CHECKSUM
+    if (mh->nbytes > 0) {
+      if (mh->isSend) { mh->chksum = chksum(mh->buffer, mh->nbytes); }
+      MPI_CHECK(MPI_Start(&(mh->chkreq)));
+    }
+#endif
+#ifdef QUDA_COMM_CHECKHANG
+    mh->startTime = MPI_Wtime();
+#endif
+  }
+
+  void Communicator::comm_wait(MsgHandle *mh)
+  {
+    MPI_CHECK(MPI_Wait(&(mh->request), MPI_STATUS_IGNORE));
+#ifdef QUDA_COMM_CHECKSUM
+    if (mh->nbytes > 0) {
+      MPI_CHECK(MPI_Wait(&(mh->chkreq), MPI_STATUS_IGNORE));
+      if (!mh->isSend) {
+        auto cs = chksum(mh->buffer, mh->nbytes);
+        if (cs != mh->chksum) { errorQuda("comm_wait checksum failure got %lu expeted %lu\n", cs, mh->chksum); }
+      }
+    }
+#endif
+  }
 
-  void Communicator::comm_wait(MsgHandle *mh) { MPI_CHECK(MPI_Wait(&(mh->request), MPI_STATUS_IGNORE)); }
+#ifdef QUDA_COMM_CHECKHANG
+  void hangwarn(bool isSend)
+  {
+    char name[MPI_MAX_PROCESSOR_NAME];
+    int resultlen;
+    MPI_Get_processor_name(name, &resultlen);
+    if (isSend) {
+      warningQuda("%s stuck send in MPI_Test\n", name);
+    } else {
+      warningQuda("%s stuck receive in MPI_Test\n", name);
+    }
+  }
+  MPI_Comm MPI_COMM;
+  bool isSend;
+  int otherRank;
+  void hang(int, siginfo_t *, void *)
+  {
+    char name[MPI_MAX_PROCESSOR_NAME];
+    int resultlen;
+    MPI_Get_processor_name(name, &resultlen);
+    int rank;
+    MPI_CHECK(MPI_Comm_rank(MPI_COMM, &rank));
+    if (isSend) {
+      errorQuda("%s rank %i send to %i stuck in MPI_Test for 120 seconds\n", name, rank, otherRank);
+    } else {
+      errorQuda("%s rank %i receive from %i stuck in MPI_Test for 120 seconds\n", name, rank, otherRank);
+    }
+  }
+#endif
 
   int Communicator::comm_query(MsgHandle *mh)
   {
+#ifdef QUDA_COMM_CHECKHANG
+    static bool firstCall = true;
+    if (firstCall) {
+      firstCall = false;
+      struct sigaction sig_action;
+      memset(&sig_action, 0, sizeof(sig_action));
+      sig_action.sa_sigaction = hang;
+      sig_action.sa_flags = 0;
+      sigemptyset(&sig_action.sa_mask);
+      sigaction(SIGALRM, &sig_action, 0);
+    }
+    MPI_COMM = MPI_COMM_HANDLE;
+    isSend = mh->isSend;
+    otherRank = mh->otherRank;
+    alarm(120); // 120 seconds
+#endif
     int query;
     MPI_CHECK(MPI_Test(&(mh->request), &query, MPI_STATUS_IGNORE));
-
+#ifdef QUDA_COMM_CHECKHANG
+    alarm(0);
+    double endTime = MPI_Wtime();
+    if (endTime - mh->startTime > 120.0) { hang(0, nullptr, nullptr); }
+    // if (endTime-mh->startTime>60.0) {
+    //   hangwarn(mh->isSend);
+    // }
+#endif
     return query;
   }
 
+#if 0
+  void Communicator::comm_query(int n, MsgHandle *mh[], int *outcount, int array_of_indices[])
+  {
+    MPI_Request req[n];
+    for (int i=0; i<n; i++) req[i] = mh[i]->request;
+    MPI_CHECK(MPI_Testsome(n, req, outcount, array_of_indices, MPI_STATUSES_IGNORE));
+  }
+#endif
+
   void Communicator::comm_allreduce_sum_array(double *data, size_t size)
   {
     if (!comm_deterministic_reduce()) {
diff --git a/lib/communicator_stack.cpp b/lib/communicator_stack.cpp
index ae36f5b2d0..c14c5a96fe 100644
--- a/lib/communicator_stack.cpp
+++ b/lib/communicator_stack.cpp
@@ -234,6 +234,13 @@ namespace quda
 
   int comm_query(MsgHandle *mh) { CHECK_MH(mh); return get_current_communicator().comm_query(mh); }
 
+#if 0
+  void comm_query(int n, MsgHandle *mh[], int *outcount, int array_of_indices[]) {
+    for(int i=0; i<n; i++) CHECK_MH(mh[i]);
+    get_current_communicator().comm_query(n, mh, outcount, array_of_indices);
+  }
+#endif
+
 #undef CHECK_MH
 
   void comm_allreduce_sum_array(double *data, size_t size)
diff --git a/lib/dslash_policy.hpp b/lib/dslash_policy.hpp
index f7d024085f..39d1f4f6ef 100644
--- a/lib/dslash_policy.hpp
+++ b/lib/dslash_policy.hpp
@@ -71,6 +71,9 @@ namespace quda
               if (commDim[j]) prev = 2 * j;
             previousDir[2 * i + 1] = prev;
             previousDir[2 * i + 0] = 2 * i + 1; // always valid
+          } else {
+            commsCompleted[2 * i] = -1; // mark them invalid for convenience
+            commsCompleted[2 * i + 1] = -1;
           }
         }
 
@@ -293,6 +296,67 @@ namespace quda
     return comms_test;
   }
 
+#if 0
+  // checks complete on all communications
+  inline void commsComplete(DslashCommsPattern &pattern, const ColorSpinorField &in, bool gdr_send, bool gdr_recv,
+			    bool zero_copy_recv, int scatterIndex = -1)
+  {
+#if 0
+    for (int i = 3; i >= 0; i--) {
+      for (int dir = 1; dir >= 0; dir--) {
+	// Query if comms has finished
+	if (!pattern.commsCompleted[2 * i + dir]) {
+	  if (commsComplete(in, in, i, dir, gdr_send, gdr_recv, zero_copy_recv, scatterIndex)) {
+	    pattern.commsCompleted[2 * i + dir] = 1;
+	    pattern.completeSum++;
+	  }
+	}
+      } // dir=0,1
+    }
+#else
+    int nq = 0;
+    int d[2*4] = {};
+    for (int i = 3; i >= 0; i--) {
+      for (int dir = 1; dir >= 0; dir--) {
+	if (!pattern.commsCompleted[2 * i + dir]) {
+	  d[nq] = 2 * i + dir;
+	  nq++;
+	}
+      }
+    }
+    bool done[2*4] = {};
+    if(dslash_comms) {
+      PROFILE(in.commsQuery(nq, d, done, gdr_send, gdr_recv), profile, QUDA_PROFILE_COMMS_QUERY);
+    }
+    for(int i=0; i<nq; i++) {
+      if (!dslash_comms || done[i]) {
+	int dim = d[i] / 2;
+	int dir = d[i] % 2;
+	pattern.commsCompleted[2 * dim + dir] = 1;
+	pattern.completeSum++;
+	// now we are receive centric
+	int dir2 = 1-dir;
+
+	// if peer-2-peer in a given direction then we need to insert a wait on that copy event
+	if (comm_peer2peer_enabled(dir2,dim)) {
+	  PROFILE(qudaStreamWaitEvent(device::get_default_stream(), in.getIPCRemoteCopyEvent(dir2,dim), 0), profile, QUDA_PROFILE_STREAM_WAIT_EVENT);
+	} else {
+
+	  if (scatterIndex == -1) scatterIndex = 2 * dim + dir;
+
+	  if (!gdr_recv && !zero_copy_recv) { // Issue CPU->GPU copy if not GDR
+	    // note the ColorSpinorField::scatter transforms from
+	    // scatter centric to gather centric (e.g., flips
+	    // direction) so here just use dir not dir2
+	    PROFILE(if (dslash_copy) in.scatter(2*dim+dir, device::get_stream(scatterIndex)), profile, QUDA_PROFILE_SCATTER);
+	  }
+	}
+      }
+    }
+#endif
+  }
+#endif
+
   /**
      @brief Ensure that the dslash is complete.  By construction, the
      dslash will have completed (or is in flight) on this process,
@@ -648,9 +712,10 @@ namespace quda
 
       DslashCommsPattern pattern(dslashParam.commDim, true);
       while (pattern.completeSum < pattern.commDimTotal) {
+        // commsComplete(pattern, in, true, true, false);
         for (int i = 3; i >= 0; i--) {
           if (!dslashParam.commDim[i]) continue;
-
+#if 1
           for (int dir = 1; dir >= 0; dir--) {
 
             // Query if comms has finished
@@ -662,7 +727,7 @@ namespace quda
             }
 
           } // dir=0,1
-
+#endif
           if (!pattern.dslashCompleted[2 * i] && pattern.dslashCompleted[pattern.previousDir[2 * i + 1]]
               && pattern.commsCompleted[2 * i] && pattern.commsCompleted[2 * i + 1]) {
             dslashParam.kernel_type = static_cast<KernelType>(i);
@@ -730,6 +795,8 @@ namespace quda
 
       DslashCommsPattern pattern(dslashParam.commDim, true);
       while (pattern.completeSum < pattern.commDimTotal) {
+        // commsComplete(pattern, in, true, true, false);
+#if 1
         for (int i = 3; i >= 0; i--) {
           if (!dslashParam.commDim[i]) continue;
 
@@ -744,6 +811,7 @@ namespace quda
             }
           } // dir=0,1
         }   // i
+#endif
       }     // pattern.completeSum < pattern.CommDimTotal
 
       // Launch exterior kernel
diff --git a/lib/restrictor.in.cu b/lib/restrictor.in.cu
index 036763e5f9..fc783c28bc 100644
--- a/lib/restrictor.in.cu
+++ b/lib/restrictor.in.cu
@@ -25,7 +25,21 @@ namespace quda {
     const int *coarse_to_fine;
     const int parity;
 
+#if defined(QUDA_TARGET_SYCL)
+    unsigned int sharedBytesPerBlock(const TuneParam &tp) const
+    {
+      // static constexpr int coarse_color_per_thread = coarse_colors_per_thread<Arg::fineColor, Arg::coarseColor>();
+      // using vector = array<complex<typename Arg::real>, Arg::coarseSpin*coarse_color_per_thread>;
+      // static constexpr int block_dim = 1;
+      // using BlockReduce_t = BlockReduce<vector, block_dim, Arg::n_vector_z>;
+      int coarse_color_per_thread = fineColor != 3 ? 2 : coarseColor >= 4 && coarseColor % 4 == 0 ? 4 : 2;
+      int vsize = 2 * sizeof(out_t) * coarseSpin * coarse_color_per_thread;
+      return vsize * (tp.block.x * tp.block.y * tp.block.z) / device::warp_size();
+    }
+#else
     bool tuneSharedBytes() const { return false; }
+#endif
+
     bool tuneAuxDim() const { return true; }
     unsigned int minThreads() const { return in.Volume(); } // fine parity is the block y dimension
 
@@ -105,6 +119,7 @@ namespace quda {
       param.grid.x = out.Volume();
       param.shared_bytes = 0;
       param.aux.x = 2; // swizzle factor
+      setSharedBytes(param);
     }
 
     void defaultTuneParam(TuneParam &param) const
@@ -114,6 +129,7 @@ namespace quda {
       param.grid.x = out.Volume();
       param.shared_bytes = 0;
       param.aux.x = 2; // swizzle factor
+      setSharedBytes(param);
     }
 
     long long flops() const
diff --git a/lib/targets/sycl/CMakeLists.txt b/lib/targets/sycl/CMakeLists.txt
new file mode 100644
index 0000000000..e1adcac38e
--- /dev/null
+++ b/lib/targets/sycl/CMakeLists.txt
@@ -0,0 +1,16 @@
+# ######################################################################################################################
+# additonal sources
+target_sources(quda_cpp PRIVATE quda_api.cpp device.cpp malloc.cpp blas_lapack_mkl.cpp comm_target.cpp)
+
+if(QUDA_BACKWARDS)
+  set_property(
+    SOURCE malloc.cpp
+    DIRECTORY ${CMAKE_SOURCE_DIR}/lib
+    APPEND
+    PROPERTY COMPILE_DEFINITIONS ${BACKWARD_DEFINITIONS})
+  set_property(
+    SOURCE malloc.cpp
+    DIRECTORY ${CMAKE_SOURCE_DIR}/lib
+    APPEND
+    PROPERTY COMPILE_DEFINITIONS QUDA_BACKWARDSCPP)
+endif()
diff --git a/lib/targets/sycl/blas_lapack_mkl.cpp b/lib/targets/sycl/blas_lapack_mkl.cpp
new file mode 100644
index 0000000000..e09dab5b24
--- /dev/null
+++ b/lib/targets/sycl/blas_lapack_mkl.cpp
@@ -0,0 +1,509 @@
+#include <blas_lapack.h>
+#include <timer.h>
+
+#ifdef NATIVE_LAPACK_LIB
+#include <quda_sycl_api.h>
+// #include <complex>
+#include <oneapi/mkl.hpp>
+#include <malloc_quda.h>
+using namespace oneapi::mkl;
+using namespace oneapi::mkl::lapack;
+#endif
+
+#define _DEBUG
+
+#ifdef _DEBUG
+#include <eigen_helper.h>
+#endif
+
+namespace quda
+{
+
+  namespace blas_lapack
+  {
+
+    namespace native
+    {
+
+      static bool native_init = false;
+
+      void init()
+      {
+        if (!native_init) {
+          native_init = true;
+#ifndef NATIVE_LAPACK_LIB
+          quda::blas_lapack::generic::init();
+#endif
+        }
+      }
+
+      void destroy()
+      {
+        if (native_init) {
+          native_init = false;
+#ifndef NATIVE_LAPACK_LIB
+          quda::blas_lapack::generic::destroy();
+#endif
+        }
+      }
+
+#ifdef _DEBUG
+      template <typename EigenMatrix, typename Float>
+      __host__ void checkEigen(std::complex<Float> *A_h, std::complex<Float> *Ainv_h, int n, uint64_t batch)
+      {
+        EigenMatrix A = EigenMatrix::Zero(n, n);
+        EigenMatrix Ainv = EigenMatrix::Zero(n, n);
+        for (int j = 0; j < n; j++) {
+          for (int k = 0; k < n; k++) {
+            A(k, j) = A_h[batch * n * n + j * n + k];
+            Ainv(k, j) = Ainv_h[batch * n * n + j * n + k];
+          }
+        }
+
+        // Check result:
+        EigenMatrix unit = EigenMatrix::Identity(n, n);
+        EigenMatrix prod = A * Ainv;
+        Float L2norm = ((prod - unit).norm() / (n * n));
+        printfQuda("oneMKL: Norm of (A * Ainv - I) batch %lu = %e\n", batch, L2norm);
+      }
+#endif
+
+#ifdef NATIVE_LAPACK_LIB
+      // FIXME do this in pipelined fashion to reduce memory overhead.
+      long long BatchInvertMatrix(void *Ainv, void *A, const int n, const uint64_t batch, QudaPrecision prec,
+                                  QudaFieldLocation location)
+      {
+        init();
+        if (getVerbosity() >= QUDA_VERBOSE)
+          printfQuda("BatchInvertMatrix (native - oneMKL): Nc = %d, batch = %lu\n", n, batch);
+
+        long long flops = 0;
+        timeval start, stop;
+        gettimeofday(&start, NULL);
+
+        std::int64_t stride_a = n * n;
+        size_t size = 2 * n * n * prec * batch;
+        std::int64_t *dipiv = static_cast<std::int64_t *>(pool_device_malloc(batch * n * sizeof(std::int64_t)));
+        auto q = device::defaultQueue();
+
+        if (prec == QUDA_SINGLE_PRECISION) {
+          typedef std::complex<float> C;
+
+          C *A_d = static_cast<C *>(A);
+          C *Ainv_d = static_cast<C *>(Ainv);
+          if (location == QUDA_CPU_FIELD_LOCATION) {
+            A_d = static_cast<C *>(pool_device_malloc(size));
+            Ainv_d = static_cast<C *>(pool_device_malloc(size));
+            qudaMemcpy(A_d, A, size, qudaMemcpyHostToDevice);
+          }
+
+#ifdef _DEBUG
+          // Debug code: Copy original A matrix to host
+          C *A_h = static_cast<C *>(pool_pinned_malloc(size));
+          qudaMemcpy(A_h, A_d, size, qudaMemcpyDeviceToHost);
+#endif
+
+          try {
+            std::int64_t getrf_scratchpad_size = getrf_batch_scratchpad_size<C>(q, n, n, n, stride_a, n, batch);
+            C *getrf_scratchpad = static_cast<C *>(pool_device_malloc(getrf_scratchpad_size * sizeof(C)));
+            auto getrf_event
+              = getrf_batch(q, n, n, A_d, n, stride_a, dipiv, n, batch, getrf_scratchpad, getrf_scratchpad_size);
+            flops += batch * FLOPS_CGETRF(n, n);
+
+            std::int64_t getri_scratchpad_size = getri_batch_scratchpad_size<C>(q, n, n, n, n, batch);
+            C *getri_scratchpad = static_cast<C *>(pool_device_malloc(getri_scratchpad_size * sizeof(C)));
+            auto getri_event = getri_batch(q, n, A_d, n, stride_a, dipiv, n, Ainv_d, n, stride_a, batch,
+                                           getri_scratchpad, getri_scratchpad_size, {getrf_event});
+            flops += batch * FLOPS_CGETRI(n);
+            getri_event.wait_and_throw();
+            pool_device_free(getrf_scratchpad);
+            pool_device_free(getri_scratchpad);
+          } catch (oneapi::mkl::lapack::exception const &e) {
+            // Handle LAPACK related exceptions happened during synchronous call
+            errorQuda("Unexpected exception caught during synchronous call to LAPACK API:\nreason: %s\ninfo: %ld",
+                      e.what(), e.info());
+          } catch (sycl::exception const &e) {
+            // Handle not LAPACK related exceptions happened during synchronous call
+            errorQuda("Unexpected exception caught during synchronous call to SYCL API:\n %s", e.what());
+          }
+
+#ifdef _DEBUG
+          // Debug code: Copy computed Ainv to host
+          C *Ainv_h = static_cast<C *>(pool_pinned_malloc(size));
+          qudaMemcpy(Ainv_h, Ainv_d, size, qudaMemcpyDeviceToHost);
+          for (uint64_t i = 0; i < batch; i++) { checkEigen<MatrixXcf, float>(A_h, Ainv_h, n, i); }
+          pool_pinned_free(Ainv_h);
+          pool_pinned_free(A_h);
+#endif
+          if (location == QUDA_CPU_FIELD_LOCATION) {
+            qudaMemcpy(Ainv, Ainv_d, size, qudaMemcpyDeviceToHost);
+            pool_device_free(Ainv_d);
+            pool_device_free(A_d);
+          }
+        } else if (prec == QUDA_DOUBLE_PRECISION) {
+          typedef std::complex<double> C;
+
+          C *A_d = static_cast<C *>(A);
+          C *Ainv_d = static_cast<C *>(Ainv);
+          if (location == QUDA_CPU_FIELD_LOCATION) {
+            A_d = static_cast<C *>(pool_device_malloc(size));
+            Ainv_d = static_cast<C *>(pool_device_malloc(size));
+            qudaMemcpy(A_d, A, size, qudaMemcpyHostToDevice);
+          }
+
+#ifdef _DEBUG
+          // Debug code: Copy original A matrix to host
+          C *A_h = static_cast<C *>(pool_pinned_malloc(size));
+          qudaMemcpy(A_h, A_d, size, qudaMemcpyDeviceToHost);
+#endif
+
+          try {
+            std::int64_t getrf_scratchpad_size = getrf_batch_scratchpad_size<C>(q, n, n, n, stride_a, n, batch);
+            C *getrf_scratchpad = static_cast<C *>(pool_device_malloc(getrf_scratchpad_size * sizeof(C)));
+            auto getrf_event
+              = getrf_batch(q, n, n, A_d, n, stride_a, dipiv, n, batch, getrf_scratchpad, getrf_scratchpad_size);
+            flops += batch * FLOPS_CGETRF(n, n);
+
+            std::int64_t getri_scratchpad_size = getri_batch_scratchpad_size<C>(q, n, n, n, n, batch);
+            C *getri_scratchpad = static_cast<C *>(pool_device_malloc(getri_scratchpad_size * sizeof(C)));
+            auto getri_event = getri_batch(q, n, A_d, n, stride_a, dipiv, n, Ainv_d, n, stride_a, batch,
+                                           getri_scratchpad, getri_scratchpad_size, {getrf_event});
+            flops += batch * FLOPS_CGETRI(n);
+            getri_event.wait_and_throw();
+            pool_device_free(getrf_scratchpad);
+            pool_device_free(getri_scratchpad);
+          } catch (oneapi::mkl::lapack::exception const &e) {
+            // Handle LAPACK related exceptions happened during synchronous call
+            errorQuda("Unexpected exception caught during synchronous call to LAPACK API:\nreason: %s\ninfo: %ld",
+                      e.what(), e.info());
+          } catch (sycl::exception const &e) {
+            // Handle not LAPACK related exceptions happened during synchronous call
+            errorQuda("Unexpected exception caught during synchronous call to SYCL API:\n %s", e.what());
+          }
+
+#ifdef _DEBUG
+          // Debug code: Copy computed Ainv to host
+          C *Ainv_h = static_cast<C *>(pool_pinned_malloc(size));
+          qudaMemcpy(Ainv_h, Ainv_d, size, qudaMemcpyDeviceToHost);
+          for (uint64_t i = 0; i < batch; i++) { checkEigen<MatrixXcf, double>(A_h, Ainv_h, n, i); }
+          pool_pinned_free(Ainv_h);
+          pool_pinned_free(A_h);
+#endif
+          if (location == QUDA_CPU_FIELD_LOCATION) {
+            qudaMemcpy(Ainv, Ainv_d, size, qudaMemcpyDeviceToHost);
+            pool_device_free(Ainv_d);
+            pool_device_free(A_d);
+          }
+        } else {
+          errorQuda("%s not implemented for precision=%d", __func__, prec);
+        }
+
+        pool_device_free(dipiv);
+
+        // qudaDeviceSynchronize();
+        gettimeofday(&stop, NULL);
+        long ds = stop.tv_sec - start.tv_sec;
+        long dus = stop.tv_usec - start.tv_usec;
+        double time = ds + 0.000001 * dus;
+
+        if (getVerbosity() >= QUDA_VERBOSE)
+          printfQuda("Batched matrix inversion completed in %f seconds with GFLOPS = %f\n", time, 1e-9 * flops / time);
+
+        return flops;
+      }
+#else
+      long long BatchInvertMatrix(void *Ainv, void *A, const int n, const uint64_t batch, QudaPrecision prec,
+                                  QudaFieldLocation location)
+      {
+        return quda::blas_lapack::generic::BatchInvertMatrix(Ainv, A, n, batch, prec, location);
+      }
+#endif
+
+#ifdef NATIVE_LAPACK_LIB
+      long long stridedBatchGEMM(void *A_data, void *B_data, void *C_data, QudaBLASParam blas_param,
+                                 QudaFieldLocation location)
+      {
+        warningQuda("using mkl stridedBatchGEMM");
+        long long flops = 0;
+        timeval start, stop;
+        gettimeofday(&start, NULL);
+
+        // Sanity checks on parameters
+        //-------------------------------------------------------------------------
+        // If the user passes non positive M,N, or K, we error out
+        int min_dim = std::min(blas_param.m, std::min(blas_param.n, blas_param.k));
+        if (min_dim <= 0) {
+          errorQuda("BLAS dims must be positive: m=%d, n=%d, k=%d", blas_param.m, blas_param.n, blas_param.k);
+        }
+
+        // If the user passes a negative stride, we error out as this has no meaning.
+        int min_stride = std::min(std::min(blas_param.a_stride, blas_param.b_stride), blas_param.c_stride);
+        if (min_stride < 0) {
+          errorQuda("BLAS strides must be positive or zero: a_stride=%d, b_stride=%d, c_stride=%d", blas_param.a_stride,
+                    blas_param.b_stride, blas_param.c_stride);
+        }
+
+        // If the user passes a negative offset, we error out as this has no meaning.
+        int min_offset = std::min(std::min(blas_param.a_offset, blas_param.b_offset), blas_param.c_offset);
+        if (min_offset < 0) {
+          errorQuda("BLAS offsets must be positive or zero: a_offset=%d, b_offset=%d, c_offset=%d", blas_param.a_offset,
+                    blas_param.b_offset, blas_param.c_offset);
+        }
+
+        // If the batch value is non-positve, we error out
+        if (blas_param.batch_count <= 0) { errorQuda("Batches must be positive: batches=%d", blas_param.batch_count); }
+
+        // Leading dims are dependendent on the matrix op type.
+        if (blas_param.data_order == QUDA_BLAS_DATAORDER_COL) {
+          if (blas_param.trans_a == QUDA_BLAS_OP_N) {
+            if (blas_param.lda < std::max(1, blas_param.m))
+              errorQuda("lda=%d must be >= max(1,m=%d)", blas_param.lda, blas_param.m);
+          } else {
+            if (blas_param.lda < std::max(1, blas_param.k))
+              errorQuda("lda=%d must be >= max(1,k=%d)", blas_param.lda, blas_param.k);
+          }
+
+          if (blas_param.trans_b == QUDA_BLAS_OP_N) {
+            if (blas_param.ldb < std::max(1, blas_param.k))
+              errorQuda("ldb=%d must be >= max(1,k=%d)", blas_param.ldb, blas_param.k);
+          } else {
+            if (blas_param.ldb < std::max(1, blas_param.n))
+              errorQuda("ldb=%d must be >= max(1,n=%d)", blas_param.ldb, blas_param.n);
+          }
+          if (blas_param.ldc < std::max(1, blas_param.m))
+            errorQuda("ldc=%d must be >= max(1,m=%d)", blas_param.ldc, blas_param.m);
+        } else {
+          if (blas_param.trans_a == QUDA_BLAS_OP_N) {
+            if (blas_param.lda < std::max(1, blas_param.k))
+              errorQuda("lda=%d must be >= max(1,k=%d)", blas_param.lda, blas_param.k);
+          } else {
+            if (blas_param.lda < std::max(1, blas_param.m))
+              errorQuda("lda=%d must be >= max(1,m=%d)", blas_param.lda, blas_param.m);
+          }
+          if (blas_param.trans_b == QUDA_BLAS_OP_N) {
+            if (blas_param.ldb < std::max(1, blas_param.n))
+              errorQuda("ldb=%d must be >= max(1,n=%d)", blas_param.ldb, blas_param.n);
+          } else {
+            if (blas_param.ldb < std::max(1, blas_param.k))
+              errorQuda("ldb=%d must be >= max(1,k=%d)", blas_param.ldb, blas_param.k);
+          }
+          if (blas_param.ldc < std::max(1, blas_param.n))
+            errorQuda("ldc=%d must be >= max(1,n=%d)", blas_param.ldc, blas_param.n);
+        }
+        //-------------------------------------------------------------------------
+
+        // Parse parameters for CUBLAS
+        //-------------------------------------------------------------------------
+        // Swap A and B if in row order
+        if (blas_param.data_order == QUDA_BLAS_DATAORDER_ROW) {
+          std::swap(blas_param.m, blas_param.n);
+          std::swap(blas_param.lda, blas_param.ldb);
+          std::swap(blas_param.trans_a, blas_param.trans_b);
+          std::swap(blas_param.a_offset, blas_param.b_offset);
+          std::swap(blas_param.a_stride, blas_param.b_stride);
+          std::swap(A_data, B_data);
+        }
+
+        // Get maximum stride length to deduce the number of batches in the
+        // computation
+        int max_stride = std::max(std::max(blas_param.a_stride, blas_param.b_stride), blas_param.c_stride);
+
+        // If the user gives strides of 0 for all arrays, we are essentially performing
+        // a GEMM on the first matrices in the array N_{batch} times.
+        // Give them what they ask for, YMMV...
+        // If the strides have not been set, we are just using strides of 1.
+        if (max_stride == 0) max_stride = 1;
+
+        // The number of GEMMs to compute
+        const uint64_t batch = blas_param.batch_count / max_stride;
+
+        uint64_t data_size
+          = (blas_param.data_type == QUDA_BLAS_DATATYPE_S || blas_param.data_type == QUDA_BLAS_DATATYPE_C) ? 4 : 8;
+
+        if (blas_param.data_type == QUDA_BLAS_DATATYPE_C || blas_param.data_type == QUDA_BLAS_DATATYPE_Z) {
+          data_size *= 2;
+        }
+
+        // Number of data between batches
+        unsigned int A_batch_size = blas_param.lda * blas_param.k;
+        if (blas_param.trans_a != QUDA_BLAS_OP_N) A_batch_size = blas_param.lda * blas_param.m;
+        unsigned int B_batch_size = blas_param.ldb * blas_param.n;
+        if (blas_param.trans_b != QUDA_BLAS_OP_N) B_batch_size = blas_param.ldb * blas_param.k;
+        unsigned int C_batch_size = blas_param.ldc * blas_param.n;
+
+        // Strides in the cublas param are defaulted to -1. If that remains unchanged,
+        // the stride will be the regular batch size, else the user specified value
+        // is used.
+        unsigned int a_stride = blas_param.a_stride == 0 ? A_batch_size : A_batch_size * blas_param.a_stride;
+        unsigned int b_stride = blas_param.b_stride == 0 ? B_batch_size : B_batch_size * blas_param.b_stride;
+        unsigned int c_stride = blas_param.c_stride == 0 ? C_batch_size : C_batch_size * blas_param.c_stride;
+
+        // Data size of the entire array
+        size_t sizeAarr = A_batch_size * data_size * batch;
+        size_t sizeBarr = B_batch_size * data_size * batch;
+        size_t sizeCarr = C_batch_size * data_size * batch;
+
+        // If already on the device, just use the given pointer. If the data is on
+        // the host, allocate device memory and transfer
+        void *A_d = location == QUDA_CUDA_FIELD_LOCATION ? A_data : pool_device_malloc(sizeAarr);
+        void *B_d = location == QUDA_CUDA_FIELD_LOCATION ? B_data : pool_device_malloc(sizeBarr);
+        void *C_d = location == QUDA_CUDA_FIELD_LOCATION ? C_data : pool_device_malloc(sizeCarr);
+        if (location == QUDA_CPU_FIELD_LOCATION) {
+          qudaMemcpy(A_d, A_data, sizeAarr, qudaMemcpyHostToDevice);
+          qudaMemcpy(B_d, B_data, sizeBarr, qudaMemcpyHostToDevice);
+          qudaMemcpy(C_d, C_data, sizeCarr, qudaMemcpyHostToDevice);
+        }
+
+        transpose trans_a = transpose::N;
+        switch (blas_param.trans_a) {
+        case QUDA_BLAS_OP_N: trans_a = transpose::N; break;
+        case QUDA_BLAS_OP_T: trans_a = transpose::T; break;
+        case QUDA_BLAS_OP_C: trans_a = transpose::C; break;
+        default: errorQuda("Unknown QUDA_BLAS_OP type %d\n", blas_param.trans_a);
+        }
+
+        transpose trans_b = transpose::N;
+        switch (blas_param.trans_b) {
+        case QUDA_BLAS_OP_N: trans_b = transpose::N; break;
+        case QUDA_BLAS_OP_T: trans_b = transpose::T; break;
+        case QUDA_BLAS_OP_C: trans_b = transpose::C; break;
+        default: errorQuda("Unknown QUDA_BLAS_OP type %d\n", blas_param.trans_b);
+        }
+        //-------------------------------------------------------------------------
+
+        // Call CUBLAS
+        //-------------------------------------------------------------------------
+        if (blas_param.data_type == QUDA_BLAS_DATATYPE_Z) {
+          typedef std::complex<double> Z;
+
+          const Z alpha(static_cast<std::complex<double>>(blas_param.alpha));
+          const Z beta(static_cast<std::complex<double>>(blas_param.beta));
+
+          auto q = device::defaultQueue();
+          sycl::event evnt;
+          if (batch > 1) {
+            evnt = blas::column_major::gemm_batch(q, trans_a, trans_b, blas_param.m, blas_param.n, blas_param.k, alpha,
+                                                  (Z *)A_d + blas_param.a_offset, blas_param.lda, a_stride,
+                                                  (Z *)B_d + blas_param.b_offset, blas_param.ldb, b_stride, beta,
+                                                  (Z *)C_d + blas_param.c_offset, blas_param.ldc, c_stride, batch);
+            evnt.wait();
+          } else {
+            evnt
+              = blas::column_major::gemm(q, trans_a, trans_b, blas_param.m, blas_param.n, blas_param.k, alpha,
+                                         (Z *)A_d + blas_param.a_offset, blas_param.lda, (Z *)B_d + blas_param.b_offset,
+                                         blas_param.ldb, beta, (Z *)C_d + blas_param.c_offset, blas_param.ldc);
+            evnt.wait();
+          }
+          flops += batch * FLOPS_CGEMM(blas_param.m, blas_param.n, blas_param.k);
+        } else if (blas_param.data_type == QUDA_BLAS_DATATYPE_C) {
+          typedef std::complex<float> C;
+
+          const C alpha(static_cast<std::complex<double>>(blas_param.alpha));
+          const C beta(static_cast<std::complex<double>>(blas_param.beta));
+
+          auto q = device::defaultQueue();
+          sycl::event evnt;
+          if (batch > 1) {
+            evnt = blas::column_major::gemm_batch(q, trans_a, trans_b, blas_param.m, blas_param.n, blas_param.k, alpha,
+                                                  (C *)A_d + blas_param.a_offset, blas_param.lda, a_stride,
+                                                  (C *)B_d + blas_param.b_offset, blas_param.ldb, b_stride, beta,
+                                                  (C *)C_d + blas_param.c_offset, blas_param.ldc, c_stride, batch);
+            evnt.wait();
+          } else {
+            evnt
+              = blas::column_major::gemm(q, trans_a, trans_b, blas_param.m, blas_param.n, blas_param.k, alpha,
+                                         (C *)A_d + blas_param.a_offset, blas_param.lda, (C *)B_d + blas_param.b_offset,
+                                         blas_param.ldb, beta, (C *)C_d + blas_param.c_offset, blas_param.ldc);
+            evnt.wait();
+          }
+          flops += batch * FLOPS_CGEMM(blas_param.m, blas_param.n, blas_param.k);
+        } else if (blas_param.data_type == QUDA_BLAS_DATATYPE_D) {
+          typedef double D;
+
+          const D alpha = (D) static_cast<std::complex<double>>(blas_param.alpha).real();
+          const D beta = (D) static_cast<std::complex<double>>(blas_param.beta).real();
+
+          auto q = device::defaultQueue();
+          sycl::event evnt;
+          if (batch > 1) {
+            evnt = blas::column_major::gemm_batch(q, trans_a, trans_b, blas_param.m, blas_param.n, blas_param.k, alpha,
+                                                  (D *)A_d + blas_param.a_offset, blas_param.lda, a_stride,
+                                                  (D *)B_d + blas_param.b_offset, blas_param.ldb, b_stride, beta,
+                                                  (D *)C_d + blas_param.c_offset, blas_param.ldc, c_stride, batch);
+            evnt.wait();
+          } else {
+            evnt
+              = blas::column_major::gemm(q, trans_a, trans_b, blas_param.m, blas_param.n, blas_param.k, alpha,
+                                         (D *)A_d + blas_param.a_offset, blas_param.lda, (D *)B_d + blas_param.b_offset,
+                                         blas_param.ldb, beta, (D *)C_d + blas_param.c_offset, blas_param.ldc);
+            evnt.wait();
+          }
+          flops += batch * FLOPS_SGEMM(blas_param.m, blas_param.n, blas_param.k);
+        } else if (blas_param.data_type == QUDA_BLAS_DATATYPE_S) {
+          typedef float S;
+
+          const S alpha = (S) static_cast<std::complex<float>>(blas_param.alpha).real();
+          const S beta = (S) static_cast<std::complex<float>>(blas_param.beta).real();
+
+          auto q = device::defaultQueue();
+          sycl::event evnt;
+          if (batch > 1) {
+            evnt = blas::column_major::gemm_batch(q, trans_a, trans_b, blas_param.m, blas_param.n, blas_param.k, alpha,
+                                                  (S *)A_d + blas_param.a_offset, blas_param.lda, a_stride,
+                                                  (S *)B_d + blas_param.b_offset, blas_param.ldb, b_stride, beta,
+                                                  (S *)C_d + blas_param.c_offset, blas_param.ldc, c_stride, batch);
+            evnt.wait();
+          } else {
+            evnt
+              = blas::column_major::gemm(q, trans_a, trans_b, blas_param.m, blas_param.n, blas_param.k, alpha,
+                                         (S *)A_d + blas_param.a_offset, blas_param.lda, (S *)B_d + blas_param.b_offset,
+                                         blas_param.ldb, beta, (S *)C_d + blas_param.c_offset, blas_param.ldc);
+            evnt.wait();
+          }
+          flops += batch * FLOPS_SGEMM(blas_param.m, blas_param.n, blas_param.k);
+        } else {
+          errorQuda("MKL GEMM type %d not implemented\n", blas_param.data_type);
+        }
+        //-------------------------------------------------------------------------
+
+        // Clean up
+        //-------------------------------------------------------------------------
+        if (blas_param.data_order == QUDA_BLAS_DATAORDER_ROW) {
+          std::swap(blas_param.m, blas_param.n);
+          std::swap(blas_param.lda, blas_param.ldb);
+          std::swap(blas_param.trans_a, blas_param.trans_b);
+          std::swap(blas_param.a_offset, blas_param.b_offset);
+          std::swap(blas_param.a_stride, blas_param.b_stride);
+          std::swap(A_data, B_data);
+        }
+
+        if (location == QUDA_CPU_FIELD_LOCATION) {
+          qudaMemcpy(C_data, C_d, sizeCarr, qudaMemcpyDeviceToHost);
+          pool_device_free(A_d);
+          pool_device_free(B_d);
+          pool_device_free(C_d);
+        }
+
+        qudaDeviceSynchronize();
+        gettimeofday(&stop, NULL);
+        long ds = stop.tv_sec - start.tv_sec;
+        long dus = stop.tv_usec - start.tv_usec;
+        double time = ds + 0.000001 * dus;
+        // if (getVerbosity() >= QUDA_DEBUG_VERBOSE)
+        printfQuda("Batched matrix GEMM completed in %f seconds with GFLOPS = %f\n", time, 1e-9 * flops / time);
+        //-------------------------------------------------------------------------
+
+        return flops;
+      }
+#else
+      long long stridedBatchGEMM(void *A_data, void *B_data, void *C_data, QudaBLASParam blas_param,
+                                 QudaFieldLocation location)
+      {
+        warningQuda("using generic stridedBatchGEMM");
+        return quda::blas_lapack::generic::stridedBatchGEMM(A_data, B_data, C_data, blas_param, location);
+      }
+#endif
+
+    } // namespace native
+  } // namespace blas_lapack
+} // namespace quda
diff --git a/lib/targets/sycl/comm_target.cpp b/lib/targets/sycl/comm_target.cpp
new file mode 100644
index 0000000000..bdef5b1e21
--- /dev/null
+++ b/lib/targets/sycl/comm_target.cpp
@@ -0,0 +1,30 @@
+#include <comm_quda.h>
+#include <quda_api.h>
+// #include <quda_cuda_api.h>
+#include <algorithm>
+
+namespace quda
+{
+
+  // bool comm_peer2peer_possible(int local_gpuid, int neighbor_gpuid)
+  bool comm_peer2peer_possible(int, int) { return false; }
+
+  int comm_peer2peer_performance(int local_gpuid, int neighbor_gpuid)
+  {
+    int accessRank[2] = {};
+    if (comm_peer2peer_possible(local_gpuid, neighbor_gpuid)) { }
+    // return the slowest direction of access (lower is faster)
+    return std::max(accessRank[0], accessRank[1]);
+  }
+
+  void comm_create_neighbor_memory(array_2d<void *, QUDA_MAX_DIM, 2> &, void *) { }
+
+  void comm_destroy_neighbor_memory(array_2d<void *, QUDA_MAX_DIM, 2> &) { }
+
+  void comm_create_neighbor_event(array_2d<qudaEvent_t, QUDA_MAX_DIM, 2> &, array_2d<qudaEvent_t, QUDA_MAX_DIM, 2> &) { }
+
+  void comm_destroy_neighbor_event(array_2d<qudaEvent_t, QUDA_MAX_DIM, 2> &, array_2d<qudaEvent_t, QUDA_MAX_DIM, 2> &)
+  {
+  }
+
+} // namespace quda
diff --git a/lib/targets/sycl/device.cpp b/lib/targets/sycl/device.cpp
new file mode 100644
index 0000000000..730b726f54
--- /dev/null
+++ b/lib/targets/sycl/device.cpp
@@ -0,0 +1,548 @@
+#include <util_quda.h>
+#include <quda_internal.h>
+#include <target_device.h>
+#include <algorithm>
+
+static sycl::device myDevice;
+static sycl::queue *streams;
+static const int Nstream = 9;
+static size_t eventCount[Nstream]; // counts event recording and stream syncs
+static size_t syncStamp[Nstream];  // eventCount of last sync
+static void *argBufD[Nstream];
+static size_t argBufSizeD[Nstream];
+static bool print = false;
+
+#ifdef OLDSYCL
+class mySelectorT : public sycl::device_selector
+{
+  int operator()(const sycl::device &device) const override
+  {
+    int score = 1;
+    if (device.get_info<sycl::info::device::device_type>() == sycl::info::device_type::gpu) score += 10;
+    if (!device.has(sycl::aspect::fp64)) score = -1; // require fp64
+    printfQuda("Selector score: %2i %s\n", score, device.get_info<sycl::info::device::name>().c_str());
+    return score;
+  }
+};
+// static auto mySelector = sycl::default_selector();
+static auto mySelector = mySelectorT();
+#else
+int mySelectorT(const sycl::device &device)
+{
+  // printf("mySelectorT\n");
+  int score = 1;
+  if (device.get_info<sycl::info::device::device_type>() == sycl::info::device_type::gpu) score += 10;
+  // printf("device.has\n");
+  if (!device.has(sycl::aspect::fp64)) score = -1; // require fp64
+  if (print) { printfQuda("Selector score: %2i %s\n", score, device.get_info<sycl::info::device::name>().c_str()); }
+  // printf("end\n");
+  return score;
+}
+// static auto mySelector = sycl::default_selector_v;
+// static auto mySelector = sycl::host_selector();
+// static auto mySelector = sycl::cpu_selector();
+// static auto mySelector = sycl::gpu_selector();
+static auto mySelector = mySelectorT;
+#endif
+
+void exception_handler(sycl::exception_list exceptions)
+{
+  for (std::exception_ptr const &e : exceptions) {
+    try {
+      std::rethrow_exception(e);
+    } catch (sycl::exception const &e) {
+      errorQuda("Caught asynchronous SYCL exception:\n %s\n", e.what());
+    }
+  }
+}
+
+namespace quda
+{
+
+  namespace device
+  {
+
+    static bool initialized = false;
+
+    void init(int dev)
+    {
+      if (initialized) return;
+      initialized = true;
+      print = true;
+      //{
+      // auto dh = sycl::device(sycl::host_selector());
+      // printfQuda("Name: %s\n", dh.get_info<sycl::info::device::name>().c_str());
+      // printfQuda("Version: %s\n", dh.get_info<sycl::info::device::version>().c_str());
+      //}
+
+      // if (getVerbosity() >= QUDA_SUMMARIZE) {
+      auto ps = sycl::platform::get_platforms();
+      printfQuda("SYCL platforms available:\n");
+      for (auto p : ps) {
+        printfQuda("  %s %s %s\n", p.get_info<sycl::info::platform::name>().c_str(),
+                   p.get_info<sycl::info::platform::vendor>().c_str(),
+                   p.get_info<sycl::info::platform::version>().c_str());
+      }
+
+      auto p = sycl::platform(mySelector);
+      // auto p = sycl::platform(sycl::host_selector());
+      // auto p = ps.back();
+      printfQuda("Selected platform: %s\n", p.get_info<sycl::info::platform::name>().c_str());
+      printfQuda("  Vendor: %s\n", p.get_info<sycl::info::platform::vendor>().c_str());
+      printfQuda("  Version: %s\n", p.get_info<sycl::info::platform::version>().c_str());
+
+      auto ds = p.get_devices();
+      int ndev = ds.size();
+      printfQuda("  Number of devices: %d\n", ndev);
+      if (dev >= ndev) { errorQuda("Requested device(%d) out of range(%d)", dev, ndev); }
+
+      printfQuda("Selected device number: %i\n", dev);
+      myDevice = ds[dev];
+      printfQuda("  Name: %s\n", myDevice.get_info<sycl::info::device::name>().c_str());
+      printfQuda("  Version: %s\n", myDevice.get_info<sycl::info::device::version>().c_str());
+      printfQuda("  Driver version: %s\n", myDevice.get_info<sycl::info::device::driver_version>().c_str());
+      printfQuda("  Max compute units: %u\n", myDevice.get_info<sycl::info::device::max_compute_units>());
+      printfQuda("  Max work item dimensions: %u\n", myDevice.get_info<sycl::info::device::max_work_item_dimensions>());
+#ifdef OLDSYCL
+      printfQuda("  Max work item sizes: %s\n",
+                 str(myDevice.get_info<sycl::info::device::max_work_item_sizes>()).c_str());
+#else
+      printfQuda("  Max work item sizes: %s\n",
+                 str(myDevice.get_info<sycl::info::device::max_work_item_sizes<3>>()).c_str());
+#endif
+      printfQuda("  Max work group size: %lu\n", myDevice.get_info<sycl::info::device::max_work_group_size>());
+      printfQuda("  Max num sub groups: %u\n", myDevice.get_info<sycl::info::device::max_num_sub_groups>());
+      printfQuda("  Sub group independent forward progress: %s\n",
+                 myDevice.get_info<sycl::info::device::sub_group_independent_forward_progress>() ? "true" : "false");
+      printfQuda("  Sub group sizes: %s\n", str(myDevice.get_info<sycl::info::device::sub_group_sizes>()).c_str());
+      // printfQuda("  Primary sub group size: %lu\n", myDevice.get_info<sycl::info::device::primary_sub_group_size>());
+      printfQuda("  Preferred vector width float: %u\n",
+                 myDevice.get_info<sycl::info::device::preferred_vector_width_float>());
+      printfQuda("  Preferred vector width double: %u\n",
+                 myDevice.get_info<sycl::info::device::preferred_vector_width_double>());
+      printfQuda("  Native vector width float: %u\n", myDevice.get_info<sycl::info::device::native_vector_width_float>());
+      printfQuda("  Native vector width double: %u\n",
+                 myDevice.get_info<sycl::info::device::native_vector_width_double>());
+      printfQuda("  Max clock frequency: %u MHz\n", myDevice.get_info<sycl::info::device::max_clock_frequency>());
+      printfQuda("  Address bits: %u\n", myDevice.get_info<sycl::info::device::address_bits>());
+      printfQuda("  Max mem alloc size: %lu\n", myDevice.get_info<sycl::info::device::max_mem_alloc_size>());
+      printfQuda("  Max parameter size: %lu\n", myDevice.get_info<sycl::info::device::max_parameter_size>());
+      printfQuda("  Mem base addr align: %u\n", myDevice.get_info<sycl::info::device::mem_base_addr_align>());
+      printfQuda("  Global mem cache line size: %u\n",
+                 myDevice.get_info<sycl::info::device::global_mem_cache_line_size>());
+      printfQuda("  Global mem cache size: %lu\n", myDevice.get_info<sycl::info::device::global_mem_cache_size>());
+      printfQuda("  Global mem size: %lu\n", myDevice.get_info<sycl::info::device::global_mem_size>());
+      // printfQuda("  Max constant buffer size: %lu\n", myDevice.get_info<sycl::info::device::max_constant_buffer_size>());
+      // printfQuda("  max_constant_args: %u\n", myDevice.get_info<sycl::info::device::max_constant_args>());
+      printfQuda("  Local mem size: %lu\n", myDevice.get_info<sycl::info::device::local_mem_size>());
+      printfQuda("  Error correction support: %s\n",
+                 myDevice.get_info<sycl::info::device::error_correction_support>() ? "true" : "false");
+      auto moc = myDevice.get_info<sycl::info::device::atomic_memory_order_capabilities>();
+      printfQuda("  Atomic memory orders:");
+      for (auto mo : moc) {
+        switch (mo) {
+        case sycl::memory_order::relaxed: printfQuda(" relaxed"); break;
+        case sycl::memory_order::acquire: printfQuda(" acquire"); break;
+        case sycl::memory_order::release: printfQuda(" release"); break;
+        case sycl::memory_order::acq_rel: printfQuda(" acq_rel"); break;
+        case sycl::memory_order::seq_cst: printfQuda(" seq_cst"); break;
+        default: printfQuda(" unknown"); break;
+        }
+      }
+      printfQuda("\n");
+      auto msc = myDevice.get_info<sycl::info::device::atomic_memory_scope_capabilities>();
+      printfQuda("  Atomic memory scopes:");
+      for (auto ms : msc) {
+        switch (ms) {
+        case sycl::memory_scope::work_item: printfQuda(" work_item"); break;
+        case sycl::memory_scope::sub_group: printfQuda(" sub_group"); break;
+        case sycl::memory_scope::work_group: printfQuda(" work_group"); break;
+        case sycl::memory_scope::device: printfQuda(" device"); break;
+        case sycl::memory_scope::system: printfQuda(" system"); break;
+        default: printfQuda(" unknown"); break;
+        }
+      }
+      printfQuda("\n");
+
+      bool err = false;
+      auto warps = myDevice.get_info<sycl::info::device::sub_group_sizes>();
+      if (std::find(warps.begin(), warps.end(), QUDA_WARP_SIZE) == warps.end()) {
+        err = true;
+        warningQuda("Warp size %d not in sub group sizes %s", QUDA_WARP_SIZE, str(warps).c_str());
+      }
+      // myDevice.get_info<sycl::info::device::max_parameter_size>();
+      // myDevice.get_info<sycl::info::device::max_work_group_size>();
+      if (err) { errorQuda("Device checks failed"); }
+    }
+
+    void init_thread() { }
+
+    state_t get_state() { return {}; }
+
+    int get_device_count()
+    {
+      // printf("get_device_count\n");
+      auto p = sycl::platform(mySelector);
+      // auto p = sycl::platform();
+      // printf("p.get_devices\n");
+      auto ds = p.get_devices();
+      // printf("ds.size\n");
+      auto device_count = ds.size();
+      // printf("device_count %zu\n", device_count);
+      return device_count;
+    }
+
+    void get_visible_devices_string(char device_list_string[128])
+    {
+      char default_list[] = "";
+      char *device_order_env = getenv("SYCL_DEVICE_FILTER");
+      if (device_order_env == nullptr) { device_order_env = getenv("ONEAPI_DEVICE_SELECTOR"); }
+      if (device_order_env == nullptr) { device_order_env = default_list; }
+      snprintf(device_list_string, 128, "%s", device_order_env);
+    }
+
+    void print_device_properties()
+    {
+      auto p = sycl::platform(mySelector);
+      auto ds = p.get_devices();
+      int dev_count = ds.size();
+      for (int device = 0; device < dev_count; device++) {
+#ifdef OLDSYCL
+        using id = sycl::info::device;
+#else
+        namespace id = sycl::info::device;
+#endif
+        auto d = ds[device];
+        printfQuda("%d - name:                    %s\n", device, d.get_info<id::name>().c_str());
+      }
+#if 0
+      printfQuda("%d - totalGlobalMem:          %lu bytes ( %.2f Gbytes)\n", device, deviceProp.totalGlobalMem,
+		 deviceProp.totalGlobalMem / (float)(1024 * 1024 * 1024));
+      printfQuda("%d - sharedMemPerBlock:       %lu bytes ( %.2f Kbytes)\n", device, deviceProp.sharedMemPerBlock,
+		 deviceProp.sharedMemPerBlock / (float)1024);
+      printfQuda("%d - regsPerBlock:            %d\n", device, deviceProp.regsPerBlock);
+      printfQuda("%d - warpSize:                %d\n", device, deviceProp.warpSize);
+      printfQuda("%d - memPitch:                %lu\n", device, deviceProp.memPitch);
+      printfQuda("%d - maxThreadsPerBlock:      %d\n", device, deviceProp.maxThreadsPerBlock);
+      printfQuda("%d - maxThreadsDim[0]:        %d\n", device, deviceProp.maxThreadsDim[0]);
+      printfQuda("%d - maxThreadsDim[1]:        %d\n", device, deviceProp.maxThreadsDim[1]);
+      printfQuda("%d - maxThreadsDim[2]:        %d\n", device, deviceProp.maxThreadsDim[2]);
+      printfQuda("%d - maxGridSize[0]:          %d\n", device, deviceProp.maxGridSize[0]);
+      printfQuda("%d - maxGridSize[1]:          %d\n", device, deviceProp.maxGridSize[1]);
+      printfQuda("%d - maxGridSize[2]:          %d\n", device, deviceProp.maxGridSize[2]);
+      printfQuda("%d - totalConstMem:           %lu bytes ( %.2f Kbytes)\n", device, deviceProp.totalConstMem,
+		 deviceProp.totalConstMem / (float)1024);
+      printfQuda("%d - compute capability:      %d.%d\n", device, deviceProp.major, deviceProp.minor);
+      printfQuda("%d - deviceOverlap            %s\n", device, (deviceProp.deviceOverlap ? "true" : "false"));
+      printfQuda("%d - multiProcessorCount      %d\n", device, deviceProp.multiProcessorCount);
+      printfQuda("%d - kernelExecTimeoutEnabled %s\n", device,
+		 (deviceProp.kernelExecTimeoutEnabled ? "true" : "false"));
+      printfQuda("%d - integrated               %s\n", device, (deviceProp.integrated ? "true" : "false"));
+      printfQuda("%d - canMapHostMemory         %s\n", device, (deviceProp.canMapHostMemory ? "true" : "false"));
+      switch (deviceProp.computeMode) {
+      case 0: printfQuda("%d - computeMode              0: cudaComputeModeDefault\n", device); break;
+      case 1: printfQuda("%d - computeMode              1: cudaComputeModeExclusive\n", device); break;
+      case 2: printfQuda("%d - computeMode              2: cudaComputeModeProhibited\n", device); break;
+      case 3: printfQuda("%d - computeMode              3: cudaComputeModeExclusiveProcess\n", device); break;
+      default: errorQuda("Unknown deviceProp.computeMode.");
+      }
+      printfQuda("%d - surfaceAlignment         %lu\n", device, deviceProp.surfaceAlignment);
+      printfQuda("%d - concurrentKernels        %s\n", device, (deviceProp.concurrentKernels ? "true" : "false"));
+      printfQuda("%d - ECCEnabled               %s\n", device, (deviceProp.ECCEnabled ? "true" : "false"));
+      printfQuda("%d - pciBusID                 %d\n", device, deviceProp.pciBusID);
+      printfQuda("%d - pciDeviceID              %d\n", device, deviceProp.pciDeviceID);
+      printfQuda("%d - pciDomainID              %d\n", device, deviceProp.pciDomainID);
+      printfQuda("%d - tccDriver                %s\n", device, (deviceProp.tccDriver ? "true" : "false"));
+      switch (deviceProp.asyncEngineCount) {
+      case 0: printfQuda("%d - asyncEngineCount         1: host -> device only\n", device); break;
+      case 1: printfQuda("%d - asyncEngineCount         2: host <-> device\n", device); break;
+      case 2: printfQuda("%d - asyncEngineCount         0: not supported\n", device); break;
+      default: errorQuda("Unknown deviceProp.asyncEngineCount.");
+      }
+      printfQuda("%d - unifiedAddressing        %s\n", device, (deviceProp.unifiedAddressing ? "true" : "false"));
+      printfQuda("%d - memoryClockRate          %d kilohertz\n", device, deviceProp.memoryClockRate);
+      printfQuda("%d - memoryBusWidth           %d bits\n", device, deviceProp.memoryBusWidth);
+      printfQuda("%d - l2CacheSize              %d bytes\n", device, deviceProp.l2CacheSize);
+      printfQuda("%d - maxThreadsPerMultiProcessor          %d\n\n", device, deviceProp.maxThreadsPerMultiProcessor);
+#endif
+    }
+
+    void create_context()
+    {
+      printfQuda("Creating context...");
+      auto ctx = sycl::context(myDevice);
+      streams = new sycl::queue[Nstream];
+      sycl::property_list props {sycl::property::queue::in_order(), sycl::property::queue::enable_profiling()};
+      for (int i = 0; i < Nstream - 1; i++) {
+        // streams[i] = sycl::queue(ctx, myDevice, props);
+        streams[i] = sycl::queue(ctx, myDevice, exception_handler, props);
+      }
+      streams[Nstream - 1] = sycl::queue(ctx, myDevice, exception_handler, props);
+      printfQuda(" done\n");
+      for (int i = 0; i < Nstream; i++) {
+        eventCount[i] = 0;
+        syncStamp[i] = 0;
+        argBufD[i] = nullptr;
+        argBufSizeD[i] = 0;
+      }
+#if 0
+      printfQuda("Testing submit...");
+      auto q = streams[Nstream-1];
+      q.submit([&](sycl::handler& h) {
+	h.parallel_for<class test>(sycl::range<3>{1,1,1},
+				   [=](sycl::item<3> i) {
+				     (void) i[0];
+				   });
+      });
+      printfQuda(" done\n");
+#endif
+    }
+
+    void destroy()
+    {
+      if (streams) {
+        // for (int i=0; i<Nstream; i++) streams[i].~queue();
+        delete[] streams;
+        streams = nullptr;
+      }
+    }
+
+    sycl::queue get_target_stream(const qudaStream_t &stream)
+    {
+      // printfQuda("Getting stream %i\n", stream.idx);
+      return streams[stream.idx];
+    }
+
+    qudaStream_t get_stream(unsigned int i)
+    {
+      if (i > Nstream) errorQuda("Invalid stream index %u", i);
+      qudaStream_t stream;
+      stream.idx = i;
+      return stream;
+      // return qudaStream_t(i);
+      //  return streams[i];
+    }
+
+    qudaStream_t get_default_stream()
+    {
+      qudaStream_t stream;
+      stream.idx = Nstream - 1;
+      return stream;
+      // return qudaStream_t(Nstream - 1);
+      // return streams[Nstream - 1];
+    }
+
+    unsigned int get_default_stream_idx() { return Nstream - 1; }
+
+    sycl::queue defaultQueue(void)
+    {
+      // printfQuda("Getting default queue\n");
+      return streams[Nstream - 1];
+    }
+
+    size_t getEventIdx(const qudaStream_t &stream)
+    {
+      eventCount[stream.idx]++;
+      return eventCount[stream.idx];
+    }
+
+    void wasSynced(const qudaStream_t &stream)
+    {
+      eventCount[stream.idx]++;
+      syncStamp[stream.idx] = eventCount[stream.idx];
+    }
+
+    void wasSynced(const qudaStream_t &stream, size_t eventIdx)
+    {
+      syncStamp[stream.idx] = std::max(syncStamp[stream.idx], eventIdx);
+    }
+
+    bool managed_memory_supported()
+    {
+      // auto val = myDevice.has(sycl::aspect::usm_restricted_shared_allocations);
+      auto val = true;
+      return val;
+    }
+
+    bool shared_memory_atomic_supported()
+    {
+      // auto val = myDevice.has(sycl::aspect::int64_base_atomics);
+      // return val;
+      // auto caps = myDevice.get_info<sycl::info::device::atomic_memory_scope_capabilities>;
+      //  work_item, sub_group, work_group, device and system
+      // return false;  // used in coarse_op, but not portable yet
+      return false;
+      // return true;
+    }
+
+    size_t max_default_shared_memory()
+    {
+      static size_t max_shared_bytes = 0;
+      if (max_shared_bytes == 0) { max_shared_bytes = myDevice.get_info<sycl::info::device::local_mem_size>(); }
+      return max_shared_bytes;
+    }
+
+    size_t max_dynamic_shared_memory()
+    {
+      static size_t max_shared_bytes = 0;
+      if (max_shared_bytes == 0) { max_shared_bytes = myDevice.get_info<sycl::info::device::local_mem_size>(); }
+      return max_shared_bytes;
+    }
+
+    unsigned int max_threads_per_block()
+    {
+      static unsigned int max_threads = 0;
+      if (max_threads == 0) {
+        max_threads = myDevice.get_info<sycl::info::device::max_work_group_size>();
+        max_threads = std::min(max_threads, device::max_block_size());
+      }
+      return max_threads;
+    }
+
+    unsigned int max_threads_per_processor()
+    { // not in portable SYCL
+      static unsigned int max_threads = 0;
+      if (max_threads == 0) {
+        max_threads = max_threads_per_block();
+        max_threads *= 2;
+      }
+      return max_threads;
+    }
+
+    unsigned int max_threads_per_block_dim(int i)
+    {
+#ifdef OLDSYCL
+      auto val = myDevice.get_info<sycl::info::device::max_work_item_sizes>();
+#else
+      auto val = myDevice.get_info<sycl::info::device::max_work_item_sizes<3>>();
+#endif
+      return val[2 - i]; // reverse order, should be consistent with RANGE_{X,Y,Z}
+    }
+
+    // unsigned int max_grid_size(int i) { // not in portable SYCL?
+    unsigned int max_grid_size(int)
+    { // not in portable SYCL?
+      // auto val = myDevice.get_info<sycl::info::device::max_work_item_sizes>();
+      // return val[i];
+      //  FIXME: address_bits / mwgs(i) ?
+      return 65536;
+    }
+
+    unsigned int processor_count()
+    {
+      auto val = myDevice.get_info<sycl::info::device::max_compute_units>();
+      return val;
+    }
+
+    unsigned int max_blocks_per_processor()
+    { // FIXME
+      static unsigned int max_blocks_per_sm = 2;
+      return max_blocks_per_sm;
+    }
+
+    bool shared_carve_out_supported() { return false; }
+
+    unsigned int max_parameter_size()
+    {
+      static unsigned int max_parameter_size = 0;
+      if (max_parameter_size == 0) { max_parameter_size = myDevice.get_info<sycl::info::device::max_parameter_size>(); }
+      return max_parameter_size;
+    }
+
+    namespace profile
+    {
+
+      void start()
+      {
+        // cudaProfilerStart();
+      }
+
+      void stop()
+      {
+        // cudaProfilerStop();
+      }
+
+    } // namespace profile
+
+    // buffer for kernel argument
+    typedef struct {
+      void *buf;
+      size_t size;
+      size_t sync;
+      qudaStream_t stream;
+    } ArgBufT;
+    std::vector<ArgBufT> argBuf {};
+
+    void *try_get_arg_buf(qudaStream_t stream, size_t size)
+    {
+      for (auto &b : argBuf) {
+        // printfQuda("  Arg buf stream %i size %i\n", b.stream.idx, b.size);
+        if (syncStamp[b.stream.idx] > b.sync) {
+          b.stream = stream;
+          b.sync = eventCount[stream.idx];
+          if (size > b.size) {
+            // if(b.buf!=nullptr) device_free(b.buf);
+            // b.buf = device_malloc(size);
+            if (b.buf != nullptr) host_free(b.buf);
+            b.buf = pinned_malloc(size);
+            // if(b.buf!=nullptr) managed_free(b.buf);
+            // b.buf = managed_malloc(size);
+            b.size = size;
+          }
+          return b.buf;
+        }
+      }
+      return nullptr;
+    }
+
+    void *get_arg_buf(qudaStream_t stream, size_t size)
+    {
+      // printfQuda("Adding buf stream %i size %i\n", stream.idx, size);
+      auto buf = try_get_arg_buf(stream, size);
+      if (buf == nullptr && argBuf.size() >= 10) { // arbitrary max
+        qudaStreamSynchronize(stream);
+        buf = try_get_arg_buf(stream, size);
+      }
+      if (buf == nullptr) {
+        ArgBufT a;
+        a.stream = stream;
+        a.sync = eventCount[stream.idx];
+        a.size = size;
+        // buf = device_malloc(size);
+        buf = pinned_malloc(size);
+        // buf = managed_malloc(size);
+        a.buf = buf;
+        argBuf.push_back(a);
+        // printfQuda("Added buf stream %i size %i\n", a.stream.idx, a.size);
+      }
+      return buf;
+    }
+
+    void *get_arg_buf_d(qudaStream_t stream, size_t size)
+    {
+      auto buf = argBufD[stream.idx];
+      if (size > argBufSizeD[stream.idx]) {
+        if (buf != nullptr) device_free(buf);
+        buf = device_malloc(size);
+        argBufD[stream.idx] = buf;
+        argBufSizeD[stream.idx] = size;
+      }
+      return buf;
+    }
+
+    void free_arg_buf()
+    {
+      printfQuda("Arg buf count %lu\n", argBuf.size());
+      for (const auto &b : argBuf) {
+        printfQuda("  stream %i size %lu\n", b.stream.idx, b.size);
+        // if(b.buf!=nullptr) device_free(b.buf);
+        if (b.buf != nullptr) host_free(b.buf);
+        // if(b.buf!=nullptr) managed_free(b.buf);
+      }
+      for (int i = 0; i < Nstream; i++) {
+        if (argBufD[i] != nullptr) device_free(argBufD[i]);
+      }
+    }
+
+  } // namespace device
+} // namespace quda
diff --git a/lib/targets/sycl/malloc.cpp b/lib/targets/sycl/malloc.cpp
new file mode 100644
index 0000000000..d6bc1192e7
--- /dev/null
+++ b/lib/targets/sycl/malloc.cpp
@@ -0,0 +1,705 @@
+#include <cstdlib>
+#include <cstdio>
+#include <string>
+#include <map>
+#include <unistd.h>   // for getpagesize()
+#include <execinfo.h> // for backtrace
+#include <quda_internal.h>
+#include <device.h>
+#include <quda_sycl_api.h>
+
+#ifdef USE_QDPJIT
+#include "qdp_cache.h"
+#endif
+
+#ifdef QUDA_BACKWARDSCPP
+#include "backward.hpp"
+#endif
+namespace quda
+{
+
+  enum AllocType { DEVICE, DEVICE_PINNED, HOST, PINNED, MAPPED, MANAGED, N_ALLOC_TYPE };
+
+  class MemAlloc
+  {
+
+  public:
+    std::string func;
+    std::string file;
+    int line;
+    size_t size;
+    size_t base_size;
+#ifdef QUDA_BACKWARDSCPP
+    backward::StackTrace st;
+#endif
+
+    MemAlloc() : line(-1), size(0), base_size(0) { }
+
+    MemAlloc(std::string func, std::string file, int line) : func(func), file(file), line(line), size(0), base_size(0)
+    {
+#ifdef QUDA_BACKWARDSCPP
+      st.load_here(32);
+      st.skip_n_firsts(1);
+#endif
+    }
+
+    MemAlloc(const MemAlloc &) = default;
+    MemAlloc(MemAlloc &&) = default;
+    virtual ~MemAlloc() = default;
+    MemAlloc &operator=(const MemAlloc &) = default;
+    MemAlloc &operator=(MemAlloc &&) = default;
+  };
+
+  static std::map<void *, MemAlloc> alloc[N_ALLOC_TYPE];
+  static long total_bytes[N_ALLOC_TYPE] = {0};
+  static long max_total_bytes[N_ALLOC_TYPE] = {0};
+  static long total_host_bytes, max_total_host_bytes;
+  static long total_pinned_bytes, max_total_pinned_bytes;
+
+  size_t device_allocated_peak() { return max_total_bytes[DEVICE]; }
+
+  size_t pinned_allocated_peak() { return max_total_bytes[PINNED]; }
+
+  size_t mapped_allocated_peak() { return max_total_bytes[MAPPED]; }
+
+  size_t managed_allocated_peak() { return max_total_bytes[MANAGED]; }
+
+  size_t host_allocated_peak() { return max_total_bytes[HOST]; }
+
+  static void print_trace(void)
+  {
+    void *array[10];
+    size_t size;
+    char **strings;
+    size = backtrace(array, 10);
+    strings = backtrace_symbols(array, size);
+    printfQuda("Obtained %zd stack frames.\n", size);
+    for (size_t i = 0; i < size; i++) printfQuda("%s\n", strings[i]);
+    free(strings);
+  }
+
+  static void print_alloc_header()
+  {
+    printfQuda("Type    Pointer          Size             Location\n");
+    printfQuda("----------------------------------------------------------\n");
+  }
+
+  static void print_alloc(AllocType type)
+  {
+    const char *type_str[] = {"Device", "Device Pinned", "Host  ", "Pinned", "Mapped", "Managed"};
+    std::map<void *, MemAlloc>::iterator entry;
+
+    for (entry = alloc[type].begin(); entry != alloc[type].end(); entry++) {
+      void *ptr = entry->first;
+      MemAlloc a = entry->second;
+      printfQuda("%s  %15p  %15lu  %s(), %s:%d\n", type_str[type], ptr, (unsigned long)a.base_size, a.func.c_str(),
+                 a.file.c_str(), a.line);
+#ifdef QUDA_BACKWARDSCPP
+      if (getRankVerbosity()) {
+        backward::Printer p;
+        p.print(a.st);
+      }
+#endif
+    }
+  }
+
+  static void track_malloc(const AllocType &type, const MemAlloc &a, void *ptr)
+  {
+    total_bytes[type] += a.base_size;
+    if (total_bytes[type] > max_total_bytes[type]) { max_total_bytes[type] = total_bytes[type]; }
+    if (type != DEVICE && type != DEVICE_PINNED) {
+      total_host_bytes += a.base_size;
+      if (total_host_bytes > max_total_host_bytes) { max_total_host_bytes = total_host_bytes; }
+    }
+    if (type == PINNED || type == MAPPED) {
+      total_pinned_bytes += a.base_size;
+      if (total_pinned_bytes > max_total_pinned_bytes) { max_total_pinned_bytes = total_pinned_bytes; }
+    }
+    alloc[type][ptr] = a;
+  }
+
+  static void track_free(const AllocType &type, void *ptr)
+  {
+    size_t size = alloc[type][ptr].base_size;
+    total_bytes[type] -= size;
+    if (type != DEVICE && type != DEVICE_PINNED) { total_host_bytes -= size; }
+    if (type == PINNED || type == MAPPED) { total_pinned_bytes -= size; }
+    alloc[type].erase(ptr);
+#ifdef HOST_DEBUG
+    if (type == HOST || type == PINNED || type == MAPPED) {
+      memset(ptr, 0xff, size);
+    } else {
+      auto q = device::defaultQueue();
+      q.memset(ptr, 0xff, size);
+    }
+#endif
+  }
+
+#if 0
+  /**
+   * Under CUDA 4.0, cudaHostRegister seems to require that both the
+   * beginning and end of the buffer be aligned on page boundaries.
+   * This local function takes care of the alignment and gets called
+   * by pinned_malloc_() and mapped_malloc_()
+   */
+  static void *aligned_malloc(MemAlloc &a, size_t size)
+  {
+    void *ptr = nullptr;
+
+    a.size = size;
+
+#if 0
+    a.base_size = size;
+    ptr = malloc(size);
+    if (!ptr) {
+#else
+    // we need to manually align to page boundaries to allow us to bind a texture to mapped memory
+    static int page_size = 2 * getpagesize();
+    a.base_size = ((size + page_size - 1) / page_size) * page_size; // round up to the nearest multiple of page_size
+    int align = posix_memalign(&ptr, page_size, a.base_size);
+    if (!ptr || align != 0) {
+#endif
+      errorQuda("Failed to allocate aligned host memory of size %zu (%s:%d in %s())\n", size, a.file.c_str(), a.line,
+                a.func.c_str());
+    }
+    return ptr;
+  }
+#endif
+
+  bool use_managed_memory()
+  {
+    static bool managed = false;
+    static bool init = false;
+
+    if (!init) {
+      char *enable_managed_memory = getenv("QUDA_ENABLE_MANAGED_MEMORY");
+      if (enable_managed_memory && strcmp(enable_managed_memory, "1") == 0) {
+        warningQuda("Using managed memory for SYCL allocations");
+        managed = true;
+
+        if (!device::managed_memory_supported()) warningQuda("Target device does not report supporting managed memory");
+      }
+
+      init = true;
+    }
+
+    return managed;
+  }
+
+  bool is_prefetch_enabled()
+  {
+    static bool prefetch = false;
+    static bool init = false;
+
+    if (!init) {
+      if (use_managed_memory()) {
+        char *enable_managed_prefetch = getenv("QUDA_ENABLE_MANAGED_PREFETCH");
+        if (enable_managed_prefetch && strcmp(enable_managed_prefetch, "1") == 0) {
+          warningQuda("Enabling prefetch support for managed memory");
+          prefetch = true;
+        }
+      }
+
+      init = true;
+    }
+
+    return prefetch;
+  }
+
+  /**
+   * Perform a standard cudaMalloc() with error-checking.  This
+   * function should only be called via the device_malloc() macro,
+   * defined in malloc_quda.h
+   */
+  void *device_malloc_(const char *func, const char *file, int line, size_t size)
+  {
+    if (use_managed_memory()) return managed_malloc_(func, file, line, size);
+
+    MemAlloc a(func, file, line);
+    a.size = a.base_size = size;
+    auto q = device::defaultQueue();
+    void *ptr = sycl::malloc_device(size, q);
+#ifndef USE_QDPJIT
+    if (!ptr) { errorQuda("Failed to allocate device memory of size %zu (%s:%d in %s())\n", size, file, line, func); }
+#else
+    // QDPJIT version -- barfs internally if it fails
+    QDP::QDP_get_global_cache().addDeviceStatic(&ptr, size, true);
+#endif
+    track_malloc(DEVICE, a, ptr);
+#ifdef HOST_DEBUG
+    q.memset(ptr, 0xff, size);
+#endif
+    return ptr;
+  }
+
+  /**
+   * Perform a cuMemAlloc with error-checking.  This function is to
+   * guarantee a unique memory allocation on the device, since
+   * cudaMalloc can be redirected (as is the case with QDPJIT).  This
+   * should only be called via the device_pinned_malloc() macro,
+   * defined in malloc_quda.h.
+   */
+  void *device_pinned_malloc_(const char *func, const char *file, int line, size_t size)
+  {
+    if (!comm_peer2peer_present()) return device_malloc_(func, file, line, size);
+
+    MemAlloc a(func, file, line);
+    a.size = a.base_size = size;
+    auto q = device::defaultQueue();
+    void *ptr = sycl::malloc_device(size, q);
+    if (!ptr) {
+      errorQuda("Failed to allocate device pinned memory of size %zu (%s:%d in %s())\n", size, file, line, func);
+    }
+    track_malloc(DEVICE_PINNED, a, ptr);
+#ifdef HOST_DEBUG
+    q.memset(ptr, 0xff, size);
+#endif
+    return ptr;
+  }
+
+  /**
+   * Perform a standard malloc() with error-checking.  This function
+   * should only be called via the safe_malloc() macro, defined in
+   * malloc_quda.h
+   */
+  void *safe_malloc_(const char *func, const char *file, int line, size_t size)
+  {
+    MemAlloc a(func, file, line);
+    a.size = a.base_size = size;
+
+    void *ptr = malloc(size);
+    if (!ptr) { errorQuda("Failed to allocate host memory of size %zu (%s:%d in %s())\n", size, file, line, func); }
+    track_malloc(HOST, a, ptr);
+#ifdef HOST_DEBUG
+    memset(ptr, 0xff, size);
+#endif
+    return ptr;
+  }
+
+  /**
+   * Allocate page-locked ("pinned") host memory.  This function
+   * should only be called via the pinned_malloc() macro, defined in
+   * malloc_quda.h
+   *
+   * Note that we do not rely on cudaHostAlloc(), since buffers
+   * allocated in this way have been observed to cause problems when
+   * shared with MPI via GPU Direct on some systems.
+   */
+  void *pinned_malloc_(const char *func, const char *file, int line, size_t size)
+  {
+    MemAlloc a(func, file, line);
+    a.size = a.base_size = size;
+    auto q = device::defaultQueue();
+    void *ptr = sycl::malloc_host(size, q);
+    if (!ptr) { errorQuda("Failed to register pinned memory of size %zu (%s:%d in %s())\n", size, file, line, func); }
+    track_malloc(PINNED, a, ptr);
+#ifdef HOST_DEBUG
+    memset(ptr, 0xff, a.base_size);
+#endif
+    return ptr;
+  }
+
+  /**
+   * Allocate page-locked ("pinned") host memory, and map it into the
+   * GPU address space.  This function should only be called via the
+   * mapped_malloc() macro, defined in malloc_quda.h
+   */
+  void *mapped_malloc_(const char *func, const char *file, int line, size_t size)
+  {
+    MemAlloc a(func, file, line);
+    a.size = a.base_size = size;
+    auto q = device::defaultQueue();
+    void *ptr = sycl::malloc_host(size, q);
+    if (!ptr) {
+      errorQuda("Failed to register host-mapped memory of size %zu (%s:%d in %s())\n", size, file, line, func);
+    }
+    track_malloc(MAPPED, a, ptr);
+#ifdef HOST_DEBUG
+    memset(ptr, 0xff, a.base_size);
+#endif
+    return ptr;
+  }
+
+  /**
+   * Perform a standard cudaMallocManaged() with error-checking.  This
+   * function should only be called via the managed_malloc() macro,
+   * defined in malloc_quda.h
+   */
+  void *managed_malloc_(const char *func, const char *file, int line, size_t size)
+  {
+    MemAlloc a(func, file, line);
+    a.size = a.base_size = size;
+    auto q = device::defaultQueue();
+    void *ptr = sycl::malloc_shared(size, q);
+    if (!ptr) { errorQuda("Failed to allocate managed memory of size %zu (%s:%d in %s())\n", size, file, line, func); }
+    track_malloc(MANAGED, a, ptr);
+#ifdef HOST_DEBUG
+    q.memset(ptr, 0xff, a.base_size);
+#endif
+    return ptr;
+  }
+
+  /**
+   * Allocate shemm device memory. This function should only be called via
+   * device_comms_pinned_malloc_()
+   */
+#ifdef NVSHMEM_COMMS
+  void *shmem_malloc_(const char *func, const char *file, int line, size_t size)
+  {
+    MemAlloc a(func, file, line);
+
+    a.size = a.base_size = size;
+
+    auto ptr = nvshmem_malloc(size);
+    if (ptr == nullptr) {
+      printfQuda("ERROR: Failed to allocate shmem memory of size %zu (%s:%d in %s())\n", size, file, line, func);
+      errorQuda("Aborting");
+    }
+    track_malloc(SHMEM, a, ptr);
+#ifdef HOST_DEBUG
+    qudaMemset(ptr, 0xff, size);
+#endif
+    return ptr;
+  }
+#endif
+
+  /**
+   * Allocate pinned or symmetric (shmem) device memory for comms. Should only be called via the
+   * device_comms_pinned_malloc macro, defined in malloc_quda.h
+   */
+  void *device_comms_pinned_malloc_(const char *func, const char *file, int line, size_t size)
+  {
+#ifdef NVSHMEM_COMMS
+    return shmem_malloc_(func, file, line, size);
+#else
+    return device_pinned_malloc_(func, file, line, size);
+#endif
+  }
+
+  /**
+   * Free device memory allocated with device_malloc().  This function
+   * should only be called via the device_free() macro, defined in
+   * malloc_quda.h
+   */
+  void device_free_(const char *func, const char *file, int line, void *ptr)
+  {
+    if (use_managed_memory()) {
+      managed_free_(func, file, line, ptr);
+      return;
+    }
+
+    if (!ptr) { errorQuda("Attempt to free NULL device pointer (%s:%d in %s())\n", file, line, func); }
+    if (!alloc[DEVICE].count(ptr)) {
+      errorQuda("Attempt to free invalid device pointer (%s:%d in %s())\n", file, line, func);
+    }
+    track_free(DEVICE, ptr);
+    auto q = device::defaultQueue();
+#ifndef USE_QDPJIT
+    sycl::free(ptr, q);
+#else
+    // QDPJIT: Barfs if it fails internally
+    QDP::QDP_get_global_cache().signoffViaPtr(ptr);
+#endif
+  }
+
+  /**
+   * Free device memory allocated with device_pinned malloc().  This
+   * function should only be called via the device_pinned_free()
+   * macro, defined in malloc_quda.h
+   */
+  void device_pinned_free_(const char *func, const char *file, int line, void *ptr)
+  {
+    if (!comm_peer2peer_present()) {
+      device_free_(func, file, line, ptr);
+      return;
+    }
+
+    if (!ptr) { errorQuda("Attempt to free NULL device pointer (%s:%d in %s())\n", file, line, func); }
+    if (!alloc[DEVICE_PINNED].count(ptr)) {
+      errorQuda("Attempt to free invalid device pointer (%s:%d in %s())\n", file, line, func);
+    }
+    track_free(DEVICE_PINNED, ptr);
+    auto q = device::defaultQueue();
+    sycl::free(ptr, q);
+  }
+
+  /**
+   * Free device memory allocated with device_malloc().  This function
+   * should only be called via the device_free() macro, defined in
+   * malloc_quda.h
+   */
+  void managed_free_(const char *func, const char *file, int line, void *ptr)
+  {
+    if (!ptr) { errorQuda("Attempt to free NULL managed pointer (%s:%d in %s())\n", file, line, func); }
+    if (!alloc[MANAGED].count(ptr)) {
+      errorQuda("Attempt to free invalid managed pointer (%s:%d in %s())\n", file, line, func);
+    }
+    track_free(MANAGED, ptr);
+    auto q = device::defaultQueue();
+    sycl::free(ptr, q);
+  }
+
+  /**
+   * Free host memory allocated with safe_malloc(), pinned_malloc(),
+   * or mapped_malloc().  This function should only be called via the
+   * host_free() macro, defined in malloc_quda.h
+   */
+  void host_free_(const char *func, const char *file, int line, void *ptr)
+  {
+    if (!ptr) { errorQuda("Attempt to free NULL host pointer (%s:%d in %s())\n", file, line, func); }
+    if (alloc[HOST].count(ptr)) {
+      track_free(HOST, ptr);
+      free(ptr);
+    } else if (alloc[PINNED].count(ptr)) {
+      track_free(PINNED, ptr);
+      auto q = device::defaultQueue();
+      sycl::free(ptr, q);
+    } else if (alloc[MAPPED].count(ptr)) {
+      track_free(MAPPED, ptr);
+      auto q = device::defaultQueue();
+      sycl::free(ptr, q);
+    } else {
+      printfQuda("ERROR: Attempt to free invalid host pointer (%s:%d in %s())\n", file, line, func);
+      print_trace();
+      errorQuda("Aborting");
+    }
+  }
+
+#ifdef NVSHMEM_COMMS
+  /**
+   * Free symmetric memory allocated with shmem_malloc_. Should only be called via the device_comms_* functions.
+   */
+  void shmem_free_(const char *func, const char *file, int line, void *ptr)
+  {
+    if (!ptr) {
+      printfQuda("ERROR: Attempt to free NULL shmem pointer (%s:%d in %s())\n", file, line, func);
+      errorQuda("Aborting");
+    }
+    if (!alloc[SHMEM].count(ptr)) {
+      printfQuda("ERROR: Attempt to free invalid shmem pointer (%s:%d in %s())\n", file, line, func);
+      errorQuda("Aborting");
+    }
+    track_free(SHMEM, ptr);
+    nvshmem_free(ptr);
+  }
+#endif
+
+  /**
+   * Free device comms memory allocated with device_comms_pinned_malloc(). This function should only be
+   * called via the device_comms_pinned_free() macro, defined in malloc_quda.h
+   */
+  void device_comms_pinned_free_(const char *func, const char *file, int line, void *ptr)
+  {
+#ifdef NVSHMEM_COMMS
+    shmem_free_(func, file, line, ptr);
+#else
+    device_pinned_free_(func, file, line, ptr);
+#endif
+  }
+
+  void printPeakMemUsage()
+  {
+    printfQuda("Device memory used = %.1f MB\n", max_total_bytes[DEVICE] / (double)(1 << 20));
+    printfQuda("Pinned device memory used = %.1f MB\n", max_total_bytes[DEVICE_PINNED] / (double)(1 << 20));
+    printfQuda("Managed memory used = %.1f MB\n", max_total_bytes[MANAGED] / (double)(1 << 20));
+    printfQuda("Page-locked host memory used = %.1f MB\n", max_total_pinned_bytes / (double)(1 << 20));
+    printfQuda("Total host memory used >= %.1f MB\n", max_total_host_bytes / (double)(1 << 20));
+  }
+
+  void assertAllMemFree()
+  {
+    if (!alloc[DEVICE].empty() || !alloc[DEVICE_PINNED].empty() || !alloc[HOST].empty() || !alloc[PINNED].empty()
+        || !alloc[MAPPED].empty()) {
+      warningQuda("The following internal memory allocations were not freed.");
+      printfQuda("\n");
+      print_alloc_header();
+      print_alloc(DEVICE);
+      print_alloc(DEVICE_PINNED);
+      print_alloc(HOST);
+      print_alloc(PINNED);
+      print_alloc(MAPPED);
+      printfQuda("\n");
+    }
+  }
+
+  QudaFieldLocation get_pointer_location(const void *ptr)
+  {
+    auto ctx = device::defaultQueue().get_context();
+    auto mem_type = sycl::get_pointer_type(ptr, ctx);
+    if (mem_type == sycl::usm::alloc::host || mem_type == sycl::usm::alloc::unknown) return QUDA_CPU_FIELD_LOCATION;
+    return QUDA_CUDA_FIELD_LOCATION;
+  }
+
+  void *get_mapped_device_pointer_(const char *, const char *, int, const void *host)
+  {
+    void *device = const_cast<void *>(host);
+    return device;
+  }
+
+  namespace pool
+  {
+
+    /** Cache of inactive pinned-memory allocations.  We cache pinned
+        memory allocations so that fields can reuse these with minimal
+        overhead.*/
+    static std::multimap<size_t, void *> pinnedCache;
+
+    /** Sizes of active pinned-memory allocations.  For convenience,
+        we keep track of the sizes of active allocations (i.e., those not
+        in the cache). */
+    static std::map<void *, size_t> pinnedSize;
+
+    /** Cache of inactive device-memory allocations.  We cache pinned
+        memory allocations so that fields can reuse these with minimal
+        overhead.*/
+    static std::multimap<size_t, void *> deviceCache;
+
+    /** Sizes of active device-memory allocations.  For convenience,
+        we keep track of the sizes of active allocations (i.e., those not
+        in the cache). */
+    static std::map<void *, size_t> deviceSize;
+
+    static bool pool_init = false;
+
+    /** whether to use a memory pool allocator for device memory */
+    static bool device_memory_pool = true;
+
+    /** whether to use a memory pool allocator for pinned memory */
+    static bool pinned_memory_pool = true;
+
+    void init()
+    {
+      if (!pool_init) {
+        // device memory pool
+        char *enable_device_pool = getenv("QUDA_ENABLE_DEVICE_MEMORY_POOL");
+        if (!enable_device_pool || strcmp(enable_device_pool, "0") != 0) {
+          warningQuda("Using device memory pool allocator");
+          device_memory_pool = true;
+        } else {
+          warningQuda("Not using device memory pool allocator");
+          device_memory_pool = false;
+        }
+
+        // pinned memory pool
+        char *enable_pinned_pool = getenv("QUDA_ENABLE_PINNED_MEMORY_POOL");
+        if (!enable_pinned_pool || strcmp(enable_pinned_pool, "0") != 0) {
+          warningQuda("Using pinned memory pool allocator");
+          pinned_memory_pool = true;
+        } else {
+          warningQuda("Not using pinned memory pool allocator");
+          pinned_memory_pool = false;
+        }
+        pool_init = true;
+      }
+    }
+
+    void *pinned_malloc_(const char *func, const char *file, int line, size_t nbytes)
+    {
+      void *ptr = nullptr;
+      if (pinned_memory_pool) {
+        std::multimap<size_t, void *>::iterator it;
+
+        if (pinnedCache.empty()) {
+          ptr = quda::pinned_malloc_(func, file, line, nbytes);
+        } else {
+          it = pinnedCache.lower_bound(nbytes);
+          if (it != pinnedCache.end()) { // sufficiently large allocation found
+            nbytes = it->first;
+            ptr = it->second;
+            pinnedCache.erase(it);
+          } else { // sacrifice the smallest cached allocation
+            it = pinnedCache.begin();
+            ptr = it->second;
+            pinnedCache.erase(it);
+            host_free(ptr);
+            ptr = quda::pinned_malloc_(func, file, line, nbytes);
+          }
+        }
+        pinnedSize[ptr] = nbytes;
+      } else {
+        ptr = quda::pinned_malloc_(func, file, line, nbytes);
+      }
+      return ptr;
+    }
+
+    void pinned_free_(const char *func, const char *file, int line, void *ptr)
+    {
+      if (pinned_memory_pool) {
+        if (!pinnedSize.count(ptr)) { errorQuda("Attempt to free invalid pointer"); }
+        pinnedCache.insert(std::make_pair(pinnedSize[ptr], ptr));
+        pinnedSize.erase(ptr);
+      } else {
+        quda::host_free_(func, file, line, ptr);
+      }
+    }
+
+    void *device_malloc_(const char *func, const char *file, int line, size_t nbytes)
+    {
+      void *ptr = nullptr;
+      if (device_memory_pool) {
+        std::multimap<size_t, void *>::iterator it;
+
+        if (deviceCache.empty()) {
+          ptr = quda::device_malloc_(func, file, line, nbytes);
+        } else {
+          it = deviceCache.lower_bound(nbytes);
+          if (it != deviceCache.end()) { // sufficiently large allocation found
+            nbytes = it->first;
+            ptr = it->second;
+            deviceCache.erase(it);
+          } else { // sacrifice the smallest cached allocation
+            it = deviceCache.begin();
+            ptr = it->second;
+            deviceCache.erase(it);
+            quda::device_free_(func, file, line, ptr);
+            ptr = quda::device_malloc_(func, file, line, nbytes);
+          }
+        }
+        deviceSize[ptr] = nbytes;
+      } else {
+        ptr = quda::device_malloc_(func, file, line, nbytes);
+      }
+      return ptr;
+    }
+
+    void device_free_(const char *func, const char *file, int line, void *ptr)
+    {
+      if (device_memory_pool) {
+        if (!deviceSize.count(ptr)) { errorQuda("Attempt to free invalid pointer"); }
+        deviceCache.insert(std::make_pair(deviceSize[ptr], ptr));
+        deviceSize.erase(ptr);
+      } else {
+        quda::device_free_(func, file, line, ptr);
+      }
+    }
+
+    void flush_pinned()
+    {
+      logQuda(QUDA_DEBUG_VERBOSE, "Flushing host pinned memory pool\n");
+      if (pinned_memory_pool) {
+        std::multimap<size_t, void *>::iterator it;
+        for (it = pinnedCache.begin(); it != pinnedCache.end(); it++) {
+          void *ptr = it->second;
+          host_free(ptr);
+        }
+        pinnedCache.clear();
+      }
+    }
+
+    void flush_device()
+    {
+      logQuda(QUDA_DEBUG_VERBOSE, "Flushing device memory pool\n");
+      device::free_arg_buf(); // free kernel arg buffer
+      if (device_memory_pool) {
+        std::multimap<size_t, void *>::iterator it;
+        for (it = deviceCache.begin(); it != deviceCache.end(); it++) {
+          void *ptr = it->second;
+          device_free(ptr);
+        }
+        deviceCache.clear();
+      }
+    }
+
+  } // namespace pool
+
+} // namespace quda
diff --git a/lib/targets/sycl/quda_api.cpp b/lib/targets/sycl/quda_api.cpp
new file mode 100644
index 0000000000..264f5b36ff
--- /dev/null
+++ b/lib/targets/sycl/quda_api.cpp
@@ -0,0 +1,491 @@
+#include <unordered_set>
+#include <tune_quda.h>
+#include <uint_to_char.h>
+#include <quda_internal.h>
+#include <timer.h>
+#include <device.h>
+#include <quda_sycl_api.h>
+
+// if this macro is defined then we profile the CUDA API calls
+// #define API_PROFILE
+
+#ifdef API_PROFILE
+#define PROFILE(f, idx)                                                                                                \
+  apiTimer.TPSTART(idx);                                                                                               \
+  f;                                                                                                                   \
+  apiTimer.TPSTOP(idx);
+#else
+#define PROFILE(f, idx) f;
+#endif
+
+namespace quda
+{
+
+  static qudaError_t last_error = QUDA_SUCCESS;
+  static std::string last_error_str("QUDA_SUCCESS");
+
+  qudaError_t qudaGetLastError()
+  {
+    auto rtn = last_error;
+    last_error = QUDA_SUCCESS;
+    return rtn;
+  }
+
+  std::string qudaGetLastErrorString()
+  {
+    auto rtn = last_error_str;
+    last_error_str = "CUDA_SUCCESS";
+    return rtn;
+  }
+
+  namespace target
+  {
+
+    namespace sycl
+    {
+
+      void set_error(std::string error_str, const char *api_func, const char *func, const char *file, const char *line,
+                     bool allow_error)
+      {
+        last_error = QUDA_ERROR;
+        last_error_str = error_str;
+        if (!allow_error)
+          errorQuda("%s returned %s\n (%s:%s in %s())\n", api_func, error_str.c_str(), file, line, func);
+      }
+
+    } // namespace sycl
+
+  } // namespace target
+
+  using namespace target::sycl;
+
+  class QudaMem : public Tunable
+  {
+    void *dst;
+    const void *src;
+    const size_t count;
+    const int value;
+    const bool copy;
+    const qudaMemcpyKind kind;
+    const bool async;
+    const char *name;
+    const bool active_tuning;
+    const char *func;
+    const char *file;
+    const char *line;
+
+    unsigned int sharedBytesPerThread() const { return 0; }
+    unsigned int sharedBytesPerBlock(const TuneParam &) const { return 0; }
+
+  public:
+    inline QudaMem(void *dst, const void *src, size_t count, qudaMemcpyKind kind, const qudaStream_t &stream,
+                   bool async, const char *func, const char *file, const char *line) :
+      dst(dst),
+      src(src),
+      count(count),
+      value(0),
+      copy(true),
+      kind(kind),
+      async(async),
+      active_tuning(activeTuning()),
+      func(func),
+      file(file),
+      line(line)
+    {
+      if (!async) {
+        switch (kind) {
+        case qudaMemcpyDeviceToHost: name = "qudaMemcpyDeviceToHost"; break;
+        case qudaMemcpyHostToDevice: name = "qudaMemcpyHostToDevice"; break;
+        case qudaMemcpyHostToHost: name = "qudaMemcpyHostToHost"; break;
+        case qudaMemcpyDeviceToDevice: name = "qudaMemcpyDeviceToDevice"; break;
+        case qudaMemcpyDefault: name = "qudaMemcpyDefault"; break;
+        default: errorQuda("Unsupported qudaMemcpyKind %d", kind);
+        }
+      } else {
+        switch (kind) {
+        case qudaMemcpyDeviceToHost: name = "qudaMemcpyAsyncDeviceToHost"; break;
+        case qudaMemcpyHostToDevice: name = "qudaMemcpyAsyncHostToDevice"; break;
+        case qudaMemcpyHostToHost: name = "qudaMemcpyAsyncHostToHost"; break;
+        case qudaMemcpyDeviceToDevice: name = "qudaMemcpyAsyncDeviceToDevice"; break;
+        case qudaMemcpyDefault: name = "qudaMemcpyAsyncDefault"; break;
+        default: errorQuda("Unsupported qudaMemcpyKind %d", kind);
+        }
+      }
+      strcpy(aux, func);
+      strcat(aux, ",");
+      strcat(aux, file);
+      strcat(aux, ",");
+      strcat(aux, line);
+
+      apply(stream);
+    }
+
+    inline QudaMem(void *dst, int value, size_t count, const qudaStream_t &stream, bool async, const char *func,
+                   const char *file, const char *line) :
+      dst(dst),
+      src(nullptr),
+      count(count),
+      value(value),
+      copy(false),
+      kind(qudaMemcpyDefault),
+      async(async),
+      active_tuning(activeTuning())
+    {
+      name = !async ? "qudaMemset" : "qudaMemsetAsync";
+      strcpy(aux, func);
+      strcat(aux, ",");
+      strcat(aux, file);
+      strcat(aux, ",");
+      strcat(aux, line);
+
+      apply(stream);
+    }
+
+    inline void apply(const qudaStream_t &stream)
+    {
+      if (!active_tuning) tuneLaunch(*this, getTuning(), getVerbosity());
+      // warningQuda("QudaMem apply %i %i", copy, async);
+      if (copy) {
+        if (async) {
+#ifdef API_PROFILE
+          QudaProfileType type = QUDA_PROFILE_MEMCPY_DEFAULT_ASYNC;
+          switch (kind) {
+          case qudaMemcpyDeviceToHost: type = QUDA_PROFILE_MEMCPY_D2H_ASYNC; break;
+          case qudaMemcpyHostToDevice: type = QUDA_PROFILE_MEMCPY_H2D_ASYNC; break;
+          case qudaMemcpyDeviceToDevice: type = QUDA_PROFILE_MEMCPY_D2D_ASYNC; break;
+          case qudaMemcpyDefault: type = QUDA_PROFILE_MEMCPY_DEFAULT_ASYNC; break;
+          default: errorQuda("Unsupported qudaMemcpyTypeAsync %d", kind);
+          }
+#endif
+          // cudaError_t error;
+          // PROFILE(cudaMemcpyAsync(dst, src, count, kind, device::get_cuda_stream(stream)), type);
+          // set_runtime_error(error, "cudaMemcpyAsync", func, file, line, active_tuning);
+          auto q = device::get_target_stream(stream);
+          q.memcpy(dst, src, count);
+        } else {
+          // qudaMemcpy(dst, src, count, kind);
+          auto q = device::get_target_stream(stream);
+          q.memcpy(dst, src, count);
+          device::wasSynced(stream);
+          q.wait_and_throw();
+        }
+      } else {
+        if (async) {
+          // qudaMemsetAsync(dst, value, count, device::get_quda_stream(stream));
+          auto q = device::get_target_stream(stream);
+          q.memset(dst, value, count);
+        } else {
+          // qudaMemset(dst, value, count);
+          auto q = device::get_target_stream(stream);
+          q.memset(dst, value, count);
+          device::wasSynced(stream);
+          q.wait_and_throw();
+        }
+      }
+      qudaError_t error = qudaGetLastError();
+      if (error != QUDA_SUCCESS)
+        errorQuda("(QUDA) %s\n (%s:%s in %s())\n", qudaGetLastErrorString().c_str(), file, line, func);
+    }
+
+    bool advanceTuneParam(TuneParam &) const { return false; }
+
+    TuneKey tuneKey() const
+    {
+      char vol[128];
+      strcpy(vol, "bytes=");
+      u64toa(vol + 6, (uint64_t)count);
+      return TuneKey(vol, name, aux);
+    }
+
+    long long flops() const { return 0; }
+    long long bytes() const { return kind == qudaMemcpyDeviceToDevice ? 2 * count : count; }
+  };
+
+  void qudaMemcpy_(void *dst, const void *src, size_t count, qudaMemcpyKind kind, const char *func, const char *file,
+                   const char *line)
+  {
+    if (count == 0) return;
+    QudaMem copy(dst, src, count, kind, device::get_default_stream(), false, func, file, line);
+  }
+
+  void qudaMemcpy_(const quda_ptr &dst, const quda_ptr &src, size_t count, qudaMemcpyKind kind, const char *func,
+                   const char *file, const char *line)
+  {
+    if (count == 0) return;
+    QudaMem copy(dst.data(), src.data(), count, kind, device::get_default_stream(), false, func, file, line);
+  }
+
+  void qudaMemcpyAsync_(void *dst, const void *src, size_t count, qudaMemcpyKind kind, const qudaStream_t &stream,
+                        const char *func, const char *file, const char *line)
+  {
+    if (count == 0) return;
+    if (kind == qudaMemcpyDeviceToDevice) {
+      QudaMem copy(dst, src, count, kind, stream, true, func, file, line);
+    } else {
+      // PROFILE(cudaMemcpyAsync(dst, src, count, qudaMemcpyKindToAPI(kind), device::get_cuda_stream(stream)),
+      // kind == qudaMemcpyDeviceToHost ? QUDA_PROFILE_MEMCPY_D2H_ASYNC : QUDA_PROFILE_MEMCPY_H2D_ASYNC);
+      auto q = device::get_target_stream(stream);
+      q.memcpy(dst, src, count);
+    }
+  }
+
+  // void qudaMemcpyP2PAsync_(void *dst, const void *src, size_t count, const qudaStream_t &stream,
+  //                          const char *func, const char *file, const char *line)
+  void qudaMemcpyP2PAsync_(void *dst, const void *src, size_t count, const qudaStream_t &stream, const char *,
+                           const char *, const char *)
+  {
+    if (count == 0) return;
+    auto q = device::get_target_stream(stream);
+    q.memcpy(dst, src, count);
+  }
+
+  void qudaMemcpy2D_(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height,
+                     qudaMemcpyKind, const char *, const char *, const char *)
+  // const char *func, const char *file, const char *line)
+  {
+    auto q = device::defaultQueue();
+    char *d = static_cast<char *>(dst);
+    const char *s = static_cast<const char *>(src);
+    for (size_t i = 0; i < height; i++) {
+      q.memcpy(d, s, width);
+      d += dpitch;
+      s += spitch;
+    }
+    device::wasSynced(device::get_default_stream());
+    q.wait_and_throw();
+  }
+
+  void qudaMemcpy2DAsync_(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height,
+                          qudaMemcpyKind, const qudaStream_t &stream, const char *, const char *, const char *)
+  // const char *func, const char *file, const char *line)
+  {
+    auto q = device::get_target_stream(stream);
+    char *d = static_cast<char *>(dst);
+    const char *s = static_cast<const char *>(src);
+    for (size_t i = 0; i < height; i++) {
+      q.memcpy(d, s, width);
+      d += dpitch;
+      s += spitch;
+    }
+  }
+
+#if 0
+  void qudaMemcpy2DP2PAsync_(void *dst, size_t dpitch, const void *src, size_t spitch,
+			     size_t width, size_t height, const qudaStream_t &stream,
+			     const char *, const char *, const char *)
+  //const char *func, const char *file, const char *line)
+  {
+    errorQuda("qudaMemcpy2DP2PAsync_ unimplemented\n");
+#if 0
+    auto error = qudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, qudaMemcpyDeviceToDevice, device::get_quda_stream(stream));
+    if (error != qudaSuccess)
+      errorQuda("qudaMemcpy2DAsync returned %s\n (%s:%s in %s())\n", cudaGetErrorString(error), file, line, func);
+#endif
+  }
+#endif
+
+  void qudaMemset_(void *ptr, int value, size_t count, const char *func, const char *file, const char *line)
+  {
+    if (count == 0) return;
+    auto stream = device::get_default_stream();
+    QudaMem set(ptr, value, count, stream, false, func, file, line);
+  }
+
+  void qudaMemset_(quda_ptr &ptr, int value, size_t count, const char *func, const char *file, const char *line)
+  {
+    if (count == 0) return;
+    if (ptr.is_device()) {
+      QudaMem set(ptr.data(), value, count, device::get_default_stream(), false, func, file, line);
+    } else {
+      memset(ptr.data(), value, count);
+    }
+  }
+
+  void qudaMemsetAsync_(void *ptr, int value, size_t count, const qudaStream_t &stream, const char *func,
+                        const char *file, const char *line)
+  {
+    if (count == 0) return;
+    QudaMem set(ptr, value, count, stream, true, func, file, line);
+  }
+
+  void qudaMemsetAsync_(quda_ptr &ptr, int value, size_t count, const qudaStream_t &stream, const char *func,
+                        const char *file, const char *line)
+  {
+    if (count == 0) return;
+    if (ptr.is_device()) {
+      QudaMem set(ptr.data(), value, count, stream, true, func, file, line);
+    } else {
+      memset(ptr.data(), value, count);
+    }
+  }
+
+#if 0
+  void qudaMemset2D_(void *ptr, size_t pitch, int value, size_t width, size_t height,
+		     const char *, const char *, const char *)
+  //const char *func, const char *file, const char *line)
+  {
+    auto q = device::defaultQueue();
+    char *p = static_cast<char*>(ptr);
+    for(size_t i=0; i<height; i++) {
+      q.memset(p, value, width);
+      p += pitch;
+    }
+    device::wasSynced(device::get_default_stream());
+    q.wait_and_throw();
+  }
+
+  void qudaMemset2DAsync_(void *ptr, size_t pitch, int value, size_t width,
+			  size_t height, const qudaStream_t &stream,
+                          const char *, const char *, const char *)
+  //const char *func, const char *file, const char *line)
+  {
+    auto q = device::get_target_stream(stream);
+    char *p = static_cast<char*>(ptr);
+    for(size_t i=0; i<height; i++) {
+      q.memset(p, value, width);
+      p += pitch;
+    }
+  }
+#endif
+
+  void qudaMemset2DAsync_(quda_ptr &ptr, size_t offset, size_t pitch, int value, size_t width, size_t height,
+                          const qudaStream_t &stream, const char *, const char *, const char *)
+  // const char *func, const char *file, const char *line)
+  {
+    if (ptr.is_device()) {
+      auto q = device::get_target_stream(stream);
+      char *p = static_cast<char *>(ptr.data()) + offset;
+      for (size_t i = 0; i < height; i++) {
+        q.memset(p, value, width);
+        p += pitch;
+      }
+    } else {
+      for (auto i = 0u; i < height; i++) memset(static_cast<char *>(ptr.data()) + offset + i * pitch, value, width);
+    }
+  }
+
+  void qudaMemPrefetchAsync_(void *ptr, size_t count, QudaFieldLocation, const qudaStream_t &stream, const char *,
+                             const char *, const char *)
+  //                         const char *func, const char *file, const char *line)
+  {
+    auto q = device::get_target_stream(stream);
+    q.prefetch(ptr, count);
+  }
+
+  typedef struct {
+    qudaStream_t stream;
+    size_t eventIdx;
+    sycl::event event;
+  } EventImpl;
+
+  // bool qudaEventQuery_(qudaEvent_t &quda_event, const char *func, const char *file, const char *line)
+  bool qudaEventQuery_(qudaEvent_t &quda_event, const char *, const char *, const char *)
+  {
+    auto pe = static_cast<EventImpl *>(quda_event.event);
+    auto status = pe->event.get_info<sycl::info::event::command_execution_status>();
+    auto val = false;
+    if (status == sycl::info::event_command_status::complete) val = true;
+    return val;
+  }
+
+  // void qudaEventRecord_(qudaEvent_t &quda_event, qudaStream_t stream, const char *func, const char *file, const char *line)
+  void qudaEventRecord_(qudaEvent_t &quda_event, qudaStream_t stream, const char *, const char *, const char *)
+  {
+    auto pe = static_cast<EventImpl *>(quda_event.event);
+    auto q = device::get_target_stream(stream);
+    pe->stream = stream;
+    pe->eventIdx = device::getEventIdx(stream);
+#if 0
+    pe->event = q.submit([&](sycl::handler& cgh) {
+      cgh.single_task<class EventRecord>([=](){});
+      //cgh.host_task([=](){});
+    });
+#else
+    //*pe = q.submit_barrier();
+    pe->event = q.ext_oneapi_submit_barrier();
+#endif
+  }
+
+  // void qudaStreamWaitEvent_(qudaStream_t stream, qudaEvent_t quda_event, unsigned int flags,
+  // const char *func, const char *file, const char *line)
+  void qudaStreamWaitEvent_(qudaStream_t, qudaEvent_t quda_event, unsigned int, const char *, const char *, const char *)
+  {
+    auto pe = static_cast<EventImpl *>(quda_event.event);
+    device::wasSynced(pe->stream, pe->eventIdx);
+    pe->event.wait_and_throw();
+  }
+
+  // qudaEvent_t qudaEventCreate_(const char *func, const char *file, const char *line)
+  qudaEvent_t qudaEventCreate_(const char *, const char *, const char *)
+  {
+    qudaEvent_t quda_event;
+    auto e = new EventImpl;
+    quda_event.event = e;
+    return quda_event;
+  }
+
+  // qudaEvent_t qudaChronoEventCreate_(const char *func, const char *file, const char *line)
+  qudaEvent_t qudaChronoEventCreate_(const char *, const char *, const char *)
+  {
+    qudaEvent_t quda_event;
+    auto e = new EventImpl;
+    quda_event.event = e;
+    return quda_event;
+  }
+
+  float qudaEventElapsedTime_(const qudaEvent_t &start, const qudaEvent_t &stop, const char *, const char *, const char *)
+  // const char *func, const char *file, const char *line)
+  {
+    auto pe0 = static_cast<EventImpl *>(start.event);
+    auto pe1 = static_cast<EventImpl *>(stop.event);
+    device::wasSynced(pe0->stream, pe0->eventIdx);
+    pe0->event.wait_and_throw();
+    auto t0 = pe0->event.get_profiling_info<sycl::info::event_profiling::command_end>();
+    device::wasSynced(pe1->stream, pe1->eventIdx);
+    pe1->event.wait_and_throw();
+    auto t1 = pe1->event.get_profiling_info<sycl::info::event_profiling::command_start>();
+    auto elapsed_time = 1e-9 * (t1 - t0);
+    // printfQuda("qudaEventElapsedTime: %lu %lu %g\n", t0, t1, elapsed_time);
+    return elapsed_time;
+  }
+
+  // void qudaEventDestroy_(qudaEvent_t &event, const char *func, const char *file, const char *line)
+  void qudaEventDestroy_(qudaEvent_t &event, const char *, const char *, const char *)
+  {
+    auto pe = static_cast<EventImpl *>(event.event);
+    delete pe;
+  }
+
+  // void qudaEventSynchronize_(const qudaEvent_t &quda_event, const char *func, const char *file, const char *line)
+  void qudaEventSynchronize_(const qudaEvent_t &quda_event, const char *, const char *, const char *)
+  {
+    auto pe = static_cast<EventImpl *>(quda_event.event);
+    device::wasSynced(pe->stream, pe->eventIdx);
+    pe->event.wait_and_throw();
+  }
+
+  // void qudaStreamSynchronize_(const qudaStream_t &stream, const char *func, const char *file, const char *line)
+  void qudaStreamSynchronize_(const qudaStream_t &stream, const char *, const char *, const char *)
+  {
+    auto q = device::get_target_stream(stream);
+    device::wasSynced(stream);
+    q.wait_and_throw();
+  }
+
+  void qudaDeviceSynchronize_(const char *func, const char *file, const char *line)
+  {
+    int n = device::get_default_stream_idx();
+    for (int i = 0; i <= n; i++) {
+      auto s = device::get_stream(i);
+      qudaStreamSynchronize_(s, func, file, line);
+    }
+  }
+
+  void printAPIProfile()
+  {
+#ifdef API_PROFILE
+    apiTimer.Print();
+#endif
+  }
+
+} // namespace quda
diff --git a/lib/targets/sycl/target_sycl.cmake b/lib/targets/sycl/target_sycl.cmake
new file mode 100644
index 0000000000..575dc8fcd8
--- /dev/null
+++ b/lib/targets/sycl/target_sycl.cmake
@@ -0,0 +1,204 @@
+# ######################################################################################################################
+# SYCL target
+
+set(QUDA_TARGET_SYCL ON)
+
+# ######################################################################################################################
+# SYCL specific options
+
+if(DEFINED ENV{QUDA_WARP_SIZE})
+  set(QUDA_WARP_SIZE_DEFAULT $ENV{QUDA_WARP_SIZE})
+else()
+  set(QUDA_WARP_SIZE_DEFAULT 16)
+endif()
+set(QUDA_WARP_SIZE
+    ${QUDA_WARP_SIZE_DEFAULT}
+    CACHE STRING "SYCL subgroup size (warp size)")
+set_property(CACHE QUDA_WARP_SIZE PROPERTY STRINGS 8 16 32)
+target_compile_definitions(quda PUBLIC QUDA_WARP_SIZE=${QUDA_WARP_SIZE})
+message(STATUS "Using subgroup (warp) size " "${QUDA_WARP_SIZE}")
+mark_as_advanced(QUDA_WARP_SIZE)
+
+if(DEFINED ENV{QUDA_MAX_BLOCK_SIZE})
+  set(QUDA_MAX_BLOCK_SIZE_DEFAULT $ENV{QUDA_MAX_BLOCK_SIZE})
+else()
+  set(QUDA_MAX_BLOCK_SIZE_DEFAULT 512)
+endif()
+set(QUDA_MAX_BLOCK_SIZE
+    ${QUDA_MAX_BLOCK_SIZE_DEFAULT}
+    CACHE STRING "SYCL max group size (max block size)")
+#set_property(CACHE QUDA_MAX_BLOCK_SIZE PROPERTY STRINGS 8 16 32)
+target_compile_definitions(quda PUBLIC QUDA_MAX_BLOCK_SIZE=${QUDA_MAX_BLOCK_SIZE})
+message(STATUS "Using max group (block) size " "${QUDA_MAX_BLOCK_SIZE}")
+mark_as_advanced(QUDA_MAX_BLOCK_SIZE)
+
+#if(DEFINED ENV{QUDA_MAX_ARGUMENT_SIZE})
+#  set(QUDA_MAX_ARGUMENT_SIZE_DEFAULT $ENV{QUDA_MAX_ARGUMENT_SIZE})
+#else()
+#  set(QUDA_MAX_ARGUMENT_SIZE_DEFAULT 2048)
+#endif()
+#set(QUDA_MAX_ARGUMENT_SIZE
+#    ${QUDA_MAX_ARGUMENT_SIZE_DEFAULT}
+#    CACHE STRING "SYCL max argument size")
+##set_property(CACHE QUDA_MAX_ARGUMENT_SIZE PROPERTY STRINGS 8 16 32)
+#target_compile_definitions(quda PUBLIC QUDA_MAX_ARGUMENT_SIZE=${QUDA_MAX_ARGUMENT_SIZE})
+#message(STATUS "Using max argument size " "${QUDA_MAX_ARGUMENT_SIZE}")
+#mark_as_advanced(QUDA_MAX_ARGUMENT_SIZE)
+
+if(DEFINED ENV{QUDA_SYCL_TARGETS})
+  set(QUDA_SYCL_TARGETS_DEFAULT $ENV{QUDA_SYCL_TARGETS})
+else()
+  set(QUDA_SYCL_TARGETS_DEFAULT "")
+endif()
+set(QUDA_SYCL_TARGETS
+    ${QUDA_SYCL_TARGETS_DEFAULT}
+    CACHE STRING "SYCL targets: spir64_gen spir64_x86_64 nvptx64-nvidia-cuda amdgcn-amd-amdhsa")
+message(STATUS "Using SYCL targets: ${QUDA_SYCL_TARGETS}")
+mark_as_advanced(QUDA_SYCL_TARGETS)
+
+# ######################################################################################################################
+# define SYCL flags
+
+set(CMAKE_SYCL_FLAGS_DEVEL
+    "-O3 -gline-directives-only -Wall -Wextra"
+    CACHE STRING "Flags used by the C++ compiler during regular development builds.")
+set(CMAKE_SYCL_FLAGS_STRICT
+    "-O3 -Wall -Wextra -Werror"
+    CACHE STRING "Flags used by the C++ compiler during strict jenkins builds.")
+set(CMAKE_SYCL_FLAGS_RELEASE
+    "-O3 -w ${CXX_OPT}"
+    CACHE STRING "Flags used by the C++ compiler during release builds.")
+set(CMAKE_SYCL_FLAGS_HOSTDEBUG
+    "-gline-directives-only -Wall -Wextra"
+    CACHE STRING "Flags used by the C++ compiler during host-debug builds.")
+set(CMAKE_SYCL_FLAGS_DEBUG
+    "-gline-directives-only -Wall -Wextra"
+    CACHE STRING "Flags used by the C++ compiler during full (host+device) debug builds.")
+set(CMAKE_SYCL_FLAGS_SANITIZE
+    "-gline-directives-only -fno-inline -Wall -Wextra"
+    CACHE STRING "Flags used by the C++ compiler during sanitizer debug builds.")
+
+mark_as_advanced(CMAKE_SYCL_FLAGS_DEVEL)
+mark_as_advanced(CMAKE_SYCL_FLAGS_STRICT)
+mark_as_advanced(CMAKE_SYCL_FLAGS_RELEASE)
+mark_as_advanced(CMAKE_SYCL_FLAGS_HOSTDEBUG)
+mark_as_advanced(CMAKE_SYCL_FLAGS_DEBUG)
+mark_as_advanced(CMAKE_SYCL_FLAGS_SANITIZE)
+
+enable_language(SYCL)
+
+#set(CMAKE_SYCL_FLAGS "${CMAKE_SYCL_FLAGS_${CMAKE_BUILD_TYPE}}")
+#message(STATUS "CMAKE_BUILD_TYPE " ${CMAKE_BUILD_TYPE})
+#message(STATUS "CMAKE_SYCL_FLAGS " ${CMAKE_SYCL_FLAGS})
+
+# ######################################################################################################################
+# SYCL specific QUDA options
+
+set(QUDA_ORDER_DOUBLE "2" CACHE STRING "which data order to use for double precision fields (2 = default, 0 = legacy)")
+set(QUDA_ORDER_SINGLE "4" CACHE STRING "which data order to use for single precision fields (4 = default, 0 = legacy)")
+set(QUDA_ORDER_HALF "8" CACHE STRING "which data order to use for half precision fields (8 = default, 0 = legacy)")
+set(QUDA_ORDER_QUARTER "8" CACHE STRING "which data order to use for quarter precision fields (8 = default, 0 = legacy)")
+
+# ######################################################################################################################
+# SYCL specific variables
+
+# QUDA_HASH for tunecache
+#set(HASH cpu_arch=${CPU_ARCH},gpu_arch=${QUDA_GPU_ARCH},sycl_version=${CMAKE_SYCL_COMPILER_VERSION})
+set(HASH cpu_arch=${CPU_ARCH},SYCL)
+#set(GITVERSION "${PROJECT_VERSION}-${GITVERSION}-${QUDA_GPU_ARCH}")
+set(GITVERSION "${PROJECT_VERSION}-${GITVERSION}-SYCL")
+
+# ######################################################################################################################
+# sycl specific compile options
+
+if("x${CMAKE_SYCL_COMPILER_ID}" STREQUAL "xIntelLLVM" OR "x${CMAKE_SYCL_COMPILER_ID}" STREQUAL "xClang")
+  target_compile_options(quda PUBLIC $<$<COMPILE_LANGUAGE:CXX>:-fsycl>)
+  target_compile_options(quda PUBLIC $<$<COMPILE_LANGUAGE:SYCL>:-fsycl>)
+  target_link_options(quda PUBLIC -fsycl)
+  if(QUDA_SYCL_TARGETS)
+    #string(APPEND CMAKE_CXX_FLAGS " -fsycl-targets=${QUDA_SYCL_TARGETS}")
+    #string(APPEND CMAKE_SYCL_FLAGS " -fsycl-targets=${QUDA_SYCL_TARGETS}")
+    target_compile_options(quda PUBLIC $<$<COMPILE_LANGUAGE:CXX>:-fsycl-targets=${QUDA_SYCL_TARGETS}>)
+    target_compile_options(quda PUBLIC $<$<COMPILE_LANGUAGE:SYCL>:-fsycl-targets=${QUDA_SYCL_TARGETS}>)
+    target_link_options(quda PUBLIC -fsycl-targets=${QUDA_SYCL_TARGETS})
+  endif()
+  set(SYCL_LINK_FLAGS -fsycl-device-code-split=per_kernel)
+  list(APPEND SYCL_LINK_FLAGS -fsycl-max-parallel-link-jobs=8)
+  list(APPEND SYCL_LINK_FLAGS -flink-huge-device-code)
+
+  target_compile_options(quda PRIVATE "SHELL:-mllvm -pragma-unroll-threshold=16")
+  #target_compile_options(quda PUBLIC -fno-fast-math)
+  target_compile_options(quda PRIVATE -Wno-division-by-zero)
+  target_compile_options(quda PRIVATE -Wno-pass-failed)
+  target_compile_options(quda PRIVATE -fno-strict-aliasing)
+  #target_compile_options(quda PRIVATE -Wno-sign-compare)
+  if("x${CMAKE_BUILD_TYPE}" STREQUAL "xDEVEL")
+    target_link_options(quda PUBLIC -gline-directives-only)
+  endif()
+endif()
+
+if("x${CMAKE_SYCL_COMPILER_ID}" STREQUAL "xIntelLLVM")
+  target_compile_options(quda PUBLIC -fhonor-nan-compares)
+  target_compile_options(quda PUBLIC -Wno-tautological-constant-compare)
+  target_compile_options(quda PUBLIC -Rno-debug-disables-optimization)
+  target_compile_options(quda PUBLIC $<$<COMPILE_LANGUAGE:CXX>:-fhonor-infinities>)
+  target_link_options(quda PUBLIC -Rno-debug-disables-optimization)
+  #if("x${CMAKE_BUILD_TYPE}" STREQUAL "xSANITIZE")
+  #  #find_library(CXXSAN NAMES libclang_rt.asan_cxx.a PATHS /opt/intel/oneapi/compiler/latest/linux/lib/clang/17/lib/x86_64-unknown-linux-gnu)
+  #  set(SANDIR /opt/intel/oneapi/compiler/latest/linux/lib/clang/17/lib/x86_64-unknown-linux-gnu)
+  #  set(CXXSAN ${SANDIR}/libclang_rt.asan.a ${SANDIR}/libclang_rt.asan_cxx.a)
+  #  target_link_libraries(quda PUBLIC ${CXXSAN})
+  #endif()
+endif()
+
+if("x${CMAKE_SYCL_COMPILER_ID}" STREQUAL "xClang")
+  target_compile_options(quda PUBLIC -fhonor-nans)
+  target_compile_options(quda PUBLIC -Wno-vla-cxx-extension)
+endif()
+
+if(DEFINED ENV{SYCL_FLAGS})
+  message(STATUS "Replacing default SYCL_FLAGS: ${SYCL_FLAGS}")
+  set(SYCL_FLAGS $ENV{SYCL_FLAGS})
+  message(STATUS "With user SYCL_FLAGS: ${SYCL_FLAGS}")
+endif()
+if(SYCL_FLAGS)
+  string(APPEND CMAKE_SYCL_FLAGS " ${SYCL_FLAGS}")
+endif()
+
+if(DEFINED ENV{SYCL_LINK_FLAGS})
+  message(STATUS "Replacing default SYCL_LINK_FLAGS: ${SYCL_LINK_FLAGS}")
+  #separate_arguments(SYCL_LINK_FLAGS NATIVE_COMMAND $ENV{SYCL_LINK_FLAGS})
+  set(SYCL_LINK_FLAGS $ENV{SYCL_LINK_FLAGS})
+  message(STATUS "With user SYCL_LINK_FLAGS: ${SYCL_LINK_FLAGS}")
+  #string(APPEND CMAKE_EXE_LINKER_FLAGS " ${SYCL_LINK_FLAGS}")
+endif()
+if(SYCL_LINK_FLAGS)
+  target_link_options(quda PUBLIC "SHELL:${SYCL_LINK_FLAGS}")
+endif()
+
+string(JOIN " " CMAKE_SYCL_FLAGS ${CMAKE_SYCL_FLAGS} ${CMAKE_SYCL_COMPILE_OPTIONS_EXPLICIT_LANGUAGE})
+set_source_files_properties(${QUDA_CU_OBJS} PROPERTIES LANGUAGE SYCL)
+
+#set_source_files_properties(${QUDA_CU_OBJS} PROPERTIES LANGUAGE CXX)
+#if(SYCL_FLAGS)
+#  set_source_files_properties(${QUDA_CU_OBJS} PROPERTIES COMPILE_FLAGS ${SYCL_FLAGS})
+#endif()
+#target_link_options(quda PUBLIC ${SYCL_LINK_FLAGS})
+#set_property(TARGET quda APPEND_STRING PROPERTY LINK_FLAGS " ${SYCL_LINK_FLAGS}")
+
+# MKL library flags
+
+set(SYCL_MKL_LIBRARY "-lmkl_sycl -lmkl_intel_ilp64 -lmkl_core -lmkl_tbb_thread")
+
+if(${QUDA_BUILD_NATIVE_LAPACK} STREQUAL "ON" OR ${QUDA_BUILD_NATIVE_FFT} STREQUAL "ON")
+  target_link_libraries(quda PUBLIC ${SYCL_MKL_LIBRARY})
+  target_compile_options(quda PRIVATE -Wno-unused-parameter)
+endif()
+
+# add SYCL include and lib directories
+
+target_include_directories(quda PRIVATE ${CMAKE_SOURCE_DIR}/include/targets/sycl)
+target_include_directories(quda PUBLIC $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include/targets/sycl>
+                                       $<INSTALL_INTERFACE:include/targets/sycl>)
+
+add_subdirectory(targets/sycl)
diff --git a/tests/laph_test.cpp b/tests/laph_test.cpp
index 75f244d406..dd8643ec2e 100644
--- a/tests/laph_test.cpp
+++ b/tests/laph_test.cpp
@@ -77,7 +77,7 @@ auto laph_test(test_t param)
   auto Lt = tdim * comm_dim(3);
   std::vector<Complex> hostRes(nSink * nEv * Lt * nSpin, 0.);
 
-#pragma omp parallel for collapse(4)
+  // #pragma omp parallel for collapse(4)
   for (int iEv = 0; iEv < nEv; ++iEv) {
     for (int iSink = 0; iSink < nSink; ++iSink) {
       for (int iSpin = 0; iSpin < nSpin; ++iSpin) {
diff --git a/tests/utils/command_line_params.cpp b/tests/utils/command_line_params.cpp
index 8eaddb4d80..fa26600ae5 100644
--- a/tests/utils/command_line_params.cpp
+++ b/tests/utils/command_line_params.cpp
@@ -359,6 +359,7 @@ namespace
 {
   CLI::TransformPairs<QudaCABasis> ca_basis_map {{"power", QUDA_POWER_BASIS}, {"chebyshev", QUDA_CHEBYSHEV_BASIS}};
 
+#if 0
   CLI::TransformPairs<QudaDslashType> dslash_type_map {{"wilson", QUDA_WILSON_DSLASH},
                                                        {"clover", QUDA_CLOVER_WILSON_DSLASH},
                                                        {"twisted-mass", QUDA_TWISTED_MASS_DSLASH},
@@ -371,6 +372,7 @@ namespace
                                                        {"mobius", QUDA_MOBIUS_DWF_DSLASH},
                                                        {"mobius-eofa", QUDA_MOBIUS_DWF_EOFA_DSLASH},
                                                        {"laplace", QUDA_LAPLACE_DSLASH}};
+#endif
 
   CLI::TransformPairs<QudaTwistFlavorType> twist_flavor_type_map {
     {"singlet", QUDA_TWIST_SINGLET}, {"nondeg-doublet", QUDA_TWIST_NONDEG_DOUBLET}, {"no", QUDA_TWIST_NO}};