From 5a27e0c252042a808fd533b81b832a29f79d9ae0 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Wed, 16 Dec 2020 16:09:54 -0600 Subject: [PATCH 001/328] initial changes --- include/comm_quda.h | 3 ++- include/deflation.h | 4 ++-- include/quda_api.h | 6 +----- include/transform_reduce.h | 3 ++- lib/interface_quda.cpp | 4 ++++ 5 files changed, 11 insertions(+), 9 deletions(-) diff --git a/include/comm_quda.h b/include/comm_quda.h index 1cb2e7a34f..5ca74c56a9 100644 --- a/include/comm_quda.h +++ b/include/comm_quda.h @@ -1,5 +1,6 @@ #pragma once #include +using size_t = std::size_t; #ifdef __cplusplus extern "C" { @@ -27,7 +28,7 @@ extern "C" { Topology *comm_default_topology(void); // routines related to direct peer-2-peer access - void comm_set_neighbor_ranks(Topology *topo=NULL); + void comm_set_neighbor_ranks(Topology *topo=nullptr); int comm_neighbor_rank(int dir, int dim); /** diff --git a/include/deflation.h b/include/deflation.h index f20f936dc9..9f4359c938 100644 --- a/include/deflation.h +++ b/include/deflation.h @@ -62,11 +62,11 @@ namespace quda { //Check that RV is a composite field: if(RV->IsComposite() == false) errorQuda("\nRitz vectors must be contained in a composite field.\n"); - cudaHostRegister(matProj,ld*tot_dim*sizeof(Complex),cudaHostRegisterDefault); + //cudaHostRegister(matProj,ld*tot_dim*sizeof(Complex),cudaHostRegisterDefault); } ~DeflationParam(){ - cudaHostUnregister(matProj); + //cudaHostUnregister(matProj); if(matProj) delete[] matProj; if(invRitzVals) delete[] invRitzVals; } diff --git a/include/quda_api.h b/include/quda_api.h index 3f8b7bc9e5..db5689a8dc 100644 --- a/include/quda_api.h +++ b/include/quda_api.h @@ -1,11 +1,7 @@ #pragma once -#ifndef __CUDACC_RTC__ -#include -#include -#endif - #include +#include /** @file quda_api.h diff --git a/include/transform_reduce.h b/include/transform_reduce.h index ca8e9d1e2f..f2616c657e 100644 --- a/include/transform_reduce.h +++ b/include/transform_reduce.h @@ -58,7 +58,7 @@ namespace quda template __launch_bounds__(Arg::block_size) __global__ void transform_reduce_kernel(Arg arg) { using count_t = decltype(arg.n_items); - +#if 0 count_t i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y; auto v = arg.v[j]; @@ -71,6 +71,7 @@ namespace quda } reduce(arg, r_, j); +#endif } template diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index f184ea1bcf..c283e485fe 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -67,6 +67,7 @@ static bool redundant_comms = false; #include +#if 0 //for MAGMA lib: #include @@ -93,6 +94,7 @@ void closeMagma(){ } } +#endif cudaGaugeField *gaugePrecise = nullptr; cudaGaugeField *gaugeSloppy = nullptr; @@ -5247,6 +5249,7 @@ void invert_multishift_quda_(void *h_x, void *hp_b, QudaInvertParam *param) { void flush_chrono_quda_(int *index) { flushChronoQuda(*index); } +#if 0 void register_pinned_quda_(void *ptr, size_t *bytes) { cudaHostRegister(ptr, *bytes, cudaHostRegisterDefault); checkCudaError(); @@ -5256,6 +5259,7 @@ void unregister_pinned_quda_(void *ptr) { cudaHostUnregister(ptr); checkCudaError(); } +#endif void new_quda_gauge_param_(QudaGaugeParam *param) { *param = newQudaGaugeParam(); From 4d01016afff09669647529f56b9f76c4d6c94d5a Mon Sep 17 00:00:00 2001 From: James Osborn Date: Mon, 21 Dec 2020 15:06:34 -0600 Subject: [PATCH 002/328] move quda_target.h out of quda_api.h --- include/quda_api.h | 2 +- include/quda_internal.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/quda_api.h b/include/quda_api.h index 149cdf5af7..0820ed09ae 100644 --- a/include/quda_api.h +++ b/include/quda_api.h @@ -2,7 +2,7 @@ #include #include -#include +#include /** @file quda_api.h diff --git a/include/quda_internal.h b/include/quda_internal.h index 61628c18fa..7099232606 100644 --- a/include/quda_internal.h +++ b/include/quda_internal.h @@ -45,6 +45,7 @@ #include #include #include +#include #include #ifdef __cplusplus From 024fe8e6d7312c5fd7d8ca9468be3a1a60b57dd6 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Wed, 13 Jan 2021 12:11:18 -0600 Subject: [PATCH 003/328] working to get sycl target to compile --- include/quda_internal.h | 3 ++- lib/gauge_fix_ovr.cu | 4 +++- tests/utils/host_utils.cpp | 6 ++++++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/include/quda_internal.h b/include/quda_internal.h index 168d13b11f..7ceca11d7b 100644 --- a/include/quda_internal.h +++ b/include/quda_internal.h @@ -2,8 +2,9 @@ #include #include +#include -#if defined(QUDA_TARGET_CUDA) || 1 // hack for Jenkins for now +#if defined(QUDA_TARGET_CUDA) //|| 1 // hack for Jenkins for now #include #include #endif diff --git a/lib/gauge_fix_ovr.cu b/lib/gauge_fix_ovr.cu index cdbff9c828..6e04cdd1b2 100644 --- a/lib/gauge_fix_ovr.cu +++ b/lib/gauge_fix_ovr.cu @@ -5,7 +5,7 @@ #include #include #include -#include +//#include #include #include #include @@ -474,6 +474,7 @@ public: for (int dir = 0; dir < 4; dir++) if (comm_dim_partitioned(dir)) nlinksfaces += faceVolume[dir]; +#if 0 thrust::device_ptr array_faceT[2]; thrust::device_ptr array_interiorT[2]; @@ -506,6 +507,7 @@ public: } if (size[0] == size[1]) threads = size[0]; else errorQuda("BORDER: Even and Odd sizes does not match, not supported!!!!, %d:%d", size[0], size[1]); +#endif } /** diff --git a/tests/utils/host_utils.cpp b/tests/utils/host_utils.cpp index d2b490f1a8..cd8511ef09 100644 --- a/tests/utils/host_utils.cpp +++ b/tests/utils/host_utils.cpp @@ -1019,6 +1019,9 @@ template void constructUnitGaugeField(Float **res, QudaGaugePar applyGaugeFieldScaling(res, Vh, param); } +template void constructUnitGaugeField(float **res, QudaGaugeParam *param); +template void constructUnitGaugeField(double **res, QudaGaugeParam *param); + // normalize the vector a template static void normalize(complex *a, int len) { @@ -1112,6 +1115,9 @@ template void constructRandomGaugeField(Float **res, QudaGaugeP } } +template void constructRandomGaugeField(float **res, QudaGaugeParam *param, QudaDslashType dslash_type); +template void constructRandomGaugeField(double **res, QudaGaugeParam *param, QudaDslashType dslash_type); + template void constructUnitaryGaugeField(Float **res) { Float *resOdd[4], *resEven[4]; From 2e4a6b3eea115888f2d7320562dcfe2171b737ab Mon Sep 17 00:00:00 2001 From: James Osborn Date: Tue, 26 Jan 2021 15:55:03 -0600 Subject: [PATCH 004/328] fix compile and testing --- include/quda_api.h | 9 ++++++--- lib/coarse_op.cuh | 2 ++ lib/tune.cpp | 5 +++++ 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/include/quda_api.h b/include/quda_api.h index 64f6ef4025..277c00f6d7 100644 --- a/include/quda_api.h +++ b/include/quda_api.h @@ -30,6 +30,9 @@ namespace quda void *event; }; + qudaError_t qudaLaunchKernel_(const char *file, const int line, + const char *func, const char *kern); + /** @brief Wrapper around cudaLaunchKernel @param[in] func Device function symbol @@ -37,7 +40,7 @@ namespace quda @param[in] args Arguments @param[in] stream Stream identifier */ - qudaError_t qudaLaunchKernel(const void *func, const TuneParam &tp, void **args, qudaStream_t stream); + qudaError_t qudaLaunchKernel_(const void *func, const TuneParam &tp, void **args, qudaStream_t stream); /** @brief Templated wrapper around qudaLaunchKernel which can accept @@ -48,10 +51,10 @@ namespace quda @param[in] stream Stream identifier */ template - qudaError_t qudaLaunchKernel(T *func, const TuneParam &tp, qudaStream_t stream, const Arg &...arg) + qudaError_t qudaLaunchKernel_(T *func, const TuneParam &tp, qudaStream_t stream, const Arg &...arg) { const void *args[] = {&arg...}; - return qudaLaunchKernel(reinterpret_cast(func), tp, const_cast(args), stream); + return qudaLaunchKernel_(reinterpret_cast(func), tp, const_cast(args), stream); } /** diff --git a/lib/coarse_op.cuh b/lib/coarse_op.cuh index fd94e1e739..c9a51fd4c1 100644 --- a/lib/coarse_op.cuh +++ b/lib/coarse_op.cuh @@ -162,6 +162,7 @@ namespace quda { Launch(Arg &arg, qudaError_t &qerror, TuneParam &tp, ComputeType type, bool use_mma, const qudaStream_t &stream) { +#if 0 #ifdef JITIFY using namespace jitify::reflection; #endif @@ -470,6 +471,7 @@ namespace quda { // convert Jitify return error into QUDA error qerror = error == CUDA_SUCCESS ? QUDA_SUCCESS : QUDA_ERROR; +#endif } }; diff --git a/lib/tune.cpp b/lib/tune.cpp index 0b19f454d6..364a993d15 100644 --- a/lib/tune.cpp +++ b/lib/tune.cpp @@ -786,6 +786,11 @@ namespace quda } float elapsed_time = timer.last() / tunable.tuningIter(); + warningQuda("timer.last(): %g", timer.last()); + warningQuda("tunable.tuningIter(): %i", tunable.tuningIter()); + warningQuda("elapsed_time: %g", elapsed_time); + warningQuda("error: %i", error); + warningQuda("tunable.launchError(): %i", tunable.launchError()); if ((elapsed_time < best_time) && (error == QUDA_SUCCESS) && (tunable.launchError() == QUDA_SUCCESS)) { best_time = elapsed_time; best_param = param; From 2053a4130e58cefefa2b64ca7a2ac7ea8ea3f8bb Mon Sep 17 00:00:00 2001 From: James Osborn Date: Thu, 4 Feb 2021 14:05:44 -0600 Subject: [PATCH 005/328] debug tuning --- lib/tune.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/tune.cpp b/lib/tune.cpp index 45041c65d1..8042831bf2 100644 --- a/lib/tune.cpp +++ b/lib/tune.cpp @@ -786,11 +786,11 @@ namespace quda } float elapsed_time = timer.last() / tunable.tuningIter(); - warningQuda("timer.last(): %g", timer.last()); - warningQuda("tunable.tuningIter(): %i", tunable.tuningIter()); - warningQuda("elapsed_time: %g", elapsed_time); - warningQuda("error: %i", error); - warningQuda("tunable.launchError(): %i", tunable.launchError()); + //warningQuda("timer.last(): %g", timer.last()); + //warningQuda("tunable.tuningIter(): %i", tunable.tuningIter()); + warningQuda("tune elapsed_time: %g", elapsed_time); + //warningQuda("error: %i", error); + //warningQuda("tunable.launchError(): %i", tunable.launchError()); if ((elapsed_time < best_time) && (error == QUDA_SUCCESS) && (tunable.launchError() == QUDA_SUCCESS)) { best_time = elapsed_time; best_param = param; From ff84fb7bb5000c1bbebaac2a8cc152fd821fe28b Mon Sep 17 00:00:00 2001 From: James Osborn Date: Mon, 8 Feb 2021 14:05:27 -0600 Subject: [PATCH 006/328] turn off message --- lib/tune.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/tune.cpp b/lib/tune.cpp index 8042831bf2..693b251a21 100644 --- a/lib/tune.cpp +++ b/lib/tune.cpp @@ -788,7 +788,7 @@ namespace quda float elapsed_time = timer.last() / tunable.tuningIter(); //warningQuda("timer.last(): %g", timer.last()); //warningQuda("tunable.tuningIter(): %i", tunable.tuningIter()); - warningQuda("tune elapsed_time: %g", elapsed_time); + //warningQuda("tune elapsed_time: %g", elapsed_time); //warningQuda("error: %i", error); //warningQuda("tunable.launchError(): %i", tunable.launchError()); if ((elapsed_time < best_time) && (error == QUDA_SUCCESS) && (tunable.launchError() == QUDA_SUCCESS)) { From 75a1c9b0c2d34ca87e4141b6b327e0934c79acd1 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Fri, 12 Feb 2021 14:31:03 -0600 Subject: [PATCH 007/328] fixes to get kernels to compile --- include/color_spinor_field_order.h | 4 ++-- include/kernels/blas_core.cuh | 2 +- include/kernels/block_orthogonalize.cuh | 2 ++ include/kernels/reduce_core.cuh | 4 ++-- include/shared_memory_cache_helper.cuh | 2 +- lib/interface_quda.cpp | 7 ++++++- tests/blas_test.cpp | 5 +++++ 7 files changed, 19 insertions(+), 7 deletions(-) diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index df9b491c57..28a516f212 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -454,12 +454,12 @@ namespace quda { __device__ __host__ inline void real(const Float &a) { - return fixed ? v[idx].real(storeFloat(round(scale * a))) : v[idx].real(storeFloat(a)); + return fixed ? v[idx].real(storeFloat(quda::round(scale * a))) : v[idx].real(storeFloat(a)); } __device__ __host__ inline void imag(const Float &a) { - return fixed ? v[idx].imag(storeFloat(round(scale * a))) : v[idx].imag(storeFloat(a)); + return fixed ? v[idx].imag(storeFloat(quda::round(scale * a))) : v[idx].imag(storeFloat(a)); } /** diff --git a/include/kernels/blas_core.cuh b/include/kernels/blas_core.cuh index 06c3b50a60..5e9786e9b5 100644 --- a/include/kernels/blas_core.cuh +++ b/include/kernels/blas_core.cuh @@ -76,7 +76,7 @@ namespace quda */ struct BlasFunctor { //! pre-computation routine before the main loop - virtual __device__ __host__ void init() { ; } + __device__ __host__ void init() { ; } }; /** diff --git a/include/kernels/block_orthogonalize.cuh b/include/kernels/block_orthogonalize.cuh index d1731b8650..5de4e6bfc0 100644 --- a/include/kernels/block_orthogonalize.cuh +++ b/include/kernels/block_orthogonalize.cuh @@ -125,6 +125,7 @@ namespace quda { __device__ __host__ inline void operator()(dim3 block, dim3 thread) { +#if 0 int x_coarse = block.x; int x_fine_offset = thread.x; int chirality = block.y; @@ -232,6 +233,7 @@ namespace quda { } } // j } // n +#endif } }; diff --git a/include/kernels/reduce_core.cuh b/include/kernels/reduce_core.cuh index bbf1719913..e0eae8673c 100644 --- a/include/kernels/reduce_core.cuh +++ b/include/kernels/reduce_core.cuh @@ -94,10 +94,10 @@ namespace quda static constexpr bool site_unroll = site_unroll_; //! pre-computation routine called before the "M-loop" - virtual __device__ __host__ void pre() { ; } + __device__ __host__ void pre() { ; } //! post-computation routine called after the "M-loop" - virtual __device__ __host__ void post(reduce_t &) { ; } + __device__ __host__ void post(reduce_t &) { ; } }; /** diff --git a/include/shared_memory_cache_helper.cuh b/include/shared_memory_cache_helper.cuh index ba0a60e1c5..06284fa5a7 100644 --- a/include/shared_memory_cache_helper.cuh +++ b/include/shared_memory_cache_helper.cuh @@ -60,7 +60,7 @@ namespace quda #ifdef __CUDA_ARCH__ extern __shared__ atom_t cache_[]; #else - static atom_t *cache_; + atom_t *cache_; #endif return reinterpret_cast(cache_); } diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 2b2c8b5d8e..2a0b213074 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -523,16 +523,20 @@ void initQudaMemory() { profileInit.TPSTART(QUDA_PROFILE_TOTAL); profileInit.TPSTART(QUDA_PROFILE_INIT); - + warningQuda("TRACE: %s %s %i", __FILE__, __func__, __LINE__); if (!comms_initialized) init_default_comms(); + warningQuda("TRACE: %s %s %i", __FILE__, __func__, __LINE__); loadTuneCache(); device::create_context(); createDslashEvents(); + warningQuda("TRACE: %s %s %i", __FILE__, __func__, __LINE__); blas_lapack::native::init(); + warningQuda("TRACE: %s %s %i", __FILE__, __func__, __LINE__); blas::init(); + warningQuda("TRACE: %s %s %i", __FILE__, __func__, __LINE__); // initalize the memory pool allocators pool::init(); @@ -542,6 +546,7 @@ void initQudaMemory() for (int d=0; d<4; d++) R[d] = 2 * (redundant_comms || commDimPartitioned(d)); + warningQuda("TRACE: %s %s %i", __FILE__, __func__, __LINE__); profileInit.TPSTOP(QUDA_PROFILE_INIT); profileInit.TPSTOP(QUDA_PROFILE_TOTAL); } diff --git a/tests/blas_test.cpp b/tests/blas_test.cpp index dbdaed64f5..0559fa5b6e 100644 --- a/tests/blas_test.cpp +++ b/tests/blas_test.cpp @@ -230,6 +230,7 @@ void initFields(prec_pair_t prec_pair) param.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; param.create = QUDA_ZERO_FIELD_CREATE; + fprintf(stderr,"creating CPU fields\n"); vH = new cpuColorSpinorField(param); wH = new cpuColorSpinorField(param); xH = new cpuColorSpinorField(param); @@ -268,6 +269,8 @@ void initFields(prec_pair_t prec_pair) QudaPrecision prec = prec_pair.first; QudaPrecision prec_other = prec_pair.second; + fprintf(stderr,"creating GPU fields\n"); + fprintf(stderr,"param.mem_type: %i\n", param.mem_type); param.setPrecision(prec, prec, true); vD = new cudaColorSpinorField(param); wD = new cudaColorSpinorField(param); @@ -275,12 +278,14 @@ void initFields(prec_pair_t prec_pair) yD = new cudaColorSpinorField(param); zD = new cudaColorSpinorField(param); + fprintf(stderr,"creating more GPU fields\n"); param.setPrecision(prec_other, prec_other, true); voD = new cudaColorSpinorField(param); woD = new cudaColorSpinorField(param); xoD = new cudaColorSpinorField(param); yoD = new cudaColorSpinorField(param); zoD = new cudaColorSpinorField(param); + fprintf(stderr,"done creating GPU fields\n"); // create composite fields param.is_composite = true; From e56c1c10b2f7dbb1ed0e24b21ced6eb51fd4eaa0 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Thu, 18 Feb 2021 23:27:22 -0600 Subject: [PATCH 008/328] add quda::max --- include/color_spinor_field_order.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index 28a516f212..38936cf8ad 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -1017,10 +1017,10 @@ namespace quda { norm_type max_[length / 2]; // two-pass to increase ILP (assumes length divisible by two, e.g. complex-valued) #pragma unroll - for (int i = 0; i < length / 2; i++) max_[i] = fmaxf(fabsf((norm_type)v[i]), fabsf((norm_type)v[i + length / 2])); + for (int i = 0; i < length / 2; i++) max_[i] = quda::max(fabsf((norm_type)v[i]), fabsf((norm_type)v[i + length / 2])); norm_type scale = 0.0; #pragma unroll - for (int i = 0; i < length / 2; i++) scale = fmaxf(max_[i], scale); + for (int i = 0; i < length / 2; i++) scale = quda::max(max_[i], scale); norm[x+parity*norm_offset] = scale; #ifdef __CUDA_ARCH__ From e5323c73840233040aa66b147b18be80ebccef62 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Fri, 19 Feb 2021 14:09:52 -0600 Subject: [PATCH 009/328] added SYCL files --- include/targets/sycl/FFT_Plans.h | 35 + include/targets/sycl/aos.h | 27 + include/targets/sycl/atomic.cuh | 209 ++++++ include/targets/sycl/block_reduction_kernel.h | 59 ++ include/targets/sycl/inline_ptx.h | 88 +++ include/targets/sycl/kernel.h | 123 ++++ include/targets/sycl/math_helper.cuh | 127 ++++ include/targets/sycl/multi_blas_helper.cuh | 265 ++++++++ include/targets/sycl/quda_target.h | 180 +++++ include/targets/sycl/random_helper.h | 112 ++++ include/targets/sycl/reduce_helper.h | 348 ++++++++++ include/targets/sycl/reduction_kernel.h | 99 +++ include/targets/sycl/shortvec.h | 96 +++ include/targets/sycl/target_device.h | 151 +++++ include/targets/sycl/tunable_nd.h | 467 +++++++++++++ include/targets/sycl/tunable_reduction.h | 391 +++++++++++ lib/targets/sycl/blas_lapack_mkl.cpp | 490 ++++++++++++++ lib/targets/sycl/comm_target.cpp | 162 +++++ lib/targets/sycl/device.cpp | 304 +++++++++ lib/targets/sycl/malloc.cpp | 625 ++++++++++++++++++ lib/targets/sycl/quda_api.cpp | 400 +++++++++++ 21 files changed, 4758 insertions(+) create mode 100644 include/targets/sycl/FFT_Plans.h create mode 100644 include/targets/sycl/aos.h create mode 100644 include/targets/sycl/atomic.cuh create mode 100644 include/targets/sycl/block_reduction_kernel.h create mode 100644 include/targets/sycl/inline_ptx.h create mode 100644 include/targets/sycl/kernel.h create mode 100644 include/targets/sycl/math_helper.cuh create mode 100644 include/targets/sycl/multi_blas_helper.cuh create mode 100644 include/targets/sycl/quda_target.h create mode 100644 include/targets/sycl/random_helper.h create mode 100644 include/targets/sycl/reduce_helper.h create mode 100644 include/targets/sycl/reduction_kernel.h create mode 100644 include/targets/sycl/shortvec.h create mode 100644 include/targets/sycl/target_device.h create mode 100644 include/targets/sycl/tunable_nd.h create mode 100644 include/targets/sycl/tunable_reduction.h create mode 100644 lib/targets/sycl/blas_lapack_mkl.cpp create mode 100644 lib/targets/sycl/comm_target.cpp create mode 100644 lib/targets/sycl/device.cpp create mode 100644 lib/targets/sycl/malloc.cpp create mode 100644 lib/targets/sycl/quda_api.cpp diff --git a/include/targets/sycl/FFT_Plans.h b/include/targets/sycl/FFT_Plans.h new file mode 100644 index 0000000000..9078825f26 --- /dev/null +++ b/include/targets/sycl/FFT_Plans.h @@ -0,0 +1,35 @@ +#pragma once + +#include +#include + +using FFTPlanHandle = int; +#define FFT_FORWARD 0 +#define FFT_INVERSE 1 + +#define CUFFT_SAFE_CALL(call) + +inline void ApplyFFT(FFTPlanHandle &, float2 *, float2 *, int) +{ + errorQuda("CPU_GAUGE_ALG is disabled so FFTs are also disabled"); +} + +inline void ApplyFFT(FFTPlanHandle &, double2 *, double2 *, int) +{ + errorQuda("CPU_GAUGE_ALG is disabled so FFTs are also disabled"); +} + +inline void SetPlanFFTMany(FFTPlanHandle &, int4, int, QudaPrecision) +{ + errorQuda("CPU_GAUGE_ALG is disabled so FFTs are also disabled"); +} + +inline void SetPlanFFT2DMany(FFTPlanHandle &, int4, int, QudaPrecision) +{ + errorQuda("CPU_GAUGE_ALG is disabled so FFTs are also disabled"); +} + +inline void FFTDestroyPlan(FFTPlanHandle &) +{ + errorQuda("CPU_GAUGE_ALG is disabled so FFTs are also disabled"); +} diff --git a/include/targets/sycl/aos.h b/include/targets/sycl/aos.h new file mode 100644 index 0000000000..262c46e51d --- /dev/null +++ b/include/targets/sycl/aos.h @@ -0,0 +1,27 @@ +#pragma once + +namespace quda { + + template __host__ __device__ void block_load(T out[n], const T *in) + { +#pragma unroll + for (int i = 0; i < n; i++) out[i] = in[i]; + } + + template __host__ __device__ void block_store(T *out, const T in[n]) + { +#pragma unroll + for (int i = 0; i < n; i++) out[i] = in[i]; + } + + template __host__ __device__ void block_load(T &out, const T *in) + { + out = *in; + } + + template __host__ __device__ void block_store(T *out, const T &in) + { + *out = in; + } + +} diff --git a/include/targets/sycl/atomic.cuh b/include/targets/sycl/atomic.cuh new file mode 100644 index 0000000000..a779b1237f --- /dev/null +++ b/include/targets/sycl/atomic.cuh @@ -0,0 +1,209 @@ +#pragma once + +/** + @file atomic.cuh + + @section Description + + Provides definitions of atomic functions that are not native to + CUDA. These are intentionally not declared in the namespace to + avoid confusion when resolving the native atomicAdd functions. + */ + +//inline constexpr auto mo = sycl::memory_order::relaxed; +inline constexpr auto mo = sycl::ONEAPI::memory_order::acq_rel; +//inline constexpr auto mo = memory_order::seq_cst; + +inline constexpr auto ms = sycl::ONEAPI::memory_scope::system; + +//inline constexpr auto as = sycl::access::address_space::generic_space; +inline constexpr auto as = sycl::access::address_space::global_space; + +template +using atomicRef = sycl::ONEAPI::atomic_ref; + +template +static inline atomicRef makeAtomicRef(T *address) { + return atomicRef(*address); +} + +static inline uint __float_as_uint(float x) { + return *reinterpret_cast(&x); +} + +static inline float __uint_as_float(uint x) { + return *reinterpret_cast(&x); +} + +static inline unsigned int atomicMax(unsigned int* address, unsigned int val) +{ + auto ar = makeAtomicRef(address); + auto old = ar.fetch_max(val); + return old; +} + +static inline int atomicCAS(int* address, int compare, int val) +{ + auto ar = makeAtomicRef(address); + auto old = ar.compare_exchange_strong(compare, val); + return old; +} +static inline unsigned int atomicCAS(unsigned int* address, unsigned int compare, unsigned int val) +{ + auto ar = makeAtomicRef(address); + auto old = ar.compare_exchange_strong(compare, val); + return old; +} + +/** + @brief Implementation of double-precision atomic addition using compare + and swap. Taken from the CUDA programming guide. + + @param addr Address that stores the atomic variable to be updated + @param val Value to be added to the atomic +*/ +static inline int atomicAdd(int* address, int val) +{ + auto ar = makeAtomicRef(address); + auto old = ar.fetch_add(val); + return old; +} +static inline float atomicAdd(float* address, float val) +{ + auto ar = makeAtomicRef(address); + auto old = ar.fetch_add(val); + return old; +} +static inline double atomicAdd(double* address, double val) +{ + auto ar = makeAtomicRef(address); + auto old = ar.fetch_add(val); + return old; +} + +/** + @brief Implementation of double2 atomic addition using two + double-precision additions. + + @param addr Address that stores the atomic variable to be updated + @param val Value to be added to the atomic +*/ +static inline double2 atomicAdd(double2 *addr, double2 val) +{ + double2 old = *addr; + // This is a necessary evil to avoid conflicts between the atomicAdd + // declared in the CUDA headers which are visible for host + // compilation, which cause a conflict when compiled on clang-cuda. + // As a result we do not support any architecture without native + // double precision atomics on clang-cuda. + old.x = atomicAdd((double*)addr, val.x); + old.y = atomicAdd((double*)addr + 1, val.y); + return old; +} + +/** + @brief Implementation of float2 atomic addition using two + single-precision additions. + + @param addr Address that stores the atomic variable to be updated + @param val Value to be added to the atomic +*/ +static inline float2 atomicAdd(float2 *addr, float2 val){ + float2 old = *addr; + old.x = atomicAdd((float*)addr, val.x); + old.y = atomicAdd((float*)addr + 1, val.y); + return old; +} + +/** + @brief Implementation of int2 atomic addition using two + int additions. + + @param addr Address that stores the atomic variable to be updated + @param val Value to be added to the atomic +*/ +static inline int2 atomicAdd(int2 *addr, int2 val){ + int2 old = *addr; + old.x = atomicAdd((int*)addr, val.x); + old.y = atomicAdd((int*)addr + 1, val.y); + return old; +} + +union uint32_short2 { unsigned int i; short2 s; }; + +/** + @brief Implementation of short2 atomic addition using compare + and swap. + + @param addr Address that stores the atomic variable to be updated + @param val Value to be added to the atomic +*/ +static inline short2 atomicAdd(short2 *addr, short2 val){ + uint32_short2 old, assumed, incremented; + old.s = *addr; + do { + assumed.s = old.s; + incremented.s = make_short2(val.x + assumed.s.x, val.y + assumed.s.y); + old.i = atomicCAS((unsigned int*)addr, assumed.i, incremented.i); + } while ( assumed.i != old.i ); + + return old.s; +} + +union uint32_char2 { unsigned short i; char2 s; }; + +/** + @brief Implementation of char2 atomic addition using compare + and swap. + + @param addr Address that stores the atomic variable to be updated + @param val Value to be added to the atomic +*/ +static inline char2 atomicAdd(char2 *addr, char2 val){ + uint32_char2 old, assumed, incremented; + old.s = *addr; + do { + assumed.s = old.s; + incremented.s = make_char2(val.x + assumed.s.x, val.y + assumed.s.y); + old.i = atomicCAS((unsigned int*)addr, assumed.i, incremented.i); + } while ( assumed.i != old.i ); + + return old.s; +} + +/** + @brief Implementation of single-precision atomic max using compare + and swap. May not support NaNs properly... + + @param addr Address that stores the atomic variable to be updated + @param val Value to be added to the atomic +*/ +static inline float atomicMax(float *addr, float val){ + unsigned int old = __float_as_uint(*addr), assumed; + do { + assumed = old; + if (__uint_as_float(old) >= val) break; + + old = atomicCAS((unsigned int*)addr, + assumed, + __float_as_uint(val)); + } while ( assumed != old ); + + return __uint_as_float(old); +} + +/** + @brief Implementation of single-precision atomic max specialized + for positive-definite numbers. Here we take advantage of the + property that when positive floating point numbers are + reinterpretted as unsigned integers, they have the same unique + sorted order. + + @param addr Address that stores the atomic variable to be updated + @param val Value to be added to the atomic +*/ +static inline float atomicAbsMax(float *addr, float val){ + uint32_t val_ = __float_as_uint(val); + uint32_t *addr_ = reinterpret_cast(addr); + return atomicMax(addr_, val_); +} diff --git a/include/targets/sycl/block_reduction_kernel.h b/include/targets/sycl/block_reduction_kernel.h new file mode 100644 index 0000000000..d411c4daa8 --- /dev/null +++ b/include/targets/sycl/block_reduction_kernel.h @@ -0,0 +1,59 @@ +#pragma once + +#include + +namespace quda { + + /** + @brief This helper function swizzles the block index through + mapping the block index onto a matrix and tranposing it. This is + done to potentially increase the cache utilization. Requires + that the argument class has a member parameter "swizzle" which + determines if we are swizzling and a parameter "swizzle_factor" + which is the effective matrix dimension that we are tranposing in + this mapping. + */ + template constexpr int virtual_block_idx(const Arg &arg) + { + int block_idx = blockIdx.x; +#if 0 + if (arg.swizzle) { + // the portion of the grid that is exactly divisible by the number of SMs + const int gridp = gridDim.x - gridDim.x % arg.swizzle_factor; + + block_idx = blockIdx.x; + if (blockIdx.x < gridp) { + // this is the portion of the block that we are going to transpose + const int i = blockIdx.x % arg.swizzle_factor; + const int j = blockIdx.x / arg.swizzle_factor; + + // transpose the coordinates + block_idx = i * (gridp / arg.swizzle_factor) + j; + } + } +#endif + return block_idx; + } + + /** + @brief Generic block kernel. Here, we split the block and thread + indices in the x and y dimension and pass these indices + separately to the transform functor. The x thread dimension is + templated, e.g., for efficient reductions, and typically the y + thread dimension is a trivial vectorizable dimension. + */ + template class Transformer, typename Arg> + void BlockKernel2D(Arg arg) + { +#if 0 + const dim3 block_idx(virtual_block_idx(arg), blockIdx.y, 0); + const dim3 thread_idx(threadIdx.x, threadIdx.y, 0); + const int j = blockDim.y*blockIdx.y + threadIdx.y; + if (j >= arg.threads.y) return; + + Transformer t(arg); + t(block_idx, thread_idx); +#endif + } + +} diff --git a/include/targets/sycl/inline_ptx.h b/include/targets/sycl/inline_ptx.h new file mode 100644 index 0000000000..bb3d012c22 --- /dev/null +++ b/include/targets/sycl/inline_ptx.h @@ -0,0 +1,88 @@ +#pragma once + +/* + Inline ptx instructions for low-level control of code generation. + Primarily these are for doing stores avoiding L1 cache and minimal + impact on L2 (streaming through L2). +*/ + +namespace quda { + + inline void load_streaming_double2(double2 &a, const double2* addr) + { + a.x = addr->x; a.y = addr->y; + } + + inline void load_streaming_float4(float4 &a, const float4* addr) + { + a.x = addr->x; a.y = addr->y; a.z = addr->z; a.w = addr->w; + } + + inline void load_cached_short4(short4 &a, const short4 *addr) + { + a.x = addr->x; + a.y = addr->y; + a.z = addr->z; + a.w = addr->w; + } + + inline void load_cached_short2(short2 &a, const short2 *addr) + { + a.x = addr->x; + a.y = addr->y; + } + + inline void load_global_short4(short4 &a, const short4 *addr) + { + a.x = addr->x; + a.y = addr->y; + a.z = addr->z; + a.w = addr->w; + } + + inline void load_global_short2(short2 &a, const short2 *addr) + { + a.x = addr->x; + a.y = addr->y; + } + + inline void load_global_float4(float4 &a, const float4* addr) + { + a.x = addr->x; a.y = addr->y; a.z = addr->z; a.w = addr->w; + } + + inline void store_streaming_float4(float4* addr, float x, float y, float z, float w) + { + addr->x = x; + addr->y = y; + addr->z = z; + addr->w = w; + } + + inline void store_streaming_short4(short4* addr, short x, short y, short z, short w) + { + addr->x = x; + addr->y = y; + addr->z = z; + addr->w = w; + } + + inline void store_streaming_double2(double2* addr, double x, double y) + { + addr->x = x; + addr->y = y; + } + + inline void store_streaming_float2(float2* addr, float x, float y) + { + addr->x = x; + addr->y = y; + } + + inline void store_streaming_short2(short2* addr, short x, short y) + { + addr->x = x; + addr->y = y; + } + +} // namespace quda diff --git a/include/targets/sycl/kernel.h b/include/targets/sycl/kernel.h new file mode 100644 index 0000000000..95c55c611c --- /dev/null +++ b/include/targets/sycl/kernel.h @@ -0,0 +1,123 @@ +#pragma once +#include + +namespace quda { + + template