diff --git a/.clang-format b/.clang-format index ba8ff9f..99f4b90 100644 --- a/.clang-format +++ b/.clang-format @@ -1,3 +1,8 @@ +IndentWidth: 2 BasedOnStyle: 'google' ColumnLimit: 100 -SortIncludes: false \ No newline at end of file +SortIncludes: false +QualifierAlignment: Right +UseTab: Never +DerivePointerAlignment: false +PointerAlignment: Left diff --git a/inc/mkn/gpu/cuda.hpp b/inc/mkn/gpu/cuda.hpp index 68c3837..c8071f8 100644 --- a/inc/mkn/gpu/cuda.hpp +++ b/inc/mkn/gpu/cuda.hpp @@ -50,7 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. { \ gpuAssert((ans), __FILE__, __LINE__); \ } -inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true) { +inline void gpuAssert(cudaError_t code, char const* file, int line, bool abort = true) { if (code != cudaSuccess) { fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) std::abort(); diff --git a/inc/mkn/gpu/launchers.hpp b/inc/mkn/gpu/launchers.hpp index 32f3509..877d044 100644 --- a/inc/mkn/gpu/launchers.hpp +++ b/inc/mkn/gpu/launchers.hpp @@ -48,7 +48,7 @@ auto as_values(Args&... args) { template struct GDLauncher : public GLauncher { - GDLauncher(std::size_t s, size_t dev = 0) : GLauncher{s, dev} {} + GDLauncher(std::size_t const s, size_t const dev = 0) : GLauncher{s, dev} {} template auto operator()(F&& f, Args&&... args) { @@ -69,7 +69,7 @@ struct GDLauncher : public GLauncher { template struct DLauncher : public Launcher { - DLauncher(size_t /*dev*/ = 0) : Launcher{{}, {}} {} + DLauncher(size_t const /*dev*/ = 0) : Launcher{{}, {}} {} template auto operator()(F&& f, Args&&... args) { diff --git a/inc/mkn/gpu/rocm.hpp b/inc/mkn/gpu/rocm.hpp index 19fdb5b..4eef4d9 100644 --- a/inc/mkn/gpu/rocm.hpp +++ b/inc/mkn/gpu/rocm.hpp @@ -46,7 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. { \ gpuAssert((ans), __FILE__, __LINE__); \ } -inline void gpuAssert(hipError_t code, const char* file, int line, bool abort = true) { +inline void gpuAssert(hipError_t code, char const* file, int line, bool abort = true) { if (code != hipSuccess) { fprintf(stderr, "GPUassert: %s %s %d\n", hipGetErrorString(code), file, line); if (abort) std::abort(); diff --git a/inc/mkn/gpu/tuple.hpp b/inc/mkn/gpu/tuple.hpp index 383a2b9..2f363f6 100644 --- a/inc/mkn/gpu/tuple.hpp +++ b/inc/mkn/gpu/tuple.hpp @@ -121,7 +121,7 @@ struct SpanSet : ASpanSet { curr_pos += sv->sizes[curr_ptr++]; return *this; } - bool operator!=(const iterator& /*other*/) const __device__ { + bool operator!=(iterator const& /*other*/) const __device__ { return curr_ptr != sv->sizes.size(); } Span operator*() const { diff --git a/test/any/array.cpp b/test/any/array.cpp index 516df90..27c80c2 100644 --- a/test/any/array.cpp +++ b/test/any/array.cpp @@ -6,7 +6,7 @@ static constexpr uint32_t NUM = WIDTH * HEIGHT; static constexpr uint32_t THREADS_PER_BLOCK_X = 16, THREADS_PER_BLOCK_Y = 16; template -__global__ void vectoradd(T* a, const T* b, const T* c) { +__global__ void vectoradd(T* a, T const* b, T const* c) { auto i = mkn::gpu::idx(); a[i] = b[i] + c[i]; } diff --git a/test/any/async_streaming.cpp b/test/any/async_streaming.cpp index 23b0d2d..7a3406d 100644 --- a/test/any/async_streaming.cpp +++ b/test/any/async_streaming.cpp @@ -2,6 +2,7 @@ #include #include +#include "mkn/gpu.hpp" #include "mkn/kul/dbg.hpp" #include "mkn/kul/time.hpp" #include "mkn/gpu/multi_launch.hpp" @@ -179,7 +180,7 @@ std::uint32_t test_threaded_host_group_idx(std::size_t const& nthreads = 2) { .dev([=] __device__(auto const& i) { views[i][mkn::gpu::idx()] += 3; })(); std::size_t val = 5; - for (std::size_t i = 0; i < vecs.size(); i++) { + for (std::size_t i = 0; i < vecs.size(); ++i) { if (i % group_size == 0) { for (auto const& e : vecs[i]) if (e != val + 1) return 1; @@ -193,6 +194,42 @@ std::uint32_t test_threaded_host_group_idx(std::size_t const& nthreads = 2) { return 0; } +std::uint32_t test_threaded_detached_stream_fns(std::size_t const& nthreads = 2) { + using T = double; + KUL_DBG_FUNC_ENTER; + + std::vector> vecs(C, ManagedVector(NUM, 0)); + for (std::size_t i = 0; i < vecs.size(); ++i) std::fill_n(vecs[i].data(), NUM, i); + + ManagedVector datas(C); + for (std::size_t i = 0; i < vecs.size(); ++i) datas[i] = vecs[i].data(); + auto views = datas.data(); + + ThreadedStreamLauncher launcher{vecs, nthreads}; + launcher + .host([&](auto i) mutable { + launcher.streams[i].sync(); // wait for first kernel per stream + for (auto& e : vecs[i]) e += 1; + }) + .dev([=] __device__(auto const& i) { views[i][mkn::gpu::idx()] += 3; }); + + for (std::size_t i = 0; i < datas.size(); ++i) { + mkn::gpu::GDLauncher{NUM}.stream( + launcher.streams[i], [=, idx = i] __device__() { views[idx][mkn::gpu::idx()] += 1; }); + } + + launcher(); + + std::size_t val = 5; + for (std::size_t i = 0; i < vecs.size(); ++i) { + for (std::size_t j = 0; j < vecs[i].size(); ++j) + if (val != vecs[i][j]) return 1; + ++val; + }; + + return 0; +} + int main() { KOUT(NON) << __FILE__; return test() // @@ -200,5 +237,5 @@ int main() { + test_threaded(6) // + test_threaded_group_barrier() // + test_threaded_host_group_mutex() // - + test_threaded_host_group_idx(); + + test_threaded_host_group_idx() + test_threaded_detached_stream_fns(); } diff --git a/test/cpu/namespace.cpp b/test/cpu/namespace.cpp index 7171a2c..f204ad4 100644 --- a/test/cpu/namespace.cpp +++ b/test/cpu/namespace.cpp @@ -6,7 +6,7 @@ static constexpr uint32_t NUM = WIDTH * HEIGHT; static constexpr uint32_t THREADS_PER_BLOCK_X = 16, THREADS_PER_BLOCK_Y = 16; template -__global__ void vectoradd(T* a, const T* b, const T* c) { +__global__ void vectoradd(T* a, T const* b, T const* c) { auto i = mkn::gpu::cpu::idx(); a[i] = b[i] + c[i]; } diff --git a/test/cuda/add.cpp b/test/cuda/add.cpp index 3421af3..d976374 100644 --- a/test/cuda/add.cpp +++ b/test/cuda/add.cpp @@ -6,7 +6,7 @@ static constexpr uint32_t NUM = WIDTH * HEIGHT; static constexpr uint32_t THREADS_PER_BLOCK_X = 16, THREADS_PER_BLOCK_Y = 16; template -__global__ void vectoradd(T* a, const T* b, const T* c) { +__global__ void vectoradd(T* a, T const* b, T const* c) { auto i = mkn::gpu::cuda::idx(); a[i] = b[i] + c[i]; } diff --git a/test/cuda/async.cpp b/test/cuda/async.cpp index 8e6a5e3..b8cca61 100644 --- a/test/cuda/async.cpp +++ b/test/cuda/async.cpp @@ -43,7 +43,7 @@ inline cudaError_t checkCuda(cudaError_t result) { return result; } -__global__ void kernel(float *a, int offset) { +__global__ void kernel(float* a, int offset) { int i = offset + threadIdx.x + blockIdx.x * blockDim.x; float x = (float)i; float s = sinf(x); @@ -51,7 +51,7 @@ __global__ void kernel(float *a, int offset) { a[i] = a[i] + sqrtf(s * s + c * c); } -float maxError(float *a, int n) { +float maxError(float* a, int n) { float maxE = 0; for (int i = 0; i < n; i++) { float error = fabs(a[i] - 1.0f); @@ -60,12 +60,12 @@ float maxError(float *a, int n) { return maxE; } -int main(int argc, char **argv) { - const int blockSize = 256, nStreams = 4; - const int n = 4 * 1024 * blockSize * nStreams; - const int streamSize = n / nStreams; - const int streamBytes = streamSize * sizeof(float); - const int bytes = n * sizeof(float); +int main(int argc, char** argv) { + int const blockSize = 256, nStreams = 4; + int const n = 4 * 1024 * blockSize * nStreams; + int const streamSize = n / nStreams; + int const streamBytes = streamSize * sizeof(float); + int const bytes = n * sizeof(float); int devId = 0; if (argc > 1) devId = atoi(argv[1]); @@ -77,8 +77,8 @@ int main(int argc, char **argv) { // allocate pinned host memory and device memory float *a, *d_a; - checkCuda(cudaMallocHost((void **)&a, bytes)); // host pinned - checkCuda(cudaMalloc((void **)&d_a, bytes)); // device + checkCuda(cudaMallocHost((void**)&a, bytes)); // host pinned + checkCuda(cudaMalloc((void**)&d_a, bytes)); // device float ms; // elapsed time in milliseconds diff --git a/test/cuda/atomic.cpp b/test/cuda/atomic.cpp index 55e53f3..ab1a47e 100644 --- a/test/cuda/atomic.cpp +++ b/test/cuda/atomic.cpp @@ -16,7 +16,7 @@ struct S { } // namespace NS0 template -__global__ void vectoradd(T* a, const T* b, const T* c) { +__global__ void vectoradd(T* a, T const* b, T const* c) { auto i = mkn::gpu::cuda::idx(); NS0::S{a[i]} += b[i] + c[i]; diff --git a/test/hip/add.cpp b/test/hip/add.cpp index df296bb..54421d8 100644 --- a/test/hip/add.cpp +++ b/test/hip/add.cpp @@ -6,7 +6,7 @@ static constexpr uint32_t NUM = WIDTH * HEIGHT; static constexpr uint32_t THREADS_PER_BLOCK_X = 16, THREADS_PER_BLOCK_Y = 16; template -__global__ void vectoradd(T* a, const T* b, const T* c) { +__global__ void vectoradd(T* a, T const* b, T const* c) { auto i = mkn::gpu::hip::idx(); a[i] = b[i] + c[i]; } diff --git a/test/hip/async.cpp b/test/hip/async.cpp index afac4ae..5aed657 100644 --- a/test/hip/async.cpp +++ b/test/hip/async.cpp @@ -43,7 +43,7 @@ inline void checkHip([[maybe_unused]] hipError_t result) { // return result; } -__global__ void kernel(float *a, int offset) { +__global__ void kernel(float* a, int offset) { int i = offset + threadIdx.x + blockIdx.x * blockDim.x; float x = (float)i; float s = sinf(x); @@ -51,7 +51,7 @@ __global__ void kernel(float *a, int offset) { a[i] = a[i] + sqrtf(s * s + c * c); } -float maxError(float *a, int n) { +float maxError(float* a, int n) { float maxE = 0; for (int i = 0; i < n; i++) { float error = fabs(a[i] - 1.0f); @@ -60,12 +60,12 @@ float maxError(float *a, int n) { return maxE; } -int main(int argc, char **argv) { - const int blockSize = 256, nStreams = 4; - const int n = 4 * 1024 * blockSize * nStreams; - const int streamSize = n / nStreams; - const int streamBytes = streamSize * sizeof(float); - const int bytes = n * sizeof(float); +int main(int argc, char** argv) { + int const blockSize = 256, nStreams = 4; + int const n = 4 * 1024 * blockSize * nStreams; + int const streamSize = n / nStreams; + int const streamBytes = streamSize * sizeof(float); + int const bytes = n * sizeof(float); int devId = 0; if (argc > 1) devId = atoi(argv[1]); @@ -77,8 +77,8 @@ int main(int argc, char **argv) { // allocate pinned host memory and device memory float *a, *d_a; - checkHip(hipHostMalloc((void **)&a, bytes)); // host pinned - checkHip(hipMalloc((void **)&d_a, bytes)); // device + checkHip(hipHostMalloc((void**)&a, bytes)); // host pinned + checkHip(hipMalloc((void**)&d_a, bytes)); // device float ms; // elapsed time in milliseconds