diff --git a/include/ylt/simd_util/avx2/str_split.h b/include/ylt/simd_util/avx2/str_split.h new file mode 100644 index 000000000..15f108d9f --- /dev/null +++ b/include/ylt/simd_util/avx2/str_split.h @@ -0,0 +1,61 @@ +#pragma once +#include + +#include +#include +#include +namespace ylt { +namespace avx2 { + +template +concept StringLike = + std::same_as || std::same_as; +template +__attribute__((__target__("avx2,bmi"))) inline std::vector +simd_str_split(std::string_view string, const char delim) { + auto* pstr = string.data(); + size_t size = string.size(); + size_t start = 0; + + std::vector output; + size_t aligned32_size = size & 0xFFFFFFFFFFFFFFE0UL; + for (size_t i = 0; i < aligned32_size; i += 32) { + __m256i data = + _mm256_lddqu_si256(reinterpret_cast(&pstr[i])); + const __m256i match = _mm256_set1_epi8(delim); + uint32_t mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(data, match)); + while (mask != 0) { + auto j = __builtin_ctzl(mask); + output.emplace_back(&pstr[start], i + j - start); + start = i + j + 1; + mask &= mask - 1; + } + } + + size_t aligned16_size = size & 0xFFFFFFFFFFFFFFF0UL; + if (aligned32_size < aligned16_size) { + __m128i data = _mm_lddqu_si128( + reinterpret_cast(&pstr[aligned32_size])); + const __m128i match = _mm_set1_epi8(delim); + uint32_t mask = _mm_movemask_epi8(_mm_cmpeq_epi8(data, match)); + while (mask != 0) { + auto j = __builtin_ctzl(mask); + output.emplace_back(&pstr[start], aligned32_size + j - start); + start = aligned32_size + j + 1; + mask &= mask - 1; + } + } + + size_t i = aligned16_size; + do { + while (pstr[i] != delim && i != size) { + ++i; + } + output.emplace_back(&pstr[start], i - start); + start = i = i + 1; + } while (i <= size); + return output; +} + +} // namespace avx2 +} // namespace ylt \ No newline at end of file diff --git a/include/ylt/simd_util/avx512/str_split.h b/include/ylt/simd_util/avx512/str_split.h new file mode 100644 index 000000000..c6a7ad0c4 --- /dev/null +++ b/include/ylt/simd_util/avx512/str_split.h @@ -0,0 +1,76 @@ +#pragma once +#include + +#include +#include +#include +namespace ylt { +namespace avx512 { + +template +concept StringLike = + std::same_as || std::same_as; +template +__attribute__((__target__("avx512bw,bmi"))) +// auto chose target +inline std::vector +simd_str_split(std::string_view string, const char delim) { + auto* pstr = string.data(); + size_t size = string.size(); + size_t start = 0; + + std::vector output; + + size_t aligned64_size = size & 0xFFFFFFFFFFFFFFC0UL; + for (size_t i = 0; i < aligned64_size; i += 64) { + __m512i data = _mm512_loadu_si512(&pstr[i]); + const __m512i match = _mm512_set1_epi8(delim); + uint64_t mask = _mm512_cmpeq_epi8_mask(data, match); + while (mask != 0) { + auto j = __builtin_ctzll(mask); + output.emplace_back(&pstr[start], i + j - start); + start = i + j + 1; + mask &= mask - 1; + } + } + + size_t aligned32_size = size & 0xFFFFFFFFFFFFFFE0UL; + if (aligned64_size < aligned32_size) { + __m256i data = _mm256_lddqu_si256( + reinterpret_cast(&pstr[aligned64_size])); + const __m256i match = _mm256_set1_epi8(delim); + uint32_t mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(data, match)); + while (mask != 0) { + auto j = __builtin_ctzl(mask); + output.emplace_back(&pstr[start], aligned64_size + j - start); + start = aligned64_size + j + 1; + mask &= mask - 1; + } + } + + size_t aligned16_size = size & 0xFFFFFFFFFFFFFFF0UL; + if (aligned32_size < aligned16_size) { + __m128i data = _mm_lddqu_si128( + reinterpret_cast(&pstr[aligned32_size])); + const __m128i match = _mm_set1_epi8(delim); + uint32_t mask = _mm_movemask_epi8(_mm_cmpeq_epi8(data, match)); + while (mask != 0) { + auto j = __builtin_ctzl(mask); + output.emplace_back(&pstr[start], aligned32_size + j - start); + start = aligned32_size + j + 1; + mask &= mask - 1; + } + } + + size_t i = aligned16_size; + do { + while (pstr[i] != delim && i != size) { + ++i; + } + output.emplace_back(&pstr[start], i - start); + start = i = i + 1; + } while (i <= size); + return output; +} +} // namespace avx512 +} // namespace ylt \ No newline at end of file diff --git a/include/ylt/simd_util/common/str_split.h b/include/ylt/simd_util/common/str_split.h new file mode 100644 index 000000000..737b0668c --- /dev/null +++ b/include/ylt/simd_util/common/str_split.h @@ -0,0 +1,35 @@ +#pragma once + +#include +#include +#include + +namespace ylt { + +namespace common { +template +concept StringLike = + std::same_as || std::same_as; +template +inline std::vector simd_str_split(std::string_view s, + const char delimiter) { + size_t start = 0; + size_t end = s.find_first_of(delimiter); + + std::vector output; + + while (end <= StringLike::npos) { + output.emplace_back(s.substr(start, end - start)); + + if (end == StringLike::npos) + break; + + start = end + 1; + end = s.find_first_of(delimiter, start); + } + + return output; +} +} // namespace common + +} // namespace ylt \ No newline at end of file diff --git a/include/ylt/simd_util/neon/str_split.h b/include/ylt/simd_util/neon/str_split.h new file mode 100644 index 000000000..7747a59ad --- /dev/null +++ b/include/ylt/simd_util/neon/str_split.h @@ -0,0 +1,92 @@ +#pragma once + +#include + +#include +#include +#include + +namespace ylt { +namespace neon { + +template +concept StringLike = + std::same_as || std::same_as; + +template +inline std::vector simd_str_split(std::string_view string, + const char delim) { + auto* pstr = string.data(); + size_t size = string.size(); + size_t start = 0; + + std::vector output; + // Similar to memchr implementation, the first round of 256-bit detection + size_t aligned32_size = size & 0xFFFFFFFFFFFFFFE0UL; + uint8x16_t match = vmovq_n_u8(delim); + for (size_t i = 0; i < aligned32_size; i += 32) { + uint8x16_t data1 = vld1q_u8(reinterpret_cast(&pstr[i])); + uint8x16_t data2 = + vld1q_u8(reinterpret_cast(&pstr[i + 16])); + uint8x16_t result1 = vceqq_u8(data1, match); + uint8x16_t result2 = vceqq_u8(data2, match); + // Quickly fold the 256-bit detection results to 64-bit. It cannot be + // accurately located, but can be used for quick skipping. + uint64x2_t result64 = vreinterpretq_u64_u8(vorrq_u8(result1, result2)); + result64 = vpaddq_u64(result64, result64); + if (result64[0] != 0) { + // Convert the detection result from 0xFF to 0x01, 0x04, 0x10, 0x40 + // The final fold will form 32 hit marks on 64 bits, and the alternate + // bits will take effect. For example, if all hits are found, the result + // will be 0x55555555555555555UL + uint8x16_t vmask = + vreinterpretq_u8_u64(vdupq_n_u64(0x4010040140100401UL)); + result1 = vandq_u8(result1, vmask); + result2 = vandq_u8(result2, vmask); + result1 = vpaddq_u8(result1, result2); + result1 = vpaddq_u8(result1, result1); + uint64_t mask = vreinterpretq_u64_u8(result1)[0]; + while (mask != 0) { + auto j = __builtin_ctzll(mask) >> 1; + output.emplace_back(&pstr[start], i + j - start); + start = i + j + 1; + mask &= mask - 1; + } + } + } + size_t aligned16_size = size & 0xFFFFFFFFFFFFFFF0UL; + if (aligned32_size < aligned16_size) { + uint8x16_t data = + vld1q_u8(reinterpret_cast(&pstr[aligned32_size])); + uint8x16_t result = vceqq_u8(data, match); + uint64x2_t result64 = vreinterpretq_u64_u8(result); + result64 = vpaddq_u64(result64, result64); + if (result64[0] != 0) { + uint8x16_t vmask = + vreinterpretq_u8_u64(vdupq_n_u64(0x4010040140100401UL)); + result = vandq_u8(result, vmask); + result = vpaddq_u8(result, result); + result = vpaddq_u8(result, result); + uint32_t mask = vreinterpretq_u32_u8(result)[0]; + while (mask != 0) { + auto j = __builtin_ctzl(mask) >> 1; + output.emplace_back(&pstr[start], aligned32_size + j - start); + start = aligned32_size + j + 1; + mask &= mask - 1; + } + } + } + + size_t i = aligned16_size; + do { + while (pstr[i] != delim && i != size) { + ++i; + } + output.emplace_back(&pstr[start], i - start); + start = i = i + 1; + } while (i <= size); + return output; +} + +} // namespace neon +} // namespace ylt \ No newline at end of file diff --git a/include/ylt/simd_util/simd_str_split.h b/include/ylt/simd_util/simd_str_split.h new file mode 100644 index 000000000..1a1b1e16d --- /dev/null +++ b/include/ylt/simd_util/simd_str_split.h @@ -0,0 +1,19 @@ +#pragma once + +#include "ylt_simd_dispatch.h" + +#include INCLUDE_ARCH_FILE(str_split.h) + +namespace ylt { +YLT_USING_ARCH_FUNC(simd_str_split); + +static inline std::vector split_str(std::string_view string, + const char delim) { + return simd_str_split(string, delim); +} + +static inline std::vector split_sv(std::string_view string, + const char delim) { + return simd_str_split(string, delim); +} +} // namespace ylt \ No newline at end of file diff --git a/include/ylt/simd_util/sse/str_split.h b/include/ylt/simd_util/sse/str_split.h new file mode 100644 index 000000000..272d42991 --- /dev/null +++ b/include/ylt/simd_util/sse/str_split.h @@ -0,0 +1,49 @@ +#pragma once + +#include + +#include +#include +#include + +namespace ylt { +namespace sse { + +template +concept StringLike = + std::same_as || std::same_as; +template +__attribute__((__target__("sse4.2"))) inline std::vector +simd_str_split(std::string_view string, const char delim) { + auto* pstr = string.data(); + size_t size = string.size(); + size_t start = 0; + + std::vector output; + size_t aligned16_size = size & 0xFFFFFFFFFFFFFFF0UL; + + for (size_t i = 0; i < aligned16_size; i += 16) { + __m128i data = _mm_lddqu_si128(reinterpret_cast(&pstr[i])); + const __m128i match = _mm_set1_epi8(delim); + uint32_t mask = _mm_movemask_epi8(_mm_cmpeq_epi8(data, match)); + while (mask != 0) { + auto j = __builtin_ctzl(mask); + output.emplace_back(&pstr[start], i + j - start); + start = i + j + 1; + mask &= mask - 1; + } + } + + size_t i = aligned16_size; + do { + while (pstr[i] != delim && i != size) { + ++i; + } + output.emplace_back(&pstr[start], i - start); + start = i = i + 1; + } while (i <= size); + return output; +} + +} // namespace sse +} // namespace ylt \ No newline at end of file diff --git a/include/ylt/simd_util/ylt_cpu_feature.h b/include/ylt/simd_util/ylt_cpu_feature.h new file mode 100644 index 000000000..209351fa5 --- /dev/null +++ b/include/ylt/simd_util/ylt_cpu_feature.h @@ -0,0 +1,21 @@ +#pragma once + +#if defined(__APPLE__) || defined(__linux__) || defined(__FreeBSD__) || \ + defined(__unix__) +#endif + +#if defined(__SSE2__) +#if defined(__SSE4_2__) +#define YLT_HAVE_SSE +#endif +#if defined(__AVX2__) +#define YLT_HAVE_AVX2 +#endif +#if defined(__AVX512F__) +#define YLT_HAVE_AVX512 +#endif +#else +#if defined(__ARM_NEON) || defined(__ARM_NEON__) +#define YLT_HAVE_NEON +#endif +#endif diff --git a/include/ylt/simd_util/ylt_simd_dispatch.h b/include/ylt/simd_util/ylt_simd_dispatch.h new file mode 100644 index 000000000..39dbf9a36 --- /dev/null +++ b/include/ylt/simd_util/ylt_simd_dispatch.h @@ -0,0 +1,26 @@ +#pragma once + +#include "ylt_cpu_feature.h" +#include "ylt_simd_macro.h" + +#if defined(YLT_HAVE_AVX512) +#define YLT_USING_ARCH_FUNC(func) using avx512::func +#define INCLUDE_ARCH_FILE(file) YLT_STRINGIFY(avx512/file) +#elif defined(YLT_HAVE_AVX2) +#define YLT_USING_ARCH_FUNC(func) using avx2::func +#define INCLUDE_ARCH_FILE(file) YLT_STRINGIFY(avx2/file) +#elif defined(YLT_HAVE_SSE) +#define YLT_USING_ARCH_FUNC(func) using sse::func +#define INCLUDE_ARCH_FILE(file) YLT_STRINGIFY(sse/file) +#else +#if defined(YLT_HAVE_NEON) +#define YLT_USING_ARCH_FUNC(func) using neon::func +#define INCLUDE_ARCH_FILE(file) YLT_STRINGIFY(neon/file) +#endif +#endif + +#if !defined(YLT_HAVE_AVX2) && !defined(YLT_HAVE_AVX512) && \ + !defined(YLT_HAVE_SSE) && !defined(YLT_HAVE_NEON) +#define YLT_USING_ARCH_FUNC(func) using common::func +#define INCLUDE_ARCH_FILE(file) YLT_STRINGIFY(common/file) +#endif \ No newline at end of file diff --git a/include/ylt/simd_util/ylt_simd_macro.h b/include/ylt/simd_util/ylt_simd_macro.h new file mode 100644 index 000000000..8cdacfa31 --- /dev/null +++ b/include/ylt/simd_util/ylt_simd_macro.h @@ -0,0 +1,6 @@ +#pragma once + +#ifndef YLT_STRINGIFY +#define YLT_STRINGIFY(s) YLT_STRINGIFY2(s) +#define YLT_STRINGIFY2(s) #s +#endif diff --git a/src/simd_util/benchmark/CMakeLists.txt b/src/simd_util/benchmark/CMakeLists.txt new file mode 100644 index 000000000..252eff848 --- /dev/null +++ b/src/simd_util/benchmark/CMakeLists.txt @@ -0,0 +1,25 @@ +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/output/benchmark) + +#set(SIMD_BENCHMARK_COMPILE_FLAGS -mavx2 -mbmi -mlzcnt) + + +#include(FetchContent) +#FetchContent_Declare(googletest +# GIT_REPOSITORY https://github.com/google/googletest.git +# GIT_TAG v1.10.x) + +#FetchContent_Declare( +# googlebenchmark +# GIT_REPOSITORY https://github.com/google/benchmark.git +# GIT_TAG v1.8.0 +#) + +#FetchContent_MakeAvailable( +# googletest +# googlebenchmark) + +#add_executable(simd_util_benchmark main.cpp) +#target_link_libraries(simd_util_benchmark PRIVATE benchmark::benchmark) +#add_test(NAME run_benchmark COMMAND simd_util_benchmark) + +message("ci requires Google Test and Google Benchmark.Remove the comment when the user needs to run the benchmark.") \ No newline at end of file diff --git a/src/simd_util/benchmark/main.cpp b/src/simd_util/benchmark/main.cpp new file mode 100644 index 000000000..5199509f4 --- /dev/null +++ b/src/simd_util/benchmark/main.cpp @@ -0,0 +1,88 @@ +#include +#include // int64_t, uint64_t +#include // timeval, gettimeofday +#include // timespec, clock_gettime +#include + +#include +#include +#include +#include + +inline std::vector normal_str_split(std::string_view s, + const char delimiter) { + size_t start = 0; + size_t end = s.find_first_of(delimiter); + + std::vector output; + + while (end <= std::string_view::npos) { + output.emplace_back(s.substr(start, end - start)); + + if (end == std::string_view::npos) + break; + + start = end + 1; + end = s.find_first_of(delimiter, start); + } + + return output; +} + +inline int64_t gettimeofday_us() { + timeval now; + gettimeofday(&now, NULL); + return now.tv_sec * 1000000L + now.tv_usec; +} + +std::string generate_test_string(size_t length, char delimiter, + size_t avg_segment_len) { + std::string s; + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution char_dist('a', 'z'); + std::uniform_int_distribution seg_dist(1, 2 * avg_segment_len); + + while (s.size() < length) { + size_t seg_len = seg_dist(gen); + for (size_t i = 0; i < seg_len && s.size() < length; ++i) { + s += static_cast(char_dist(gen)); + } + if (s.size() < length) + s += delimiter; + } + return s; +} + +// Benchmark runner +template +void BM_Split(benchmark::State& state, Func split_func, + bool return_string_view) { + const size_t str_len = state.range(0); + const char delimiter = ','; + const size_t avg_segment_len = 10; + auto test_str = generate_test_string(str_len, delimiter, avg_segment_len); + + for (auto _ : state) { + if (return_string_view) { + auto result = split_func(test_str, delimiter); + benchmark::DoNotOptimize(result); + } + else { + auto result = split_func(test_str, delimiter); + benchmark::DoNotOptimize(result); + } + } +} + +BENCHMARK_CAPTURE(BM_Split, simd_str_split_sv, ylt::split_sv, true) + ->Args({100}) + ->Args({1000}) + ->Args({10000}); + +BENCHMARK_CAPTURE(BM_Split, naive_str_split_sv, normal_str_split, true) + ->Args({100}) + ->Args({1000}) + ->Args({10000}); + +BENCHMARK_MAIN(); \ No newline at end of file diff --git a/src/simd_util/tests/CMakeLists.txt b/src/simd_util/tests/CMakeLists.txt new file mode 100644 index 000000000..fa11e1c5e --- /dev/null +++ b/src/simd_util/tests/CMakeLists.txt @@ -0,0 +1,9 @@ +set(CMAKE_INCLUDE_CURRENT_DIR ON) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/output/tests/simd_util) + +#set(SIMD_TEST_COMPILE_FLAGS -mavx2 -mbmi -mlzcnt) + +add_executable(util_simd_string_split_test test_simd_string_split.cpp main.cpp) +#target_compile_options(util_simd_string_split_test PRIVATE ${SIMD_TEST_COMPILE_FLAGS}) + +add_test(NAME util_simd_string_split_test COMMAND util_simd_string_split_test) \ No newline at end of file diff --git a/src/simd_util/tests/main.cpp b/src/simd_util/tests/main.cpp new file mode 100644 index 000000000..c5890983a --- /dev/null +++ b/src/simd_util/tests/main.cpp @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2023, Alibaba Group Holding Limited; + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#define DOCTEST_CONFIG_IMPLEMENT + +#include "doctest.h" + +// doctest comments +// 'function' : must be 'attribute' - see issue #182 +DOCTEST_MSVC_SUPPRESS_WARNING_WITH_PUSH(4007) +int main(int argc, char** argv) { return doctest::Context(argc, argv).run(); } +DOCTEST_MSVC_SUPPRESS_WARNING_POP \ No newline at end of file diff --git a/src/simd_util/tests/test_simd_string_split.cpp b/src/simd_util/tests/test_simd_string_split.cpp new file mode 100644 index 000000000..1edba6e0d --- /dev/null +++ b/src/simd_util/tests/test_simd_string_split.cpp @@ -0,0 +1,27 @@ +#include + +#include "doctest.h" + +TEST_CASE("test string_view split") { + std::string_view sv_tmp = + "hello world\t127.0.0.1\t1024\twww.yalantinglibs.com"; + + auto tokens = ylt::split_sv(sv_tmp, '\t'); + + CHECK(tokens[0] == "hello world"); + CHECK(tokens[1] == "127.0.0.1"); + CHECK(tokens[2] == "1024"); + CHECK(tokens[3] == "www.yalantinglibs.com"); +} + +TEST_CASE("test string split") { + std::string_view sv_tmp = + "hello world\t127.0.0.1\t1024\twww.yalantinglibs.com"; + + auto tokens = ylt::split_str(sv_tmp, '\t'); + + CHECK(tokens[0] == "hello world"); + CHECK(tokens[1] == "127.0.0.1"); + CHECK(tokens[2] == "1024"); + CHECK(tokens[3] == "www.yalantinglibs.com"); +} \ No newline at end of file