From ee6c878b50e4775defa873e4fb2adf5086f8c14f Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Wed, 12 Mar 2025 04:49:29 -0700 Subject: [PATCH 1/7] switch strncopy to memmove for DTO --- cachelib/cachebench/cache/Cache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cachelib/cachebench/cache/Cache.h b/cachelib/cachebench/cache/Cache.h index 6f1d35683d..c129360e0b 100644 --- a/cachelib/cachebench/cache/Cache.h +++ b/cachelib/cachebench/cache/Cache.h @@ -1331,7 +1331,7 @@ void Cache::setStringItem(WriteHandle& handle, } auto ptr = reinterpret_cast(getMemory(handle)); - std::strncpy(ptr, str.c_str(), dataSize); + std::memmove(ptr, str.c_str(), dataSize); // Make sure the copied string ends with null char if (str.size() + 1 > dataSize) { From 79f401b5996f73a9228bd67f83d584c86e83570e Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Wed, 30 Apr 2025 17:22:44 -0700 Subject: [PATCH 2/7] add dto async --- cachelib/cachebench/runner/CacheStressor.h | 23 ++++++++++++++++++++-- cachelib/cachebench/util/Config.cpp | 1 + cachelib/cachebench/util/Config.h | 3 +++ 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/cachelib/cachebench/runner/CacheStressor.h b/cachelib/cachebench/runner/CacheStressor.h index af3b8b9101..279dec3372 100644 --- a/cachelib/cachebench/runner/CacheStressor.h +++ b/cachelib/cachebench/runner/CacheStressor.h @@ -28,6 +28,8 @@ #include #include +#include + #include "cachelib/cachebench/cache/Cache.h" #include "cachelib/cachebench/cache/TimeStampTicker.h" #include "cachelib/cachebench/runner/Stressor.h" @@ -43,6 +45,11 @@ namespace cachebench { constexpr uint32_t kNvmCacheWarmUpCheckRate = 1000; +void async_memcpy_callback(void *arg) { + auto &fn = *reinterpret_cast*>(arg); + fn(); +} + // Implementation of stressor that uses a workload generator to stress an // instance of the cache. All item's value in CacheStressor follows CacheValue // schema, which contains a few integers for sanity checks use. So it is invalid @@ -493,8 +500,20 @@ class CacheStressor : public Stressor { ++stats.setFailure; return OpResultType::kSetFailure; } else { - populateItem(it, itemValue); - cache_->insertOrReplace(it); + if (config_.useDTOAsync && size > 32*1024) { + //it->markMoving(); + auto insertToCache = [&] { + cache_->insertOrReplace(it); + }; + + std::function fn = insertToCache; + dto_memcpy_async( + it->getMemory(), itemValue.data(), size, &async_memcpy_callback, &insertToCache); + //it->unmarkMoving(); + } else { + populateItem(it, itemValue); + cache_->insertOrReplace(it); + } return OpResultType::kSetSuccess; } } diff --git a/cachelib/cachebench/util/Config.cpp b/cachelib/cachebench/util/Config.cpp index 133074e50c..c66bd5213d 100644 --- a/cachelib/cachebench/util/Config.cpp +++ b/cachelib/cachebench/util/Config.cpp @@ -68,6 +68,7 @@ StressorConfig::StressorConfig(const folly::dynamic& configJson) { JSONSetVal(configJson, checkNvmCacheWarmUp); JSONSetVal(configJson, useCombinedLockForIterators); + JSONSetVal(configJson, useDTOAsync); if (configJson.count("poolDistributions")) { for (auto& it : configJson["poolDistributions"]) { diff --git a/cachelib/cachebench/util/Config.h b/cachelib/cachebench/util/Config.h index dcb2ea3b63..7a5b1225ba 100644 --- a/cachelib/cachebench/util/Config.h +++ b/cachelib/cachebench/util/Config.h @@ -331,6 +331,9 @@ struct StressorConfig : public JSONConfig { uint64_t timestampFactor{1000}; bool useCombinedLockForIterators{false}; + + // if we want to use async DSA function + bool useDTOAsync{false}; // admission policy for cache. std::shared_ptr admPolicy{}; From b9b944a6a0b9415be28af945d262c66ef65537dc Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Wed, 10 Sep 2025 08:23:40 -0700 Subject: [PATCH 3/7] add DTO build in getdeps and cmake, use compile definitions for DTO api --- build/fbcode_builder/manifests/accel-config | 9 +++++++++ build/fbcode_builder/manifests/cachelib | 1 + build/fbcode_builder/manifests/dto | 14 ++++++++++++++ build/fbcode_builder/manifests/uuid | 9 +++++++++ cachelib/CMakeLists.txt | 10 ++++++++++ cachelib/cachebench/CMakeLists.txt | 5 ++++- cachelib/cachebench/runner/CacheStressor.h | 16 +++++++++------- 7 files changed, 56 insertions(+), 8 deletions(-) create mode 100644 build/fbcode_builder/manifests/accel-config create mode 100644 build/fbcode_builder/manifests/dto create mode 100644 build/fbcode_builder/manifests/uuid diff --git a/build/fbcode_builder/manifests/accel-config b/build/fbcode_builder/manifests/accel-config new file mode 100644 index 0000000000..1a51058147 --- /dev/null +++ b/build/fbcode_builder/manifests/accel-config @@ -0,0 +1,9 @@ +[manifest] +name = accel-config + +[rpms] +accel-config-devel + +[debs.distro=ubuntu] +libaccel-config-dev + diff --git a/build/fbcode_builder/manifests/cachelib b/build/fbcode_builder/manifests/cachelib index c340a2bb56..1386034f15 100644 --- a/build/fbcode_builder/manifests/cachelib +++ b/build/fbcode_builder/manifests/cachelib @@ -25,6 +25,7 @@ zstd mvfst numa libaio +dto # cachelib also depends on openssl but since the latter requires a platform- # specific configuration we rely on the folly manifest to provide this # dependency to avoid duplication. diff --git a/build/fbcode_builder/manifests/dto b/build/fbcode_builder/manifests/dto new file mode 100644 index 0000000000..9d171ef933 --- /dev/null +++ b/build/fbcode_builder/manifests/dto @@ -0,0 +1,14 @@ +[manifest] +name = dto + +[git] +repo_url = https://github.com/intel/DTO.git +branch = cachelib + +[build] +builder = cmake + +[dependencies] +accel-config +uuid +numa diff --git a/build/fbcode_builder/manifests/uuid b/build/fbcode_builder/manifests/uuid new file mode 100644 index 0000000000..d334fdd732 --- /dev/null +++ b/build/fbcode_builder/manifests/uuid @@ -0,0 +1,9 @@ +[manifest] +name = uuid + +[rpms] +libuuid-devel + +[debs] +uuid-dev + diff --git a/cachelib/CMakeLists.txt b/cachelib/CMakeLists.txt index 20598fc55e..89efa6d2b4 100644 --- a/cachelib/CMakeLists.txt +++ b/cachelib/CMakeLists.txt @@ -44,6 +44,16 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON) option(BUILD_TESTS "If enabled, compile the tests." ON) +option(BUILD_WITH_DTO "If enabled, build with the DTO library for DSA support." OFF) + +if (BUILD_WITH_DTO) + find_package(DTO REQUIRED) + if (DTO_FOUND) + message(STATUS "DTO found, remember to configure DSA devices for acceleration. If no DSA device is found, cachelib will fallback to software path.") + add_compile_definitions(HAVE_DTO) + endif() +endif () + set(BIN_INSTALL_DIR bin CACHE STRING "The subdirectory where binaries should be installed") diff --git a/cachelib/cachebench/CMakeLists.txt b/cachelib/cachebench/CMakeLists.txt index 712957f4f0..d33efb3d20 100644 --- a/cachelib/cachebench/CMakeLists.txt +++ b/cachelib/cachebench/CMakeLists.txt @@ -67,12 +67,15 @@ else() target_compile_definitions(cachelib_cachebench PRIVATE SKIP_OPTION_SIZE_VERIFY) endif() - add_executable (cachebench main.cpp) add_executable (binary_trace_gen binary_trace_gen.cpp) target_link_libraries(cachebench cachelib_cachebench) target_link_libraries(binary_trace_gen cachelib_binary_trace_gen) +if (BUILD_WITH_DTO) + target_link_libraries(cachebench accel-config DTO::dto) +endif() + install( TARGETS cachebench diff --git a/cachelib/cachebench/runner/CacheStressor.h b/cachelib/cachebench/runner/CacheStressor.h index 279dec3372..a53cfb53f2 100644 --- a/cachelib/cachebench/runner/CacheStressor.h +++ b/cachelib/cachebench/runner/CacheStressor.h @@ -28,7 +28,10 @@ #include #include +#ifdef HAVE_DTO #include +#define DTO_DSA_MIN_THRESHOLD (32 * 1024) +#endif #include "cachelib/cachebench/cache/Cache.h" #include "cachelib/cachebench/cache/TimeStampTicker.h" @@ -500,20 +503,19 @@ class CacheStressor : public Stressor { ++stats.setFailure; return OpResultType::kSetFailure; } else { - if (config_.useDTOAsync && size > 32*1024) { - //it->markMoving(); +#ifdef HAVE_DTO + if (config_.useDTOAsync && size >= DTO_DSA_MIN_THRESHOLD) { auto insertToCache = [&] { cache_->insertOrReplace(it); }; - std::function fn = insertToCache; dto_memcpy_async( it->getMemory(), itemValue.data(), size, &async_memcpy_callback, &insertToCache); - //it->unmarkMoving(); - } else { - populateItem(it, itemValue); - cache_->insertOrReplace(it); + return OpResultType::kSetSuccess; } +#endif + populateItem(it, itemValue); + cache_->insertOrReplace(it); return OpResultType::kSetSuccess; } } From 914a88bd025d2f3119e29e4d20ee40ca7cb74e3e Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Wed, 10 Sep 2025 09:11:55 -0700 Subject: [PATCH 4/7] update to add CRC offload --- cachelib/navy/CMakeLists.txt | 5 +++++ cachelib/navy/block_cache/BlockCache.cpp | 26 ++++++++++++++++++++++-- cachelib/navy/common/Hash.cpp | 7 +++++++ cachelib/navy/common/Hash.h | 1 - 4 files changed, 36 insertions(+), 3 deletions(-) diff --git a/cachelib/navy/CMakeLists.txt b/cachelib/navy/CMakeLists.txt index ace0ce07f0..09a0ae7426 100644 --- a/cachelib/navy/CMakeLists.txt +++ b/cachelib/navy/CMakeLists.txt @@ -61,10 +61,15 @@ target_link_libraries(cachelib_navy PUBLIC GTest::gmock ) +if (BUILD_WITH_DTO) + target_link_libraries(cachelib_navy PUBLIC DTO::dto) +endif() + install(TARGETS cachelib_navy EXPORT cachelib-exports DESTINATION ${LIB_INSTALL_DIR} ) + if (BUILD_TESTS) add_library(navy_test_support testing/BufferGen.cpp diff --git a/cachelib/navy/block_cache/BlockCache.cpp b/cachelib/navy/block_cache/BlockCache.cpp index 5cae3fb3e8..0d1c8c6cea 100644 --- a/cachelib/navy/block_cache/BlockCache.cpp +++ b/cachelib/navy/block_cache/BlockCache.cpp @@ -23,6 +23,10 @@ #include #include +#ifdef HAVE_DTO +#include +#endif + #include "cachelib/common/inject_pause.h" #include "cachelib/navy/block_cache/SparseMapIndex.h" #include "cachelib/navy/common/Hash.h" @@ -31,6 +35,11 @@ namespace facebook::cachelib::navy { +void async_memcpy_crc_cb(void *arg) { + auto &fn = *reinterpret_cast*>(arg); + fn(); +} + BlockCache::Config& BlockCache::Config::validate() { XDCHECK_NE(scheduler, nullptr); if (!device || !evictionPolicy) { @@ -702,11 +711,24 @@ Status BlockCache::writeEntry(RelAddress addr, auto desc = new (buffer.data() + descOffset) EntryDesc(hk.key().size(), value.size(), hk.keyHash()); if (checksumData_) { +#ifdef HAVE_DTO + auto keyCopy = [hk, descOffset, &buffer]() { + // Copy the key to the buffer at the end + buffer.copyFrom(descOffset - hk.key().size(), makeView(hk.key())); + }; + std::function fn = keyCopy; + //buffer data is dest, value is src, keyCopy is function to execute while waiting + desc->cs = dto_memcpy_crc_async(buffer.data(), static_cast(value.data()), value.size(), &async_memcpy_crc_cb, &fn); +#else desc->cs = checksum(value); + buffer.copyFrom(descOffset - hk.key().size(), makeView(hk.key())); + buffer.copyFrom(0, value); +#endif + } else { + buffer.copyFrom(descOffset - hk.key().size(), makeView(hk.key())); + buffer.copyFrom(0, value); } - buffer.copyFrom(descOffset - hk.key().size(), makeView(hk.key())); - buffer.copyFrom(0, value); regionManager_.write(addr, std::move(buffer)); logicalWrittenCount_.add(hk.key().size() + value.size()); diff --git a/cachelib/navy/common/Hash.cpp b/cachelib/navy/common/Hash.cpp index 50ef925e46..b242ed7a0a 100644 --- a/cachelib/navy/common/Hash.cpp +++ b/cachelib/navy/common/Hash.cpp @@ -17,6 +17,9 @@ #include "cachelib/navy/common/Hash.h" #include +#ifdef HAVE_DTO +#include +#endif namespace facebook::cachelib::navy { uint64_t hashBuffer(BufferView key, uint64_t seed) { @@ -24,6 +27,10 @@ uint64_t hashBuffer(BufferView key, uint64_t seed) { } uint32_t checksum(BufferView data, uint32_t startingChecksum) { +#ifdef HAVE_DTO + return dto_crc(data.data(), data.size(), nullptr, nullptr); +#else return folly::crc32(data.data(), data.size(), startingChecksum); +#endif } } // namespace facebook::cachelib::navy diff --git a/cachelib/navy/common/Hash.h b/cachelib/navy/common/Hash.h index 7eed7464f8..b5ec034a4a 100644 --- a/cachelib/navy/common/Hash.h +++ b/cachelib/navy/common/Hash.h @@ -17,7 +17,6 @@ #pragma once #include - #include "cachelib/common/Hash.h" #include "cachelib/navy/common/Buffer.h" From 8d988b4d981d56b683184e2a80dd5e4062117a27 Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Wed, 10 Sep 2025 09:45:35 -0700 Subject: [PATCH 5/7] guard callbacks with define and fix linespace --- cachelib/cachebench/CMakeLists.txt | 1 + cachelib/cachebench/runner/CacheStressor.h | 2 ++ cachelib/cachebench/util/Config.h | 2 +- cachelib/navy/CMakeLists.txt | 1 - cachelib/navy/block_cache/BlockCache.cpp | 3 +++ cachelib/navy/common/Hash.h | 1 + 6 files changed, 8 insertions(+), 2 deletions(-) diff --git a/cachelib/cachebench/CMakeLists.txt b/cachelib/cachebench/CMakeLists.txt index d33efb3d20..385a988155 100644 --- a/cachelib/cachebench/CMakeLists.txt +++ b/cachelib/cachebench/CMakeLists.txt @@ -67,6 +67,7 @@ else() target_compile_definitions(cachelib_cachebench PRIVATE SKIP_OPTION_SIZE_VERIFY) endif() + add_executable (cachebench main.cpp) add_executable (binary_trace_gen binary_trace_gen.cpp) target_link_libraries(cachebench cachelib_cachebench) diff --git a/cachelib/cachebench/runner/CacheStressor.h b/cachelib/cachebench/runner/CacheStressor.h index a53cfb53f2..e8cdc7532c 100644 --- a/cachelib/cachebench/runner/CacheStressor.h +++ b/cachelib/cachebench/runner/CacheStressor.h @@ -48,10 +48,12 @@ namespace cachebench { constexpr uint32_t kNvmCacheWarmUpCheckRate = 1000; +#ifdef HAVE_DTO void async_memcpy_callback(void *arg) { auto &fn = *reinterpret_cast*>(arg); fn(); } +#endif // Implementation of stressor that uses a workload generator to stress an // instance of the cache. All item's value in CacheStressor follows CacheValue diff --git a/cachelib/cachebench/util/Config.h b/cachelib/cachebench/util/Config.h index 7a5b1225ba..1bfec941ae 100644 --- a/cachelib/cachebench/util/Config.h +++ b/cachelib/cachebench/util/Config.h @@ -331,7 +331,7 @@ struct StressorConfig : public JSONConfig { uint64_t timestampFactor{1000}; bool useCombinedLockForIterators{false}; - + // if we want to use async DSA function bool useDTOAsync{false}; diff --git a/cachelib/navy/CMakeLists.txt b/cachelib/navy/CMakeLists.txt index 09a0ae7426..92d9a8308b 100644 --- a/cachelib/navy/CMakeLists.txt +++ b/cachelib/navy/CMakeLists.txt @@ -69,7 +69,6 @@ install(TARGETS cachelib_navy EXPORT cachelib-exports DESTINATION ${LIB_INSTALL_DIR} ) - if (BUILD_TESTS) add_library(navy_test_support testing/BufferGen.cpp diff --git a/cachelib/navy/block_cache/BlockCache.cpp b/cachelib/navy/block_cache/BlockCache.cpp index 0d1c8c6cea..c223fa12a3 100644 --- a/cachelib/navy/block_cache/BlockCache.cpp +++ b/cachelib/navy/block_cache/BlockCache.cpp @@ -35,10 +35,13 @@ namespace facebook::cachelib::navy { +#ifdef HAVE_DTO void async_memcpy_crc_cb(void *arg) { auto &fn = *reinterpret_cast*>(arg); fn(); } +#endif + BlockCache::Config& BlockCache::Config::validate() { XDCHECK_NE(scheduler, nullptr); diff --git a/cachelib/navy/common/Hash.h b/cachelib/navy/common/Hash.h index b5ec034a4a..7eed7464f8 100644 --- a/cachelib/navy/common/Hash.h +++ b/cachelib/navy/common/Hash.h @@ -17,6 +17,7 @@ #pragma once #include + #include "cachelib/common/Hash.h" #include "cachelib/navy/common/Buffer.h" From 80aa2ce8dc3b647647c3f0454da4b618a2b2ab79 Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Wed, 10 Sep 2025 10:54:54 -0700 Subject: [PATCH 6/7] make DTO enabled by default --- cachelib/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cachelib/CMakeLists.txt b/cachelib/CMakeLists.txt index 89efa6d2b4..610d35b806 100644 --- a/cachelib/CMakeLists.txt +++ b/cachelib/CMakeLists.txt @@ -44,7 +44,7 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON) option(BUILD_TESTS "If enabled, compile the tests." ON) -option(BUILD_WITH_DTO "If enabled, build with the DTO library for DSA support." OFF) +option(BUILD_WITH_DTO "If enabled, build with the DTO library for DSA support." ON) if (BUILD_WITH_DTO) find_package(DTO REQUIRED) From 4205a9301caf0d1f4e258b68b0fff7a02f76de4b Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Thu, 11 Sep 2025 07:48:45 -0700 Subject: [PATCH 7/7] use DTO_API define to make API path independent of transparent intercept --- cachelib/CMakeLists.txt | 15 ++++++++++++++- cachelib/cachebench/runner/CacheStressor.h | 6 +++--- cachelib/navy/block_cache/BlockCache.cpp | 6 +++--- cachelib/navy/common/Hash.cpp | 4 ++-- 4 files changed, 22 insertions(+), 9 deletions(-) diff --git a/cachelib/CMakeLists.txt b/cachelib/CMakeLists.txt index 610d35b806..8f20c921c6 100644 --- a/cachelib/CMakeLists.txt +++ b/cachelib/CMakeLists.txt @@ -50,10 +50,23 @@ if (BUILD_WITH_DTO) find_package(DTO REQUIRED) if (DTO_FOUND) message(STATUS "DTO found, remember to configure DSA devices for acceleration. If no DSA device is found, cachelib will fallback to software path.") - add_compile_definitions(HAVE_DTO) endif() endif () +include(CMakeDependentOption) +# USE_DTO_API is only meaningful if BUILD_WITH_DTO is ON *and* DTO was found +cmake_dependent_option( + USE_DTO_API + "Use DTO library API functions for DSA acceleration." + OFF + "BUILD_WITH_DTO;DTO_FOUND" + OFF +) +if (USE_DTO_API) + message(STATUS "Using DTO API for offloading") + add_compile_definitions(DTO_API) +endif() + set(BIN_INSTALL_DIR bin CACHE STRING "The subdirectory where binaries should be installed") diff --git a/cachelib/cachebench/runner/CacheStressor.h b/cachelib/cachebench/runner/CacheStressor.h index e8cdc7532c..ddc04b38e8 100644 --- a/cachelib/cachebench/runner/CacheStressor.h +++ b/cachelib/cachebench/runner/CacheStressor.h @@ -28,7 +28,7 @@ #include #include -#ifdef HAVE_DTO +#ifdef DTO_API #include #define DTO_DSA_MIN_THRESHOLD (32 * 1024) #endif @@ -48,7 +48,7 @@ namespace cachebench { constexpr uint32_t kNvmCacheWarmUpCheckRate = 1000; -#ifdef HAVE_DTO +#ifdef DTO_API void async_memcpy_callback(void *arg) { auto &fn = *reinterpret_cast*>(arg); fn(); @@ -505,7 +505,7 @@ class CacheStressor : public Stressor { ++stats.setFailure; return OpResultType::kSetFailure; } else { -#ifdef HAVE_DTO +#ifdef DTO_API if (config_.useDTOAsync && size >= DTO_DSA_MIN_THRESHOLD) { auto insertToCache = [&] { cache_->insertOrReplace(it); diff --git a/cachelib/navy/block_cache/BlockCache.cpp b/cachelib/navy/block_cache/BlockCache.cpp index c223fa12a3..deb4fc4456 100644 --- a/cachelib/navy/block_cache/BlockCache.cpp +++ b/cachelib/navy/block_cache/BlockCache.cpp @@ -23,7 +23,7 @@ #include #include -#ifdef HAVE_DTO +#ifdef DTO_API #include #endif @@ -35,7 +35,7 @@ namespace facebook::cachelib::navy { -#ifdef HAVE_DTO +#ifdef DTO_API void async_memcpy_crc_cb(void *arg) { auto &fn = *reinterpret_cast*>(arg); fn(); @@ -714,7 +714,7 @@ Status BlockCache::writeEntry(RelAddress addr, auto desc = new (buffer.data() + descOffset) EntryDesc(hk.key().size(), value.size(), hk.keyHash()); if (checksumData_) { -#ifdef HAVE_DTO +#ifdef DTO_API auto keyCopy = [hk, descOffset, &buffer]() { // Copy the key to the buffer at the end buffer.copyFrom(descOffset - hk.key().size(), makeView(hk.key())); diff --git a/cachelib/navy/common/Hash.cpp b/cachelib/navy/common/Hash.cpp index b242ed7a0a..4f9092af04 100644 --- a/cachelib/navy/common/Hash.cpp +++ b/cachelib/navy/common/Hash.cpp @@ -17,7 +17,7 @@ #include "cachelib/navy/common/Hash.h" #include -#ifdef HAVE_DTO +#ifdef DTO_API #include #endif @@ -27,7 +27,7 @@ uint64_t hashBuffer(BufferView key, uint64_t seed) { } uint32_t checksum(BufferView data, uint32_t startingChecksum) { -#ifdef HAVE_DTO +#ifdef DTO_API return dto_crc(data.data(), data.size(), nullptr, nullptr); #else return folly::crc32(data.data(), data.size(), startingChecksum);