diff --git a/.gitignore b/.gitignore index 14f6bad..5c5b126 100644 --- a/.gitignore +++ b/.gitignore @@ -43,7 +43,33 @@ vendor/local/* src/rust/.cargo/config src/version.h +# Excel temporary files +.~lock.*# +*.tmp + # Sub projects -vendor/pcre2/* +vendor/pcre2/ + +# Build dependency artifacts +vendor/abseil-cpp/ +vendor/boost/ +vendor/ctre/ +vendor/jansson/ +vendor/yara/ +vendor/hyperscan/ +vendor/oniguruma/ +vendor/re2/ +vendor/tre/ + +# Result files and benchmarks +results*.txt +results*.csv +*.xlsx +test_input.txt +titles.md +build_deps.sh + +# Screenshots +*.png diff --git a/CMakeLists.txt b/CMakeLists.txt index 400b20e..5575b7d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,7 +5,7 @@ cmake_minimum_required(VERSION 3.0) -project(RegexPeformance C CXX) +project(RegexPerformance C CXX) if(NOT CMAKE_CXX_STANDARD) set(CMAKE_CXX_STANDARD 20) @@ -15,13 +15,13 @@ endif() set(CMAKE_EXPORT_COMPILE_COMMANDS ON) -set(GENERAL_C_FLAGS "-march=native -Wall -Wstack-usage=5000 -fdiagnostics-color -pipe -fsigned-char -fno-asynchronous-unwind-tables -fno-stack-protector -Wunused-parameter") +set(GENERAL_C_FLAGS "-march=native -mtune=native -Wall -Wstack-usage=5000 -fdiagnostics-color -pipe -fsigned-char -fno-asynchronous-unwind-tables -fno-stack-protector -Wunused-parameter") set(CMAKE_C_FLAGS "-std=c11 ${GENERAL_C_FLAGS}" CACHE STRING "additional CFLAGS" FORCE) set(CMAKE_C_FLAGS_DEBUG "-O0 -g") set(CMAKE_C_FLAGS_RELEASE "-O3") -set(CMAKE_CXX_FLAGS "-std=c++11 ${GENERAL_C_FLAGS}" CACHE STRING "additional CFLAGS" FORCE) +set(CMAKE_CXX_FLAGS "-std=c++20 ${GENERAL_C_FLAGS}" CACHE STRING "additional CFLAGS" FORCE) set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g") set(CMAKE_CXX_FLAGS_RELEASE "-O3") diff --git a/README.md b/README.md index 66e1939..4a1c030 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,15 @@ This tool is based on the work of John Maddock (See his own regex comparison [he and the sljit project (See their regex comparison [here](http://sljit.sourceforge.net/regex_perf.html)). ## Requirements + +### Modern Clang 19.1.6 Toolchain (Recommended) +When using the modern toolchain build, all dependencies are automatically handled by the build script. Just ensure you have access to the toolchain environment: +- Access to `/ssd/hblib-installer/ubuntu-20.04/tools/sourceme.sh` +- CMake 3.24.2 (included in toolchain) +- Clang 19.1.6 (included in toolchain) +- All regex engine dependencies built automatically + +### Legacy System Requirements | dependency | version | |------------|----------| | Cmake | >=3.0 | @@ -51,6 +60,38 @@ regex crate for defined expressions. The different engines have different requirements which are not described here. Please see the related project documentations. +On Ubuntu 20.04 these were necessary installs to get the build done from a stock AWS box +```bash +$ apt install build-essential cmake rustc cargo automake autoconf autopoint autogen \ + libtool libprotobuf-dev libprotobuf-c-dev protobuf-compiler ninja-build \ + ragel libpcap pcaputils pkg-config libboost-dev flex bison +``` + +### Modern Clang 19.1.6 Toolchain Build (Recommended) + +For optimal performance with the latest toolchain, use the automated build script with the modern Clang 19.1.6 toolchain: + +```bash +# Source the modern toolchain environment +source /ssd/hblib-installer/ubuntu-20.04/tools/sourceme.sh + +# Clean build from scratch +./build_deps_simple.sh + +# Build the main project +mkdir -p build && cd build +CC=clang CXX=clang++ cmake .. +make -j$(nproc) +``` + +This approach: +- Uses Clang 19.1.6 compiler with LLVM tools +- Builds all dependencies with modern toolchain +- Includes latest RE2 with Abseil dependencies +- Supports all 11 regex engines with optimal performance + +### Legacy Build Method + In the case all depencies are fulfilled, just configure and build the cmake based project: ```bash @@ -71,6 +112,13 @@ make regex_perf The test tool calls each engine with a defined set of different regular expression on a given file. The repository contains a ~16Mbyte large text file (3200.txt) which can be used for measuring. +```bash +# When using modern toolchain, source the environment first +source /ssd/hblib-installer/ubuntu-20.04/tools/sourceme.sh +build/src/regex_perf -f 3200.txt +``` + +For legacy builds: ```bash ./src/regex_perf -f ./3200.txt ``` @@ -98,8 +146,38 @@ python3 ../genspreadsheet.py results.csv It will save an Excel spreadsheet with the name `regex-results-YYYYMMDD-HHMMSS.xlsx` in the current directory. +## Compiling with clang + libc++ + +Unfortunately it is not possible to run both standard C++ from GCC/stdlibc++ and clang+libc++ at the +same time, it is just the way that cmake selects a single compiler. + +To run with clang+libc++ use the following recipe: +```bash +mkdir build && cd build +cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_EXE_LINKER_FLAGS="-lc++abi -lc++" \ + -DCMAKE_CXX_COMPILER=/usr/local/bin/clang++ \ + -DCMAKE_C_COMPILER=/usr/local/bin/clang \ + -DCMAKE_CXX_FLAGS_INIT="-std=c++20 -stdlib=libc++ -march=native -mtune=native" \ + -G Ninja .. +``` + +## Current Build Status + +**Latest Update (2025-09-28)**: Successfully rebuilt from scratch with modern Clang 19.1.6 toolchain +- ✅ All 11 regex engines working: CTRE, Boost, C++ std, PCRE (3 variants), RE2, Oniguruma, TRE, Rust regex (2 variants) +- ✅ Latest RE2 with Abseil dependencies properly linked +- ✅ Performance tests running successfully +- ⚠️ CTRE has known issues with case-insensitive patterns and word boundaries +- 🚀 Best performers: Rust regex, PCRE-JIT, RE2 + ## Results -These results were obtained in an AMD Threadripper 3960X (Zen2) at 3.8 GHz running Ubuntu 20.04.5 LTS. +These results were obtained in an AMD Threadripper 3960X (Zen2) at 3.8 GHz running Ubuntu 20.04.5 LTS. + +![Updated Performance Results](results_threadripper.png "Performance Results") + +IceLake Xeon Platinum 8375C @ 2.90GHz (AWS C6i instance) - no mitigations -![Updated Performance Results](results_20221012.png "Performance Results") +![IceLake Server](results_icelake.png "Results Ice Lake") \ No newline at end of file diff --git a/build_deps_simple.sh b/build_deps_simple.sh new file mode 100755 index 0000000..f9545fd --- /dev/null +++ b/build_deps_simple.sh @@ -0,0 +1,262 @@ +#!/bin/bash + +# Simplified script to build working regex engine dependencies +# Focuses on engines compatible with CMake 3.16 + +set -e + +VENDOR_DIR="/home/hbucher/git/regex-performance/vendor" +LOCAL_DIR="$VENDOR_DIR/local" +BUILD_TYPE="${BUILD_TYPE:-Release}" +PARALLEL_JOBS="${PARALLEL_JOBS:-4}" + +# Create directories +mkdir -p "$LOCAL_DIR"/{lib,include,bin} +cd "$VENDOR_DIR" + +echo "=== Building compatible regex engine dependencies ===" +echo "Build type: $BUILD_TYPE" +echo "CMake version: $(cmake --version | head -1)" + +# Function to clone or update repository +clone_or_update() { + local name=$1 + local url=$2 + local branch=${3:-main} + + if [ -d "$name" ]; then + echo "Updating $name..." + cd "$name" + /usr/bin/git pull origin "$branch" || true + cd .. + else + echo "Cloning $name from $url..." + /usr/bin/git clone "$url" "$name" + if [ "$branch" != "main" ] && [ "$branch" != "master" ]; then + cd "$name" + /usr/bin/git checkout "$branch" + cd .. + fi + fi +} + +# Source the modern Clang toolchain +echo "Sourcing modern Clang toolchain..." +source /ssd/hblib-installer/ubuntu-20.04/tools/sourceme.sh + +# Function to build with cmake +build_cmake() { + local name=$1 + shift + echo "Building $name with CMake using Clang with -march=native..." + mkdir -p build + cd build + CC=clang CXX=clang++ cmake .. \ + -DCMAKE_BUILD_TYPE="$BUILD_TYPE" \ + -DCMAKE_INSTALL_PREFIX="$LOCAL_DIR" \ + -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ + -DCMAKE_CXX_FLAGS="-fPIC -march=native -mtune=native" \ + -DCMAKE_C_FLAGS="-fPIC -march=native -mtune=native" \ + -DBUILD_SHARED_LIBS=OFF \ + "$@" + make -j"$PARALLEL_JOBS" + make install + cd .. +} + +# Function to build with autotools +build_autotools() { + local name=$1 + shift + echo "Building $name with autotools using Clang with -march=native..." + mkdir -p build + cd build + CC=clang CXX=clang++ \ + CFLAGS="-march=native -mtune=native" \ + CXXFLAGS="-march=native -mtune=native" \ + ../configure \ + --prefix="$LOCAL_DIR" \ + --enable-static \ + --disable-shared \ + "$@" + make -j"$PARALLEL_JOBS" + make install + cd .. +} + +echo "=== Building PCRE2 ===" +if [ ! -f "$LOCAL_DIR/lib/libpcre2-8.a" ]; then + clone_or_update "pcre2" "https://github.com/PhilipHazel/pcre2.git" "master" + cd pcre2 + /usr/bin/git submodule update --init --recursive + build_cmake "pcre2" \ + -DPCRE2_SUPPORT_JIT=ON \ + -DPCRE2_BUILD_TESTS=OFF \ + -DPCRE2_BUILD_PCRE2GREP=OFF + cd .. +else + echo "PCRE2 already built, skipping..." +fi + +echo "=== Building Oniguruma ===" +if [ ! -f "$LOCAL_DIR/lib/libonig.so" ]; then + clone_or_update "oniguruma" "https://github.com/kkos/oniguruma.git" "master" + cd oniguruma + build_cmake "oniguruma" \ + -DBUILD_TEST=OFF \ + -DINSTALL_DOCUMENTATION=OFF + cd .. +else + echo "Oniguruma already built, skipping..." +fi + +echo "=== Building TRE ===" +if [ ! -f "$LOCAL_DIR/lib/libtre.so" ]; then + clone_or_update "tre" "https://github.com/laurikari/tre.git" "master" + cd tre + ./utils/autogen.sh + build_autotools "tre" + cd .. +else + echo "TRE already built, skipping..." +fi + +echo "=== Building Jansson (required for YARA) ===" +if [ ! -f "$LOCAL_DIR/lib/libjansson.a" ]; then + clone_or_update "jansson" "https://github.com/akheron/jansson.git" "master" + cd jansson + build_cmake "jansson" \ + -DJANSSON_BUILD_SHARED_LIBS=OFF \ + -DJANSSON_BUILD_DOCS=OFF + cd .. +else + echo "Jansson already built, skipping..." +fi + +echo "=== Building YARA ===" +if [ ! -f "$LOCAL_DIR/lib/libyara.a" ]; then + clone_or_update "yara" "https://github.com/VirusTotal/yara.git" "master" + cd yara + ./bootstrap.sh + # Set environment variables so YARA can find Jansson + export PKG_CONFIG_PATH="$LOCAL_DIR/lib/pkgconfig:$PKG_CONFIG_PATH" + export CPPFLAGS="-I$LOCAL_DIR/include $CPPFLAGS" + export LDFLAGS="-L$LOCAL_DIR/lib $LDFLAGS" + export LD_LIBRARY_PATH="$LOCAL_DIR/lib:$LD_LIBRARY_PATH" + build_autotools "yara" \ + --enable-cuckoo \ + --disable-magic \ + --enable-dotnet + cd .. +else + echo "YARA already built, skipping..." +fi + +echo "=== Building Abseil (required for RE2) ===" +if [ ! -f "$LOCAL_DIR/lib/libabsl_base.a" ]; then + echo "Cloning Abseil from GitHub..." + clone_or_update "abseil-cpp" "https://github.com/abseil/abseil-cpp.git" "main" + cd abseil-cpp + build_cmake "abseil-cpp" \ + -DABSL_BUILD_TESTING=OFF \ + -DABSL_USE_GOOGLETEST_HEAD=OFF \ + -DCMAKE_CXX_STANDARD=17 + cd .. +else + echo "Abseil already built, skipping..." +fi + +echo "=== Building RE2 (latest version) ===" +if [ ! -f "$LOCAL_DIR/lib/libre2.a" ]; then + echo "Cloning latest RE2 from GitHub..." + clone_or_update "re2" "https://github.com/google/re2.git" "main" + cd re2 + build_cmake "re2" \ + -DRE2_BUILD_TESTING=OFF \ + -DCMAKE_PREFIX_PATH="$LOCAL_DIR" + cd .. +else + echo "RE2 already built, skipping..." +fi + +echo "=== Building simplified Boost ===" +if [ ! -f "$LOCAL_DIR/lib/libboost_regex.a" ]; then + clone_or_update "boost" "https://github.com/boostorg/boost.git" "boost-1.82.0" + cd boost + # Initialize only essential submodules + /usr/bin/git submodule update --init --recursive \ + tools/build \ + tools/boost_install \ + libs/regex \ + libs/config \ + libs/headers \ + libs/throw_exception \ + libs/exception \ + libs/assert \ + libs/core \ + libs/static_assert \ + libs/type_traits \ + libs/mpl \ + libs/preprocessor \ + libs/predef + + ./bootstrap.sh --with-toolset=clang + ./b2 headers + ./b2 install -q \ + --prefix="$LOCAL_DIR" \ + --with-regex \ + toolset=clang \ + variant=release \ + link=static \ + threading=multi \ + cxxstd=17 \ + cxxflags="-march=native -mtune=native" \ + cflags="-march=native -mtune=native" + cd .. +else + echo "Boost already built, skipping..." +fi + +echo "=== Building Hyperscan ===" +if [ ! -f "$LOCAL_DIR/lib/libhs.a" ]; then + clone_or_update "hyperscan" "https://github.com/intel/hyperscan.git" "master" + cd hyperscan + build_cmake "hyperscan" \ + -DFAT_RUNTIME=OFF \ + -DBUILD_EXAMPLES=OFF \ + -DBUILD_STATIC_LIBS=ON \ + -DBUILD_SHARED_LIBS=OFF + cd .. +else + echo "Hyperscan already built, skipping..." +fi + +echo "=== Setting up CTRE (header-only) ===" +if [ ! -d "$LOCAL_DIR/include/ctre" ]; then + echo "Cloning CTRE from GitHub..." + /usr/bin/git clone https://github.com/hanickadot/compile-time-regular-expressions.git ctre + cd ctre + # CTRE is header-only, just copy headers + cp -r include/* "$LOCAL_DIR/include/" + cd .. +else + echo "CTRE already set up, skipping..." +fi + +echo "=== Build summary ===" +echo "Libraries built in: $LOCAL_DIR" +echo "" +echo "✅ PCRE2 (with JIT)" +echo "✅ Oniguruma" +echo "✅ TRE" +echo "✅ Jansson (dependency for YARA)" +echo "✅ YARA (with Jansson support)" +echo "✅ RE2 (latest version with Abseil dependencies)" +echo "✅ Boost.Regex (minimal build with Clang)" +echo "✅ Hyperscan (Intel's vectorized pattern matching)" +echo "✅ CTRE (header-only, C++20 required)" +echo "✅ Rust regex (built automatically by main project)" +echo "✅ C++ std::regex (system)" +echo "" +echo "Total: 11 regex engines ready for benchmarking!" +echo "Now you can build the main project with these engines." \ No newline at end of file diff --git a/genspreadsheet.py b/genspreadsheet.py index 5feffbc..f13e9e4 100644 --- a/genspreadsheet.py +++ b/genspreadsheet.py @@ -8,13 +8,8 @@ print("Usage: genspreadsheet.py \n") sys.exit(0) -regexre = re.compile('Regex:\s*(.*)') -resultre = re.compile('\[\s*(\S+)\]\s*time:\s*([\d\.]+).*matches:\s*(\d+)') - infilename = sys.argv[1] -current_regex = None results = {} -stats = None scanners = set() with open( infilename, "r" ) as filein: headers = filein.readline().split(';') @@ -37,14 +32,15 @@ workbook = xlsxwriter.Workbook(outfilename) worksheet = workbook.add_worksheet() worksheet.hide_gridlines(2) -worksheet.set_column(0,0,30) +worksheet.set_column(0,0,35) worksheet.set_column(1,len(scanners),10) worksheet.set_row(0,20) # Add a bold format to use to highlight cells. -bold = workbook.add_format({'bold': True}) -boldrot = workbook.add_format({'bold': True}) -boldrot.set_rotation(0) +headerfmt = workbook.add_format({'bold': True}) +headerfmt.set_bg_color('gray') +headerfmt.set_font_color('white') +headerfmt.set_rotation(0) highfmt = workbook.add_format({'bold': True}) highfmt.set_bg_color( 'orange' ) highfmt.set_font_color( 'white' ) @@ -56,19 +52,19 @@ warnfmt.set_font_color( 'black' ) warnfmt.set_align('center') -# Write some data headers. -scanners = list(scanners) +# Write headers. +scanners = sorted(list(scanners)) row = 0 for col,scanner in enumerate(scanners): - worksheet.write( row, col+1, scanner, boldrot ) -worksheet.write( row, 0, "Regex", bold) + worksheet.write( row, col+1, scanner, headerfmt ) +worksheet.write( row, 0, "Regex", headerfmt ) for regex,stats in results.items(): values = sorted([ ms for ms in stats.values() ]) lowcut = values[1] highcut = values[-2] row += 1 - worksheet.write( row, 0, regex, bold ) + worksheet.write( row, 0, regex, headerfmt ) for col,scanner in enumerate(scanners): if scanner not in stats: worksheet.write( row, col+1, "n/a", warnfmt ) diff --git a/results_icelake.png b/results_icelake.png new file mode 100644 index 0000000..a7b5d88 Binary files /dev/null and b/results_icelake.png differ diff --git a/results_threadripper.png b/results_threadripper.png new file mode 100644 index 0000000..c3a58a0 Binary files /dev/null and b/results_threadripper.png differ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 33391e5..dc49298 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -41,7 +41,7 @@ endif() if(NOT ${INCLUDE_RE2} MATCHES "disabled") add_definitions(-DINCLUDE_RE2) set(REGEX_SOURCES ${REGEX_SOURCES} re2.cpp) - set(REGEX_ENGINES ${REGEX_ENGINES} re2) + # Note: RE2 is linked via CMake target, not added to REGEX_ENGINES endif() if(NOT ${INCLUDE_CPPSTD} MATCHES "disabled") @@ -58,7 +58,7 @@ endif() if(NOT ${INCLUDE_YARA} MATCHES "disabled") add_definitions(-DINCLUDE_YARA) set(REGEX_SOURCES ${REGEX_SOURCES} yara.c) - set(REGEX_ENGINES ${REGEX_ENGINES} yara) + set(REGEX_ENGINES ${REGEX_ENGINES} yara jansson) endif() include_directories( @@ -78,4 +78,13 @@ add_executable(regex_perf ${REGEX_SOURCES}) add_dependencies(regex_perf librregex) -target_link_libraries(regex_perf rregex ${REGEX_ENGINES} pthread dl) +# Use CMake to find and link dependencies properly +set(EXTRA_LIBRARIES "") +if(NOT ${INCLUDE_RE2} MATCHES "disabled") + # Set the prefix path so CMake can find our locally built packages + list(APPEND CMAKE_PREFIX_PATH "${PROJECT_SOURCE_DIR}/vendor/local") + find_package(re2 REQUIRED) + set(EXTRA_LIBRARIES re2::re2) +endif() + +target_link_libraries(regex_perf rregex ${REGEX_ENGINES} ${EXTRA_LIBRARIES} pthread dl) diff --git a/src/cppstd.cpp b/src/cppstd.cpp index cc8f0cc..6950d66 100644 --- a/src/cppstd.cpp +++ b/src/cppstd.cpp @@ -24,8 +24,12 @@ extern "C" int cppstd_find_all(char* pattern, char* subject, int subject_len, in double * times = (double*) std::calloc(repeat, sizeof(double)); int const times_len = repeat; + TIME_TYPE test_start; + GET_TIME(test_start); do { + // Check for timeout before each iteration + CHECK_TIMEOUT(test_start); GET_TIME(start); found = search_all( rx, text ); GET_TIME(end); diff --git a/src/main.c b/src/main.c index 104f01a..19fe6c2 100644 --- a/src/main.c +++ b/src/main.c @@ -2,6 +2,10 @@ #include #include #include +#include +#include +#include +#include #include "main.h" #include "version.h" @@ -9,6 +13,9 @@ static char* data = NULL; static int data_len = 0; +// Global timeout setting (default 1 second) +double timeout_ms = 1000.0; + struct engines { char * name; int (*find_all)(char* pattern, char* subject, int subject_len, int repeat, struct result * result); @@ -69,41 +76,85 @@ static char * regex [] = { "(.*?,){13}z" }; -void load(char const * file_name) +void cleanup_data(void) { - int i; + if (data) { + free(data); + data = NULL; + data_len = 0; + } +} - FILE * f; - f = fopen(file_name, "rb"); - if (!f) { - fprintf(stderr, "Cannot open '%s'!\n", file_name); - return; +int load(char const * file_name) +{ + struct stat st; + size_t bytes_read = 0; + + // Validate file exists and get size safely + if (stat(file_name, &st) != 0) { + fprintf(stderr, "Cannot stat file '%s': %s\n", file_name, strerror(errno)); + return -1; } - fseek(f, 0, SEEK_END); - data_len = ftell(f); - fseek(f, 0, SEEK_SET); + // Check for reasonable file size (prevent huge allocations) + if (st.st_size <= 0 || st.st_size > 1024 * 1024 * 1024) { // 1GB limit + fprintf(stderr, "File '%s' has invalid size: %ld bytes\n", file_name, st.st_size); + return -1; + } + FILE * f = fopen(file_name, "rb"); + if (!f) { + fprintf(stderr, "Cannot open '%s': %s\n", file_name, strerror(errno)); + return -1; + } + + data_len = st.st_size; data = (char*)malloc(data_len + 1); if (!data) { - fprintf(stderr, "Cannot allocate memory!\n"); + fprintf(stderr, "Cannot allocate %d bytes for file '%s': %s\n", + data_len + 1, file_name, strerror(errno)); fclose(f); - return; + return -1; } - data[data_len] = '\0'; - int size = fread(data, data_len, 1, f); - if (size == 0) { - fprintf(stderr, "Reading file failed!\n"); + // Read file in chunks with proper bounds checking + while (bytes_read < data_len) { + size_t to_read = data_len - bytes_read; + size_t chunk_size = fread(data + bytes_read, 1, to_read, f); + + if (chunk_size == 0) { + if (feof(f)) { + break; // End of file reached + } else if (ferror(f)) { + fprintf(stderr, "Error reading file '%s': %s\n", file_name, strerror(errno)); + cleanup_data(); + fclose(f); + return -1; + } + } + bytes_read += chunk_size; } + fclose(f); - for (i = 0; i < data_len; ++i) { + // Verify we read the expected amount + if (bytes_read != data_len) { + fprintf(stderr, "Warning: Expected %d bytes, read %zu bytes from '%s'\n", + data_len, bytes_read, file_name); + data_len = bytes_read; // Adjust to actual size + } + + data[data_len] = '\0'; // Null terminate + + // Convert line endings + for (int i = 0; i < data_len; ++i) { if (data[i] == '\r') { data[i] = '\n'; } } + fprintf(stdout, "'%s' loaded. (Length: %d bytes)\n", file_name, data_len); + return 0; } void find_all(char* pattern, char* subject, int subject_len, int repeat, struct result * engine_results) @@ -113,17 +164,48 @@ void find_all(char* pattern, char* subject, int subject_len, int repeat, struct fprintf(stdout, "-----------------\nRegex: '%s'\n", pattern); for (iter = 0; iter < sizeof(engines)/sizeof(engines[0]); iter++) { + // Warmup for JIT engines + if (needs_warmup(engines[iter].name)) { + struct result warmup_result = {0}; + int warmup_cycles = 3; + engines[iter].find_all(pattern, subject, subject_len, warmup_cycles, &warmup_result); + // Warmup result is discarded + } + + // Record baseline memory before engine execution + record_memory_baseline(&(engine_results[iter])); + int ret = engines[iter].find_all(pattern, subject, subject_len, repeat, &(engine_results[iter])); + + // Record peak memory after engine execution + record_memory_peak(&(engine_results[iter])); + if (ret == -1) { engine_results[iter].time = 0; engine_results[iter].time_sd = 0; + engine_results[iter].confidence_95_lower = 0; + engine_results[iter].confidence_95_upper = 0; engine_results[iter].matches = 0; engine_results[iter].score = 0; + engine_results[iter].samples_used = 0; + // Memory fields are already set by record_memory_peak + } else if (ret == -2) { + // Timeout occurred + engine_results[iter].time = timeout_ms; + engine_results[iter].time_sd = 0; + engine_results[iter].confidence_95_lower = timeout_ms; + engine_results[iter].confidence_95_upper = timeout_ms; + engine_results[iter].matches = -1; // Indicate timeout + engine_results[iter].score = 0; + engine_results[iter].samples_used = 0; + printf("[%10s] time: TIMEOUT (>%.1f ms)\n", engines[iter].name, timeout_ms); } else { printResult(engines[iter].name, &(engine_results[iter])); } } + // Note: Validation is now done in main() after all patterns are tested + int score_points = 5; for (int top = 0; top < score_points; top++) { double best = 0; @@ -147,10 +229,17 @@ void find_all(char* pattern, char* subject, int subject_len, int repeat, struct } +int compare_doubles(const void *a, const void *b) { + double da = *(const double*)a; + double db = *(const double*)b; + return (da > db) - (da < db); +} + void get_mean_and_derivation(double * times, uint32_t times_len, struct result * res) { double mean, sd, var, sum = 0.0, sdev = 0.0; int32_t iter; + uint32_t valid_count = times_len; if (times == NULL || res == NULL || times_len == 0) { return; @@ -159,33 +248,272 @@ void get_mean_and_derivation(double * times, uint32_t times_len, struct result * if (times_len == 1) { res->time = times[0]; res->time_sd = 0; + return; } - /* get mean value */ - for (iter = 0; iter < times_len; iter++) { - sum += times[iter]; + // Create a copy for outlier detection + double *sorted_times = malloc(times_len * sizeof(double)); + if (!sorted_times) { + // Fallback to original algorithm if malloc fails + for (iter = 0; iter < times_len; iter++) { + sum += times[iter]; + } + mean = sum / times_len; + + for (iter = 0; iter < times_len; iter++) { + sdev += (times[iter] - mean) * (times[iter] - mean); + } + var = sdev / (times_len - 1); + sd = sqrt(var); + + res->time = mean; + res->time_sd = sd; + return; + } + + memcpy(sorted_times, times, times_len * sizeof(double)); + qsort(sorted_times, times_len, sizeof(double), compare_doubles); + + // Remove extreme outliers (beyond 2.5 std devs) only if we have enough samples + if (times_len >= 10) { + // Calculate initial stats to identify outliers + for (iter = 0; iter < times_len; iter++) { + sum += sorted_times[iter]; + } + mean = sum / times_len; + + sdev = 0.0; + for (iter = 0; iter < times_len; iter++) { + sdev += (sorted_times[iter] - mean) * (sorted_times[iter] - mean); + } + sd = sqrt(sdev / (times_len - 1)); + + // Remove outliers beyond 2.5 standard deviations + double threshold = 2.5 * sd; + valid_count = 0; + sum = 0.0; + + for (iter = 0; iter < times_len; iter++) { + if (fabs(sorted_times[iter] - mean) <= threshold) { + sum += sorted_times[iter]; + valid_count++; + } + } + + if (valid_count < times_len / 2) { + // Too many outliers, use all data + valid_count = times_len; + sum = 0.0; + for (iter = 0; iter < times_len; iter++) { + sum += sorted_times[iter]; + } + } + } else { + // Use all data for small sample sizes + for (iter = 0; iter < times_len; iter++) { + sum += sorted_times[iter]; + } } - mean = sum / times_len; - /* get variance */ + mean = sum / valid_count; + + // Recalculate variance with outliers removed + sdev = 0.0; + uint32_t count = 0; + double threshold = (times_len >= 10) ? 2.5 * sd : DBL_MAX; + for (iter = 0; iter < times_len; iter++) { - sdev += (times[iter] - mean) * (times[iter] - mean); + if (times_len < 10 || fabs(sorted_times[iter] - mean) <= threshold) { + sdev += (sorted_times[iter] - mean) * (sorted_times[iter] - mean); + count++; + } } - var = sdev / (times_len - 1); - /* get standard derivation */ - sd = sqrt(var); + if (count > 1) { + var = sdev / (count - 1); + sd = sqrt(var); + } else { + sd = 0.0; + } res->time = mean; res->time_sd = sd; + res->samples_used = count; + + // Calculate 95% confidence interval + if (count > 1) { + // t-distribution critical value for 95% confidence (approximation for large samples) + double t_critical = (count >= 30) ? 1.96 : 2.776; // Conservative estimate for small samples + if (count >= 5 && count < 30) { + // Rough approximation for t-distribution + t_critical = 2.5 - 0.5 * (count - 5) / 25.0; + } + + double margin_of_error = t_critical * (sd / sqrt(count)); + res->confidence_95_lower = mean - margin_of_error; + res->confidence_95_upper = mean + margin_of_error; + + // Ensure confidence bounds are non-negative + if (res->confidence_95_lower < 0) { + res->confidence_95_lower = 0; + } + } else { + res->confidence_95_lower = mean; + res->confidence_95_upper = mean; + } + + free(sorted_times); } void printResult(char * name, struct result * res) { - fprintf(stdout, "[%10s] time: %7.1f ms (+/- %4.1f %%), matches: %8d\n", name, res->time, (res->time_sd / res->time) * 100, res->matches); + double cv = (res->time > 0) ? (res->time_sd / res->time) * 100 : 0; + const char* stability = ""; + + // Measurement stability indicators + if (cv > 30) { + stability = " [UNSTABLE]"; + } else if (cv > 15) { + stability = " [NOISY]"; + } else if (cv > 5) { + stability = " [VARIABLE]"; + } + + // Format memory information + char memory_info[64] = ""; + if (res->memory_rss_peak_kb > 0) { + if (res->memory_delta_kb > 0) { + snprintf(memory_info, sizeof(memory_info), " mem: %ld kB (+%ld)", + res->memory_rss_peak_kb, res->memory_delta_kb); + } else { + snprintf(memory_info, sizeof(memory_info), " mem: %ld kB", + res->memory_rss_peak_kb); + } + } + + fprintf(stdout, "[%10s] time: %7.1f ms [%6.1f-%6.1f] (±%4.1f%%)%s matches: %8d (n=%d)%s\n", + name, res->time, + res->confidence_95_lower, res->confidence_95_upper, + cv, memory_info, res->matches, res->samples_used, stability); fflush(stdout); } +int validate_results(char* pattern, struct result* results, char** engine_names, int num_engines) +{ + int reference_matches = -1; + char* reference_engine = NULL; + int validation_errors = 0; + int valid_engines = 0; + + // Find a reference match count from the first successful engine + for (int i = 0; i < num_engines; i++) { + if (results[i].time > 0) { // Engine succeeded + if (reference_matches == -1) { + reference_matches = results[i].matches; + reference_engine = engine_names[i]; + } + valid_engines++; + } + } + + // If no engines succeeded, skip validation + if (valid_engines == 0) { + fprintf(stderr, "WARNING: No engines succeeded for pattern '%s'\n", pattern); + return 0; + } + + // If only one engine succeeded, can't validate + if (valid_engines == 1) { + fprintf(stdout, "INFO: Only one engine (%s) succeeded for pattern '%s', skipping validation\n", + reference_engine, pattern); + return 0; + } + + // Compare all successful engines against reference + for (int i = 0; i < num_engines; i++) { + if (results[i].time > 0) { // Engine succeeded + if (results[i].matches != reference_matches) { + fprintf(stderr, "VALIDATION ERROR: Pattern '%s' - %s found %d matches, %s found %d matches\n", + pattern, engine_names[i], results[i].matches, + reference_engine, reference_matches); + validation_errors++; + } + } + } + + if (validation_errors == 0 && valid_engines > 1) { + // Only print success message for complex patterns to reduce noise + if (reference_matches > 1000 || strstr(pattern, "\\p{") || strstr(pattern, "(?i)")) { + fprintf(stdout, "VALIDATION OK: All %d engines agree on %d matches for pattern '%s'\n", + valid_engines, reference_matches, pattern); + } + } + + return validation_errors; +} + +int needs_warmup(const char* engine_name) +{ + // JIT engines benefit from warmup cycles + return (strstr(engine_name, "jit") != NULL || + strstr(engine_name, "re2") != NULL || + strstr(engine_name, "rust") != NULL); +} + +long get_memory_usage_kb(const char* metric) +{ + FILE* status_file = fopen("/proc/self/status", "r"); + if (!status_file) { + return -1; + } + + char line[256]; + long value = -1; + + while (fgets(line, sizeof(line), status_file)) { + if (strncmp(line, metric, strlen(metric)) == 0) { + // Parse the value (format: "MetricName: 1234 kB") + char* colon = strchr(line, ':'); + if (colon) { + value = atol(colon + 1); + break; + } + } + } + + fclose(status_file); + return value; +} + +void record_memory_baseline(struct result* res) +{ + res->memory_start_kb = get_memory_usage_kb("VmRSS"); + if (res->memory_start_kb < 0) { + res->memory_start_kb = 0; + } +} + +void record_memory_peak(struct result* res) +{ + long current_peak = get_memory_usage_kb("VmHWM"); + long current_rss = get_memory_usage_kb("VmRSS"); + + if (current_peak > 0) { + res->memory_rss_peak_kb = current_peak; + res->memory_peak_kb = get_memory_usage_kb("VmPeak"); + + if (res->memory_start_kb > 0 && current_rss > res->memory_start_kb) { + res->memory_delta_kb = current_rss - res->memory_start_kb; + } else { + res->memory_delta_kb = 0; + } + } else { + res->memory_rss_peak_kb = 0; + res->memory_peak_kb = 0; + res->memory_delta_kb = 0; + } +} + int main(int argc, char **argv) { char const * file = NULL; @@ -193,7 +521,7 @@ int main(int argc, char **argv) int repeat = 5; int c = 0; - while ((c = getopt(argc, argv, "n:hvf:o:")) != -1) { + while ((c = getopt(argc, argv, "n:hvf:o:t:")) != -1) { switch (c) { case 'f': file = optarg; @@ -204,6 +532,13 @@ int main(int argc, char **argv) case 'o': out_file = optarg; break; + case 't': + timeout_ms = atof(optarg) * 1000.0; // Convert seconds to milliseconds + if (timeout_ms <= 0) { + fprintf(stderr, "Timeout must be a positive number (in seconds)\n"); + exit(EXIT_FAILURE); + } + break; case 'v': printf("%s\n", VERSION_STRING); exit(EXIT_SUCCESS); @@ -213,6 +548,7 @@ int main(int argc, char **argv) printf(" -f\tInput file.\n"); printf(" -n\tSet number of repetitions. Default: 5\n"); printf(" -o\tWrite measured data into CSV file.\n"); + printf(" -t\tSet timeout in seconds. Default: 1.0\n"); printf(" -v\tGet the application version and build date.\n"); printf(" -h\tPrint this help message\n\n"); exit(EXIT_SUCCESS); @@ -224,17 +560,29 @@ int main(int argc, char **argv) exit(EXIT_FAILURE); } - load(file); - if (data_len == 0) { + if (load(file) != 0) { + fprintf(stderr, "Failed to load file '%s'\n", file); exit(EXIT_FAILURE); } + // Register cleanup function for proper memory management + atexit(cleanup_data); + struct result results[sizeof(regex)/sizeof(regex[0])][sizeof(engines)/sizeof(engines[0])] = {0}; struct result engine_results[sizeof(engines)/sizeof(engines[0])] = {0}; + int total_validation_errors = 0; for (int iter = 0; iter < sizeof(regex)/sizeof(regex[0]); iter++) { find_all(regex[iter], data, data_len, repeat, results[iter]); + // Validate results for this pattern + char* engine_names[sizeof(engines)/sizeof(engines[0])]; + for (int iiter = 0; iiter < sizeof(engines)/sizeof(engines[0]); iiter++) { + engine_names[iiter] = engines[iiter].name; + } + int errors = validate_results(regex[iter], results[iter], engine_names, sizeof(engines)/sizeof(engines[0])); + total_validation_errors += errors; + for (int iiter = 0; iiter < sizeof(engines)/sizeof(engines[0]); iiter++) { engine_results[iiter].time += results[iter][iiter].time; engine_results[iiter].score += results[iter][iiter].score; @@ -243,7 +591,116 @@ int main(int argc, char **argv) fprintf(stdout, "-----------------\nTotal Results:\n"); for (int iter = 0; iter < sizeof(engines)/sizeof(engines[0]); iter++) { - fprintf(stdout, "[%10s] time: %7.1f ms, score: %6u points,\n", engines[iter].name, engine_results[iter].time, engine_results[iter].score); + char memory_summary[32] = ""; + if (engine_results[iter].memory_rss_peak_kb > 0) { + snprintf(memory_summary, sizeof(memory_summary), ", mem: %ld kB", + engine_results[iter].memory_rss_peak_kb); + } + fprintf(stdout, "[%10s] time: %7.1f ms, score: %6u points%s\n", + engines[iter].name, engine_results[iter].time, engine_results[iter].score, memory_summary); + } + + // Print validation and stability summary + fprintf(stdout, "-----------------\nValidation Summary:\n"); + if (total_validation_errors == 0) { + fprintf(stdout, "✓ ALL ENGINES PASSED VALIDATION - No discrepancies found\n"); + } else { + fprintf(stderr, "✗ VALIDATION FAILURES: %d discrepancies found across patterns\n", total_validation_errors); + fprintf(stderr, "WARNING: Results may not be reliable due to engine inconsistencies\n"); + } + + fprintf(stdout, "-----------------\nMeasurement Quality:\n"); + int unstable_engines = 0, noisy_engines = 0, variable_engines = 0, stable_engines = 0; + + for (int iter = 0; iter < sizeof(engines)/sizeof(engines[0]); iter++) { + if (engine_results[iter].time > 0) { + double cv = (engine_results[iter].time_sd / engine_results[iter].time) * 100; + if (cv > 30) unstable_engines++; + else if (cv > 15) noisy_engines++; + else if (cv > 5) variable_engines++; + else stable_engines++; + } + } + + fprintf(stdout, "Stable engines: %d, Variable: %d, Noisy: %d, Unstable: %d\n", + stable_engines, variable_engines, noisy_engines, unstable_engines); + + if (unstable_engines > 0) { + fprintf(stderr, "WARNING: %d engines show unstable measurements (>30%% CV)\n", unstable_engines); + } + + // Memory vs Speed Analysis + fprintf(stdout, "\n-----------------\n"); + fprintf(stdout, "Memory vs Speed Analysis:\n"); + + // Calculate average memory usage and speed across all regex patterns + double avg_memory[sizeof(engines)/sizeof(engines[0])] = {0}; + double avg_time[sizeof(engines)/sizeof(engines[0])] = {0}; + int valid_results[sizeof(engines)/sizeof(engines[0])] = {0}; + + for (int e = 0; e < sizeof(engines)/sizeof(engines[0]); e++) { + for (int r = 0; r < sizeof(regex)/sizeof(regex[0]); r++) { + if (results[r][e].time > 0 && results[r][e].memory_peak_kb > 0) { + avg_memory[e] += results[r][e].memory_delta_kb; + avg_time[e] += results[r][e].time; + valid_results[e]++; + } + } + if (valid_results[e] > 0) { + avg_memory[e] /= valid_results[e]; + avg_time[e] /= valid_results[e]; + } + } + + // Find most memory efficient and fastest engines + int most_efficient = -1, fastest = -1; + double min_memory = 1e9, min_time = 1e9; + + for (int e = 0; e < sizeof(engines)/sizeof(engines[0]); e++) { + if (valid_results[e] > 0) { + if (avg_memory[e] < min_memory) { + min_memory = avg_memory[e]; + most_efficient = e; + } + if (avg_time[e] < min_time) { + min_time = avg_time[e]; + fastest = e; + } + } + } + + // Report memory efficiency rankings + fprintf(stdout, "Memory Efficiency Rankings (avg delta KB):\n"); + for (int e = 0; e < sizeof(engines)/sizeof(engines[0]); e++) { + if (valid_results[e] > 0) { + fprintf(stdout, " %s: %.1f kB avg\n", engines[e].name, avg_memory[e]); + } + } + + fprintf(stdout, "Speed Rankings (avg ms):\n"); + for (int e = 0; e < sizeof(engines)/sizeof(engines[0]); e++) { + if (valid_results[e] > 0) { + fprintf(stdout, " %s: %.1f ms avg\n", engines[e].name, avg_time[e]); + } + } + + if (most_efficient >= 0 && fastest >= 0) { + fprintf(stdout, "\nSummary:\n"); + fprintf(stdout, " Most memory efficient: %s (%.1f kB avg)\n", + engines[most_efficient].name, avg_memory[most_efficient]); + fprintf(stdout, " Fastest: %s (%.1f ms avg)\n", + engines[fastest].name, avg_time[fastest]); + + if (most_efficient == fastest) { + fprintf(stdout, " %s wins both speed and memory efficiency!\n", + engines[fastest].name); + } else { + // Calculate memory/speed ratio for trade-off analysis + fprintf(stdout, " Trade-off: %s uses %.1fx more memory for %.1fx speed improvement\n", + engines[fastest].name, + avg_memory[fastest] / avg_memory[most_efficient], + avg_time[most_efficient] / avg_time[fastest]); + } } if (out_file != NULL) { @@ -267,6 +724,9 @@ int main(int argc, char **argv) for (iter = 0; iter < sizeof(engines)/sizeof(engines[0]); iter++) { fprintf(f, "%s [matches];", engines[iter].name); } + for (iter = 0; iter < sizeof(engines)/sizeof(engines[0]); iter++) { + fprintf(f, "%s [mem_kb];", engines[iter].name); + } fprintf(f, "\n"); /* write data */ @@ -282,6 +742,9 @@ int main(int argc, char **argv) for (iiter = 0; iiter < sizeof(engines)/sizeof(engines[0]); iiter++) { fprintf(f, "%d;", results[iter][iiter].matches); } + for (iiter = 0; iiter < sizeof(engines)/sizeof(engines[0]); iiter++) { + fprintf(f, "%ld;", results[iter][iiter].memory_delta_kb); + } fprintf(f, "\n"); } diff --git a/src/main.h b/src/main.h index 973f72a..14db4e3 100644 --- a/src/main.h +++ b/src/main.h @@ -12,15 +12,41 @@ extern "C" { #define TIME_DIFF_IN_MS(begin, end) (((double) (end - begin)) * 1000 / CLOCKS_PER_SEC) #define UNUSED __attribute__((unused)) +// Global timeout setting (in milliseconds) +extern double timeout_ms; +#define CHECK_TIMEOUT(start_time) \ + do { \ + TIME_TYPE now; \ + GET_TIME(now); \ + if (TIME_DIFF_IN_MS(start_time, now) > timeout_ms) { \ + return -2; /* Timeout error code */ \ + } \ + } while(0) + struct result { int score; double time; double time_sd; + double confidence_95_lower; + double confidence_95_upper; int matches; + int samples_used; // How many samples after outlier removal + + // Memory usage tracking + long memory_peak_kb; // Peak memory usage during execution + long memory_rss_peak_kb; // Peak resident set size + long memory_start_kb; // Baseline memory before execution + long memory_delta_kb; // Memory increase during execution }; void get_mean_and_derivation(double * times, uint32_t times_len, struct result * res); void printResult(char * name, struct result * res); +void cleanup_data(void); +int validate_results(char* pattern, struct result* results, char** engine_names, int num_engines); +int needs_warmup(const char* engine_name); +long get_memory_usage_kb(const char* metric); +void record_memory_baseline(struct result* res); +void record_memory_peak(struct result* res); #ifdef INCLUDE_CTRE int ctre_find_all(char* pattern, char* subject, int subject_len, int repeat, struct result * res); diff --git a/src/pcre2.c b/src/pcre2.c index 9e92264..2cecadb 100644 --- a/src/pcre2.c +++ b/src/pcre2.c @@ -78,8 +78,12 @@ int pcre2_find_all(char* pattern, char* subject, int subject_len, int repeat, in double * times = calloc(repeat, sizeof(double)); int const times_len = repeat; + TIME_TYPE test_start; + GET_TIME(test_start); do { + // Check for timeout before each iteration + CHECK_TIMEOUT(test_start); found = 0; ptr = subject; len = subject_len; diff --git a/src/tre.c b/src/tre.c index f1b6ffc..557176f 100644 --- a/src/tre.c +++ b/src/tre.c @@ -22,8 +22,12 @@ int tre_find_all(char* pattern, char* subject, int subject_len, int repeat, stru double * times = calloc(repeat, sizeof(double)); int const times_len = repeat; + TIME_TYPE test_start; + GET_TIME(test_start); do { + // Check for timeout before each iteration + CHECK_TIMEOUT(test_start); found = 0; ptr = subject; len = subject_len; diff --git a/src/yara.c b/src/yara.c index fd56093..ddc6a7f 100644 --- a/src/yara.c +++ b/src/yara.c @@ -92,8 +92,13 @@ int yara_find_all(char* pattern, char* subject, int subject_len, int repeat, str double * times = calloc(repeat, sizeof(double)); int const times_len = repeat; + TIME_TYPE test_start; + GET_TIME(test_start); do { + // Check for timeout before each iteration + CHECK_TIMEOUT(test_start); + counter = 0; GET_TIME(start); yr_rules_scan_mem(rules, (const uint8_t*) subject, subject_len, 0, capture_matches, &counter, 0); diff --git a/vendor/CMakeLists.txt b/vendor/CMakeLists.txt index 5da456d..8750710 100644 --- a/vendor/CMakeLists.txt +++ b/vendor/CMakeLists.txt @@ -9,7 +9,7 @@ include(ExternalProject) function(AddExternalProject NAME LIB_NAME URL TAG) string(TOUPPER ${NAME} UP_NAME) - set(INCLUDE_${UP_NAME} "local" CACHE STRING "Use ${NAME} library form local built, system or disable usage.") + set(INCLUDE_${UP_NAME} "system" CACHE STRING "Use ${NAME} library form local built, system or disable usage.") set_property(CACHE INCLUDE_${UP_NAME} PROPERTY STRINGS "local" "system" "disabled") message("-- Include ${NAME}: ${INCLUDE_${UP_NAME}}") @@ -25,6 +25,7 @@ function(AddExternalProject NAME LIB_NAME URL TAG) lib${NAME} GIT_REPOSITORY ${URL} GIT_TAG ${TAG} + GIT_EXECUTABLE /usr/bin/git PREFIX ${CMAKE_CURRENT_SOURCE_DIR} SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/${NAME} TMP_DIR ${PROJECT_BINARY_DIR}/${NAME}-tmp @@ -37,6 +38,7 @@ function(AddExternalProject NAME LIB_NAME URL TAG) ExternalProject_Add( lib${NAME} GIT_REPOSITORY ${URL} + GIT_EXECUTABLE /usr/bin/git PREFIX ${CMAKE_CURRENT_SOURCE_DIR} SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/${NAME} TMP_DIR ${PROJECT_BINARY_DIR}/${NAME}-tmp @@ -71,77 +73,108 @@ function(AddExternalProject NAME LIB_NAME URL TAG) endif() endfunction() -# hyperscan -AddExternalProject( - "hyperscan" - "hs" - "https://github.com/01org/hyperscan.git" - "master" - -DCMAKE_BUILD_TYPE=Release -DFAT_RUNTIME=OFF -DCMAKE_INSTALL_PREFIX=${CMAKE_CURRENT_SOURCE_DIR}/local -) - -# oniguruma -AddExternalProject( - "oniguruma" - "onig" - "https://github.com/kkos/oniguruma.git" - "master" - -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=${CMAKE_CURRENT_SOURCE_DIR}/local -) - -# re2 -AddExternalProject( - "re2" - "re2" - "https://github.com/google/re2.git" - "main" - -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=${CMAKE_CURRENT_SOURCE_DIR}/local -) - -# tre -AddExternalProject( - "tre" - "tre" - "https://github.com/laurikari/tre.git" - "master" - cd ${CMAKE_CURRENT_SOURCE_DIR}/tre/ && ./utils/autogen.sh && cd ${PROJECT_BINARY_DIR}/tre-build && ${CMAKE_CURRENT_SOURCE_DIR}/tre/configure --prefix=${CMAKE_CURRENT_SOURCE_DIR}/local -) - -# pcre2 -AddExternalProject( - "pcre2" - "pcre2-8" - "https://github.com/PhilipHazel/pcre2.git" - "master" - -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=${CMAKE_CURRENT_SOURCE_DIR}/local -DPCRE2_SUPPORT_JIT=ON -) + + +# PCRE2 +set(INCLUDE_PCRE2 "system" CACHE STRING "Use pcre2 library form local built, system or disable usage.") +set_property(CACHE INCLUDE_PCRE2 PROPERTY STRINGS "local" "system" "disabled") +message("-- Include pcre2: ${INCLUDE_PCRE2}") +if(INCLUDE_PCRE2 MATCHES "system") + find_library(LIB_PCRE2 pcre2-8 ${CMAKE_CURRENT_SOURCE_DIR}/local/lib) + if(NOT LIB_PCRE2) + message(FATAL_ERROR " > Library 'pcre2-8' not found on system.\n") + else() + message(" > Found pcre2 library: ${LIB_PCRE2}\n") + endif() +endif() + +# Oniguruma +set(INCLUDE_ONIGURUMA "system" CACHE STRING "Use oniguruma library form local built, system or disable usage.") +set_property(CACHE INCLUDE_ONIGURUMA PROPERTY STRINGS "local" "system" "disabled") +message("-- Include oniguruma: ${INCLUDE_ONIGURUMA}") +if(INCLUDE_ONIGURUMA MATCHES "system") + find_library(LIB_ONIGURUMA onig ${CMAKE_CURRENT_SOURCE_DIR}/local/lib) + if(NOT LIB_ONIGURUMA) + message(FATAL_ERROR " > Library 'onig' not found on system.\n") + else() + message(" > Found oniguruma library: ${LIB_ONIGURUMA}\n") + endif() +endif() + +# TRE +set(INCLUDE_TRE "system" CACHE STRING "Use tre library form local built, system or disable usage.") +set_property(CACHE INCLUDE_TRE PROPERTY STRINGS "local" "system" "disabled") +message("-- Include tre: ${INCLUDE_TRE}") +if(INCLUDE_TRE MATCHES "system") + find_library(LIB_TRE tre ${CMAKE_CURRENT_SOURCE_DIR}/local/lib) + if(NOT LIB_TRE) + message(FATAL_ERROR " > Library 'tre' not found on system.\n") + else() + message(" > Found tre library: ${LIB_TRE}\n") + endif() +endif() + +# RE2 +set(INCLUDE_RE2 "system" CACHE STRING "Use re2 library form local built, system or disable usage.") +set_property(CACHE INCLUDE_RE2 PROPERTY STRINGS "local" "system" "disabled") +message("-- Include re2: ${INCLUDE_RE2}") +if(INCLUDE_RE2 MATCHES "system") + find_library(LIB_RE2 re2 ${CMAKE_CURRENT_SOURCE_DIR}/local/lib) + if(NOT LIB_RE2) + message(FATAL_ERROR " > Library 're2' not found on system.\n") + else() + message(" > Found re2 library: ${LIB_RE2}\n") + endif() +endif() + +# CTRE (header-only) +set(INCLUDE_CTRE "system" CACHE STRING "Use ctre library form local built, system or disable usage.") +set_property(CACHE INCLUDE_CTRE PROPERTY STRINGS "local" "system" "disabled") +message("-- Include ctre: ${INCLUDE_CTRE}") + +set(INCLUDE_BOOST "system" CACHE STRING "Use boost::regex library form local built, system or disable usage.") +set_property(CACHE INCLUDE_BOOST PROPERTY STRINGS "local" "system" "disabled") +message("-- Include boost: ${INCLUDE_BOOST}") +if(INCLUDE_BOOST MATCHES "system") + find_library(LIB_BOOST boost_regex ${CMAKE_CURRENT_SOURCE_DIR}/local/lib) + if(NOT LIB_BOOST) + message(FATAL_ERROR " > Library 'boost_regex' not found on system.\n") + else() + message(" > Found boost library: ${LIB_BOOST}\n") + endif() +endif() + +# YARA +set(INCLUDE_YARA "system" CACHE STRING "Use yara library form local built, system or disable usage.") +set_property(CACHE INCLUDE_YARA PROPERTY STRINGS "local" "system" "disabled") +message("-- Include yara: ${INCLUDE_YARA}") +if(INCLUDE_YARA MATCHES "system") + find_library(LIB_YARA yara ${CMAKE_CURRENT_SOURCE_DIR}/local/lib) + if(NOT LIB_YARA) + message(FATAL_ERROR " > Library 'yara' not found on system.\n") + else() + message(" > Found yara library: ${LIB_YARA}\n") + endif() +endif() + +# Hyperscan +set(INCLUDE_HYPERSCAN "system" CACHE STRING "Use hyperscan library form local built, system or disable usage.") +set_property(CACHE INCLUDE_HYPERSCAN PROPERTY STRINGS "local" "system" "disabled") +message("-- Include hyperscan: ${INCLUDE_HYPERSCAN}") +if(INCLUDE_HYPERSCAN MATCHES "system") + find_library(LIB_HYPERSCAN hs ${CMAKE_CURRENT_SOURCE_DIR}/local/lib) + if(NOT LIB_HYPERSCAN) + message(FATAL_ERROR " > Library 'hs' not found on system.\n") + else() + message(" > Found hyperscan library: ${LIB_HYPERSCAN}\n") + endif() +endif() # c++ standard set(INCLUDE_CPPSTD "system" CACHE STRING "Use cppstd library form local built, system or disable usage.") set_property(CACHE INCLUDE_CPPSTD PROPERTY STRINGS "local" "system" "disabled") message("-- Include cppstd: ${INCLUDE_CPPSTD}") -# ctre -# -AddExternalProject( - "ctre" - "ctre" - "https://github.com/hanickadot/compile-time-regular-expressions.git" - "master" - -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=${CMAKE_CURRENT_SOURCE_DIR}/local -) - -# yara -AddExternalProject( - "yara" - "yara" - "https://github.com/VirusTotal/yara.git" - "master" - cd ${CMAKE_CURRENT_SOURCE_DIR}/yara/ && ./bootstrap.sh && cd ${PROJECT_BINARY_DIR}/yara-build && ${CMAKE_CURRENT_SOURCE_DIR}/yara/configure --prefix=${CMAKE_CURRENT_SOURCE_DIR}/local - -) - -# boost - I'm not going to build boost here -set(INCLUDE_BOOST "system" CACHE STRING "Use boost::regex library form local built, system or disable usage.") -set_property(CACHE INCLUDE_BOOST PROPERTY STRINGS "local" "system" "disabled") -message("-- Include boost: ${INCLUDE_BOOST}") + + + diff --git a/vendor/hyperscan b/vendor/hyperscan deleted file mode 160000 index 64a995b..0000000 --- a/vendor/hyperscan +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 64a995bf445d86b74eb0f375624ffc85682eadfe diff --git a/vendor/oniguruma b/vendor/oniguruma deleted file mode 160000 index c24467b..0000000 --- a/vendor/oniguruma +++ /dev/null @@ -1 +0,0 @@ -Subproject commit c24467b93176868d3ddd86a00bf0f8644dc65a80 diff --git a/vendor/re2 b/vendor/re2 deleted file mode 160000 index d826d9f..0000000 --- a/vendor/re2 +++ /dev/null @@ -1 +0,0 @@ -Subproject commit d826d9fcb68c62996c1b7c0a45d604e22d814952 diff --git a/vendor/tre b/vendor/tre deleted file mode 160000 index 6092368..0000000 --- a/vendor/tre +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 6092368aabdd0dbb0fbceb2766a37b98e0ff6911