diff --git a/arc/README.md b/arc/README.md new file mode 100644 index 000000000..e9e644e03 --- /dev/null +++ b/arc/README.md @@ -0,0 +1,175 @@ +# Arc - ClickBench Benchmark + +Arc is a high-performance time-series data warehouse built on DuckDB, Parquet, and object storage. + +## System Information + +- **System:** Arc +- **Date:** 2025-10-15 +- **Machine:** m3_max (14 cores, 36GB RAM) +- **Tags:** Python, time-series, DuckDB, Parquet, columnar, HTTP API +- **License:** AGPL-3.0 +- **Repository:** https://github.com/Basekick-Labs/arc + +## Performance + +Arc achieves: +- **Write throughput:** 1.89M records/sec (MessagePack binary protocol) +- **ClickBench:** ~22 seconds total (43 analytical queries) +- **Storage:** DuckDB + Parquet with MinIO/S3/GCS backends + +## Prerequisites + +- Ubuntu/Debian Linux (or compatible) +- Python 3.11+ +- 8GB+ RAM recommended +- Internet connection for dataset download +- Sudo access (only if system dependencies are missing) + +## Quick Start + +The benchmark script handles everything automatically: + +```bash +./benchmark.sh +``` + +This will: +1. Create Python virtual environment (no system packages modified) +2. Clone Arc repository +3. Install dependencies in venv +4. Start Arc server with optimal worker count (2x CPU cores) +5. Download ClickBench dataset (14GB parquet file) +6. Run 43 queries × 3 iterations +7. Output results in ClickBench JSON format + +## Manual Steps + +### 1. Install Dependencies + +```bash +sudo apt-get update -y +sudo apt-get install -y python3-pip python3-venv wget curl +``` + +### 2. Create Virtual Environment + +```bash +python3 -m venv arc-venv +source arc-venv/bin/activate +``` + +### 3. Clone and Setup Arc + +```bash +git clone https://github.com/Basekick-Labs/arc.git +cd arc +pip install -r requirements.txt +mkdir -p data logs +``` + +### 4. Create API Token + +```bash +python3 << 'EOF' +from api.auth import AuthManager + +auth = AuthManager(db_path='./data/arc.db') +token = auth.create_token(name='clickbench', description='ClickBench benchmark') +print(f"Token: {token}") +EOF +``` + +### 5. Start Arc Server + +```bash +# Auto-detect cores +CORES=$(nproc) +WORKERS=$((CORES * 2)) + +# Start server +gunicorn -w $WORKERS -b 0.0.0.0:8000 \ + -k uvicorn.workers.UvicornWorker \ + --timeout 300 \ + api.main:app +``` + +### 6. Download Dataset + +```bash +wget https://datasets.clickhouse.com/hits_compatible/hits.parquet +``` + +### 7. Run Benchmark + +```bash +export ARC_URL="http://localhost:8000" +export ARC_API_KEY="your-token-from-step-4" +export DATABASE="clickbench" +export TABLE="hits" + +./run.sh +``` + +**Note:** The benchmark uses Apache Arrow columnar format for optimal performance. Requires `pyarrow` to be installed. + +## Configuration + +Arc uses optimal settings for ClickBench (all automatic, no configuration needed): + +- **Workers:** Auto-detected cores × 2 (optimal for analytical workloads) +- **Query cache:** Disabled (per ClickBench rules) +- **Storage:** Local filesystem (fastest for single-node) +- **Timeout:** 300 seconds per query +- **Format:** Apache Arrow (columnar, high-performance) + +## Results Format + +Results are output in official ClickBench format: + +``` +Load time: 0 +Data size: 14779976446 +[0.0226, 0.0233, 0.0284] +[0.0324, 0.0334, 0.0392] +... +``` + +- **Load time:** Arc queries Parquet files directly without a data loading phase (load time = 0) +- **Data size:** Size of the dataset in bytes (14GB) +- **Query results:** 43 lines, each containing 3 execution times (in seconds) for the same query + +## Notes + +- **Virtual Environment:** All dependencies installed in isolated venv (no `--break-system-packages` needed) +- **Authentication:** Uses Arc's built-in token auth (simpler than Permission-based auth) +- **Query Cache:** Disabled to ensure fair benchmark (no cache hits) +- **Worker Count:** Auto-detected based on CPU cores, optimized for analytical workloads +- **Timeout:** Generous 300s timeout for complex queries + +## Architecture + +``` +ClickBench Query → Arc Arrow API → DuckDB → Parquet File → Arrow Results +``` + +Arc queries the Parquet file directly via DuckDB's `read_parquet()` function and returns results in Apache Arrow columnar format for maximum efficiency. + +## Performance Characteristics + +Arc is optimized for: +- **High-throughput writes** (1.89M RPS with MessagePack) +- **Analytical queries** (DuckDB's columnar engine) +- **Columnar data transfer** (Apache Arrow IPC for efficient results) +- **Object storage** (S3, GCS, MinIO compatibility) +- **Time-series workloads** (built-in time-based indexing) + +## Support + +- GitHub: https://github.com/Basekick-Labs/arc +- Issues: https://github.com/Basekick-Labs/arc/issues +- Docs: https://docs.arc.basekick.com (coming soon) + +## License + +Arc Core is licensed under AGPL-3.0. diff --git a/arc/benchmark.sh b/arc/benchmark.sh new file mode 100755 index 000000000..55451132e --- /dev/null +++ b/arc/benchmark.sh @@ -0,0 +1,393 @@ +#!/bin/bash +# Arc ClickBench Complete Benchmark Script +# This script installs Arc, loads data, and runs the benchmark + +set -e + +# Check and install system dependencies +echo "Checking system dependencies..." + +MISSING_DEPS=() +command -v python3 >/dev/null 2>&1 || MISSING_DEPS+=("python3") +command -v pip3 >/dev/null 2>&1 || MISSING_DEPS+=("python3-pip") +command -v wget >/dev/null 2>&1 || MISSING_DEPS+=("wget") +command -v curl >/dev/null 2>&1 || MISSING_DEPS+=("curl") + +# Check for python3-venv by detecting Python version +PYTHON_VERSION=$(python3 --version 2>&1 | awk '{print $2}' | cut -d. -f1,2) +VENV_PACKAGE="python${PYTHON_VERSION}-venv" + +# Try to create a test venv to check if venv is properly installed +if ! python3 -m venv --help >/dev/null 2>&1 || ! python3 -c "import ensurepip" 2>/dev/null; then + MISSING_DEPS+=("$VENV_PACKAGE") +fi + +if [ ${#MISSING_DEPS[@]} -eq 0 ]; then + echo "[OK] All system dependencies are already installed" +else + echo "Installing missing dependencies: ${MISSING_DEPS[*]}" + sudo apt-get update -y + sudo apt-get install -y "${MISSING_DEPS[@]}" +fi + +# Create Python virtual environment +echo "Creating Python virtual environment..." +python3 -m venv arc-venv +source arc-venv/bin/activate + +# Clone Arc repository if not exists +if [ ! -d "arc" ]; then + echo "Cloning Arc repository..." + git clone https://github.com/Basekick-Labs/arc.git +fi + +cd arc + +# Install Arc dependencies in venv +echo "Installing Arc dependencies..." +pip install --upgrade pip +pip install -r requirements.txt + +# Create data directory +mkdir -p data logs + +# Create or reuse API token for benchmark +echo "Setting up API token..." +python3 << 'EOF' +from api.auth import AuthManager +import os +import time + +# Initialize auth manager +auth = AuthManager(db_path='./data/arc.db') + +# Try to create token, or reuse if exists +token = None +token_name = f'clickbench-{int(time.time())}' + +try: + # Try to create new token with timestamp + token = auth.create_token( + name=token_name, + description='ClickBench benchmark access' + ) + print(f"Created new API token: {token_name}") +except Exception as e: + # If that fails, try with a simple name and catch if exists + try: + token = auth.create_token( + name='clickbench', + description='ClickBench benchmark access' + ) + print(f"Created API token: clickbench") + except ValueError: + # Token already exists, list and use existing one + print("Token 'clickbench' already exists, retrieving...") + tokens = auth.list_tokens() + for t in tokens: + if t.get('name') == 'clickbench': + token = t.get('token') + print(f"Reusing existing token: clickbench") + break + + if not token: + raise Exception("Could not create or retrieve token") + +# Write token to file for run.sh to use +with open('../arc_token.txt', 'w') as f: + f.write(token) +EOF + +ARC_TOKEN=$(cat ../arc_token.txt) +echo "Token ready: $ARC_TOKEN" + +# Auto-detect CPU cores (supports Linux and macOS) +if command -v nproc > /dev/null 2>&1; then + # Linux: use nproc + CORES=$(nproc) +elif command -v sysctl > /dev/null 2>&1; then + # macOS: use sysctl + CORES=$(sysctl -n hw.ncpu 2>/dev/null || sysctl -n hw.logicalcpu 2>/dev/null || echo 4) +elif [ -f /proc/cpuinfo ]; then + # Linux fallback: parse /proc/cpuinfo + CORES=$(grep -c processor /proc/cpuinfo) +else + # Final fallback + CORES=4 +fi + +# Use 2x cores for optimal analytical performance (automatic) +WORKERS=$((CORES * 2)) +echo "Starting Arc with $WORKERS workers ($CORES cores detected, 2x multiplier for optimal performance)..." + +# Create minimal .env if not exists +if [ ! -f ".env" ]; then + cat > .env << 'ENVEOF' +# Arc Configuration for ClickBench +STORAGE_BACKEND=local +LOCAL_STORAGE_PATH=./minio-data +PORT=8000 +HOST=0.0.0.0 +LOG_LEVEL=WARNING +QUERY_CACHE_ENABLED=false +BUFFER_MAX_SIZE=50000 +BUFFER_MAX_AGE=5 +ENVEOF +fi + +# Start Arc server in background +gunicorn -w $WORKERS -b 0.0.0.0:8000 \ + -k uvicorn.workers.UvicornWorker \ + --timeout 300 \ + --access-logfile /dev/null \ + --error-logfile ../arc.log \ + --log-level warning \ + api.main:app > /dev/null 2>&1 & + +ARC_PID=$! +echo "Arc started with PID: $ARC_PID" + +# Wait for Arc to be ready (up to 30 seconds) +echo "Waiting for Arc to be ready..." +for i in {1..30}; do + if curl -s -f http://localhost:8000/health > /dev/null 2>&1; then + echo "[OK] Arc is ready!" + break + fi + if [ $i -eq 30 ]; then + echo "Error: Arc failed to start within 30 seconds" + echo "Last 50 lines of logs:" + tail -50 ../arc.log + kill $ARC_PID 2>/dev/null || true + exit 1 + fi + sleep 1 +done + +cd .. + +# Download and prepare dataset +DATASET_FILE="hits.parquet" +DATASET_URL="https://datasets.clickhouse.com/hits_compatible/hits.parquet" +EXPECTED_SIZE=14779976446 # 14GB + +if [ -f "$DATASET_FILE" ]; then + CURRENT_SIZE=$(stat -f%z "$DATASET_FILE" 2>/dev/null || stat -c%s "$DATASET_FILE" 2>/dev/null) + if [ "$CURRENT_SIZE" -eq "$EXPECTED_SIZE" ]; then + echo "[OK] Dataset already downloaded (14GB)" + else + echo "[WARNING] Dataset exists but size mismatch (expected: $EXPECTED_SIZE, got: $CURRENT_SIZE)" + echo "Re-downloading dataset..." + rm -f "$DATASET_FILE" + wget --continue --progress=dot:giga "$DATASET_URL" + fi +else + echo "Downloading ClickBench dataset (14GB)..." + wget --continue --progress=dot:giga "$DATASET_URL" +fi + +FILE_SIZE=$(du -h "$DATASET_FILE" | cut -f1) +echo "Dataset size: $FILE_SIZE ($DATASET_FILE)" + +# Count rows using DuckDB +echo "Counting rows..." +python3 << 'EOF' +import duckdb +conn = duckdb.connect() +count = conn.execute("SELECT COUNT(*) FROM read_parquet('hits.parquet')").fetchone()[0] +print(f"Dataset contains {count:,} rows") +EOF + +# Set environment variables for benchmarking +export ARC_URL="http://localhost:8000" +export ARC_API_KEY="$ARC_TOKEN" +export DATABASE="clickbench" +export TABLE="hits" + +# Load data into Arc by copying parquet file to storage +echo "" +echo "Loading ClickBench data into Arc..." +echo "================================================" + +STORAGE_BASE="arc/data/arc" +TARGET_DIR="$STORAGE_BASE/$DATABASE/$TABLE" +TARGET_FILE="$TARGET_DIR/hits.parquet" + +# Create target directory +mkdir -p "$TARGET_DIR" + +# Check if already loaded +if [ -f "$TARGET_FILE" ]; then + SOURCE_SIZE=$(stat -f%z "$DATASET_FILE" 2>/dev/null || stat -c%s "$DATASET_FILE" 2>/dev/null) + TARGET_SIZE=$(stat -f%z "$TARGET_FILE" 2>/dev/null || stat -c%s "$TARGET_FILE" 2>/dev/null) + + if [ "$SOURCE_SIZE" -eq "$TARGET_SIZE" ]; then + echo "[OK] Data already loaded (14GB)" + echo " Location: $TARGET_FILE" + else + echo "[WARNING] Existing file has different size, reloading..." + rm -f "$TARGET_FILE" + echo " Copying parquet file to Arc storage..." + cp "$DATASET_FILE" "$TARGET_FILE" + echo "[OK] Data loaded successfully!" + fi +else + echo " Copying parquet file to Arc storage..." + echo " Source: $DATASET_FILE" + echo " Target: $TARGET_FILE" + cp "$DATASET_FILE" "$TARGET_FILE" + echo "[OK] Data loaded successfully!" + echo " Table: $DATABASE.$TABLE" + TARGET_SIZE=$(du -h "$TARGET_FILE" | cut -f1) + echo " Size: $TARGET_SIZE" +fi + +echo "" +echo "Data loading complete." + +# Verify query cache configuration +echo "" +echo "Verifying query cache configuration..." +cd arc +python3 << 'CACHECHECK' +import os +import sys + +# Check all possible cache configuration sources +print("=" * 70) +print("Query Cache Configuration Check") +print("=" * 70) + +# 1. Check arc.conf +cache_in_conf = None +try: + from config_loader import get_config + arc_config = get_config() + cache_config = arc_config.config.get('query_cache', {}) + cache_in_conf = cache_config.get('enabled', None) + print(f" arc.conf: enabled = {cache_in_conf}") +except Exception as e: + print(f" arc.conf: Error reading: {e}") + +# 2. Check .env file +cache_in_env = None +if os.path.exists('.env'): + with open('.env', 'r') as f: + for line in f: + if line.strip().startswith('QUERY_CACHE_ENABLED'): + cache_in_env = line.split('=')[1].strip().lower() + print(f" .env: QUERY_CACHE_ENABLED = {cache_in_env}") + break + if cache_in_env is None: + print(f" .env: QUERY_CACHE_ENABLED not set") +else: + print(f" .env: File not found") + +# 3. Check environment variable +cache_in_os_env = os.getenv("QUERY_CACHE_ENABLED") +if cache_in_os_env: + print(f" Environment: QUERY_CACHE_ENABLED = {cache_in_os_env}") +else: + print(f" Environment: QUERY_CACHE_ENABLED not set") + +# 4. Check what init_query_cache will actually use +print("") +try: + from api.query_cache import init_query_cache + cache_instance = init_query_cache() + if cache_instance is None: + print(f"[OK] FINAL RESULT: Query cache is DISABLED") + else: + print(f"[ERROR] FINAL RESULT: Query cache is ENABLED") + print(f" TTL: {cache_instance.ttl_seconds}s") + print(f" Max size: {cache_instance.max_size}") + print(f"\n [WARNING] Cache must be disabled for valid benchmark results!") +except Exception as e: + print(f"[ERROR] Error checking cache initialization: {e}") + +print("=" * 70) +CACHECHECK + +cd .. + +# Test API token before running benchmark +echo "" +echo "Testing API token authentication..." +TEST_RESPONSE=$(curl -s -w "\n%{http_code}" -H "x-api-key: $ARC_API_KEY" "$ARC_URL/health") +HTTP_CODE=$(echo "$TEST_RESPONSE" | tail -n1) +if [ "$HTTP_CODE" = "200" ]; then + echo "[OK] API token is valid" +else + echo "[ERROR] API token test failed (HTTP $HTTP_CODE)" + echo "Response: $(echo "$TEST_RESPONSE" | head -n-1)" + echo "" + echo "Debugging: Let's verify the token exists in the database..." + cd arc + python3 << 'DEBUGEOF' +from api.auth import AuthManager +auth = AuthManager(db_path='./data/arc.db') +tokens = auth.list_tokens() +print(f"Found {len(tokens)} tokens in database:") +for t in tokens: + print(f" - {t.get('name')}: {t.get('token')[:20]}...") +DEBUGEOF + cd .. + echo "" + echo "Error: Cannot proceed without valid authentication" + kill $ARC_PID 2>/dev/null || true + exit 1 +fi + +# Run benchmark +echo "" +echo "Running ClickBench queries via Arc Arrow API..." +echo "================================================" +echo "(Logging to log.txt, this may take a few minutes...)" +./run.sh > log.txt 2>&1 +echo "Benchmark execution complete!" + +# Stop Arc +echo "" +echo "Stopping Arc..." +kill $ARC_PID 2>/dev/null || true +wait $ARC_PID 2>/dev/null || true + +# Deactivate venv +deactivate + +# Format results for ClickBench (official format) +echo "" +echo "Formatting results..." + +# Extract timing values from log +cat log.txt | grep -oE '^[0-9]+\.[0-9]+|^null' | \ + awk '{ + if (NR % 3 == 1) printf "["; + printf "%s", $1; + if (NR % 3 == 0) print "]"; + else printf ", "; + }' > results.txt + +# Output in official ClickBench format +echo "" +echo "[OK] Benchmark complete!" +echo "" +echo "================================================" +echo "Official ClickBench Results" +echo "================================================" +echo "" + +# Load time (Arc doesn't load data, it queries Parquet directly) +echo "Load time: 0" + +# Data size in bytes +echo "Data size: $EXPECTED_SIZE" + +# Query results (43 lines) +cat results.txt + +echo "" +echo "================================================" +echo "Results saved to: results.txt" +echo "Full logs saved to: log.txt" +echo "================================================" diff --git a/arc/create.sql b/arc/create.sql new file mode 100644 index 000000000..9c5f4857a --- /dev/null +++ b/arc/create.sql @@ -0,0 +1,116 @@ +-- Arc ClickBench Schema +-- +-- Note: Arc queries Parquet files directly via DuckDB, so no explicit table creation is needed. +-- The benchmark.sh script copies the hits.parquet file to Arc's storage directory: +-- arc/data/arc/clickbench/hits/hits.parquet +-- +-- Arc automatically detects and queries Parquet files without requiring schema definition. +-- This file documents the equivalent schema for reference. + +CREATE TABLE hits ( + WatchID BIGINT, + JavaEnable SMALLINT, + Title VARCHAR, + GoodEvent SMALLINT, + EventTime BIGINT, + EventDate USMALLINT, + CounterID INTEGER, + ClientIP INTEGER, + RegionID INTEGER, + UserID BIGINT, + CounterClass SMALLINT, + OS SMALLINT, + UserAgent SMALLINT, + URL VARCHAR, + Referer VARCHAR, + IsRefresh SMALLINT, + RefererCategoryID SMALLINT, + RefererRegionID INTEGER, + URLCategoryID SMALLINT, + URLRegionID INTEGER, + ResolutionWidth SMALLINT, + ResolutionHeight SMALLINT, + ResolutionDepth SMALLINT, + FlashMajor SMALLINT, + FlashMinor SMALLINT, + FlashMinor2 VARCHAR, + NetMajor SMALLINT, + NetMinor SMALLINT, + UserAgentMajor SMALLINT, + UserAgentMinor VARCHAR, + CookieEnable SMALLINT, + JavascriptEnable SMALLINT, + IsMobile SMALLINT, + MobilePhone SMALLINT, + MobilePhoneModel VARCHAR, + Params VARCHAR, + IPNetworkID INTEGER, + TraficSourceID SMALLINT, + SearchEngineID SMALLINT, + SearchPhrase VARCHAR, + AdvEngineID SMALLINT, + IsArtifical SMALLINT, + WindowClientWidth SMALLINT, + WindowClientHeight SMALLINT, + ClientTimeZone SMALLINT, + ClientEventTime BIGINT, + SilverlightVersion1 SMALLINT, + SilverlightVersion2 SMALLINT, + SilverlightVersion3 INTEGER, + SilverlightVersion4 SMALLINT, + PageCharset VARCHAR, + CodeVersion INTEGER, + IsLink SMALLINT, + IsDownload SMALLINT, + IsNotBounce SMALLINT, + FUniqID BIGINT, + OriginalURL VARCHAR, + HID INTEGER, + IsOldCounter SMALLINT, + IsEvent SMALLINT, + IsParameter SMALLINT, + DontCountHits SMALLINT, + WithHash SMALLINT, + HitColor VARCHAR, + LocalEventTime BIGINT, + Age SMALLINT, + Sex SMALLINT, + Income SMALLINT, + Interests SMALLINT, + Robotness SMALLINT, + RemoteIP INTEGER, + WindowName INTEGER, + OpenerName INTEGER, + HistoryLength SMALLINT, + BrowserLanguage VARCHAR, + BrowserCountry VARCHAR, + SocialNetwork VARCHAR, + SocialAction VARCHAR, + HTTPError SMALLINT, + SendTiming INTEGER, + DNSTiming INTEGER, + ConnectTiming INTEGER, + ResponseStartTiming INTEGER, + ResponseEndTiming INTEGER, + FetchTiming INTEGER, + SocialSourceNetworkID SMALLINT, + SocialSourcePage VARCHAR, + ParamPrice BIGINT, + ParamOrderID VARCHAR, + ParamCurrency VARCHAR, + ParamCurrencyID SMALLINT, + OpenstatServiceName VARCHAR, + OpenstatCampaignID VARCHAR, + OpenstatAdID VARCHAR, + OpenstatSourceID VARCHAR, + UTMSource VARCHAR, + UTMMedium VARCHAR, + UTMCampaign VARCHAR, + UTMContent VARCHAR, + UTMTerm VARCHAR, + FromTag VARCHAR, + HasGCLID SMALLINT, + RefererHash BIGINT, + URLHash BIGINT, + CLID INTEGER +); diff --git a/arc/queries.sql b/arc/queries.sql new file mode 100644 index 000000000..64b056b91 --- /dev/null +++ b/arc/queries.sql @@ -0,0 +1,47 @@ +-- ClickBench Queries for Arc +-- Original: https://github.com/ClickHouse/ClickBench +-- Adapted for Arc's DuckDB engine (table name: clickbench.hits) + +SELECT COUNT(*) FROM clickbench.hits; +SELECT COUNT(*) FROM clickbench.hits WHERE AdvEngineID <> 0; +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM clickbench.hits; +SELECT AVG(UserID) FROM clickbench.hits; +SELECT COUNT(DISTINCT UserID) FROM clickbench.hits; +SELECT COUNT(DISTINCT SearchPhrase) FROM clickbench.hits; +SELECT MIN(EventDate), MAX(EventDate) FROM clickbench.hits; +SELECT AdvEngineID, COUNT(*) FROM clickbench.hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM clickbench.hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM clickbench.hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM clickbench.hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM clickbench.hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT SearchPhrase, COUNT(*) AS c FROM clickbench.hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM clickbench.hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM clickbench.hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT UserID, COUNT(*) FROM clickbench.hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM clickbench.hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM clickbench.hits GROUP BY UserID, SearchPhrase LIMIT 10; +SELECT UserID, minute(to_timestamp(EventTime)) AS m, SearchPhrase, COUNT(*) FROM clickbench.hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID FROM clickbench.hits WHERE UserID = 435090932899640449; +SELECT COUNT(*) FROM clickbench.hits WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM clickbench.hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM clickbench.hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM clickbench.hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM clickbench.hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM clickbench.hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; +SELECT SearchPhrase FROM clickbench.hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; +SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM clickbench.hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM clickbench.hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM clickbench.hits; +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM clickbench.hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM clickbench.hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM clickbench.hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS c FROM clickbench.hits GROUP BY URL ORDER BY c DESC LIMIT 10; +SELECT 1, URL, COUNT(*) AS c FROM clickbench.hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM clickbench.hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM clickbench.hits WHERE CounterID = 62 AND EventDate >= 15887 AND EventDate <= 15917 AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM clickbench.hits WHERE CounterID = 62 AND EventDate >= 15887 AND EventDate <= 15917 AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM clickbench.hits WHERE CounterID = 62 AND EventDate >= 15887 AND EventDate <= 15917 AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM clickbench.hits WHERE CounterID = 62 AND EventDate >= 15887 AND EventDate <= 15917 AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM clickbench.hits WHERE CounterID = 62 AND EventDate >= 15887 AND EventDate <= 15917 AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM clickbench.hits WHERE CounterID = 62 AND EventDate >= 15887 AND EventDate <= 15917 AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', to_timestamp(EventTime)) AS M, COUNT(*) AS PageViews FROM clickbench.hits WHERE CounterID = 62 AND EventDate >= 15900 AND EventDate <= 15901 AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', to_timestamp(EventTime)) ORDER BY DATE_TRUNC('minute', to_timestamp(EventTime)) LIMIT 10 OFFSET 1000; diff --git a/arc/results/c6a.4xlarge.json b/arc/results/c6a.4xlarge.json new file mode 100644 index 000000000..0075400bc --- /dev/null +++ b/arc/results/c6a.4xlarge.json @@ -0,0 +1,56 @@ +{ + "system": "Arc", + "date": "2025-10-16", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "tags": ["time-series"], + "load_time": 0, + "data_size": 14779976446, + "result": [ + [0.1771, 0.0489, 0.0306], + [0.0724, 0.0483, 0.0482], + [0.0792, 0.0728, 0.0732], + [0.0755, 0.0738, 0.0721], + [0.3664, 0.3545, 0.3401], + [0.5707, 0.5549, 0.5394], + [0.0960, 0.0587, 0.0592], + [0.1152, 0.0580, 0.0515], + [0.4881, 0.4437, 0.4640], + [0.6132, 0.5848, 0.5753], + [0.1467, 0.1459, 0.1451], + [0.1950, 0.1970, 0.1816], + [0.5801, 0.5568, 0.5706], + [0.9320, 0.9209, 0.8731], + [0.5911, 0.5856, 0.5947], + [0.4006, 0.4228, 0.4202], + [1.0630, 1.0253, 1.0050], + [0.7713, 0.7968, 0.7761], + [3.3695, 3.3090, 3.3101], + [0.0750, 0.0619, 0.0677], + [1.0353, 0.9025, 0.9012], + [0.8437, 0.8448, 0.8360], + [1.6622, 1.6610, 1.6706], + [0.5193, 0.5336, 0.5057], + [0.1918, 0.1977, 0.1965], + [0.2946, 0.2899, 0.2886], + [0.1373, 0.1341, 0.1675], + [1.0057, 0.9827, 1.0277], + [9.0295, 9.1074, 9.1164], + [0.0816, 0.0762, 0.0802], + [0.7967, 0.6669, 0.5835], + [0.6877, 0.6892, 0.6830], + [2.0258, 1.9906, 1.9201], + [2.2859, 2.3226, 2.3100], + [2.4650, 2.4027, 2.4308], + [0.5759, 0.7953, 0.6377], + [0.2055, 0.1400, 0.1667], + [0.1341, 0.1267, 0.1235], + [0.0912, 0.0903, 0.0901], + [0.2624, 0.2692, 0.2877], + [0.0640, 0.0624, 0.0609], + [0.0729, 0.0595, 0.0571], + [0.2177, 0.2407, 0.2197] + ] +} diff --git a/arc/results/m3_max.json b/arc/results/m3_max.json new file mode 100644 index 000000000..d4cab0132 --- /dev/null +++ b/arc/results/m3_max.json @@ -0,0 +1,56 @@ +{ + "system": "Arc", + "date": "2025-10-15", + "machine": "M3 Pro Max: 14 Cores, 36GB", + "cluster_size": 1, + "proprietary": "no", + "tuned": "no", + "tags": ["time-series"], + "load_time": 0, + "data_size": 14779976446, + "result": [ + [0.0268, 0.0238, 0.0219], + [0.0365, 0.0335, 0.0343], + [0.0559, 0.0478, 0.0457], + [0.0683, 0.0554, 0.0520], + [0.1714, 0.1749, 0.1791], + [0.2990, 0.2767, 0.2812], + [0.0430, 0.0396, 0.0365], + [0.0393, 0.0393, 0.0340], + [0.2293, 0.2175, 0.2173], + [0.3047, 0.3122, 0.3065], + [0.0989, 0.0912, 0.0978], + [0.1058, 0.1036, 0.1046], + [0.2837, 0.2848, 0.3015], + [0.4204, 0.4309, 0.4429], + [0.3361, 0.3174, 0.3112], + [0.2122, 0.2008, 0.2017], + [0.5108, 0.5335, 0.5193], + [0.4918, 0.4830, 0.4810], + [1.6410, 1.6183, 1.6337], + [0.0685, 0.0600, 0.0661], + [0.8242, 0.5983, 0.5881], + [0.4983, 0.4987, 0.4998], + [1.0744, 1.0914, 1.1013], + [0.3505, 0.3212, 0.3235], + [0.1281, 0.1211, 0.1214], + [0.1995, 0.1768, 0.1786], + [0.0870, 0.0851, 0.0956], + [0.6845, 0.6735, 0.6748], + [7.8790, 8.0568, 8.0540], + [0.0667, 0.0678, 0.0613], + [0.3354, 0.3023, 0.3058], + [0.3767, 0.3575, 0.3388], + [1.2356, 0.9725, 1.0842], + [1.0723, 1.0657, 1.0676], + [1.0918, 1.1146, 1.1810], + [0.2625, 0.2573, 0.2693], + [0.0767, 0.0745, 0.0825], + [0.0864, 0.0874, 0.0780], + [0.0515, 0.0521, 0.0534], + [0.1459, 0.1314, 0.1357], + [0.0520, 0.0382, 0.0409], + [0.0413, 0.0402, 0.0383], + [0.1223, 0.1212, 0.1203] + ] +} diff --git a/arc/run.sh b/arc/run.sh new file mode 100755 index 000000000..8b8ac56ff --- /dev/null +++ b/arc/run.sh @@ -0,0 +1,95 @@ +#!/bin/bash +# Arc ClickBench Benchmark Runner +# Queries Arc via HTTP API using Apache Arrow columnar format + +TRIES=3 +DATABASE="${DATABASE:-clickbench}" +TABLE="${TABLE:-hits}" +ARC_URL="${ARC_URL:-http://localhost:8000}" +ARC_API_KEY="${ARC_API_KEY:-benchmark-test-key}" + +# Check if Arc is running +echo "Checking if Arc is running at $ARC_URL..." >&2 +if ! curl -s -f "$ARC_URL/health" > /dev/null 2>&1; then + echo "Error: Arc is not running at $ARC_URL" >&2 + echo "Please start Arc first or set ARC_URL environment variable" >&2 + exit 1 +fi + +echo "Arc is running. Querying table: $DATABASE.$TABLE (Apache Arrow)" >&2 +echo "Using API key: ${ARC_API_KEY:0:20}..." >&2 + +python3 << EOF +import requests +import time +import sys + +try: + import pyarrow as pa +except ImportError: + print("Error: pyarrow is required for Arrow format", file=sys.stderr) + print("Install with: pip install pyarrow", file=sys.stderr) + sys.exit(1) + +ARC_URL = "$ARC_URL" +API_KEY = "$ARC_API_KEY" +DATABASE = "$DATABASE" +TABLE = "$TABLE" + +# Headers for API requests +headers = { + "x-api-key": API_KEY, + "Content-Type": "application/json" +} + +# Read queries +with open('queries.sql') as f: + content = f.read() + +# Remove comment lines +lines = [line for line in content.split('\n') if not line.strip().startswith('--')] +clean_content = '\n'.join(lines) + +# Split by semicolons and filter empties +queries = [] +for query in clean_content.split(';'): + query = query.strip() + if query: + queries.append(query) + +print(f"Running {len(queries)} queries via Apache Arrow API...", file=sys.stderr) + +# Run each query 3 times +for i, query_sql in enumerate(queries, 1): + for run in range(3): + try: + start = time.perf_counter() + + response = requests.post( + f"{ARC_URL}/query/arrow", + headers=headers, + json={"sql": query_sql}, + timeout=300 + ) + + if response.status_code == 200: + # Parse Arrow IPC stream to ensure data is received + reader = pa.ipc.open_stream(response.content) + arrow_table = reader.read_all() + elapsed = time.perf_counter() - start + print(f"{elapsed:.4f}") + else: + print("null") + if run == 0: + print(f"Query {i} failed: {response.status_code} - {response.text[:200]}", file=sys.stderr) + except requests.exceptions.Timeout: + print("null") + if run == 0: + print(f"Query {i} timed out", file=sys.stderr) + except Exception as e: + print("null") + if run == 0: + print(f"Query {i} error: {e}", file=sys.stderr) + +print("Benchmark complete!", file=sys.stderr) +EOF diff --git a/arc/template.json b/arc/template.json new file mode 100644 index 000000000..526bafb74 --- /dev/null +++ b/arc/template.json @@ -0,0 +1,6 @@ +{ + "system": "Arc", + "proprietary": "no", + "tuned": "no", + "tags": ["Python", "time-series", "DuckDB", "Parquet", "columnar"] +}