CryptoLabInc · inkme9 · Nov 21, 2025 · Aug 9, 2024 · Oct 2, 2024 · Oct 8, 2024
diff --git a/.env.example b/.env.example
@@ -3,9 +3,9 @@
 # LOG_NAME=
 # TIMEZONE=
 
-# NUM_PER_BATCH=
+NUM_PER_BATCH=4096
 # DEFAULT_DATASET_URL=
 
-DATASET_LOCAL_DIR="/tmp/vectordb_bench/dataset"
+DATASET_LOCAL_DIR="/data/vectordb_bench/dataset"
 
 # DROP_OLD = True
diff --git a/README.md b/README.md
@@ -1,3 +1,11 @@
+# enVector with ANN (GAS) in VectorDBBench
+
+The guide on how to use enVector with ANN index in VectorDBBench is available in [README_ENVECTOR.md](README_ENVECTOR.md).
+
+The followings are the original contents of README in VectorDBBench:
+
+---
+
 # VectorDBBench(VDBBench): A Benchmark Tool for VectorDB
 
 [![version](https://img.shields.io/pypi/v/vectordb-bench.svg?color=blue)](https://pypi.org/project/vectordb-bench/)
@@ -422,6 +430,9 @@ python -m vectordb_bench
 
 OR:
 
+If you are using [dev container](https://code.visualstudio.com/docs/devcontainers/containers), create
+the following dataset directory first:
+
 ```shell
 init_bench
 ```

diff --git a/README_ENVECTOR.md b/README_ENVECTOR.md
@@ -0,0 +1,132 @@
+# enVector with ANN (GAS) in VectorDBBench
+
+This guide demonstrates how to use enVector with an ANN index in VectorDBBench.
+
+Basic usage of enVector with VectorDBBench follows the standard procedure for [VectorDBBench](https://github.com/zilliztech/VectorDBBench).
+
+## Structure
+
+```bash
+.
+├── centroids
+│   └── embeddinggemma-300m
+│       ├── centroids.npy             # centroids file for ANN
+│       └── tree_info.pkl             # tree metadata for ANN
+├── dataset
+│   └── pubmed768d400k                # VectorDB ANN benchmark dataset
+│       ├── neighbors.parquet
+│       ├── test.npy
+│       └── train.pkl
+├── README_ENVECTOR.md
+├── scripts
+    ├── run_benchmark.sh              # benchmark script
+    ├── envector_pubmed_config.yml    # benchmark config file
+    └── prepare_dataset.py            # download and prepare ground truth neighbors for dataset
+```
+
+## Prerequisites
+
+### Install Python Dependencies
+```bash
+# 1. Create your environment
+python -m venv .venv
+source .venv/bin/activate
+
+# 2. Install VectorDBBench
+pip install -e .
+
+# 3. Install es2
+pip install es2==1.2.0a4
+```
+
+### Prepare dataset
+
+Prepare the following artifacts for the ANN benchmark with `scripts/prepare_dataset.py`:
+
+- download datasets from HuggingFace
+- prepare ground-truth neighbors
+- download centroids and tree metadata for the GAS index for corresponding to the embedding model
+
+For the ANN benchmark, we provide two datasets via HuggingFace:
+- PUBMED768D400K: [cryptolab-playground/pubmed-arxiv-abstract-embedding-gemma-300m](https://huggingface.co/datasets/cryptolab-playground/pubmed-arxiv-abstract-embedding-gemma-300m)
+- BLOOMBERG768D368K: [cryptolab-playground/Bloomberg-Financial-News-embedding-gemma-300m](https://huggingface.co/datasets/cryptolab-playground/Bloomberg-Financial-News-embedding-gemma-300m)
+
+Also, we provide centroids and tree metadata for the corresponding embedding model used in the ANN benchmark:
+- GAS Centroids: [cryptolab-playground/gas-centroids](https://huggingface.co/datasets/cryptolab-playground/gas-centroids)
+
+To prepare dataset, run the following command as example:
+
+```bash
+# Prepare dataset
+python ./scripts/prepare_dataset.py \
+    -d cryptolab-playground/pubmed-arxiv-abstract-embedding-gemma-300m \
+    -e embeddinggemma-300m
+```
+
+Then, you can find the following generated files:
+
+```bash
+.
+├── centroids
+│   └── embeddinggemma-300m
+│       ├── centroids.npy
+│       └── tree_info.pkl
+└── dataset
+    └── pubmed768d400k
+        ├── neighbors.parquet
+        ├── test.npy
+        └── train.pkl
+```
+
+### Prepare enVector Server
+
+To run enVector server with ANN, please refer to the [enVector Deployment repository](https://github.com/CryptoLabInc/envector-deployment). 
+For example, you can start the server with the following command:
+
+```bash
+# Start enVector server
+git clone https://github.com/CryptoLabInc/envector-deployment
+cd envector-deployment/docker-compose
+./start_envector.sh
+```
+
+We provide four enVector Docker Images:
+- `cryptolabinc/es2e:v1.2.0-alpha.4`
+- `cryptolabinc/es2b:v1.2.0-alpha.4`
+- `cryptolabinc/es2o:v1.2.0-alpha.4`
+- `cryptolabinc/es2c:v1.2.0-alpha.4`
+
+### Set Environment Variables
+
+```bash
+# Set environment variables
+export DATASET_LOCAL_DIR="./dataset"
+export NUM_PER_BATCH=4096
+```
+
+## Run Benchmark
+
+Refer to `./scripts/run_benchmark.sh` or `./scripts/envector_benchmark_config.yml` for benchmarks with enVector with ANN (VCT), or use the following command:
+
+```bash
+export NUM_PER_BATCH=500000 # set to the database size for efficiency with IVF_FLAT
+python -m vectordb_bench.cli.vectordbbench envectorivfflat \
+    --uri "localhost:50050" \
+    --eval-mode mm \
+    --case-type PerformanceCustomDataset \
+    --db-label "PUBMED768D400K-IVF" \
+    --custom-case-name PUBMED768D400K \
+    --custom-dataset-name PUBMED768D400K \
+    --custom-dataset-dir "" \
+    --custom-dataset-size 400335 \
+    --custom-dataset-dim 768 \
+    --custom-dataset-file-count 1 \
+    --custom-dataset-with-gt \
+    --skip-custom-dataset-use-shuffled \
+    --train-centroids True \
+    --is-vct True \
+    --centroids-path "./centroids/embeddinggemma-300m/centroids.npy" \
+    --vct-path "./centroids/embeddinggemma-300m/tree_info.pkl" \
+    --nlist 32768 \
+    --nprobe 6
+```
diff --git a/pyproject.toml b/pyproject.toml
@@ -40,6 +40,12 @@ dependencies = [
     "pydantic<v2",
     "scikit-learn",
     "pymilvus", # with pandas, numpy, ujson
+    "ujson",
+    "pgvector",
+    "psycopg",
+    "psycopg[binary]",
+    "datasets",
+    "faiss-cpu"
 ]
 dynamic = ["version"]
 

diff --git a/scripts/envector_bloomberg_config.yml b/scripts/envector_bloomberg_config.yml
@@ -0,0 +1,42 @@
+envectorflat:
+  uri: localhost:50050
+  eval_mode: mm
+  case_type: PerformanceCustomDataset
+  db_label: BLOOMBERG768D368K-FLAT
+  custom_case_name: BLOOMBERG768D368K
+  custom_case_description: BLOOMBERG768D368K benchmark (768D, 368K vectors)
+  custom_dataset_name: BLOOMBERG768D368K
+  custom_dataset_dir: 
+  custom_dataset_size: 368816
+  custom_dataset_dim: 768
+  custom_dataset_file_count: 1
+  custom_dataset_use_shuffled: false
+  custom_dataset_with_gt: true
+  k: 10
+  drop_old: true
+  load: true
+
+envectorivfflat:
+  uri: localhost:50050
+  eval_mode: mm
+  case_type: PerformanceCustomDataset
+  db_label: BLOOMBERG768D368K-IVF
+  custom_case_name: BLOOMBERG768D368K
+  custom_case_description: BLOOMBERG768D368K benchmark (768D, 368K vectors)
+  custom_dataset_name: BLOOMBERG768D368K
+  custom_dataset_dir: 
+  custom_dataset_size: 368816
+  custom_dataset_dim: 768
+  custom_dataset_file_count: 1
+  custom_dataset_use_shuffled: false
+  custom_dataset_with_gt: true
+  k: 10
+  nlist: 32768
+  nprobe: 6
+  train_centroids: true
+  is_vct: true
+  centroids_path: centroids/embeddinggemma-300m/centroids.npy
+  vct_path: centroids/embeddinggemma-300m/tree_info.pkl
+  drop_old: true
+  load: true
+
diff --git a/scripts/envector_pubmed_config.yml b/scripts/envector_pubmed_config.yml
@@ -0,0 +1,42 @@
+envectorflat:
+  uri: localhost:50050
+  eval_mode: mm
+  case_type: PerformanceCustomDataset
+  db_label: PUBMED768D400K-FLAT
+  custom_case_name: PUBMED768D400K
+  custom_case_description: PUBMED768D400K benchmark (768D, 400K vectors)
+  custom_dataset_name: PUBMED768D400K
+  custom_dataset_dir: 
+  custom_dataset_size: 400335
+  custom_dataset_dim: 768
+  custom_dataset_file_count: 1
+  custom_dataset_use_shuffled: false
+  custom_dataset_with_gt: true
+  k: 10
+  drop_old: true
+  load: true
+
+envectorivfflat:
+  uri: localhost:50050
+  eval_mode: mm
+  case_type: PerformanceCustomDataset
+  db_label: PUBMED768D400K-IVF
+  custom_case_name: PUBMED768D400K
+  custom_case_description: PUBMED768D400K benchmark (768D, 400K vectors)
+  custom_dataset_name: PUBMED768D400K
+  custom_dataset_dir: 
+  custom_dataset_size: 400335
+  custom_dataset_dim: 768
+  custom_dataset_file_count: 1
+  custom_dataset_use_shuffled: false
+  custom_dataset_with_gt: true
+  k: 10
+  nlist: 32768
+  nprobe: 6
+  train_centroids: true
+  is_vct: true
+  centroids_path: centroids/embeddinggemma-300m/centroids.npy
+  vct_path: centroids/embeddinggemma-300m/tree_info.pkl
+  drop_old: true
+  load: true
+
diff --git a/scripts/prepare_dataset.py b/scripts/prepare_dataset.py
@@ -0,0 +1,114 @@
+import os
+import wget
+import argparse
+import numpy as np
+import pandas as pd
+import pyarrow as pa
+import pyarrow.parquet as pq
+
+from datasets import load_dataset
+
+import faiss
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description="Prepare dataset and ground truth neighbors for benchmarking."
+    )
+    parser.add_argument(
+        "-d", "--dataset-name",
+        type=str,
+        default="cryptolab-playground/pubmed-arxiv-abstract-embedding-gemma-300m",
+        help="Huggingface dataset name to download.",
+        choices=[
+            "cryptolab-playground/pubmed-arxiv-abstract-embedding-gemma-300m",
+            "cryptolab-playground/Bloomberg-Financial-News-embedding-gemma-300m",
+        ],
+    )
+    parser.add_argument(
+        "--dataset-dir",
+        type=str,
+        default="./dataset/pubmed768d400k",
+        help="Dataset directory to save the dataset and neighbors.",
+    )
+    parser.add_argument(
+        "-e", "--embedding-model",
+        type=str,
+        default="embeddinggemma-300m",
+        help="Embedding model name to download centroids for.",
+    )
+    parser.add_argument(
+        "--centroids-dir",
+        type=str,
+        default="./centroids",
+        help="Directory to save the centroids and tree info.",
+    )
+    return parser.parse_args()
+
+def download_dataset(
+    dataset_name: str, 
+    output_dir: str = "./dataset/pubmed768d400k"
+) -> None:
+    """Download dataset from Huggingface and save as Parquet files."""
+    # load dataset
+    ds = load_dataset(dataset_name)
+    train = ds["train"].to_pandas()
+    test = ds["test"].to_pandas()
+
+    # write to parquet
+    train_table = pa.Table.from_pandas(train)
+    pq.write_table(train_table, f"{output_dir}/train.parquet")
+
+    test_table = pa.Table.from_pandas(test)
+    pq.write_table(test_table, f"{output_dir}/test.parquet")
+
+def prepare_neighbors(
+    data_dir: str = "./dataset/pubmed768d400k",
+) -> None:
+    """Prepare ground truth neighbors using brute-force flat search and save as Parquet."""
+    # load dataset
+    train = pd.read_parquet(f"{data_dir}/train.parquet")
+    test = pd.read_parquet(f"{data_dir}/test.parquet")
+
+    train = np.stack(train["emb"].to_list()).astype("float32")
+    test = np.stack(test["emb"].to_list()).astype("float32")
+    dim = train.shape[1]
+
+    # flat search
+    index = faiss.IndexFlatIP(dim)
+    index.add(train)
+
+    k = len(test)
+    distances, indices = index.search(test, k)
+    print(distances.shape, indices.shape)
+
+    # save flat search result as neighbors
+    df = pd.DataFrame({
+        "id": np.arange(len(indices)),
+        "neighbors_id": indices.tolist()
+    })
+
+    table = pa.Table.from_pandas(df)
+    pq.write_table(table, f"{data_dir}/neighbors.parquet")
+
+def download_centroids(embedding_model: str, dataset_dir: str) -> None:
+    """Download pre-computed centroids and tree info for GAS VCT index."""
+
+    if embedding_model != "embeddinggemma-300m":
+        raise ValueError(f"Centroids for {embedding_model} currently not available.")
+
+    # https://huggingface.co/datasets/cryptolab-playground/gas-centroids
+    dataset_link = f"https://huggingface.co/datasets/cryptolab-playground/gas-centroids/resolve/main/{embedding_model}"
+
+    # download
+    os.makedirs(os.path.join(dataset_dir, embedding_model), exist_ok=True)
+    wget.download(f"{dataset_link}/centroids.npy", out=os.path.join(dataset_dir, embedding_model, "centroids.npy"))
+    wget.download(f"{dataset_link}/tree_info.pkl", out=os.path.join(dataset_dir, embedding_model, "tree_info.pkl"))
+
+
+if __name__ == "__main__":
+    args = get_args()
+    os.makedirs(args.dataset_dir, exist_ok=True)
+
+    download_dataset(args.dataset_name, args.dataset_dir)
+    prepare_neighbors(args.dataset_dir)
+    download_centroids(args.embedding_model, args.centroids_dir)