diff --git a/README.md b/README.md index a6784fabd..3298b2536 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ -# enVector with ANN (GAS) in VectorDBBench +# enVector in VectorDBBench -The guide on how to use enVector with ANN index in VectorDBBench is available in [README_ENVECTOR.md](README_ENVECTOR.md). +**Quick start:** The guide on how to use **enVector** in VectorDBBench is available in : + +👉 [README_ENVECTOR.md](README_ENVECTOR.md). The followings are the original contents of README in VectorDBBench: diff --git a/README_ENVECTOR.md b/README_ENVECTOR.md index 68bd785de..1ea5c2a7c 100644 --- a/README_ENVECTOR.md +++ b/README_ENVECTOR.md @@ -1,6 +1,6 @@ -# enVector with ANN (GAS) in VectorDBBench +# enVector in VectorDBBench -This guide demonstrates how to use enVector with an ANN index in VectorDBBench. +This guide demonstrates how to use enVector in VectorDBBench. Basic usage of enVector with VectorDBBench follows the standard procedure for [VectorDBBench](https://github.com/zilliztech/VectorDBBench). @@ -18,7 +18,7 @@ Basic usage of enVector with VectorDBBench follows the standard procedure for [V │ ├── test.npy │ └── train.pkl ├── README_ENVECTOR.md -├── scripts +└── scripts ├── run_benchmark.sh # benchmark script ├── envector_pubmed_config.yml # benchmark config file └── prepare_dataset.py # download and prepare ground truth neighbors for dataset @@ -35,8 +35,8 @@ source .venv/bin/activate # 2. Install VectorDBBench pip install -e . -# 3. Install es2 -pip install es2==1.2.0a4 +# 3. Install pyenvector +pip install pyenvector==1.2.0a5 ``` ### Prepare dataset @@ -48,8 +48,8 @@ Prepare the following artifacts for the ANN benchmark with `scripts/prepare_data - download centroids and tree metadata for the GAS index for corresponding to the embedding model For the ANN benchmark, we provide two datasets via HuggingFace: -- PUBMED768D400K: [cryptolab-playground/pubmed-arxiv-abstract-embedding-gemma-300m](https://huggingface.co/datasets/cryptolab-playground/pubmed-arxiv-abstract-embedding-gemma-300m) -- BLOOMBERG768D368K: [cryptolab-playground/Bloomberg-Financial-News-embedding-gemma-300m](https://huggingface.co/datasets/cryptolab-playground/Bloomberg-Financial-News-embedding-gemma-300m) +- `PUBMED768D400K`: [cryptolab-playground/pubmed-arxiv-abstract-embedding-gemma-300m](https://huggingface.co/datasets/cryptolab-playground/pubmed-arxiv-abstract-embedding-gemma-300m) +- `BLOOMBERG768D368K`: [cryptolab-playground/Bloomberg-Financial-News-embedding-gemma-300m](https://huggingface.co/datasets/cryptolab-playground/Bloomberg-Financial-News-embedding-gemma-300m) Also, we provide centroids and tree metadata for the corresponding embedding model used in the ANN benchmark: - GAS Centroids: [cryptolab-playground/gas-centroids](https://huggingface.co/datasets/cryptolab-playground/gas-centroids) @@ -63,7 +63,7 @@ python ./scripts/prepare_dataset.py \ -e embeddinggemma-300m ``` -Then, you can find the following generated files: +Then, you can find the generated files as follows: ```bash . @@ -91,25 +91,38 @@ cd envector-deployment/docker-compose ``` We provide four enVector Docker Images: -- `cryptolabinc/es2e:v1.2.0-alpha.4` -- `cryptolabinc/es2b:v1.2.0-alpha.4` -- `cryptolabinc/es2o:v1.2.0-alpha.4` -- `cryptolabinc/es2c:v1.2.0-alpha.4` +- `cryptolabinc/es2e:v1.2.0-alpha.5` +- `cryptolabinc/es2b:v1.2.0-alpha.5` +- `cryptolabinc/es2o:v1.2.0-alpha.5` +- `cryptolabinc/es2c:v1.2.0-alpha.5` ### Set Environment Variables ```bash # Set environment variables -export DATASET_LOCAL_DIR="./dataset" -export NUM_PER_BATCH=4096 +export DATASET_LOCAL_DIR="./dataset" # dataset directory. default: /tmp/vectordb_bench/dataset +export NUM_PER_BATCH=4096 # default batch size for enVector ``` -## Run Benchmark +## Run Our ANN Benchmark -Refer to `./scripts/run_benchmark.sh` or `./scripts/envector_benchmark_config.yml` for benchmarks with enVector with ANN (VCT), or use the following command: +We provide enVector-customized ANN, called "GAS", designed to perform efficient IVF-FLAT-based ANN search with the encrypted index. +We evaluated enVector on two benchmark datasets that we provided: +- `PUBMED768D400K` +- `BLOOMBERG768D368K` + +Run the provided shell scripts (`./scripts/run_benchmark.sh`) as the following: ```bash -export NUM_PER_BATCH=500000 # set to the database size for efficiency with IVF_FLAT +./scripts/run_benchmark.sh --type flat # FLAT +./scripts/run_benchmark.sh --type ivf # IVF-FLAT with enVector-customized ANN (GAS) +``` + +For more details, please refer to `run_benchmark.sh` or `envector_{benchmark}_config.yml` in scripts directory for benchmarks with enVector with ANN (GAS), or you can use the following command: + + +```bash +export NUM_PER_BATCH=500000 # set to the database size when IVF_FLAT python -m vectordb_bench.cli.vectordbbench envectorivfflat \ --uri "localhost:50050" \ --eval-mode mm \ @@ -123,10 +136,68 @@ python -m vectordb_bench.cli.vectordbbench envectorivfflat \ --custom-dataset-file-count 1 \ --custom-dataset-with-gt \ --skip-custom-dataset-use-shuffled \ + --k 10 \ --train-centroids True \ --is-vct True \ --centroids-path "./centroids/embeddinggemma-300m/centroids.npy" \ --vct-path "./centroids/embeddinggemma-300m/tree_info.pkl" \ --nlist 32768 \ --nprobe 6 -``` \ No newline at end of file +``` + +Note that, **`NUM_PER_BATCH` should be set to the database** size when using IVF-based ANN index for enVector currently. +We will support adjustable `NUM_PER_BATCH` for ANN soon. + +## Run VectorDBBench Benchmark + +Run the following commands to run enVector with VectorDBBench's built-in benchmark. + +```bash +# flat +python -m vectordb_bench.cli.vectordbbench envectorflat \ + --uri "localhost:50050" \ + --case-type "Performance1536D500K" \ + --db-label "Performance1536D500K-FLAT" + +# ivf: IVF-FLAT with random centroids +export NUM_PER_BATCH=500000 # set database size when IVF-FLAT +python -m vectordb_bench.cli.vectordbbench envectorivfflat \ + --uri "localhost:50050" \ + --case-type "Performance1536D500K" \ + --db-label "Performance1536D500K-IVF-FLAT" \ + --nlist 250 \ + --nprobe 6 + +# ivf-trained: IVF-FLAT with trained centroids via k-means +export NUM_PER_BATCH=500000 # set to the database size when IVF-FLAT +python -m vectordb_bench.cli.vectordbbench envectorivfflat \ + --uri "localhost:50050" \ + --case-type "Performance1536D500K" \ + --db-label "Performance1536D500K-IVF-FLAT" \ + --train-centroids True \ + --centroids-path "./centroids/kmeans_centroids.npy" \ # centroids built by sklearn, etc. + --nlist 250 \ + --nprobe 6 +``` + +### CLI Options + +enVector Types for VectorDBBench +- `envectorflat`: FLAT as index type for enVector +- `envectorivfflat`: IVF_FLAT as index type for enVector + +Common Options for enVector +- `--uri`: enVector server URI +- `--eval-mode`: FHE evaluation mode on server. Use `mm` for enhanced performance. + +ANN Options for enVector +- `--nlist`: Number of coarse clusters for IVF_FLAT +- `--nprobe`: Number of clusters to scan during search for IVF_FLAT +- `--train-centroids`: whether to use trained centroids for IVF_FLAT +- `--centroids-path`: path to the trained centroids +- `--is-vct`: whether to use VCT approach for IVF_GAS +- `--vct-path`: path to the trained VCT metadata for IVF_GAS + +Benchmark Options: + follows conventions of VectorDBBench, + see details in [VectorDBBench Options](https://github.com/zilliztech/VectorDBBench?tab=readme-ov-file#custom-dataset-for-performance-case) \ No newline at end of file diff --git a/vectordb_bench/backend/clients/envector/envector.py b/vectordb_bench/backend/clients/envector/envector.py index 0e34f44bb..e546f6390 100644 --- a/vectordb_bench/backend/clients/envector/envector.py +++ b/vectordb_bench/backend/clients/envector/envector.py @@ -7,8 +7,8 @@ from pathlib import Path from typing import Any -import es2 import numpy as np +import pyenvector as ev from vectordb_bench.backend.filter import Filter, FilterOp @@ -51,12 +51,12 @@ def __init__( self._vector_index_name = "vector_idx" self._scalar_id_index_name = "id_sort_idx" self._scalar_labels_index_name = "labels_idx" - self.col: es2.Index | None = None + self.col: ev.Index | None = None self.is_vct: bool = False self.vct_params: dict[str, Any] = {} - es2.init( + ev.init( address=self.db_config.get("uri"), key_path=self.db_config.get("key_path"), key_id=self.db_config.get("key_id"), @@ -64,8 +64,8 @@ def __init__( ) if drop_old: log.info(f"{self.name} client drop_old index: {self.collection_name}") - if self.collection_name in es2.get_index_list(): - es2.drop_index(self.collection_name) + if self.collection_name in ev.get_index_list(): + ev.drop_index(self.collection_name) # Create the collection log.info(f"{self.name} create index: {self.collection_name}") @@ -73,10 +73,10 @@ def __init__( index_kwargs = dict(kwargs) self._ensure_index(dim, index_kwargs) - es2.disconnect() + ev.disconnect() def _ensure_index(self, dim: int, index_kwargs: dict[str, Any]): - if self.collection_name in es2.get_index_list(): + if self.collection_name in ev.get_index_list(): log.info(f"{self.name} index {self.collection_name} already exists, skip creating") self.is_vct = self.case_config.index_param().get("is_vct", False) log.debug(f"IS_VCT: {self.is_vct}") @@ -94,7 +94,7 @@ def _create_index(self, dim: int, index_kwargs: dict[str, Any]): if index_type == "IVF_FLAT": self._adjust_batch_size() - es2.create_index( + ev.create_index( index_name=self.collection_name, dim=dim, key_path=self.db_config.get("key_path"), @@ -146,16 +146,16 @@ def init(self): >>> self.insert_embeddings() >>> self.search_embedding() """ - es2.init( + ev.init( address=self.db_config.get("uri"), key_path=self.db_config.get("key_path"), key_id=self.db_config.get("key_id"), eval_mode=self.case_config.eval_mode, ) try: - self.col = es2.Index(self.collection_name) + self.col = ev.Index(self.collection_name) if self.is_vct: - log.debug(f"VCT: {self.col.index_config.index_param.index_params['virtual_cluster']}") + log.debug(f"VCT: {self.col.index_config.index_param.index_params.get('virtual_cluster')}") is_vct = self.case_config.index_param().get("is_vct", False) assert self.is_vct == is_vct, "is_vct mismatch" vct_path = self.case_config.index_param().get("vct_path", None) @@ -163,7 +163,7 @@ def init(self): yield finally: self.col = None - es2.disconnect() + ev.disconnect() def create_index(self): pass @@ -194,8 +194,6 @@ def insert_embeddings( assert self.col is not None assert len(embeddings) == len(metadata) - log.debug(f"IS_VCT: {self.is_vct}") - insert_count = 0 try: for batch_start_offset in range(0, len(embeddings), self.batch_size):