Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 10 additions & 14 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,26 +14,22 @@ jobs:
compare-php:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
- uses: actions/checkout@v6
- uses: actions/setup-python@v6
with:
python-version: 3.x

# A naive `docker compose up` would first build the `python-api` container and then
# start all services, which kickstarts Elastic Search and building indices.
# But since those two steps are independent, we can parallelize them to save time.
- run: |
{ docker compose build python-api; docker compose up -d python-api; } &
docker compose up -d --wait php-api
- run: docker container ls && docker image ls
- run: docker exec python-api python -m pytest -xv -m "php"
# https://github.com/docker/compose/issues/10596
- run: docker compose --profile "python" --profile "php" up --detach --wait --remove-orphans || exit $(docker compose ps -q | xargs docker inspect -f '{{.State.ExitCode}}' | grep -v '^0' | wc -l)
- run: docker container ls
- run: docker exec openml-python-rest-api python -m pytest -v -m "php"
python:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
- uses: actions/checkout@v6
- uses: actions/setup-python@v6
with:
python-version: 3.x
- run: docker compose up -d --wait database python-api
- run: docker compose --profile "python" up --detach --wait --remove-orphans || exit $(docker compose ps -q | xargs docker inspect -f '{{.State.ExitCode}}' | grep -v '^0' | wc -l)
- run: docker container ls
- run: docker exec python-api python -m pytest -xv -m "not php"
- run: docker exec openml-python-rest-api python -m pytest -v -m "not php"
73 changes: 51 additions & 22 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -1,13 +1,33 @@
services:
database:
image: "openml/test-database"
profiles: ["python", "php", "all"]
image: "openml/test-database:20240105"
container_name: "openml-test-database"
environment:
MYSQL_ROOT_PASSWORD: ok
ports:
- "3306:3306"
healthcheck:
test: ["CMD", "mysqladmin" ,"ping", "-h", "localhost"]
start_period: 30s
start_interval: 1s
timeout: 3s
interval: 5s
retries: 10

database-setup:
profiles: ["python", "php", "all"]
image: mysql
container_name: "openml-test-database-setup"
volumes:
- ./docker/database/update.sh:/database-update.sh
command: /bin/sh -c "/database-update.sh"
depends_on:
database:
condition: service_healthy

docs:
profiles: ["all"]
build:
context: .
dockerfile: docker/docs/Dockerfile
Expand All @@ -16,10 +36,35 @@ services:
volumes:
- .:/docs

elasticsearch:
profiles: ["php", "all"]
image: docker.elastic.co/elasticsearch/elasticsearch:6.8.23
container_name: "openml-elasticsearch"
platform: "linux/amd64"
ports:
- "9200:9200" # also known as /es (nginx)
- "9300:9300"
env_file: docker/elasticsearch/.env
healthcheck:
test: curl 127.0.0.1:9200/_cluster/health | grep -e "green"
start_period: 30s
start_interval: 5s
timeout: 3s
interval: 10s
deploy:
resources:
limits:
cpus: '1'
memory: 1G
reservations:
cpus: '0.2'
memory: 250M

php-api:
image: "openml/php-rest-api"
build:
context: ./docker/php/
profiles: ["php", "all"]
image: "openml/php-rest-api:v1.2.2"
container_name: "openml-php-rest-api"
env_file: docker/php/.env
ports:
- "8002:80"
depends_on:
Expand All @@ -35,7 +80,8 @@ services:
interval: 1m

python-api:
container_name: "python-api"
profiles: ["python", "all"]
container_name: "openml-python-rest-api"
build:
context: .
dockerfile: docker/python/Dockerfile
Expand All @@ -45,20 +91,3 @@ services:
- .:/python-api
depends_on:
- database

elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:8.10.4
container_name: "elasticsearch"
ports:
- "9200:9200"
- "9300:9300"
environment:
- ELASTIC_PASSWORD=default
- discovery.type=single-node
- xpack.security.enabled=false
healthcheck:
test: curl 127.0.0.1:9200/_cluster/health | grep -e "green"
start_period: 30s
start_interval: 5s
timeout: 3s
interval: 1m
31 changes: 31 additions & 0 deletions docker/database/update.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#/bin/bash
# Change the filepath of openml.file
# from "https://www.openml.org/data/download/1666876/phpFsFYVN"
# to "http://minio:9000/datasets/0000/0001/phpFsFYVN"
mysql -hdatabase -uroot -pok -e 'UPDATE openml.file SET filepath = CONCAT("http://minio:9000/datasets/0000/", LPAD(id, 4, "0"), "/", SUBSTRING_INDEX(filepath, "/", -1)) WHERE extension="arff";'

# Update openml.expdb.dataset with the same url
mysql -hdatabase -uroot -pok -e 'UPDATE openml_expdb.dataset DS, openml.file FL SET DS.url = FL.filepath WHERE DS.did = FL.id;'





# Create the data_feature_description TABLE. TODO: can we make sure this table exists already?
mysql -hdatabase -uroot -pok -Dopenml_expdb -e 'CREATE TABLE IF NOT EXISTS `data_feature_description` (
`did` int unsigned NOT NULL,
`index` int unsigned NOT NULL,
`uploader` mediumint unsigned NOT NULL,
`date` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
`description_type` enum("plain", "ontology") NOT NULL,
`value` varchar(256) NOT NULL,
KEY `did` (`did`,`index`),
CONSTRAINT `data_feature_description_ibfk_1` FOREIGN KEY (`did`, `index`) REFERENCES `data_feature` (`did`, `index`) ON DELETE CASCADE ON UPDATE CASCADE
)'

# SET dataset 1 to active (used in unittests java)
mysql -hdatabase -uroot -pok -Dopenml_expdb -e 'INSERT IGNORE INTO dataset_status VALUES (1, "active", "2024-01-01 00:00:00", 1)'
mysql -hdatabase -uroot -pok -Dopenml_expdb -e 'DELETE FROM dataset_status WHERE did = 2 AND status = "deactivated";'

# Temporary fix in case the database missed the kaggle table. The PHP Rest API expects the table to be there, while indexing.
mysql -hdatabase -uroot -pok -Dopenml_expdb -e 'CREATE TABLE IF NOT EXISTS `kaggle` (`dataset_id` int(11) DEFAULT NULL, `kaggle_link` varchar(500) DEFAULT NULL)'
3 changes: 3 additions & 0 deletions docker/elasticsearch/.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
ELASTIC_PASSWORD=default
discovery.type=single-node
xpack.security.enabled=false
14 changes: 14 additions & 0 deletions docker/php/.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
API_KEY=AD000000000000000000000000000000
BASE_URL=http://php-api:80/
MINIO_URL=http://minio:9000/
DB_HOST_OPENML=database:3306
DB_HOST_EXPDB=database:3306
DB_USER_OPENML=root
DB_PASS_OPENML=ok
DB_USER_EXPDB_WRITE=root
DB_PASS_EXPDB_WRITE=ok
DB_USER_EXPDB_READ=root
DB_PASS_EXPDB_READ=ok
ES_URL=elasticsearch:9200
ES_PASSWORD=default
INDEX_ES_DURING_STARTUP=false
14 changes: 12 additions & 2 deletions src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

TomlTable = dict[str, typing.Any]

CONFIG_PATH = Path(__file__).parent / "config.toml"


def _apply_defaults_to_siblings(configuration: TomlTable) -> TomlTable:
defaults = configuration["defaults"]
Expand All @@ -19,9 +21,17 @@ def _apply_defaults_to_siblings(configuration: TomlTable) -> TomlTable:


@functools.cache
def load_database_configuration(file: Path = Path(__file__).parent / "config.toml") -> TomlTable:
configuration = tomllib.loads(file.read_text())
def _load_configuration(file: Path) -> TomlTable:
return typing.cast(TomlTable, tomllib.loads(file.read_text()))


def load_routing_configuration(file: Path = CONFIG_PATH) -> TomlTable:
return typing.cast(TomlTable, _load_configuration(file)["routing"])


@functools.cache
def load_database_configuration(file: Path = CONFIG_PATH) -> TomlTable:
configuration = _load_configuration(file)
database_configuration = _apply_defaults_to_siblings(
configuration["databases"],
)
Expand Down
4 changes: 4 additions & 0 deletions src/config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,7 @@ database="openml_expdb"

[databases.openml]
database="openml"

[routing]
minio_url="http://minio:9000/"
server_url="http://php-api:80/"
11 changes: 7 additions & 4 deletions src/core/formatting.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import html

from config import load_routing_configuration
from schemas.datasets.openml import DatasetFileFormat
from sqlalchemy.engine import Row

Expand All @@ -24,14 +25,16 @@ def _format_parquet_url(dataset: Row) -> str | None:
if dataset.format.lower() != DatasetFileFormat.ARFF:
return None

minio_base_url = "https://openml1.win.tue.nl"
return f"{minio_base_url}/dataset{dataset.did}/dataset_{dataset.did}.pq"
minio_base_url = load_routing_configuration()["minio_url"]
ten_thousands_prefix = f"{dataset.did // 10_000:04d}"
padded_id = f"{dataset.did:04d}"
return f"{minio_base_url}datasets/{ten_thousands_prefix}/{padded_id}/dataset_{dataset.did}.pq"


def _format_dataset_url(dataset: Row) -> str:
base_url = "https://test.openml.org"
base_url = load_routing_configuration()["server_url"]
filename = f"{html.escape(dataset.name)}.{dataset.format.lower()}"
return f"{base_url}/data/v1/download/{dataset.file_id}/{filename}"
return f"{base_url}data/v1/download/{dataset.file_id}/{filename}"


def _safe_unquote(text: str | None) -> str | None:
Expand Down
1 change: 0 additions & 1 deletion src/routers/openml/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,6 @@ def get_dataset(
row_id_attribute=row_id_attribute,
url=dataset_url,
parquet_url=parquet_url,
minio_url=parquet_url,
file_id=dataset.file_id,
format=dataset.format.lower(),
paper_url=dataset.paper_url or None,
Expand Down
8 changes: 1 addition & 7 deletions src/routers/openml/flows.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,7 @@ def get_flow(flow_id: int, expdb: Annotated[Connection, Depends(expdb_connection
tags = get_flow_tags(flow_id, expdb)

flow_rows = get_flow_subflows(flow_id, expdb)
subflows = [
{
"identifier": flow.identifier,
"flow": get_flow(flow_id=flow.child_id, expdb=expdb),
}
for flow in flow_rows
]
subflows = [get_flow(flow_id=flow.child_id, expdb=expdb) for flow in flow_rows]

return Flow(
id_=flow.id,
Expand Down
4 changes: 3 additions & 1 deletion src/routers/openml/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import re
from typing import Annotated, Any

import config
import xmltodict
from database.datasets import get_dataset
from database.tasks import (
Expand Down Expand Up @@ -145,7 +146,8 @@ def _fill_json_template(
# I believe that the operations below are always part of string output, so
# we don't need to be careful to avoid losing typedness
template = template.replace("[TASK:id]", str(task.task_id))
return template.replace("[CONSTANT:base_url]", "https://test.openml.org/")
server_url = config.load_routing_configuration()["server_url"]
return template.replace("[CONSTANT:base_url]", server_url)


@router.get("/{task_id}")
Expand Down
2 changes: 1 addition & 1 deletion src/schemas/datasets/mldcat_ap.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ class DataService(JsonLDObject):


class JsonLDGraph(BaseModel):
context: str | dict[str, HttpUrl] = Field(default_factory=dict, serialization_alias="@context")
context: str | dict[str, HttpUrl] = Field(default_factory=dict, serialization_alias="@context") # type: ignore
graph: list[
Distribution | DataService | Dataset | Quality | Feature | Agent | MD5Checksum
] = Field(default_factory=list, serialization_alias="@graph")
Expand Down
6 changes: 0 additions & 6 deletions src/schemas/datasets/openml.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,12 +124,6 @@ class DatasetMetadata(BaseModel):
"description": "URL of the parquet dataset data file.",
},
)
minio_url: HttpUrl | None = Field(
json_schema_extra={
"example": "http://openml1.win.tue.nl/dataset2/dataset_2.pq",
"description": "Deprecated, I think.",
},
)
file_id: int = Field(json_schema_extra={"example": 1})
format_: DatasetFileFormat = Field(
json_schema_extra={"example": DatasetFileFormat.ARFF},
Expand Down
4 changes: 3 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from routers.dependencies import expdb_connection, userdb_connection
from sqlalchemy import Connection, Engine

PHP_API_URL = "http://openml-php-rest-api:80/api/v1/json"


class ApiKey(StrEnum):
ADMIN: str = "AD000000000000000000000000000000"
Expand Down Expand Up @@ -42,7 +44,7 @@ def user_test() -> Connection:

@pytest.fixture()
def php_api() -> httpx.Client:
with httpx.Client(base_url="http://server-api-php-api-1:80/api/v1/json") as client:
with httpx.Client(base_url=PHP_API_URL) as client:
yield client


Expand Down
4 changes: 2 additions & 2 deletions tests/constants.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
PRIVATE_DATASET_ID = {130}
IN_PREPARATION_ID = {1, 33}
DEACTIVATED_DATASETS = {2, 131}
IN_PREPARATION_ID = {33}
DEACTIVATED_DATASETS = {131}
DATASETS = set(range(1, 132))

NUMBER_OF_DATASETS = len(DATASETS)
Expand Down
Loading