Skip to content
Open
8 changes: 8 additions & 0 deletions openml/_api/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from openml._api.runtime.core import APIContext


def set_api_version(version: str, *, strict: bool = False) -> None:
api_context.set_version(version=version, strict=strict)


api_context = APIContext()
62 changes: 62 additions & 0 deletions openml/_api/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from __future__ import annotations

from dataclasses import dataclass
from typing import Literal

DelayMethod = Literal["human", "robot"]


@dataclass
class APIConfig:
server: str
base_url: str
key: str
timeout: int = 10 # seconds


@dataclass
class APISettings:
v1: APIConfig
v2: APIConfig


@dataclass
class ConnectionConfig:
retries: int = 3
delay_method: DelayMethod = "human"
delay_time: int = 1 # seconds

def __post_init__(self) -> None:
if self.delay_method not in ("human", "robot"):
raise ValueError(f"delay_method must be 'human' or 'robot', got {self.delay_method}")


@dataclass
class CacheConfig:
dir: str = "~/.openml/cache"
ttl: int = 60 * 60 * 24 * 7 # one week


@dataclass
class Settings:
api: APISettings
connection: ConnectionConfig
cache: CacheConfig


settings = Settings(
api=APISettings(
v1=APIConfig(
server="https://www.openml.org/",
base_url="api/v1/xml/",
key="...",
),
v2=APIConfig(
server="http://127.0.0.1:8001/",
base_url="",
key="...",
),
),
connection=ConnectionConfig(),
cache=CacheConfig(),
)
3 changes: 3 additions & 0 deletions openml/_api/http/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from openml._api.http.client import HTTPClient

__all__ = ["HTTPClient"]
151 changes: 151 additions & 0 deletions openml/_api/http/client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
from __future__ import annotations

from pathlib import Path
from typing import TYPE_CHECKING, Any
from urllib.parse import urlencode, urljoin, urlparse

import requests
from requests import Response

from openml.__version__ import __version__
from openml._api.config import settings

if TYPE_CHECKING:
from openml._api.config import APIConfig


class CacheMixin:
@property
def dir(self) -> str:
return settings.cache.dir

@property
def ttl(self) -> int:
return settings.cache.ttl

def _get_cache_dir(self, url: str, params: dict[str, Any]) -> Path:
parsed_url = urlparse(url)
netloc_parts = parsed_url.netloc.split(".")[::-1] # reverse domain
path_parts = parsed_url.path.strip("/").split("/")

# remove api_key and serialize params if any
filtered_params = {k: v for k, v in params.items() if k != "api_key"}
params_part = [urlencode(filtered_params)] if filtered_params else []

return Path(self.dir).joinpath(*netloc_parts, *path_parts, *params_part)

def _get_cache_response(self, cache_dir: Path) -> Response: # noqa: ARG002
return Response()

def _set_cache_response(self, cache_dir: Path, response: Response) -> None: # noqa: ARG002
return None


class HTTPClient(CacheMixin):
def __init__(self, config: APIConfig) -> None:
self.config = config
self.headers: dict[str, str] = {"user-agent": f"openml-python/{__version__}"}

@property
def server(self) -> str:
return self.config.server

@property
def base_url(self) -> str:
return self.config.base_url

@property
def key(self) -> str:
return self.config.key

@property
def timeout(self) -> int:
return self.config.timeout

def request(
self,
method: str,
path: str,
*,
use_cache: bool = False,
use_api_key: bool = False,
**request_kwargs: Any,
) -> Response:
url = urljoin(self.server, urljoin(self.base_url, path))

params = request_kwargs.pop("params", {})
params = params.copy()
if use_api_key:
params["api_key"] = self.key

headers = request_kwargs.pop("headers", {})
headers = headers.copy()
headers.update(self.headers)

timeout = request_kwargs.pop("timeout", self.timeout)
cache_dir = self._get_cache_dir(url, params)

if use_cache:
try:
return self._get_cache_response(cache_dir)
# TODO: handle ttl expired error
except Exception:
raise

response = requests.request(
method=method,
url=url,
params=params,
headers=headers,
timeout=timeout,
**request_kwargs,
)

if use_cache:
self._set_cache_response(cache_dir, response)

return response

def get(
self,
path: str,
*,
use_cache: bool = False,
use_api_key: bool = False,
**request_kwargs: Any,
) -> Response:
# TODO: remove override when cache is implemented
use_cache = False
return self.request(
method="GET",
path=path,
use_cache=use_cache,
use_api_key=use_api_key,
**request_kwargs,
)

def post(
self,
path: str,
**request_kwargs: Any,
) -> Response:
return self.request(
method="POST",
path=path,
use_cache=False,
use_api_key=True,
**request_kwargs,
)

def delete(
self,
path: str,
**request_kwargs: Any,
) -> Response:
return self.request(
method="DELETE",
path=path,
use_cache=False,
use_api_key=True,
**request_kwargs,
)
Empty file added openml/_api/http/utils.py
Empty file.
5 changes: 5 additions & 0 deletions openml/_api/resources/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from openml._api.resources.datasets import DatasetsV1, DatasetsV2
from openml._api.resources.studies import StudiesV1, StudiesV2
from openml._api.resources.tasks import TasksV1, TasksV2

__all__ = ["DatasetsV1", "DatasetsV2", "StudiesV1", "StudiesV2", "TasksV1", "TasksV2"]
45 changes: 45 additions & 0 deletions openml/_api/resources/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from __future__ import annotations

from abc import ABC, abstractmethod
from typing import TYPE_CHECKING

if TYPE_CHECKING:
import pandas as pd
from requests import Response

from openml._api.http import HTTPClient
from openml.datasets.dataset import OpenMLDataset
from openml.tasks.task import OpenMLTask


class ResourceAPI:
def __init__(self, http: HTTPClient):
self._http = http


class DatasetsAPI(ResourceAPI, ABC):
@abstractmethod
def get(self, dataset_id: int) -> OpenMLDataset | tuple[OpenMLDataset, Response]: ...


class TasksAPI(ResourceAPI, ABC):
@abstractmethod
def get(
self,
task_id: int,
*,
return_response: bool = False,
) -> OpenMLTask | tuple[OpenMLTask, Response]: ...


class StudiesAPI(ResourceAPI, ABC):
@abstractmethod
def list( # noqa: PLR0913
self,
limit: int | None = None,
offset: int | None = None,
status: str | None = None,
main_entity_type: str | None = None,
uploader: list[int] | None = None,
benchmark_suite: int | None = None,
) -> pd.DataFrame: ...
20 changes: 20 additions & 0 deletions openml/_api/resources/datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from __future__ import annotations

from typing import TYPE_CHECKING

from openml._api.resources.base import DatasetsAPI

if TYPE_CHECKING:
from responses import Response

from openml.datasets.dataset import OpenMLDataset


class DatasetsV1(DatasetsAPI):
def get(self, dataset_id: int) -> OpenMLDataset | tuple[OpenMLDataset, Response]:
raise NotImplementedError


class DatasetsV2(DatasetsAPI):
def get(self, dataset_id: int) -> OpenMLDataset | tuple[OpenMLDataset, Response]:
raise NotImplementedError
80 changes: 80 additions & 0 deletions openml/_api/resources/studies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
from __future__ import annotations

import pandas as pd
import xmltodict

from openml._api.resources.base import StudiesAPI


class StudiesV1(StudiesAPI):
def list( # noqa: PLR0913
self,
limit: int | None = None,
offset: int | None = None,
status: str | None = None,
main_entity_type: str | None = None,
uploader: list[int] | None = None,
benchmark_suite: int | None = None,
) -> pd.DataFrame:
api_call = "study/list"

if limit is not None:
api_call += f"/limit/{limit}"
if offset is not None:
api_call += f"/offset/{offset}"
if status is not None:
api_call += f"/status/{status}"
if main_entity_type is not None:
api_call += f"/main_entity_type/{main_entity_type}"
if uploader is not None:
api_call += f"/uploader/{','.join(str(u) for u in uploader)}"
if benchmark_suite is not None:
api_call += f"/benchmark_suite/{benchmark_suite}"

response = self._http.get(api_call)
xml_string = response.text

# Parse XML and convert to DataFrame
study_dict = xmltodict.parse(xml_string, force_list=("oml:study",))

assert isinstance(study_dict["oml:study_list"]["oml:study"], list), type(
study_dict["oml:study_list"],
)
assert study_dict["oml:study_list"]["@xmlns:oml"] == "http://openml.org/openml", study_dict[
"oml:study_list"
]["@xmlns:oml"]

studies = {}
for study_ in study_dict["oml:study_list"]["oml:study"]:
expected_fields = {
"oml:id": ("id", int),
"oml:alias": ("alias", str),
"oml:main_entity_type": ("main_entity_type", str),
"oml:benchmark_suite": ("benchmark_suite", int),
"oml:name": ("name", str),
"oml:status": ("status", str),
"oml:creation_date": ("creation_date", str),
"oml:creator": ("creator", int),
}
study_id = int(study_["oml:id"])
current_study = {}
for oml_field_name, (real_field_name, cast_fn) in expected_fields.items():
if oml_field_name in study_:
current_study[real_field_name] = cast_fn(study_[oml_field_name])
current_study["id"] = int(current_study["id"])
studies[study_id] = current_study

return pd.DataFrame.from_dict(studies, orient="index")


class StudiesV2(StudiesAPI):
def list( # noqa: PLR0913
self,
limit: int | None = None,
offset: int | None = None,
status: str | None = None,
main_entity_type: str | None = None,
uploader: list[int] | None = None,
benchmark_suite: int | None = None,
) -> pd.DataFrame:
raise NotImplementedError("V2 API implementation is not yet available")
Loading