diff --git a/openml/extensions/base/__init__.py b/openml/extensions/base/__init__.py new file mode 100644 index 000000000..d85c0b268 --- /dev/null +++ b/openml/extensions/base/__init__.py @@ -0,0 +1,13 @@ +# License: BSD 3-Clause + +"""Base classes for OpenML extensions.""" + +from openml.extensions.base._connector import OpenMLAPIConnector +from openml.extensions.base._executor import ModelExecutor +from openml.extensions.base._serializer import ModelSerializer + +__all__ = [ + "ModelExecutor", + "ModelSerializer", + "OpenMLAPIConnector", +] diff --git a/openml/extensions/base/_connector.py b/openml/extensions/base/_connector.py new file mode 100644 index 000000000..9ad66307a --- /dev/null +++ b/openml/extensions/base/_connector.py @@ -0,0 +1,28 @@ +# License: BSD 3-Clause + +"""Base class for OpenML API connectors.""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from openml.extensions.base import ModelExecutor, ModelSerializer + + +class OpenMLAPIConnector(ABC): + """Base class for OpenML API connectors.""" + + @abstractmethod + def serializer(self) -> ModelSerializer: + """Return the serializer for this API.""" + + @abstractmethod + def executor(self) -> ModelExecutor: + """Return the executor for this API.""" + + @classmethod + @abstractmethod + def supports(cls, estimator: Any) -> bool: + """High-level check if this connector supports the estimator instance or flow.""" diff --git a/openml/extensions/base/_executor.py b/openml/extensions/base/_executor.py new file mode 100644 index 000000000..67184a3b3 --- /dev/null +++ b/openml/extensions/base/_executor.py @@ -0,0 +1,151 @@ +# License: BSD 3-Clause + +"""Base class for estimator executors.""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from collections import OrderedDict +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + import numpy as np + import scipy.sparse + + from openml.flows import OpenMLFlow + from openml.runs.trace import OpenMLRunTrace, OpenMLTraceIteration + from openml.tasks.task import OpenMLTask + + +class ModelExecutor(ABC): + """Define runtime execution semantics for a specific API type.""" + + @abstractmethod + def seed_model(self, model: Any, seed: int | None) -> Any: + """Set the seed of all the unseeded components of a model and return the seeded model. + + Required so that all seed information can be uploaded to OpenML for reproducible results. + + Parameters + ---------- + model : Any + The model to be seeded + seed : int + + Returns + ------- + model + """ + + @abstractmethod + def _run_model_on_fold( # noqa: PLR0913 + self, + model: Any, + task: OpenMLTask, + X_train: np.ndarray | scipy.sparse.spmatrix, + rep_no: int, + fold_no: int, + y_train: np.ndarray | None = None, + X_test: np.ndarray | scipy.sparse.spmatrix | None = None, + ) -> tuple[np.ndarray, np.ndarray | None, OrderedDict[str, float], OpenMLRunTrace | None]: + """Run a model on a repeat, fold, subsample triplet of the task. + + Returns the data that is necessary to construct the OpenML Run object. Is used by + :func:`openml.runs.run_flow_on_task`. + + Parameters + ---------- + model : Any + The UNTRAINED model to run. The model instance will be copied and not altered. + task : OpenMLTask + The task to run the model on. + X_train : array-like + Training data for the given repetition and fold. + rep_no : int + The repeat of the experiment (0-based; in case of 1 time CV, always 0) + fold_no : int + The fold nr of the experiment (0-based; in case of holdout, always 0) + y_train : Optional[np.ndarray] (default=None) + Target attributes for supervised tasks. In case of classification, these are integer + indices to the potential classes specified by dataset. + X_test : Optional, array-like (default=None) + Test attributes to test for generalization in supervised tasks. + + Returns + ------- + predictions : np.ndarray + Model predictions. + probabilities : Optional, np.ndarray + Predicted probabilities (only applicable for supervised classification tasks). + user_defined_measures : OrderedDict[str, float] + User defined measures that were generated on this fold + trace : Optional, OpenMLRunTrace + Hyperparameter optimization trace (only applicable for supervised tasks with + hyperparameter optimization). + """ + + @abstractmethod + def check_if_model_fitted(self, model: Any) -> bool: + """Returns True/False denoting if the model has already been fitted/trained. + + Parameters + ---------- + model : Any + + Returns + ------- + bool + """ + + @abstractmethod + def obtain_parameter_values( + self, + flow: OpenMLFlow, + model: Any = None, + ) -> list[dict[str, Any]]: + """Extracts all parameter settings required for the flow from the model. + + If no explicit model is provided, the parameters will be extracted from `flow.model` + instead. + + Parameters + ---------- + flow : OpenMLFlow + OpenMLFlow object (containing flow ids, i.e., it has to be downloaded from the server) + + model: Any, optional (default=None) + The model from which to obtain the parameter values. Must match the flow signature. + If None, use the model specified in ``OpenMLFlow.model``. + + Returns + ------- + list + A list of dicts, where each dict has the following entries: + - ``oml:name`` : str: The OpenML parameter name + - ``oml:value`` : mixed: A representation of the parameter value + - ``oml:component`` : int: flow id to which the parameter belongs + """ + + # Abstract methods for hyperparameter optimization + + @abstractmethod + def instantiate_model_from_hpo_class( + self, + model: Any, + trace_iteration: OpenMLTraceIteration, + ) -> Any: + """Instantiate a base model which can be searched over by the hyperparameter optimization + model. + + Parameters + ---------- + model : Any + A hyperparameter optimization model which defines the model to be instantiated. + trace_iteration : OpenMLTraceIteration + Describing the hyperparameter settings to instantiate. + + Returns + ------- + Any + """ + # TODO a trace belongs to a run and therefore a flow -> simplify this part of the interface! diff --git a/openml/extensions/base/_serializer.py b/openml/extensions/base/_serializer.py new file mode 100644 index 000000000..f2673d4c4 --- /dev/null +++ b/openml/extensions/base/_serializer.py @@ -0,0 +1,73 @@ +# License: BSD 3-Clause + +"""Base class for estimator serializors.""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from openml.flows import OpenMLFlow + + +class ModelSerializer(ABC): + """Handle the conversion between estimator instances and OpenML Flows.""" + + @classmethod + @abstractmethod + def can_handle_model(cls, model: Any) -> bool: + """Check whether a model flow can be handled by this extension. + + This is typically done by checking the type of the model, or the package it belongs to. + + Parameters + ---------- + model : Any + + Returns + ------- + bool + """ + + @abstractmethod + def model_to_flow(self, model: Any) -> OpenMLFlow: + """Transform a model to a flow for uploading it to OpenML. + + Parameters + ---------- + model : Any + + Returns + ------- + OpenMLFlow + """ + + @abstractmethod + def flow_to_model( + self, + flow: OpenMLFlow, + initialize_with_defaults: bool = False, # noqa: FBT001, FBT002 + strict_version: bool = True, # noqa: FBT002, FBT001 + ) -> Any: + """Instantiate a model from the flow representation. + + Parameters + ---------- + flow : OpenMLFlow + + initialize_with_defaults : bool, optional (default=False) + If this flag is set, the hyperparameter values of flows will be + ignored and a flow with its defaults is returned. + + strict_version : bool, default=True + Whether to fail if version requirements are not fulfilled. + + Returns + ------- + Any + """ + + @abstractmethod + def get_version_information(self) -> list[str]: + """Return dependency and version information.""" diff --git a/openml/extensions/registry.py b/openml/extensions/registry.py new file mode 100644 index 000000000..e3bc0788e --- /dev/null +++ b/openml/extensions/registry.py @@ -0,0 +1,50 @@ +# License: BSD 3-Clause + +"""Extension registry.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from openml.exceptions import PyOpenMLError +from openml.extensions.sklearn import SklearnAPIConnector + +if TYPE_CHECKING: + from openml.extensions.base import OpenMLAPIConnector + +API_CONNECTOR_REGISTRY: list[type[OpenMLAPIConnector]] = [ + SklearnAPIConnector, +] + + +def resolve_api_connector(estimator: Any) -> OpenMLAPIConnector: + """ + Identify and return the appropriate OpenML API connector for a given estimator. + + This function iterates through the global ``API_CONNECTOR_REGISTRY`` to find + a connector class that supports the provided estimator instance or OpenML flow. + If a matching connector is found, it is instantiated and returned. + + Parameters + ---------- + estimator : Any + The estimator instance (e.g., a scikit-learn estimator) or OpenML flow for + which an API connector is required. + + Returns + ------- + OpenMLAPIConnector + An instance of the matching API connector. + + Raises + ------ + OpenMLException + If no connector is found in the registry that supports the provided + model, or if multiple connectors in the registry claim support for + the provided model. + """ + for connector_cls in API_CONNECTOR_REGISTRY: + if connector_cls.supports(estimator): + return connector_cls() + + raise PyOpenMLError("No OpenML API connector supports this estimator.") diff --git a/openml/flows/__init__.py b/openml/flows/__init__.py index ce32fec7d..e0bd979da 100644 --- a/openml/flows/__init__.py +++ b/openml/flows/__init__.py @@ -1,7 +1,7 @@ # License: BSD 3-Clause -from .flow import OpenMLFlow -from .functions import ( +from openml.flows.flow import OpenMLFlow +from openml.flows.functions import ( assert_flows_equal, delete_flow, flow_exists, @@ -9,6 +9,7 @@ get_flow_id, list_flows, ) +from openml.flows.utils import estimator_to_flow, flow_to_estimator __all__ = [ "OpenMLFlow", @@ -18,4 +19,6 @@ "flow_exists", "assert_flows_equal", "delete_flow", + "estimator_to_flow", + "flow_to_estimator", ] diff --git a/openml/flows/utils.py b/openml/flows/utils.py new file mode 100644 index 000000000..0e2c2020f --- /dev/null +++ b/openml/flows/utils.py @@ -0,0 +1,60 @@ +# License: BSD 3-Clause + +"""Utility functions for OpenML extensions.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from openml.extensions.registry import resolve_api_connector + +if TYPE_CHECKING: + from openml.flows import OpenMLFlow + + +def flow_to_estimator( + flow: OpenMLFlow, + initialize_with_defaults: bool = False, # noqa: FBT001, FBT002 + strict_version: bool = True, # noqa: FBT002, FBT001 +) -> Any: + """Instantiate a model from the flow representation. + + Parameters + ---------- + flow : OpenMLFlow + + initialize_with_defaults : bool, optional (default=False) + If this flag is set, the hyperparameter values of flows will be + ignored and a flow with its defaults is returned. + + strict_version : bool, default=True + Whether to fail if version requirements are not fulfilled. + + Returns + ------- + estimator_instance : Any + The corresponding estimator instance. + """ + connector = resolve_api_connector(flow) + return connector.serializer().flow_to_model( + flow, + initialize_with_defaults=initialize_with_defaults, + strict_version=strict_version, + ) + + +def estimator_to_flow(estimator_instance: Any) -> OpenMLFlow: + """Convert an estimator instance to an OpenML flow. + + Parameters + ---------- + estimator_instance : Any + The estimator instance to convert. + + Returns + ------- + flow : openml.flows.OpenMLFlow + The corresponding OpenML flow. + """ + connector = resolve_api_connector(estimator_instance) + return connector.serializer().model_to_flow(estimator_instance) diff --git a/tests/test_extensions/test_base.py b/tests/test_extensions/test_base.py new file mode 100644 index 000000000..46abcf4ca --- /dev/null +++ b/tests/test_extensions/test_base.py @@ -0,0 +1,143 @@ +# License: BSD 3-Clause + +"""Test OpenML extension base classes and registry.""" + +import pytest +from collections import OrderedDict + +from openml.exceptions import PyOpenMLError +from openml.extensions.base import ( + ModelSerializer, + ModelExecutor, + OpenMLAPIConnector, +) +from openml.extensions.registry import resolve_api_connector + + +class TestModelSerializer: + """Test ModelSerializer abstract base class.""" + + def test_is_abstract(self): + """ModelSerializer should not be instantiable.""" + with pytest.raises(TypeError): + ModelSerializer() # noqa: B024 + + class DummySerializer(ModelSerializer): + @classmethod + def can_handle_model(cls, model): + return True + + def model_to_flow(self, model): + return "dummy_flow" + + def flow_to_model(self, flow, initialize_with_defaults=False, strict_version=True): + return "dummy_model" + + def get_version_information(self): + return ["dummy>=0.1"] + + def test_concrete_implementation(self): + serializer = self.DummySerializer() + + assert serializer.can_handle_model(object()) is True + assert serializer.model_to_flow("model") == "dummy_flow" + assert serializer.flow_to_model("flow") == "dummy_model" + assert serializer.get_version_information() == ["dummy>=0.1"] + + +class TestModelExecutor: + """Test ModelExecutor abstract base class.""" + + def test_is_abstract(self): + """ModelExecutor should not be instantiable.""" + with pytest.raises(TypeError): + ModelExecutor() # noqa: B024 + + class DummyExecutor(ModelExecutor): + def seed_model(self, model, seed): + return model + + def _run_model_on_fold( + self, + model, + task, + X_train, + rep_no, + fold_no, + y_train=None, + X_test=None, + ): + return ( + [], # predictions + None, # probabilities + OrderedDict(), # user_defined_measures + None, # trace + ) + + def check_if_model_fitted(self, model): + return False + + def obtain_parameter_values(self, flow, model=None): + return [] + + def instantiate_model_from_hpo_class(self, model, trace_iteration): + return model + + def test_concrete_implementation(self): + executor = self.DummyExecutor() + + assert executor.seed_model("model", 42) == "model" + assert executor.check_if_model_fitted("model") is False + assert executor.obtain_parameter_values("flow") == [] + + +class TestOpenMLAPIConnector: + """Test OpenMLAPIConnector abstract base class.""" + + def test_is_abstract(self): + """OpenMLAPIConnector should not be instantiable.""" + with pytest.raises(TypeError): + OpenMLAPIConnector() # noqa: B024 + + class DummySerializer: + pass + + class DummyExecutor: + pass + + class DummyConnector(OpenMLAPIConnector): + def serializer(self): + return TestOpenMLAPIConnector.DummySerializer() + + def executor(self): + return TestOpenMLAPIConnector.DummyExecutor() + + @classmethod + def supports(cls, estimator): + return estimator == "supported" + + def test_concrete_implementation(self): + connector = self.DummyConnector() + + assert isinstance(connector.serializer(), self.DummySerializer) + assert isinstance(connector.executor(), self.DummyExecutor) + assert self.DummyConnector.supports("supported") is True + assert self.DummyConnector.supports("unsupported") is False + + def test_resolve_api_connector_success(self, monkeypatch): + monkeypatch.setattr( + "openml.extensions.registry.API_CONNECTOR_REGISTRY", + [self.DummyConnector], + ) + + connector = resolve_api_connector("supported") + assert isinstance(connector, self.DummyConnector) + + def test_resolve_api_connector_no_match(self, monkeypatch): + monkeypatch.setattr( + "openml.extensions.registry.API_CONNECTOR_REGISTRY", + [], + ) + + with pytest.raises(PyOpenMLError, match="No OpenML API connector supports"): + resolve_api_connector("anything")