From 2c0c1aa0e115af4d1dc6d9c0df1db0fb5d5d6ff5 Mon Sep 17 00:00:00 2001 From: Jigyasu Date: Fri, 2 Jan 2026 14:12:11 +0530 Subject: [PATCH 01/15] [ENH] Refactor `Extension` --- openml/extensions/connectors/__init__.py | 5 + openml/extensions/connectors/base.py | 25 ++++ openml/extensions/execution/__init__.py | 5 + openml/extensions/execution/base.py | 109 +++++++++++++++++ openml/extensions/registry.py | 57 +++++++++ openml/extensions/serialization/__init__.py | 5 + openml/extensions/serialization/base.py | 124 ++++++++++++++++++++ 7 files changed, 330 insertions(+) create mode 100644 openml/extensions/connectors/__init__.py create mode 100644 openml/extensions/connectors/base.py create mode 100644 openml/extensions/execution/__init__.py create mode 100644 openml/extensions/execution/base.py create mode 100644 openml/extensions/registry.py create mode 100644 openml/extensions/serialization/__init__.py create mode 100644 openml/extensions/serialization/base.py diff --git a/openml/extensions/connectors/__init__.py b/openml/extensions/connectors/__init__.py new file mode 100644 index 000000000..5bbd28033 --- /dev/null +++ b/openml/extensions/connectors/__init__.py @@ -0,0 +1,5 @@ +# License: BSD 3-Clause + +"""Base classes for OpenML API connectors.""" + +from openml.extensions.connectors.base import OpenMLAPIConnector \ No newline at end of file diff --git a/openml/extensions/connectors/base.py b/openml/extensions/connectors/base.py new file mode 100644 index 000000000..048adcdf3 --- /dev/null +++ b/openml/extensions/connectors/base.py @@ -0,0 +1,25 @@ +# License: BSD 3-Clause + +from abc import ABC, abstractmethod +from typing import Any + +from openml.extensions.execution import ModelExecutor +from openml.extensions.serialization import ModelSerializer + +class OpenMLAPIConnector(ABC): + """ + Base class for OpenML API connectors. + """ + + @abstractmethod + def serializer(self) -> ModelSerializer: + """Return the serializer for this API.""" + + @abstractmethod + def executor(self) -> ModelExecutor: + """Return the executor for this API.""" + + @classmethod + @abstractmethod + def supports(cls, model: Any) -> bool: + """High-level check if this connector supports the model.""" \ No newline at end of file diff --git a/openml/extensions/execution/__init__.py b/openml/extensions/execution/__init__.py new file mode 100644 index 000000000..c41c09ba7 --- /dev/null +++ b/openml/extensions/execution/__init__.py @@ -0,0 +1,5 @@ +# License: BSD 3-Clause + +"""Base class for estimator executors.""" + +from openml.extensions.execution.base import ModelExecutor \ No newline at end of file diff --git a/openml/extensions/execution/base.py b/openml/extensions/execution/base.py new file mode 100644 index 000000000..148f123f2 --- /dev/null +++ b/openml/extensions/execution/base.py @@ -0,0 +1,109 @@ +# License: BSD 3-Clause + +"""Base class for estimator executors.""" + +from abc import ABC, abstractmethod +from typing import Any +from collections import OrderedDict + +import numpy as np +import scipy.sparse + +from openml.tasks.task import OpenMLTask +from openml.runs.trace import OpenMLRunTrace + +class ModelExecutor(ABC): + """Define runtime execution semantics for a specific API type.""" + + @classmethod + @abstractmethod + def can_handle_model(cls, model: Any) -> bool: + """Check whether a model flow can be handled by this extension. + + This is typically done by checking the type of the model, or the package it belongs to. + + Parameters + ---------- + model : Any + + Returns + ------- + bool + """ + + @abstractmethod + def seed_model(self, model: Any, seed: int | None) -> Any: + """Set the seed of all the unseeded components of a model and return the seeded model. + + Required so that all seed information can be uploaded to OpenML for reproducible results. + + Parameters + ---------- + model : Any + The model to be seeded + seed : int + + Returns + ------- + model + """ + + @abstractmethod + def _run_model_on_fold( # noqa: PLR0913 + self, + model: Any, + task: OpenMLTask, + X_train: np.ndarray | scipy.sparse.spmatrix, + rep_no: int, + fold_no: int, + y_train: np.ndarray | None = None, + X_test: np.ndarray | scipy.sparse.spmatrix | None = None, + ) -> tuple[np.ndarray, np.ndarray | None, OrderedDict[str, float], OpenMLRunTrace | None]: + """Run a model on a repeat, fold, subsample triplet of the task. + + Returns the data that is necessary to construct the OpenML Run object. Is used by + :func:`openml.runs.run_flow_on_task`. + + Parameters + ---------- + model : Any + The UNTRAINED model to run. The model instance will be copied and not altered. + task : OpenMLTask + The task to run the model on. + X_train : array-like + Training data for the given repetition and fold. + rep_no : int + The repeat of the experiment (0-based; in case of 1 time CV, always 0) + fold_no : int + The fold nr of the experiment (0-based; in case of holdout, always 0) + y_train : Optional[np.ndarray] (default=None) + Target attributes for supervised tasks. In case of classification, these are integer + indices to the potential classes specified by dataset. + X_test : Optional, array-like (default=None) + Test attributes to test for generalization in supervised tasks. + + Returns + ------- + predictions : np.ndarray + Model predictions. + probabilities : Optional, np.ndarray + Predicted probabilities (only applicable for supervised classification tasks). + user_defined_measures : OrderedDict[str, float] + User defined measures that were generated on this fold + trace : Optional, OpenMLRunTrace + Hyperparameter optimization trace (only applicable for supervised tasks with + hyperparameter optimization). + """ + + @abstractmethod + def check_if_model_fitted(self, model: Any) -> bool: + """Returns True/False denoting if the model has already been fitted/trained. + + Parameters + ---------- + model : Any + + Returns + ------- + bool + """ \ No newline at end of file diff --git a/openml/extensions/registry.py b/openml/extensions/registry.py new file mode 100644 index 000000000..2ab6cc785 --- /dev/null +++ b/openml/extensions/registry.py @@ -0,0 +1,57 @@ +# License: BSD 3-Clause + +from typing import Any + +from openml.exceptions import OpenMLException +from openml.extensions.connectors import OpenMLAPIConnector + +from openml_sklearn import SklearnExtension + +API_CONNECTOR_REGISTRY: list[type[OpenMLAPIConnector]] = [ + SklearnExtension, # TODO: I need to refactor SklearnExtension +] + +def resolve_api_connector(estimator: Any) -> OpenMLAPIConnector: + """ + Identifies and returns the appropriate OpenML API connector for a given estimator. + + This function iterates through the global ``API_CONNECTOR_REGISTRY`` to find + a connector class that supports the provided estimator object. If exactly one + matching connector is found, it is instantiated and returned. + + Parameters + ---------- + estimator : Any + The estimator object (e.g., a scikit-learn estimator) for which an API + connector is required. + + Returns + ------- + OpenMLAPIConnector + An instance of the matching API connector. + + Raises + ------ + OpenMLException + If no connector is found in the registry that supports the provided + model, or if multiple connectors in the registry claim support for + the provided model. + """ + candidates = [ + connector for connector in API_CONNECTOR_REGISTRY + if connector.supports(estimator) + ] + + if not candidates: + raise OpenMLException( + "No OpenML API connector found for this estimator." + ) + + if len(candidates) > 1: + names = [c.__name__ for c in candidates] + raise OpenMLException( + "Multiple API connectors match this estimator:\n" + + "\n".join(f"- {n}" for n in names) + ) + + return candidates[0]() diff --git a/openml/extensions/serialization/__init__.py b/openml/extensions/serialization/__init__.py new file mode 100644 index 000000000..e8bf5d200 --- /dev/null +++ b/openml/extensions/serialization/__init__.py @@ -0,0 +1,5 @@ +# License: BSD 3-Clause + +"""Base classes for estimator serializors.""" + +from openml.extensions.serialization.base import ModelSerializer \ No newline at end of file diff --git a/openml/extensions/serialization/base.py b/openml/extensions/serialization/base.py new file mode 100644 index 000000000..2e275af2e --- /dev/null +++ b/openml/extensions/serialization/base.py @@ -0,0 +1,124 @@ +# License: BSD 3-Clause + +"""Base class for estimator serializors.""" + +from abc import ABC, abstractmethod +from typing import Any + +from openml.flows import OpenMLFlow +from openml.runs.trace import OpenMLTraceIteration + + +class ModelSerializer(ABC): + """Handle the conversion between estimator instances and OpenML Flows.""" + + @classmethod + @abstractmethod + def can_handle_model(cls, model: Any) -> bool: + """Check whether a model flow can be handled by this extension. + + This is typically done by checking the type of the model, or the package it belongs to. + + Parameters + ---------- + model : Any + + Returns + ------- + bool + """ + + @abstractmethod + def model_to_flow(self, model: Any) -> OpenMLFlow: + """Transform a model to a flow for uploading it to OpenML. + + Parameters + ---------- + model : Any + + Returns + ------- + OpenMLFlow + """ + + @abstractmethod + def flow_to_model( + self, + flow: OpenMLFlow, + initialize_with_defaults: bool = False, # noqa: FBT001, FBT002 + strict_version: bool = True, # noqa: FBT002, FBT001 + ) -> Any: + """Instantiate a model from the flow representation. + + Parameters + ---------- + flow : OpenMLFlow + + initialize_with_defaults : bool, optional (default=False) + If this flag is set, the hyperparameter values of flows will be + ignored and a flow with its defaults is returned. + + strict_version : bool, default=True + Whether to fail if version requirements are not fulfilled. + + Returns + ------- + Any + """ + + @abstractmethod + def obtain_parameter_values( + self, + flow: OpenMLFlow, + model: Any = None, + ) -> list[dict[str, Any]]: + """Extracts all parameter settings required for the flow from the model. + + If no explicit model is provided, the parameters will be extracted from `flow.model` + instead. + + Parameters + ---------- + flow : OpenMLFlow + OpenMLFlow object (containing flow ids, i.e., it has to be downloaded from the server) + + model: Any, optional (default=None) + The model from which to obtain the parameter values. Must match the flow signature. + If None, use the model specified in ``OpenMLFlow.model``. + + Returns + ------- + list + A list of dicts, where each dict has the following entries: + - ``oml:name`` : str: The OpenML parameter name + - ``oml:value`` : mixed: A representation of the parameter value + - ``oml:component`` : int: flow id to which the parameter belongs + """ + + @abstractmethod + def get_version_information(self) -> list[str]: + """Return dependency and version information.""" + + # Abstract methods for hyperparameter optimization + + @abstractmethod + def instantiate_model_from_hpo_class( + self, + model: Any, + trace_iteration: OpenMLTraceIteration, + ) -> Any: + """Instantiate a base model which can be searched over by the hyperparameter optimization + model. + + Parameters + ---------- + model : Any + A hyperparameter optimization model which defines the model to be instantiated. + trace_iteration : OpenMLTraceIteration + Describing the hyperparameter settings to instantiate. + + Returns + ------- + Any + """ + # TODO a trace belongs to a run and therefore a flow -> simplify this part of the interface! \ No newline at end of file From 2aab335a39a7044ee34427a05d9647579f63cb59 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 2 Jan 2026 08:49:05 +0000 Subject: [PATCH 02/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- openml/extensions/connectors/__init__.py | 2 +- openml/extensions/connectors/base.py | 15 +++++----- openml/extensions/execution/__init__.py | 2 +- openml/extensions/execution/base.py | 16 +++++++---- openml/extensions/registry.py | 32 ++++++++++----------- openml/extensions/serialization/__init__.py | 2 +- openml/extensions/serialization/base.py | 11 ++++--- 7 files changed, 44 insertions(+), 36 deletions(-) diff --git a/openml/extensions/connectors/__init__.py b/openml/extensions/connectors/__init__.py index 5bbd28033..d9ebb7fd8 100644 --- a/openml/extensions/connectors/__init__.py +++ b/openml/extensions/connectors/__init__.py @@ -2,4 +2,4 @@ """Base classes for OpenML API connectors.""" -from openml.extensions.connectors.base import OpenMLAPIConnector \ No newline at end of file +from openml.extensions.connectors.base import OpenMLAPIConnector diff --git a/openml/extensions/connectors/base.py b/openml/extensions/connectors/base.py index 048adcdf3..df753c04e 100644 --- a/openml/extensions/connectors/base.py +++ b/openml/extensions/connectors/base.py @@ -1,15 +1,16 @@ # License: BSD 3-Clause +from __future__ import annotations from abc import ABC, abstractmethod -from typing import Any +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from openml.extensions.execution import ModelExecutor + from openml.extensions.serialization import ModelSerializer -from openml.extensions.execution import ModelExecutor -from openml.extensions.serialization import ModelSerializer class OpenMLAPIConnector(ABC): - """ - Base class for OpenML API connectors. - """ + """Base class for OpenML API connectors.""" @abstractmethod def serializer(self) -> ModelSerializer: @@ -22,4 +23,4 @@ def executor(self) -> ModelExecutor: @classmethod @abstractmethod def supports(cls, model: Any) -> bool: - """High-level check if this connector supports the model.""" \ No newline at end of file + """High-level check if this connector supports the model.""" diff --git a/openml/extensions/execution/__init__.py b/openml/extensions/execution/__init__.py index c41c09ba7..245a16db4 100644 --- a/openml/extensions/execution/__init__.py +++ b/openml/extensions/execution/__init__.py @@ -2,4 +2,4 @@ """Base class for estimator executors.""" -from openml.extensions.execution.base import ModelExecutor \ No newline at end of file +from openml.extensions.execution.base import ModelExecutor diff --git a/openml/extensions/execution/base.py b/openml/extensions/execution/base.py index 148f123f2..018fe2964 100644 --- a/openml/extensions/execution/base.py +++ b/openml/extensions/execution/base.py @@ -2,15 +2,19 @@ """Base class for estimator executors.""" +from __future__ import annotations + from abc import ABC, abstractmethod -from typing import Any from collections import OrderedDict +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + import numpy as np + import scipy.sparse -import numpy as np -import scipy.sparse + from openml.runs.trace import OpenMLRunTrace + from openml.tasks.task import OpenMLTask -from openml.tasks.task import OpenMLTask -from openml.runs.trace import OpenMLRunTrace class ModelExecutor(ABC): """Define runtime execution semantics for a specific API type.""" @@ -106,4 +110,4 @@ def check_if_model_fitted(self, model: Any) -> bool: Returns ------- bool - """ \ No newline at end of file + """ diff --git a/openml/extensions/registry.py b/openml/extensions/registry.py index 2ab6cc785..0bdd9cd2f 100644 --- a/openml/extensions/registry.py +++ b/openml/extensions/registry.py @@ -1,28 +1,32 @@ # License: BSD 3-Clause +from __future__ import annotations -from typing import Any +from typing import TYPE_CHECKING, Any + +from openml_sklearn import SklearnExtension from openml.exceptions import OpenMLException -from openml.extensions.connectors import OpenMLAPIConnector -from openml_sklearn import SklearnExtension +if TYPE_CHECKING: + from openml.extensions.connectors import OpenMLAPIConnector API_CONNECTOR_REGISTRY: list[type[OpenMLAPIConnector]] = [ - SklearnExtension, # TODO: I need to refactor SklearnExtension + SklearnExtension, # TODO: I need to refactor SklearnExtension ] + def resolve_api_connector(estimator: Any) -> OpenMLAPIConnector: """ Identifies and returns the appropriate OpenML API connector for a given estimator. - This function iterates through the global ``API_CONNECTOR_REGISTRY`` to find - a connector class that supports the provided estimator object. If exactly one + This function iterates through the global ``API_CONNECTOR_REGISTRY`` to find + a connector class that supports the provided estimator object. If exactly one matching connector is found, it is instantiated and returned. Parameters ---------- estimator : Any - The estimator object (e.g., a scikit-learn estimator) for which an API + The estimator object (e.g., a scikit-learn estimator) for which an API connector is required. Returns @@ -33,25 +37,21 @@ def resolve_api_connector(estimator: Any) -> OpenMLAPIConnector: Raises ------ OpenMLException - If no connector is found in the registry that supports the provided - model, or if multiple connectors in the registry claim support for + If no connector is found in the registry that supports the provided + model, or if multiple connectors in the registry claim support for the provided model. """ candidates = [ - connector for connector in API_CONNECTOR_REGISTRY - if connector.supports(estimator) + connector for connector in API_CONNECTOR_REGISTRY if connector.supports(estimator) ] if not candidates: - raise OpenMLException( - "No OpenML API connector found for this estimator." - ) + raise OpenMLException("No OpenML API connector found for this estimator.") if len(candidates) > 1: names = [c.__name__ for c in candidates] raise OpenMLException( - "Multiple API connectors match this estimator:\n" - + "\n".join(f"- {n}" for n in names) + "Multiple API connectors match this estimator:\n" + "\n".join(f"- {n}" for n in names) ) return candidates[0]() diff --git a/openml/extensions/serialization/__init__.py b/openml/extensions/serialization/__init__.py index e8bf5d200..f21df84bd 100644 --- a/openml/extensions/serialization/__init__.py +++ b/openml/extensions/serialization/__init__.py @@ -2,4 +2,4 @@ """Base classes for estimator serializors.""" -from openml.extensions.serialization.base import ModelSerializer \ No newline at end of file +from openml.extensions.serialization.base import ModelSerializer diff --git a/openml/extensions/serialization/base.py b/openml/extensions/serialization/base.py index 2e275af2e..96026fc56 100644 --- a/openml/extensions/serialization/base.py +++ b/openml/extensions/serialization/base.py @@ -2,11 +2,14 @@ """Base class for estimator serializors.""" +from __future__ import annotations + from abc import ABC, abstractmethod -from typing import Any +from typing import TYPE_CHECKING, Any -from openml.flows import OpenMLFlow -from openml.runs.trace import OpenMLTraceIteration +if TYPE_CHECKING: + from openml.flows import OpenMLFlow + from openml.runs.trace import OpenMLTraceIteration class ModelSerializer(ABC): @@ -121,4 +124,4 @@ def instantiate_model_from_hpo_class( ------- Any """ - # TODO a trace belongs to a run and therefore a flow -> simplify this part of the interface! \ No newline at end of file + # TODO a trace belongs to a run and therefore a flow -> simplify this part of the interface! From 1365bf62701d274f50cc298b8ce297efe35cb3e8 Mon Sep 17 00:00:00 2001 From: Jigyasu Date: Fri, 2 Jan 2026 14:32:07 +0530 Subject: [PATCH 03/15] correct openml exception --- openml/extensions/registry.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/openml/extensions/registry.py b/openml/extensions/registry.py index 0bdd9cd2f..4cb1a2d84 100644 --- a/openml/extensions/registry.py +++ b/openml/extensions/registry.py @@ -5,7 +5,7 @@ from openml_sklearn import SklearnExtension -from openml.exceptions import OpenMLException +from openml.exceptions import PyOpenMLError if TYPE_CHECKING: from openml.extensions.connectors import OpenMLAPIConnector @@ -46,11 +46,11 @@ def resolve_api_connector(estimator: Any) -> OpenMLAPIConnector: ] if not candidates: - raise OpenMLException("No OpenML API connector found for this estimator.") + raise PyOpenMLError("No OpenML API connector found for this estimator.") if len(candidates) > 1: names = [c.__name__ for c in candidates] - raise OpenMLException( + raise PyOpenMLError( "Multiple API connectors match this estimator:\n" + "\n".join(f"- {n}" for n in names) ) From 67c0efb92a1b615f0e511ad4bbbb64c76b6d59d3 Mon Sep 17 00:00:00 2001 From: Jigyasu Date: Fri, 2 Jan 2026 14:51:42 +0530 Subject: [PATCH 04/15] use __all__ for imports in __init__ --- openml/extensions/connectors/__init__.py | 2 ++ openml/extensions/execution/__init__.py | 2 ++ openml/extensions/serialization/__init__.py | 2 ++ 3 files changed, 6 insertions(+) diff --git a/openml/extensions/connectors/__init__.py b/openml/extensions/connectors/__init__.py index d9ebb7fd8..31d358d70 100644 --- a/openml/extensions/connectors/__init__.py +++ b/openml/extensions/connectors/__init__.py @@ -3,3 +3,5 @@ """Base classes for OpenML API connectors.""" from openml.extensions.connectors.base import OpenMLAPIConnector + +__all__ = ["OpenMLAPIConnector"] diff --git a/openml/extensions/execution/__init__.py b/openml/extensions/execution/__init__.py index 245a16db4..0939998e8 100644 --- a/openml/extensions/execution/__init__.py +++ b/openml/extensions/execution/__init__.py @@ -3,3 +3,5 @@ """Base class for estimator executors.""" from openml.extensions.execution.base import ModelExecutor + +__all__ = ["ModelExecutor"] diff --git a/openml/extensions/serialization/__init__.py b/openml/extensions/serialization/__init__.py index f21df84bd..df618b60f 100644 --- a/openml/extensions/serialization/__init__.py +++ b/openml/extensions/serialization/__init__.py @@ -3,3 +3,5 @@ """Base classes for estimator serializors.""" from openml.extensions.serialization.base import ModelSerializer + +__all__ = ["ModelSerializer"] From e5850efc932e3f550b31270c31c3bec3dd5a36bb Mon Sep 17 00:00:00 2001 From: Jigyasu Date: Fri, 2 Jan 2026 16:08:00 +0530 Subject: [PATCH 05/15] update registry --- openml/extensions/connectors/base.py | 3 +++ openml/extensions/registry.py | 22 ++++++++-------------- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/openml/extensions/connectors/base.py b/openml/extensions/connectors/base.py index df753c04e..398c3c656 100644 --- a/openml/extensions/connectors/base.py +++ b/openml/extensions/connectors/base.py @@ -1,4 +1,7 @@ # License: BSD 3-Clause + +"""Base class for OpenML API connectors.""" + from __future__ import annotations from abc import ABC, abstractmethod diff --git a/openml/extensions/registry.py b/openml/extensions/registry.py index 4cb1a2d84..23d220898 100644 --- a/openml/extensions/registry.py +++ b/openml/extensions/registry.py @@ -1,4 +1,7 @@ # License: BSD 3-Clause + +"""Extension registry.""" + from __future__ import annotations from typing import TYPE_CHECKING, Any @@ -17,7 +20,7 @@ def resolve_api_connector(estimator: Any) -> OpenMLAPIConnector: """ - Identifies and returns the appropriate OpenML API connector for a given estimator. + Identify and return the appropriate OpenML API connector for a given estimator. This function iterates through the global ``API_CONNECTOR_REGISTRY`` to find a connector class that supports the provided estimator object. If exactly one @@ -41,17 +44,8 @@ def resolve_api_connector(estimator: Any) -> OpenMLAPIConnector: model, or if multiple connectors in the registry claim support for the provided model. """ - candidates = [ - connector for connector in API_CONNECTOR_REGISTRY if connector.supports(estimator) - ] - - if not candidates: - raise PyOpenMLError("No OpenML API connector found for this estimator.") - - if len(candidates) > 1: - names = [c.__name__ for c in candidates] - raise PyOpenMLError( - "Multiple API connectors match this estimator:\n" + "\n".join(f"- {n}" for n in names) - ) + for connector_cls in API_CONNECTOR_REGISTRY: + if connector_cls.supports(estimator): + return connector_cls() - return candidates[0]() + raise PyOpenMLError("No OpenML API connector supports this estimator.") From 00da7a9691add4f4c3ceaa099573de61779c8fab Mon Sep 17 00:00:00 2001 From: Jigyasu Date: Mon, 5 Jan 2026 12:17:56 +0530 Subject: [PATCH 06/15] update registry and file structure --- openml/extensions/base/__init__.py | 13 ++ .../base.py => base/_connector.py} | 7 +- .../{execution/base.py => base/_executor.py} | 72 +++++++--- openml/extensions/base/_serializer.py | 73 ++++++++++ openml/extensions/connectors/__init__.py | 7 - openml/extensions/execution/__init__.py | 7 - openml/extensions/registry.py | 15 +-- openml/extensions/serialization/__init__.py | 7 - openml/extensions/serialization/base.py | 127 ------------------ 9 files changed, 151 insertions(+), 177 deletions(-) create mode 100644 openml/extensions/base/__init__.py rename openml/extensions/{connectors/base.py => base/_connector.py} (69%) rename openml/extensions/{execution/base.py => base/_executor.py} (64%) create mode 100644 openml/extensions/base/_serializer.py delete mode 100644 openml/extensions/connectors/__init__.py delete mode 100644 openml/extensions/execution/__init__.py delete mode 100644 openml/extensions/serialization/__init__.py delete mode 100644 openml/extensions/serialization/base.py diff --git a/openml/extensions/base/__init__.py b/openml/extensions/base/__init__.py new file mode 100644 index 000000000..d85c0b268 --- /dev/null +++ b/openml/extensions/base/__init__.py @@ -0,0 +1,13 @@ +# License: BSD 3-Clause + +"""Base classes for OpenML extensions.""" + +from openml.extensions.base._connector import OpenMLAPIConnector +from openml.extensions.base._executor import ModelExecutor +from openml.extensions.base._serializer import ModelSerializer + +__all__ = [ + "ModelExecutor", + "ModelSerializer", + "OpenMLAPIConnector", +] diff --git a/openml/extensions/connectors/base.py b/openml/extensions/base/_connector.py similarity index 69% rename from openml/extensions/connectors/base.py rename to openml/extensions/base/_connector.py index 398c3c656..9ad66307a 100644 --- a/openml/extensions/connectors/base.py +++ b/openml/extensions/base/_connector.py @@ -8,8 +8,7 @@ from typing import TYPE_CHECKING, Any if TYPE_CHECKING: - from openml.extensions.execution import ModelExecutor - from openml.extensions.serialization import ModelSerializer + from openml.extensions.base import ModelExecutor, ModelSerializer class OpenMLAPIConnector(ABC): @@ -25,5 +24,5 @@ def executor(self) -> ModelExecutor: @classmethod @abstractmethod - def supports(cls, model: Any) -> bool: - """High-level check if this connector supports the model.""" + def supports(cls, estimator: Any) -> bool: + """High-level check if this connector supports the estimator instance or flow.""" diff --git a/openml/extensions/execution/base.py b/openml/extensions/base/_executor.py similarity index 64% rename from openml/extensions/execution/base.py rename to openml/extensions/base/_executor.py index 018fe2964..67184a3b3 100644 --- a/openml/extensions/execution/base.py +++ b/openml/extensions/base/_executor.py @@ -12,29 +12,14 @@ import numpy as np import scipy.sparse - from openml.runs.trace import OpenMLRunTrace + from openml.flows import OpenMLFlow + from openml.runs.trace import OpenMLRunTrace, OpenMLTraceIteration from openml.tasks.task import OpenMLTask class ModelExecutor(ABC): """Define runtime execution semantics for a specific API type.""" - @classmethod - @abstractmethod - def can_handle_model(cls, model: Any) -> bool: - """Check whether a model flow can be handled by this extension. - - This is typically done by checking the type of the model, or the package it belongs to. - - Parameters - ---------- - model : Any - - Returns - ------- - bool - """ - @abstractmethod def seed_model(self, model: Any, seed: int | None) -> Any: """Set the seed of all the unseeded components of a model and return the seeded model. @@ -111,3 +96,56 @@ def check_if_model_fitted(self, model: Any) -> bool: ------- bool """ + + @abstractmethod + def obtain_parameter_values( + self, + flow: OpenMLFlow, + model: Any = None, + ) -> list[dict[str, Any]]: + """Extracts all parameter settings required for the flow from the model. + + If no explicit model is provided, the parameters will be extracted from `flow.model` + instead. + + Parameters + ---------- + flow : OpenMLFlow + OpenMLFlow object (containing flow ids, i.e., it has to be downloaded from the server) + + model: Any, optional (default=None) + The model from which to obtain the parameter values. Must match the flow signature. + If None, use the model specified in ``OpenMLFlow.model``. + + Returns + ------- + list + A list of dicts, where each dict has the following entries: + - ``oml:name`` : str: The OpenML parameter name + - ``oml:value`` : mixed: A representation of the parameter value + - ``oml:component`` : int: flow id to which the parameter belongs + """ + + # Abstract methods for hyperparameter optimization + + @abstractmethod + def instantiate_model_from_hpo_class( + self, + model: Any, + trace_iteration: OpenMLTraceIteration, + ) -> Any: + """Instantiate a base model which can be searched over by the hyperparameter optimization + model. + + Parameters + ---------- + model : Any + A hyperparameter optimization model which defines the model to be instantiated. + trace_iteration : OpenMLTraceIteration + Describing the hyperparameter settings to instantiate. + + Returns + ------- + Any + """ + # TODO a trace belongs to a run and therefore a flow -> simplify this part of the interface! diff --git a/openml/extensions/base/_serializer.py b/openml/extensions/base/_serializer.py new file mode 100644 index 000000000..f2673d4c4 --- /dev/null +++ b/openml/extensions/base/_serializer.py @@ -0,0 +1,73 @@ +# License: BSD 3-Clause + +"""Base class for estimator serializors.""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from openml.flows import OpenMLFlow + + +class ModelSerializer(ABC): + """Handle the conversion between estimator instances and OpenML Flows.""" + + @classmethod + @abstractmethod + def can_handle_model(cls, model: Any) -> bool: + """Check whether a model flow can be handled by this extension. + + This is typically done by checking the type of the model, or the package it belongs to. + + Parameters + ---------- + model : Any + + Returns + ------- + bool + """ + + @abstractmethod + def model_to_flow(self, model: Any) -> OpenMLFlow: + """Transform a model to a flow for uploading it to OpenML. + + Parameters + ---------- + model : Any + + Returns + ------- + OpenMLFlow + """ + + @abstractmethod + def flow_to_model( + self, + flow: OpenMLFlow, + initialize_with_defaults: bool = False, # noqa: FBT001, FBT002 + strict_version: bool = True, # noqa: FBT002, FBT001 + ) -> Any: + """Instantiate a model from the flow representation. + + Parameters + ---------- + flow : OpenMLFlow + + initialize_with_defaults : bool, optional (default=False) + If this flag is set, the hyperparameter values of flows will be + ignored and a flow with its defaults is returned. + + strict_version : bool, default=True + Whether to fail if version requirements are not fulfilled. + + Returns + ------- + Any + """ + + @abstractmethod + def get_version_information(self) -> list[str]: + """Return dependency and version information.""" diff --git a/openml/extensions/connectors/__init__.py b/openml/extensions/connectors/__init__.py deleted file mode 100644 index 31d358d70..000000000 --- a/openml/extensions/connectors/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# License: BSD 3-Clause - -"""Base classes for OpenML API connectors.""" - -from openml.extensions.connectors.base import OpenMLAPIConnector - -__all__ = ["OpenMLAPIConnector"] diff --git a/openml/extensions/execution/__init__.py b/openml/extensions/execution/__init__.py deleted file mode 100644 index 0939998e8..000000000 --- a/openml/extensions/execution/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# License: BSD 3-Clause - -"""Base class for estimator executors.""" - -from openml.extensions.execution.base import ModelExecutor - -__all__ = ["ModelExecutor"] diff --git a/openml/extensions/registry.py b/openml/extensions/registry.py index 23d220898..e3bc0788e 100644 --- a/openml/extensions/registry.py +++ b/openml/extensions/registry.py @@ -6,15 +6,14 @@ from typing import TYPE_CHECKING, Any -from openml_sklearn import SklearnExtension - from openml.exceptions import PyOpenMLError +from openml.extensions.sklearn import SklearnAPIConnector if TYPE_CHECKING: - from openml.extensions.connectors import OpenMLAPIConnector + from openml.extensions.base import OpenMLAPIConnector API_CONNECTOR_REGISTRY: list[type[OpenMLAPIConnector]] = [ - SklearnExtension, # TODO: I need to refactor SklearnExtension + SklearnAPIConnector, ] @@ -23,14 +22,14 @@ def resolve_api_connector(estimator: Any) -> OpenMLAPIConnector: Identify and return the appropriate OpenML API connector for a given estimator. This function iterates through the global ``API_CONNECTOR_REGISTRY`` to find - a connector class that supports the provided estimator object. If exactly one - matching connector is found, it is instantiated and returned. + a connector class that supports the provided estimator instance or OpenML flow. + If a matching connector is found, it is instantiated and returned. Parameters ---------- estimator : Any - The estimator object (e.g., a scikit-learn estimator) for which an API - connector is required. + The estimator instance (e.g., a scikit-learn estimator) or OpenML flow for + which an API connector is required. Returns ------- diff --git a/openml/extensions/serialization/__init__.py b/openml/extensions/serialization/__init__.py deleted file mode 100644 index df618b60f..000000000 --- a/openml/extensions/serialization/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# License: BSD 3-Clause - -"""Base classes for estimator serializors.""" - -from openml.extensions.serialization.base import ModelSerializer - -__all__ = ["ModelSerializer"] diff --git a/openml/extensions/serialization/base.py b/openml/extensions/serialization/base.py deleted file mode 100644 index 96026fc56..000000000 --- a/openml/extensions/serialization/base.py +++ /dev/null @@ -1,127 +0,0 @@ -# License: BSD 3-Clause - -"""Base class for estimator serializors.""" - -from __future__ import annotations - -from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any - -if TYPE_CHECKING: - from openml.flows import OpenMLFlow - from openml.runs.trace import OpenMLTraceIteration - - -class ModelSerializer(ABC): - """Handle the conversion between estimator instances and OpenML Flows.""" - - @classmethod - @abstractmethod - def can_handle_model(cls, model: Any) -> bool: - """Check whether a model flow can be handled by this extension. - - This is typically done by checking the type of the model, or the package it belongs to. - - Parameters - ---------- - model : Any - - Returns - ------- - bool - """ - - @abstractmethod - def model_to_flow(self, model: Any) -> OpenMLFlow: - """Transform a model to a flow for uploading it to OpenML. - - Parameters - ---------- - model : Any - - Returns - ------- - OpenMLFlow - """ - - @abstractmethod - def flow_to_model( - self, - flow: OpenMLFlow, - initialize_with_defaults: bool = False, # noqa: FBT001, FBT002 - strict_version: bool = True, # noqa: FBT002, FBT001 - ) -> Any: - """Instantiate a model from the flow representation. - - Parameters - ---------- - flow : OpenMLFlow - - initialize_with_defaults : bool, optional (default=False) - If this flag is set, the hyperparameter values of flows will be - ignored and a flow with its defaults is returned. - - strict_version : bool, default=True - Whether to fail if version requirements are not fulfilled. - - Returns - ------- - Any - """ - - @abstractmethod - def obtain_parameter_values( - self, - flow: OpenMLFlow, - model: Any = None, - ) -> list[dict[str, Any]]: - """Extracts all parameter settings required for the flow from the model. - - If no explicit model is provided, the parameters will be extracted from `flow.model` - instead. - - Parameters - ---------- - flow : OpenMLFlow - OpenMLFlow object (containing flow ids, i.e., it has to be downloaded from the server) - - model: Any, optional (default=None) - The model from which to obtain the parameter values. Must match the flow signature. - If None, use the model specified in ``OpenMLFlow.model``. - - Returns - ------- - list - A list of dicts, where each dict has the following entries: - - ``oml:name`` : str: The OpenML parameter name - - ``oml:value`` : mixed: A representation of the parameter value - - ``oml:component`` : int: flow id to which the parameter belongs - """ - - @abstractmethod - def get_version_information(self) -> list[str]: - """Return dependency and version information.""" - - # Abstract methods for hyperparameter optimization - - @abstractmethod - def instantiate_model_from_hpo_class( - self, - model: Any, - trace_iteration: OpenMLTraceIteration, - ) -> Any: - """Instantiate a base model which can be searched over by the hyperparameter optimization - model. - - Parameters - ---------- - model : Any - A hyperparameter optimization model which defines the model to be instantiated. - trace_iteration : OpenMLTraceIteration - Describing the hyperparameter settings to instantiate. - - Returns - ------- - Any - """ - # TODO a trace belongs to a run and therefore a flow -> simplify this part of the interface! From 373fa53697e00ecacfdb390f1d326a9a95f468df Mon Sep 17 00:00:00 2001 From: Jigyasu Date: Mon, 5 Jan 2026 12:30:32 +0530 Subject: [PATCH 07/15] [DO NOT MERGE] Refactor openml-sklearn back into openml-python --- openml/extensions/sklearn/__init__.py | 13 + openml/extensions/sklearn/connector.py | 68 + openml/extensions/sklearn/executor.py | 784 ++++++++++++ openml/extensions/sklearn/serializer.py | 1508 +++++++++++++++++++++++ 4 files changed, 2373 insertions(+) create mode 100644 openml/extensions/sklearn/__init__.py create mode 100644 openml/extensions/sklearn/connector.py create mode 100644 openml/extensions/sklearn/executor.py create mode 100644 openml/extensions/sklearn/serializer.py diff --git a/openml/extensions/sklearn/__init__.py b/openml/extensions/sklearn/__init__.py new file mode 100644 index 000000000..4e22407be --- /dev/null +++ b/openml/extensions/sklearn/__init__.py @@ -0,0 +1,13 @@ +# License: BSD 3-Clause + +"""OpenML extension for Scikit-learn.""" + +from openml.extensions.sklearn.connector import SklearnAPIConnector +from openml.extensions.sklearn.executor import SklearnExecutor +from openml.extensions.sklearn.serializer import SklearnSerializer + +__all__ = [ + "SklearnAPIConnector", + "SklearnExecutor", + "SklearnSerializer", +] diff --git a/openml/extensions/sklearn/connector.py b/openml/extensions/sklearn/connector.py new file mode 100644 index 000000000..795baf640 --- /dev/null +++ b/openml/extensions/sklearn/connector.py @@ -0,0 +1,68 @@ +# License: BSD 3-Clause + +"""Connector for the Scikit-learn extension.""" + +from __future__ import annotations + +from openml.extensions.base import OpenMLAPIConnector +from openml.extensions.sklearn.executor import SklearnExecutor +from openml.extensions.sklearn.serializer import SklearnSerializer +from openml.flows import OpenMLFlow + + +class SklearnAPIConnector(OpenMLAPIConnector): + """ + Connector for the Scikit-learn extension. + + This class provides the interface to connect Scikit-learn models and flows + to the OpenML API, handling both serialization and execution compatibility checks. + """ + + def serializer(self) -> SklearnSerializer: + """ + Return the serializer for Scikit-learn estimators. + + Returns + ------- + SklearnSerializer + The serializer instance capable of handling Scikit-learn estimator. + """ + return SklearnSerializer() + + def executor(self) -> SklearnExecutor: + """ + Return the executor for Scikit-learn estimators. + + Returns + ------- + SklearnExecutor + The executor instance capable of running Scikit-learn estimators. + """ + return SklearnExecutor() + + @classmethod + def supports(cls, estimator) -> bool: + """ + Check if this connector supports the given model or flow. + + Parameters + ---------- + estimator : Any or OpenMLFlow + The Scikit-learn estimator instance or OpenMLFlow object. + + Returns + ------- + bool + True if both the serializer and executor can handle the provided + estimator or flow, False otherwise. + """ + serializer = SklearnSerializer() + SklearnExecutor() + + if isinstance(estimator, OpenMLFlow): + support = serializer.can_handle_flow(estimator) + + else: + support = serializer.can_handle_model(estimator) + + return support diff --git a/openml/extensions/sklearn/executor.py b/openml/extensions/sklearn/executor.py new file mode 100644 index 000000000..69d54a102 --- /dev/null +++ b/openml/extensions/sklearn/executor.py @@ -0,0 +1,784 @@ +# License: BSD 3-Clause +from __future__ import annotations + +import json +import logging +import time +import warnings +from collections import OrderedDict +from typing import TYPE_CHECKING, Any, List, cast + +import numpy as np +import pandas as pd +import sklearn.base +import sklearn.model_selection +import sklearn.pipeline + +import openml +from openml.exceptions import PyOpenMLError +from openml.extensions.base import ModelExecutor +from openml.flows import OpenMLFlow +from openml.runs.trace import PREFIX, OpenMLRunTrace, OpenMLTraceIteration +from openml.tasks import ( + OpenMLClassificationTask, + OpenMLClusteringTask, + OpenMLLearningCurveTask, + OpenMLRegressionTask, + OpenMLSupervisedTask, + OpenMLTask, +) + +if TYPE_CHECKING: + import scipy.sparse + +logger = logging.getLogger(__name__) + +SKLEARN_PIPELINE_STRING_COMPONENTS = ("drop", "passthrough") +COMPONENT_REFERENCE = "component_reference" +COMPOSITION_STEP_CONSTANT = "composition_step_constant" + + +class SklearnExecutor(ModelExecutor): + """Executor for Scikit-learn estimators.""" + + def seed_model(self, model: Any, seed: int | None = None) -> Any: # noqa: C901 + """Set the random state of all the unseeded components of a model and return the seeded + model. + + Required so that all seed information can be uploaded to OpenML for reproducible results. + + Models that are already seeded will maintain the seed. In this case, + only integer seeds are allowed (An exception is raised when a RandomState was used as + seed). + + Parameters + ---------- + model : sklearn model + The model to be seeded + seed : int + The seed to initialize the RandomState with. Unseeded subcomponents + will be seeded with a random number from the RandomState. + + Returns + ------- + Any + """ + + def _seed_current_object(current_value): + if isinstance(current_value, int): # acceptable behaviour + return False + + if isinstance(current_value, np.random.RandomState): + raise ValueError( + "Models initialized with a RandomState object are not " + "supported. Please seed with an integer. ", + ) + + if current_value is not None: + raise ValueError( + "Models should be seeded with int or None (this should never happen). ", + ) + + return True + + rs = np.random.RandomState(seed) + model_params = model.get_params() + random_states = {} + for param_name in sorted(model_params): + if "random_state" in param_name: + current_value = model_params[param_name] + # important to draw the value at this point (and not in the if + # statement) this way we guarantee that if a different set of + # subflows is seeded, the same number of the random generator is + # used + new_value = rs.randint(0, 2**16) + if _seed_current_object(current_value): + random_states[param_name] = new_value + + # Also seed CV objects! + elif isinstance(model_params[param_name], sklearn.model_selection.BaseCrossValidator): + if not hasattr(model_params[param_name], "random_state"): + continue + + current_value = model_params[param_name].random_state + new_value = rs.randint(0, 2**16) + if _seed_current_object(current_value): + model_params[param_name].random_state = new_value + + model.set_params(**random_states) + return model + + def check_if_model_fitted(self, model: Any) -> bool: + """Returns True/False denoting if the model has already been fitted/trained + + Parameters + ---------- + model : Any + + Returns + ------- + bool + """ + from sklearn.exceptions import NotFittedError + from sklearn.utils.validation import check_is_fitted + + try: + # check if model is fitted + check_is_fitted(model) + + # Creating random dummy data of arbitrary size + dummy_data = np.random.uniform(size=(10, 3)) # noqa: NPY002 + # Using 'predict' instead of 'sklearn.utils.validation.check_is_fitted' for a more + # robust check that works across sklearn versions and models. Internally, 'predict' + # should call 'check_is_fitted' for every concerned attribute, thus offering a more + # assured check than explicit calls to 'check_is_fitted' + model.predict(dummy_data) + # Will reach here if the model was fit on a dataset with 3 features + return True + except NotFittedError: # needs to be the first exception to be caught + # Model is not fitted, as is required + return False + except ValueError: + # Will reach here if the model was fit on a dataset with more or less than 3 features + return True + + def _run_model_on_fold( # noqa: PLR0915, PLR0913, C901, PLR0912 + self, + model: Any, + task: OpenMLTask, + X_train: np.ndarray | scipy.sparse.spmatrix | pd.DataFrame, + rep_no: int, + fold_no: int, + y_train: np.ndarray | None = None, + X_test: np.ndarray | scipy.sparse.spmatrix | pd.DataFrame | None = None, + ) -> tuple[ + np.ndarray, + pd.DataFrame | None, + OrderedDict[str, float], + OpenMLRunTrace | None, + ]: + """Run a model on a repeat,fold,subsample triplet of the task and return prediction + information. + + Furthermore, it will measure run time measures in case multi-core behaviour allows this. + * exact user cpu time will be measured if the number of cores is set (recursive throughout + the model) exactly to 1 + * wall clock time will be measured if the number of cores is set (recursive throughout the + model) to any given number (but not when it is set to -1) + + Returns the data that is necessary to construct the OpenML Run object. Is used by + run_task_get_arff_content. Do not use this function unless you know what you are doing. + + Parameters + ---------- + model : Any + The UNTRAINED model to run. The model instance will be copied and not altered. + task : OpenMLTask + The task to run the model on. + X_train : array-like + Training data for the given repetition and fold. + rep_no : int + The repeat of the experiment (0-based; in case of 1 time CV, always 0) + fold_no : int + The fold nr of the experiment (0-based; in case of holdout, always 0) + y_train : Optional[np.ndarray] (default=None) + Target attributes for supervised tasks. In case of classification, these are integer + indices to the potential classes specified by dataset. + X_test : Optional, array-like (default=None) + Test attributes to test for generalization in supervised tasks. + + Returns + ------- + pred_y : np.ndarray + Predictions on the training/test set, depending on the task type. + For supervised tasks, predictions are on the test set. + For unsupervised tasks, predictions are on the training set. + proba_y : pd.DataFrame, optional + Predicted probabilities for the test set. + None, if task is not Classification or Learning Curve prediction. + user_defined_measures : OrderedDict[str, float] + User defined measures that were generated on this fold + trace : OpenMLRunTrace, optional + arff trace object from a fitted model and the trace content obtained by + repeatedly calling ``run_model_on_task`` + """ + + def _prediction_to_probabilities( + y: np.ndarray | list, + model_classes: list[Any], + class_labels: list[str] | None, + ) -> pd.DataFrame: + """Transforms predicted probabilities to match with OpenML class indices. + + Parameters + ---------- + y : np.ndarray + Predicted probabilities (possibly omitting classes if they were not present in the + training data). + model_classes : list + List of classes known_predicted by the model, ordered by their index. + class_labels : list + List of classes as stored in the task object fetched from server. + + Returns + ------- + pd.DataFrame + """ + if class_labels is None: + raise ValueError("The task has no class labels") + + if isinstance(y_train, np.ndarray) and isinstance(class_labels[0], str): + # mapping (decoding) the predictions to the categories + # creating a separate copy to not change the expected pred_y type + y = [class_labels[pred] for pred in y] # list or numpy array of predictions + + # model_classes: sklearn classifier mapping from original array id to + # prediction index id + if not isinstance(model_classes, list): + raise ValueError("please convert model classes to list prior to calling this fn") + + # DataFrame allows more accurate mapping of classes as column names + result = pd.DataFrame( + 0, + index=np.arange(len(y)), + columns=model_classes, + dtype=np.float32, + ) + for obs, prediction in enumerate(y): + result.loc[obs, prediction] = 1.0 + return result + + if isinstance(task, OpenMLSupervisedTask): + if y_train is None: + raise TypeError("argument y_train must not be of type None") + if X_test is None: + raise TypeError("argument X_test must not be of type None") + + model_copy = sklearn.base.clone(model, safe=True) + # sanity check: prohibit users from optimizing n_jobs + self._prevent_optimize_n_jobs(model_copy) + # measures and stores runtimes + user_defined_measures = OrderedDict() # type: 'OrderedDict[str, float]' + try: + # for measuring runtime. Only available since Python 3.3 + modelfit_start_cputime = time.process_time() + modelfit_start_walltime = time.time() + + if isinstance(task, OpenMLSupervisedTask): + model_copy.fit(X_train, y_train) # type: ignore + elif isinstance(task, OpenMLClusteringTask): + model_copy.fit(X_train) # type: ignore + + modelfit_dur_cputime = (time.process_time() - modelfit_start_cputime) * 1000 + modelfit_dur_walltime = (time.time() - modelfit_start_walltime) * 1000 + + user_defined_measures["usercpu_time_millis_training"] = modelfit_dur_cputime + refit_time = model_copy.refit_time_ * 1000 if hasattr(model_copy, "refit_time_") else 0 # type: ignore + user_defined_measures["wall_clock_time_millis_training"] = modelfit_dur_walltime + + except AttributeError as e: + # typically happens when training a regressor on classification task + raise PyOpenMLError(str(e)) from e + + if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)): + # search for model classes_ (might differ depending on modeltype) + # first, pipelines are a special case (these don't have a classes_ + # object, but rather borrows it from the last step. We do this manually, + # because of the BaseSearch check) + if isinstance(model_copy, sklearn.pipeline.Pipeline): + used_estimator = model_copy.steps[-1][-1] + else: + used_estimator = model_copy + + if self._is_hpo_class(used_estimator): + model_classes = used_estimator.best_estimator_.classes_ + else: + model_classes = used_estimator.classes_ + + if not isinstance(model_classes, list): + model_classes = model_classes.tolist() + + # to handle the case when dataset is numpy and categories are encoded + # however the class labels stored in task are still categories + if isinstance(y_train, np.ndarray) and isinstance( + cast("List", task.class_labels)[0], + str, + ): + model_classes = [cast("List[str]", task.class_labels)[i] for i in model_classes] + + modelpredict_start_cputime = time.process_time() + modelpredict_start_walltime = time.time() + + # In supervised learning this returns the predictions for Y, in clustering + # it returns the clusters + if isinstance(task, OpenMLSupervisedTask): + pred_y = model_copy.predict(X_test) + elif isinstance(task, OpenMLClusteringTask): + pred_y = model_copy.predict(X_train) + else: + raise ValueError(task) + + modelpredict_duration_cputime = (time.process_time() - modelpredict_start_cputime) * 1000 + user_defined_measures["usercpu_time_millis_testing"] = modelpredict_duration_cputime + user_defined_measures["usercpu_time_millis"] = ( + modelfit_dur_cputime + modelpredict_duration_cputime + ) + modelpredict_duration_walltime = (time.time() - modelpredict_start_walltime) * 1000 + user_defined_measures["wall_clock_time_millis_testing"] = modelpredict_duration_walltime + user_defined_measures["wall_clock_time_millis"] = ( + modelfit_dur_walltime + modelpredict_duration_walltime + refit_time + ) + + if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)): + try: + proba_y = model_copy.predict_proba(X_test) + proba_y = pd.DataFrame(proba_y, columns=model_classes) # handles X_test as numpy + except AttributeError: # predict_proba is not available when probability=False + proba_y = _prediction_to_probabilities(pred_y, model_classes, task.class_labels) + + if task.class_labels is not None: + if proba_y.shape[1] != len(task.class_labels): + # Remap the probabilities in case there was a class missing + # at training time. By default, the classification targets + # are mapped to be zero-based indices to the actual classes. + # Therefore, the model_classes contain the correct indices to + # the correct probability array. Example: + # classes in the dataset: 0, 1, 2, 3, 4, 5 + # classes in the training set: 0, 1, 2, 4, 5 + # then we need to add a column full of zeros into the probabilities + # for class 3 because the rest of the library expects that the + # probabilities are ordered the same way as the classes are ordered). + message = ( + f"Estimator only predicted for {proba_y.shape[1]}/{len(task.class_labels)}" + " classes!" + ) + warnings.warn(message, stacklevel=2) + openml.config.logger.warning(message) + + for _i, col in enumerate(task.class_labels): + # adding missing columns with 0 probability + if col not in model_classes: + proba_y[col] = 0 + # We re-order the columns to move possibly added missing columns into place. + proba_y = proba_y[task.class_labels] + else: + raise ValueError("The task has no class labels") + + if not np.all(set(proba_y.columns) == set(task.class_labels)): + missing_cols = list(set(task.class_labels) - set(proba_y.columns)) + raise ValueError("Predicted probabilities missing for the columns: ", missing_cols) + + elif isinstance(task, (OpenMLRegressionTask, OpenMLClusteringTask)): + proba_y = None + else: + raise TypeError(type(task)) + + if self._is_hpo_class(model_copy): + trace_data = self._extract_trace_data(model_copy, rep_no, fold_no) + trace: OpenMLRunTrace | None = self._obtain_arff_trace( + model_copy, + trace_data, + ) + else: + trace = None + + return pred_y, proba_y, user_defined_measures, trace + + def obtain_parameter_values( # noqa: C901, PLR0915 + self, + flow: OpenMLFlow, + model: Any = None, + ) -> list[dict[str, Any]]: + """Extracts all parameter settings required for the flow from the model. + + If no explicit model is provided, the parameters will be extracted from `flow.model` + instead. + + Parameters + ---------- + flow : OpenMLFlow + OpenMLFlow object (containing flow ids, i.e., it has to be downloaded from the server) + + model: Any, optional (default=None) + The model from which to obtain the parameter values. Must match the flow signature. + If None, use the model specified in ``OpenMLFlow.model``. + + Returns + ------- + list + A list of dicts, where each dict has the following entries: + - ``oml:name`` : str: The OpenML parameter name + - ``oml:value`` : mixed: A representation of the parameter value + - ``oml:component`` : int: flow id to which the parameter belongs + """ + openml.flows.functions._check_flow_for_server_id(flow) + + def get_flow_dict(_flow): + flow_map = {_flow.name: _flow.flow_id} + for subflow in _flow.components: + flow_map.update(get_flow_dict(_flow.components[subflow])) + return flow_map + + def extract_parameters( # noqa: PLR0915, PLR0912, C901 + _flow, + _flow_dict, + component_model, + _main_call=False, # noqa: FBT002 + main_id=None, + ): + def is_subcomponent_specification(values): + # checks whether the current value can be a specification of + # subcomponents, as for example the value for steps parameter + # (in Pipeline) or transformers parameter (in + # ColumnTransformer). + return ( + # Specification requires list/tuple of list/tuple with + # at least length 2. + isinstance(values, (tuple, list)) + and all(isinstance(item, (tuple, list)) and len(item) > 1 for item in values) + # And each component needs to be a flow or interpretable string + and all( + isinstance(item[1], openml.flows.OpenMLFlow) + or ( + isinstance(item[1], str) + and item[1] in SKLEARN_PIPELINE_STRING_COMPONENTS + ) + for item in values + ) + ) + + # _flow is openml flow object, _param dict maps from flow name to flow + # id for the main call, the param dict can be overridden (useful for + # unit tests / sentinels) this way, for flows without subflows we do + # not have to rely on _flow_dict + exp_parameters = set(_flow.parameters) + if ( + isinstance(component_model, str) + and component_model in SKLEARN_PIPELINE_STRING_COMPONENTS + ): + model_parameters = set() + else: + model_parameters = set(component_model.get_params(deep=False)) + if len(exp_parameters.symmetric_difference(model_parameters)) != 0: + flow_params = sorted(exp_parameters) + model_params = sorted(model_parameters) + raise ValueError( + "Parameters of the model do not match the " + "parameters expected by the " + "flow:\nexpected flow parameters: " + f"{flow_params}\nmodel parameters: {model_params}", + ) + exp_components = set(_flow.components) + if ( + isinstance(component_model, str) + and component_model in SKLEARN_PIPELINE_STRING_COMPONENTS + ): + model_components = set() + else: + _ = set(component_model.get_params(deep=False)) + model_components = { + mp + for mp in component_model.get_params(deep=True) + if "__" not in mp and mp not in _ + } + if len(exp_components.symmetric_difference(model_components)) != 0: + is_problem = True + if len(exp_components - model_components) > 0: + # If an expected component is not returned as a component by get_params(), + # this means that it is also a parameter -> we need to check that this is + # actually the case + difference = exp_components - model_components + component_in_model_parameters = [] + for component in difference: + if component in model_parameters: + component_in_model_parameters.append(True) + else: + component_in_model_parameters.append(False) + is_problem = not all(component_in_model_parameters) + if is_problem: + flow_components = sorted(exp_components) + model_components = sorted(model_components) + raise ValueError( + "Subcomponents of the model do not match the " + "parameters expected by the " + "flow:\nexpected flow subcomponents: " + f"{flow_components}\nmodel subcomponents: {model_components}", + ) + + _params = [] + for _param_name in _flow.parameters: + _current = OrderedDict() + _current["oml:name"] = _param_name + + current_param_values = self.model_to_flow(component_model.get_params()[_param_name]) + + # Try to filter out components (a.k.a. subflows) which are + # handled further down in the code (by recursively calling + # this function)! + if isinstance(current_param_values, openml.flows.OpenMLFlow): + continue + + if is_subcomponent_specification(current_param_values): + # complex parameter value, with subcomponents + parsed_values = [] + for subcomponent in current_param_values: + # scikit-learn stores usually tuples in the form + # (name (str), subcomponent (mixed), argument + # (mixed)). OpenML replaces the subcomponent by an + # OpenMLFlow object. + if len(subcomponent) < 2 or len(subcomponent) > 3: + raise ValueError("Component reference should be size {2,3}. ") + + subcomponent_identifier = subcomponent[0] + subcomponent_flow = subcomponent[1] + if not isinstance(subcomponent_identifier, str): + raise TypeError( + "Subcomponent identifier should be of type string, " + f"but is {type(subcomponent_identifier)}", + ) + if not isinstance(subcomponent_flow, (openml.flows.OpenMLFlow, str)): + if ( + isinstance(subcomponent_flow, str) + and subcomponent_flow in SKLEARN_PIPELINE_STRING_COMPONENTS + ): + pass + else: + raise TypeError( + "Subcomponent flow should be of type flow, but is" + f" {type(subcomponent_flow)}", + ) + + current = { + "oml-python:serialized_object": COMPONENT_REFERENCE, + "value": { + "key": subcomponent_identifier, + "step_name": subcomponent_identifier, + }, + } + if len(subcomponent) == 3: + if not isinstance(subcomponent[2], list) and not isinstance( + subcomponent[2], + OrderedDict, + ): + raise TypeError( + "Subcomponent argument should be list or OrderedDict", + ) + current["value"]["argument_1"] = subcomponent[2] + parsed_values.append(current) + parsed_values = json.dumps(parsed_values) + else: + # vanilla parameter value + parsed_values = json.dumps(current_param_values) + + _current["oml:value"] = parsed_values + if _main_call: + _current["oml:component"] = main_id + else: + _current["oml:component"] = _flow_dict[_flow.name] + _params.append(_current) + + for _identifier in _flow.components: + subcomponent_model = component_model.get_params()[_identifier] + _params.extend( + extract_parameters( + _flow.components[_identifier], + _flow_dict, + subcomponent_model, + ), + ) + return _params + + flow_dict = get_flow_dict(flow) + model = model if model is not None else flow.model + return extract_parameters(flow, flow_dict, model, _main_call=True, main_id=flow.flow_id) + + def _openml_param_name_to_sklearn( + self, + openml_parameter: openml.setups.OpenMLParameter, + flow: OpenMLFlow, + ) -> str: + """ + Converts the name of an OpenMLParameter into the sklean name, given a flow. + + Parameters + ---------- + openml_parameter: OpenMLParameter + The parameter under consideration + + flow: OpenMLFlow + The flow that provides context. + + Returns + ------- + sklearn_parameter_name: str + The name the parameter will have once used in scikit-learn + """ + if not isinstance(openml_parameter, openml.setups.OpenMLParameter): + raise ValueError("openml_parameter should be an instance of OpenMLParameter") + if not isinstance(flow, OpenMLFlow): + raise ValueError("flow should be an instance of OpenMLFlow") + + flow_structure = flow.get_structure("name") + if openml_parameter.flow_name not in flow_structure: + raise ValueError("Obtained OpenMLParameter and OpenMLFlow do not correspond. ") + name = openml_parameter.flow_name # for PEP8 + return "__".join(flow_structure[name] + [openml_parameter.parameter_name]) + + ################################################################################################ + # Methods for hyperparameter optimization + + def _is_hpo_class(self, model: Any) -> bool: + """Check whether the model performs hyperparameter optimization. + + Used to check whether an optimization trace can be extracted from the model after + running it. + + Parameters + ---------- + model : Any + + Returns + ------- + bool + """ + return isinstance(model, sklearn.model_selection._search.BaseSearchCV) + + def instantiate_model_from_hpo_class( + self, + model: Any, + trace_iteration: OpenMLTraceIteration, + ) -> Any: + """Instantiate a ``base_estimator`` which can be searched over by the hyperparameter + optimization model. + + Parameters + ---------- + model : Any + A hyperparameter optimization model which defines the model to be instantiated. + trace_iteration : OpenMLTraceIteration + Describing the hyperparameter settings to instantiate. + + Returns + ------- + Any + """ + if not self._is_hpo_class(model): + raise AssertionError( + f"Flow model {model} is not an instance of" + " sklearn.model_selection._search.BaseSearchCV", + ) + base_estimator = model.estimator + base_estimator.set_params(**trace_iteration.get_parameters()) + return base_estimator + + def _extract_trace_data(self, model, rep_no, fold_no): + """Extracts data from a machine learning model's cross-validation results + and creates an ARFF (Attribute-Relation File Format) trace. + + Parameters + ---------- + model : Any + A fitted hyperparameter optimization model. + rep_no : int + The repetition number. + fold_no : int + The fold number. + + Returns + ------- + A list of ARFF tracecontent. + """ + arff_tracecontent = [] + for itt_no in range(len(model.cv_results_["mean_test_score"])): + # we use the string values for True and False, as it is defined in + # this way by the OpenML server + selected = "false" + if itt_no == model.best_index_: + selected = "true" + test_score = model.cv_results_["mean_test_score"][itt_no] + arff_line = [rep_no, fold_no, itt_no, test_score, selected] + for key in model.cv_results_: + if key.startswith("param_"): + value = model.cv_results_[key][itt_no] + # Built-in serializer does not convert all numpy types, + # these methods convert them to built-in types instead. + if isinstance(value, np.generic): + # For scalars it actually returns scalars, not a list + value = value.tolist() + serialized_value = json.dumps(value) if value is not np.ma.masked else np.nan + arff_line.append(serialized_value) + arff_tracecontent.append(arff_line) + return arff_tracecontent + + def _obtain_arff_trace( + self, + model: Any, + trace_content: list, + ) -> OpenMLRunTrace: + """Create arff trace object from a fitted model and the trace content obtained by + repeatedly calling ``run_model_on_task``. + + Parameters + ---------- + model : Any + A fitted hyperparameter optimization model. + + trace_content : List[List] + Trace content obtained by ``openml.runs.run_flow_on_task``. + + Returns + ------- + OpenMLRunTrace + """ + if not self._is_hpo_class(model): + raise AssertionError( + f"Flow model {model} is not an instance of " + "sklearn.model_selection._search.BaseSearchCV", + ) + if not hasattr(model, "cv_results_"): + raise ValueError("model should contain `cv_results_`") + + # attributes that will be in trace arff, regardless of the model + trace_attributes = [ + ("repeat", "NUMERIC"), + ("fold", "NUMERIC"), + ("iteration", "NUMERIC"), + ("evaluation", "NUMERIC"), + ("selected", ["true", "false"]), + ] + + # model dependent attributes for trace arff + for key in model.cv_results_: + if key.startswith("param_"): + # supported types should include all types, including bool, + # int float + supported_basic_types = (bool, int, float, str) + for param_value in model.cv_results_[key]: + if isinstance(param_value, np.generic): + param_value = param_value.tolist() # noqa: PLW2901 + if ( + isinstance(param_value, supported_basic_types) + or param_value is None + or param_value is np.ma.masked + ): + # basic string values + type = "STRING" # noqa: A001 + elif isinstance(param_value, (list, tuple)) and all( + isinstance(i, int) for i in param_value + ): + # list of integers (usually for selecting features) + # hyperparameter layer_sizes of MLPClassifier + type = "STRING" # noqa: A001 + else: + raise TypeError(f"Unsupported param type in param grid: {key}") + + # renamed the attribute param to parameter, as this is a required + # OpenML convention - this also guards against name collisions + # with the required trace attributes + attribute = (PREFIX + key[6:], type) # type: ignore + trace_attributes.append(attribute) + + return OpenMLRunTrace.generate( + trace_attributes, + trace_content, + ) diff --git a/openml/extensions/sklearn/serializer.py b/openml/extensions/sklearn/serializer.py new file mode 100644 index 000000000..26413afec --- /dev/null +++ b/openml/extensions/sklearn/serializer.py @@ -0,0 +1,1508 @@ +# License: BSD 3-Clause + +"""Serializer for the Scikit-learn estimators.""" + +from __future__ import annotations + +import contextlib +import copy +import importlib +import inspect +import json +import logging +import re +import sys +import traceback +import warnings +from collections import OrderedDict +from json.decoder import JSONDecodeError +from re import IGNORECASE +from typing import Any, Callable, Sized + +import numpy as np +import scipy.stats +import sklearn.base +import sklearn.model_selection +import sklearn.pipeline +from packaging.version import Version + +import openml +from openml.exceptions import PyOpenMLError +from openml.extensions.base import ModelSerializer +from openml.flows import OpenMLFlow + +logger = logging.getLogger(__name__) + + +DEPENDENCIES_PATTERN = re.compile( + r"^(?P[\w\-]+)((?P==|>=|>)" + r"(?P(\d+\.)?(\d+\.)?(\d+)?(dev)?[0-9]*))?$", +) + +# NOTE(eddiebergman): This was imported before but became deprecated, +# as a result I just enumerated them manually by copy-ing and pasting, +# recommended solution in Numpy 2.0 guide was to explicitly list them. +SIMPLE_NUMPY_TYPES = [ + np.int8, + np.int16, + np.int32, + np.int64, + np.longlong, + np.uint8, + np.uint16, + np.uint32, + np.uint64, + np.ulonglong, + np.float16, + np.float32, + np.float64, + np.longdouble, + np.complex64, + np.complex128, + np.clongdouble, +] +SIMPLE_TYPES = (bool, int, float, str, *SIMPLE_NUMPY_TYPES) + + +SKLEARN_PIPELINE_STRING_COMPONENTS = ("drop", "passthrough") +COMPONENT_REFERENCE = "component_reference" +COMPOSITION_STEP_CONSTANT = "composition_step_constant" + + +class SklearnSerializer(ModelSerializer): + """Serializer for Scikit-learn estimators.""" + + @classmethod + def can_handle_flow(cls, flow: OpenMLFlow) -> bool: + """Check whether a given describes a scikit-learn estimator. + + This is done by parsing the ``external_version`` field. + + Parameters + ---------- + flow : OpenMLFlow + + Returns + ------- + bool + """ + return cls._is_sklearn_flow(flow) + + @classmethod + def can_handle_model(cls, model: Any) -> bool: + """Check whether a model is an instance of ``sklearn.base.BaseEstimator``. + + Parameters + ---------- + model : Any + + Returns + ------- + bool + """ + return isinstance(model, sklearn.base.BaseEstimator) + + def flow_to_model( + self, + flow: OpenMLFlow, + initialize_with_defaults: bool = False, # noqa: FBT001, FBT002 + strict_version: bool = True, # noqa: FBT001, FBT002 + ) -> Any: + """Initializes a sklearn model based on a flow. + + Parameters + ---------- + flow : mixed + the object to deserialize (can be flow object, or any serialized + parameter value that is accepted by) + + initialize_with_defaults : bool, optional (default=False) + If this flag is set, the hyperparameter values of flows will be + ignored and a flow with its defaults is returned. + + strict_version : bool, default=True + Whether to fail if version requirements are not fulfilled. + + Returns + ------- + mixed + """ + return self._deserialize_sklearn( + flow, + initialize_with_defaults=initialize_with_defaults, + strict_version=strict_version, + ) + + def model_to_flow(self, model: Any) -> OpenMLFlow: + """Transform a scikit-learn model to a flow for uploading it to OpenML. + + Parameters + ---------- + model : Any + + Returns + ------- + OpenMLFlow + """ + # Necessary to make pypy not complain about all the different possible return types + return self._serialize_sklearn(model) + + @classmethod + def trim_flow_name( # noqa: C901 + cls, + long_name: str, + extra_trim_length: int = 100, + _outer: bool = True, # noqa: FBT001, FBT002 + ) -> str: + """Shorten generated sklearn flow name to at most ``max_length`` characters. + + Flows are assumed to have the following naming structure: + ``(model_selection)? (pipeline)? (steps)+`` + and will be shortened to: + ``sklearn.(selection.)?(pipeline.)?(steps)+`` + e.g. (white spaces and newlines added for readability) + + .. code :: + + sklearn.pipeline.Pipeline( + columntransformer=sklearn.compose._column_transformer.ColumnTransformer( + numeric=sklearn.pipeline.Pipeline( + imputer=sklearn.preprocessing.imputation.Imputer, + standardscaler=sklearn.preprocessing.data.StandardScaler), + nominal=sklearn.pipeline.Pipeline( + simpleimputer=sklearn.impute.SimpleImputer, + onehotencoder=sklearn.preprocessing._encoders.OneHotEncoder)), + variancethreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, + svc=sklearn.svm.classes.SVC) + + -> + ``sklearn.Pipeline(ColumnTransformer,VarianceThreshold,SVC)`` + + Parameters + ---------- + long_name : str + The full flow name generated by the scikit-learn extension. + extra_trim_length: int (default=100) + If the trimmed name would exceed `extra_trim_length` characters, additional trimming + of the short name is performed. This reduces the produced short name length. + There is no guarantee the end result will not exceed `extra_trim_length`. + _outer : bool (default=True) + For internal use only. Specifies if the function is called recursively. + + Returns + ------- + str + + """ + + def remove_all_in_parentheses(string: str) -> str: + string, removals = re.subn(r"\([^()]*\)", "", string) + while removals > 0: + string, removals = re.subn(r"\([^()]*\)", "", string) + return string + + # Generally, we want to trim all hyperparameters, the exception to that is for model + # selection, as the `estimator` hyperparameter is very indicative of what is in the flow. + # So we first trim name of the `estimator` specified in mode selection. For reference, in + # the example below, we want to trim `sklearn.tree.tree.DecisionTreeClassifier`, and + # keep it in the final trimmed flow name: + # sklearn.pipeline.Pipeline(Imputer=sklearn.preprocessing.imputation.Imputer, + # VarianceThreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, # noqa: ERA001, E501 + # Estimator=sklearn.model_selection._search.RandomizedSearchCV(estimator= + # sklearn.tree.tree.DecisionTreeClassifier)) + if "sklearn.model_selection" in long_name: + start_index = long_name.index("sklearn.model_selection") + estimator_start = ( + start_index + long_name[start_index:].index("estimator=") + len("estimator=") + ) + + model_select_boilerplate = long_name[start_index:estimator_start] + # above is .g. "sklearn.model_selection._search.RandomizedSearchCV(estimator=" + model_selection_class = model_select_boilerplate.split("(")[0].split(".")[-1] + + # Now we want to also find and parse the `estimator`, for this we find the closing + # parenthesis to the model selection technique: + closing_parenthesis_expected = 1 + for char in long_name[estimator_start:]: + if char == "(": + closing_parenthesis_expected += 1 + if char == ")": + closing_parenthesis_expected -= 1 + if closing_parenthesis_expected == 0: + break + + _end: int = estimator_start + len(long_name[estimator_start:]) - 1 + model_select_pipeline = long_name[estimator_start:_end] + + trimmed_pipeline = cls.trim_flow_name(model_select_pipeline, _outer=False) + _, trimmed_pipeline = trimmed_pipeline.split(".", maxsplit=1) # trim module prefix + model_select_short = f"sklearn.{model_selection_class}[{trimmed_pipeline}]" + name = long_name[:start_index] + model_select_short + long_name[_end + 1 :] + else: + name = long_name + + module_name = long_name.split(".")[0] + short_name = module_name + ".{}" + + if name.startswith("sklearn.pipeline"): + full_pipeline_class, pipeline = name[:-1].split("(", maxsplit=1) + pipeline_class = full_pipeline_class.split(".")[-1] + # We don't want nested pipelines in the short name, so we trim all complicated + # subcomponents, i.e. those with parentheses: + pipeline = remove_all_in_parentheses(pipeline) + + # then the pipeline steps are formatted e.g.: + # step1name=sklearn.submodule.ClassName,step2name... + components = [component.split(".")[-1] for component in pipeline.split(",")] + pipeline = f"{pipeline_class}({','.join(components)})" + if len(short_name.format(pipeline)) > extra_trim_length: + pipeline = f"{pipeline_class}(...,{components[-1]})" + else: + # Just a simple component: e.g. sklearn.tree.DecisionTreeClassifier + pipeline = remove_all_in_parentheses(name).split(".")[-1] + + if not _outer: + # Anything from parenthesis in inner calls should not be culled, so we use brackets + pipeline = pipeline.replace("(", "[").replace(")", "]") + else: + # Square brackets may be introduced with nested model_selection + pipeline = pipeline.replace("[", "(").replace("]", ")") + + return short_name.format(pipeline) + + @classmethod + def _min_dependency_str(cls, sklearn_version: str) -> str: + """Returns a string containing the minimum dependencies for the sklearn version passed. + + Parameters + ---------- + sklearn_version : str + A version string of the xx.xx.xx + + Returns + ------- + str + """ + # This explicit check is necessary to support existing entities on the OpenML servers + # that used the fixed dependency string (in the else block) + if Version(openml.__version__) > Version("0.11"): + # OpenML v0.11 onwards supports sklearn>=0.24 + # assumption: 0.24 onwards sklearn should contain a _min_dependencies.py file with + # variables declared for extracting minimum dependency for that version + if Version(sklearn_version) >= Version("0.24"): + from sklearn import _min_dependencies as _mindep + + dependency_list = { + "numpy": f"{_mindep.NUMPY_MIN_VERSION}", + "scipy": f"{_mindep.SCIPY_MIN_VERSION}", + "joblib": f"{_mindep.JOBLIB_MIN_VERSION}", + "threadpoolctl": f"{_mindep.THREADPOOLCTL_MIN_VERSION}", + } + elif Version(sklearn_version) >= Version("0.23"): + dependency_list = { + "numpy": "1.13.3", + "scipy": "0.19.1", + "joblib": "0.11", + "threadpoolctl": "2.0.0", + } + if Version(sklearn_version).micro == 0: + dependency_list.pop("threadpoolctl") + elif Version(sklearn_version) >= Version("0.21"): + dependency_list = {"numpy": "1.11.0", "scipy": "0.17.0", "joblib": "0.11"} + elif Version(sklearn_version) >= Version("0.19"): + dependency_list = {"numpy": "1.8.2", "scipy": "0.13.3"} + else: + dependency_list = {"numpy": "1.6.1", "scipy": "0.9"} + else: + # this is INCORRECT for sklearn versions >= 0.19 and < 0.24 + # given that OpenML has existing flows uploaded with such dependency information, + # we change no behaviour for older sklearn version, however from 0.24 onwards + # the dependency list will be accurately updated for any flow uploaded to OpenML + dependency_list = {"numpy": "1.6.1", "scipy": "0.9"} + + sklearn_dep = f"sklearn=={sklearn_version}" + dep_str = "\n".join([f"{k}>={v}" for k, v in dependency_list.items()]) + return "\n".join([sklearn_dep, dep_str]) + + def _deserialize_sklearn( # noqa: PLR0915, C901, PLR0912 + self, + o: Any, + components: dict | None = None, + initialize_with_defaults: bool = False, # noqa: FBT001, FBT002 + recursion_depth: int = 0, + strict_version: bool = True, # noqa: FBT002, FBT001 + ) -> Any: + """Recursive function to deserialize a scikit-learn flow. + + This function inspects an object to deserialize and decides how to do so. This function + delegates all work to the respective functions to deserialize special data structures etc. + This function works on everything that has been serialized to OpenML: OpenMLFlow, + components (which are flows themselves), functions, hyperparameter distributions (for + random search) and the actual hyperparameter values themselves. + + Parameters + ---------- + o : mixed + the object to deserialize (can be flow object, or any serialized + parameter value that is accepted by) + + components : Optional[dict] + Components of the current flow being de-serialized. These will not be used when + de-serializing the actual flow, but when de-serializing a component reference. + + initialize_with_defaults : bool, optional (default=False) + If this flag is set, the hyperparameter values of flows will be + ignored and a flow with its defaults is returned. + + recursion_depth : int + The depth at which this flow is called, mostly for debugging + purposes + + strict_version : bool, default=True + Whether to fail if version requirements are not fulfilled. + + Returns + ------- + mixed + """ + logger.info( + "-{} flow_to_sklearn START o={}, components={}, init_defaults={}".format( + "-" * recursion_depth, o, components, initialize_with_defaults + ), + ) + depth_pp = recursion_depth + 1 # shortcut var, depth plus plus + + # First, we need to check whether the presented object is a json string. + # JSON strings are used to encoder parameter values. By passing around + # json strings for parameters, we make sure that we can flow_to_sklearn + # the parameter values to the correct type. + + if isinstance(o, str): + with contextlib.suppress(JSONDecodeError): + o = json.loads(o) + + if isinstance(o, dict): + # Check if the dict encodes a 'special' object, which could not + # easily converted into a string, but rather the information to + # re-create the object were stored in a dictionary. + if "oml-python:serialized_object" in o: + serialized_type = o["oml-python:serialized_object"] + value = o["value"] + if serialized_type == "type": + rval = self._deserialize_type(value) + elif serialized_type == "rv_frozen": + rval = self._deserialize_rv_frozen(value) + elif serialized_type == "function": + rval = self._deserialize_function(value) + elif serialized_type in (COMPOSITION_STEP_CONSTANT, COMPONENT_REFERENCE): + if serialized_type == COMPOSITION_STEP_CONSTANT: + pass + elif serialized_type == COMPONENT_REFERENCE: + value = self._deserialize_sklearn( + value, + recursion_depth=depth_pp, + strict_version=strict_version, + ) + else: + raise NotImplementedError(serialized_type) + assert components is not None # Necessary for mypy + step_name = value["step_name"] + key = value["key"] + component = self._deserialize_sklearn( + components[key], + initialize_with_defaults=initialize_with_defaults, + recursion_depth=depth_pp, + strict_version=strict_version, + ) + # The component is now added to where it should be used + # later. It should not be passed to the constructor of the + # main flow object. + del components[key] + if step_name is None: + rval = component + elif "argument_1" not in value: + rval = (step_name, component) + else: + rval = (step_name, component, value["argument_1"]) + elif serialized_type == "cv_object": + rval = self._deserialize_cross_validator( + value, + recursion_depth=recursion_depth, + strict_version=strict_version, + ) + else: + raise ValueError(f"Cannot flow_to_sklearn {serialized_type}") + + else: + rval = OrderedDict( + ( + self._deserialize_sklearn( + o=key, + components=components, + initialize_with_defaults=initialize_with_defaults, + recursion_depth=depth_pp, + strict_version=strict_version, + ), + self._deserialize_sklearn( + o=value, + components=components, + initialize_with_defaults=initialize_with_defaults, + recursion_depth=depth_pp, + strict_version=strict_version, + ), + ) + for key, value in sorted(o.items()) + ) + elif isinstance(o, (list, tuple)): + rval = [ + self._deserialize_sklearn( + o=element, + components=components, + initialize_with_defaults=initialize_with_defaults, + recursion_depth=depth_pp, + strict_version=strict_version, + ) + for element in o + ] + if isinstance(o, tuple): + rval = tuple(rval) + elif isinstance(o, (bool, int, float, str)) or o is None: + rval = o + elif isinstance(o, OpenMLFlow): + if not self._is_sklearn_flow(o): + raise ValueError("Only sklearn flows can be reinstantiated") + rval = self._deserialize_model( + flow=o, + keep_defaults=initialize_with_defaults, + recursion_depth=recursion_depth, + strict_version=strict_version, + ) + else: + raise TypeError(o) + logger.info(f"-{'-' * recursion_depth} flow_to_sklearn END o={o}, rval={rval}") + return rval + + def _serialize_sklearn(self, o: Any, parent_model: Any | None = None) -> Any: # noqa: PLR0912, C901 + rval = None # type: Any + + # TODO: assert that only on first recursion lvl `parent_model` can be None + if self.is_estimator(o): + # is the main model or a submodel + rval = self._serialize_model(o) + elif ( + isinstance(o, (list, tuple)) + and len(o) == 2 + and o[1] in SKLEARN_PIPELINE_STRING_COMPONENTS + and isinstance(parent_model, sklearn.pipeline._BaseComposition) + ): + rval = o + elif isinstance(o, (list, tuple)): + # TODO: explain what type of parameter is here + rval = [self._serialize_sklearn(element, parent_model) for element in o] + if isinstance(o, tuple): + rval = tuple(rval) + elif isinstance(o, SIMPLE_TYPES) or o is None: + if isinstance(o, tuple(SIMPLE_NUMPY_TYPES)): + o = o.item() # type: ignore + # base parameter values + rval = o + elif isinstance(o, dict): + # TODO: explain what type of parameter is here + if not isinstance(o, OrderedDict): + o = OrderedDict(sorted(o.items())) + + rval = OrderedDict() + for key, value in o.items(): + if not isinstance(key, str): + raise TypeError( + "Can only use string as keys, you passed " + f"type {type(key)} for value {key!s}.", + ) + _key = self._serialize_sklearn(key, parent_model) + rval[_key] = self._serialize_sklearn(value, parent_model) + elif isinstance(o, type): + # TODO: explain what type of parameter is here + rval = self._serialize_type(o) + elif isinstance(o, scipy.stats.distributions.rv_frozen): + rval = self._serialize_rv_frozen(o) + # This only works for user-defined functions (and not even partial). + # I think this is exactly what we want here as there shouldn't be any + # built-in or functool.partials in a pipeline + elif inspect.isfunction(o): + # TODO: explain what type of parameter is here + rval = self._serialize_function(o) + elif self._is_cross_validator(o): + # TODO: explain what type of parameter is here + rval = self._serialize_cross_validator(o) + else: + raise TypeError(o, type(o)) + + return rval + + def is_estimator(self, model: Any) -> bool: + """Check whether the given model is a scikit-learn estimator. + + This function is only required for backwards compatibility and will be removed in the + near future. + + Parameters + ---------- + model : Any + + Returns + ------- + bool + """ + o = model + return hasattr(o, "fit") and hasattr(o, "get_params") and hasattr(o, "set_params") + + def get_version_information(self) -> list[str]: + """List versions of libraries required by the flow. + + Libraries listed are ``Python``, ``scikit-learn``, ``numpy`` and ``scipy``. + + Returns + ------- + List + """ + # This can possibly be done by a package such as pyxb, but I could not get + # it to work properly. + import numpy + import scipy + import sklearn + + major, minor, micro, _, _ = sys.version_info + python_version = f"Python_{'.'.join([str(major), str(minor), str(micro)])}." + sklearn_version = f"Sklearn_{sklearn.__version__}." + numpy_version = f"NumPy_{numpy.__version__}." # type: ignore + scipy_version = f"SciPy_{scipy.__version__}." + + return [python_version, sklearn_version, numpy_version, scipy_version] + + def create_setup_string(self, model: Any) -> str: # noqa: ARG002 + """Create a string which can be used to reinstantiate the given model. + + Parameters + ---------- + model : Any + + Returns + ------- + str + """ + return " ".join(self.get_version_information()) + + def _is_cross_validator(self, o: Any) -> bool: + return isinstance(o, sklearn.model_selection.BaseCrossValidator) + + @classmethod + def _is_sklearn_flow(cls, flow: OpenMLFlow) -> bool: + sklearn_dependency = isinstance(flow.dependencies, str) and "sklearn" in flow.dependencies + sklearn_as_external = isinstance(flow.external_version, str) and ( + flow.external_version.startswith("sklearn==") or ",sklearn==" in flow.external_version + ) + return sklearn_dependency or sklearn_as_external + + def _get_sklearn_description(self, model: Any, char_lim: int = 1024) -> str: + r"""Fetches the sklearn function docstring for the flow description + + Retrieves the sklearn docstring available and does the following: + * If length of docstring <= char_lim, then returns the complete docstring + * Else, trims the docstring till it encounters a 'Read more in the :ref:' + * Or till it encounters a 'Parameters\n----------\n' + The final string returned is at most of length char_lim with leading and + trailing whitespaces removed. + + Parameters + ---------- + model : sklearn model + char_lim : int + Specifying the max length of the returned string. + OpenML servers have a constraint of 1024 characters for the 'description' field. + + Returns + ------- + str + """ + + def match_format(s): + return f"{s}\n{len(s) * '-'}\n" + + s = inspect.getdoc(model) + if s is None: + return "" + try: + # trim till 'Read more' + pattern = "Read more in the :ref:" + index = s.index(pattern) + s = s[:index] + # trimming docstring to be within char_lim + if len(s) > char_lim: + s = f"{s[: char_lim - 3]}..." + return s.strip() + except ValueError: + logger.warning( + "'Read more' not found in descriptions. " + "Trying to trim till 'Parameters' if available in docstring.", + ) + try: + # if 'Read more' doesn't exist, trim till 'Parameters' + pattern = "Parameters" + index = s.index(match_format(pattern)) + except ValueError: + # returning full docstring + logger.warning("'Parameters' not found in docstring. Omitting docstring trimming.") + index = len(s) + s = s[:index] + # trimming docstring to be within char_lim + if len(s) > char_lim: + s = f"{s[: char_lim - 3]}..." + return s.strip() + + def _extract_sklearn_parameter_docstring(self, model) -> None | str: + """Extracts the part of sklearn docstring containing parameter information + + Fetches the entire docstring and trims just the Parameter section. + The assumption is that 'Parameters' is the first section in sklearn docstrings, + followed by other sections titled 'Attributes', 'See also', 'Note', 'References', + appearing in that order if defined. + Returns a None if no section with 'Parameters' can be found in the docstring. + + Parameters + ---------- + model : sklearn model + + Returns + ------- + str, or None + """ + + def match_format(s): + return f"{s}\n{len(s) * '-'}\n" + + s = inspect.getdoc(model) + if s is None: + return None + try: + index1 = s.index(match_format("Parameters")) + except ValueError as e: + # when sklearn docstring has no 'Parameters' section + logger.warning(f"{match_format('Parameters')} {e}") + return None + + headings = ["Attributes", "Notes", "See also", "Note", "References"] + for h in headings: + try: + # to find end of Parameters section + index2 = s.index(match_format(h)) + break + except ValueError: + logger.warning(f"{h} not available in docstring") + continue + else: + # in the case only 'Parameters' exist, trim till end of docstring + index2 = len(s) + s = s[index1:index2] + return s.strip() + + def _extract_sklearn_param_info(self, model, char_lim=1024) -> None | dict: + """Parses parameter type and description from sklearn dosctring + + Parameters + ---------- + model : sklearn model + char_lim : int + Specifying the max length of the returned string. + OpenML servers have a constraint of 1024 characters string fields. + + Returns + ------- + Dict, or None + """ + docstring = self._extract_sklearn_parameter_docstring(model) + if docstring is None: + # when sklearn docstring has no 'Parameters' section + return None + + n = re.compile("[.]*\n", flags=IGNORECASE) + lines = n.split(docstring) + p = re.compile("[a-z0-9_ ]+ : [a-z0-9_']+[a-z0-9_ ]*", flags=IGNORECASE) + # The above regular expression is designed to detect sklearn parameter names and type + # in the format of [variable_name][space]:[space][type] + # The expectation is that the parameter description for this detected parameter will + # be all the lines in the docstring till the regex finds another parameter match + + # collecting parameters and their descriptions + description = [] # type: List + for s in lines: + param = p.findall(s) + if param != []: + # a parameter definition is found by regex + # creating placeholder when parameter found which will be a list of strings + # string descriptions will be appended in subsequent iterations + # till another parameter is found and a new placeholder is created + placeholder = [""] # type: List[str] + description.append(placeholder) + elif len(description) > 0: # description=[] means no parameters found yet + # appending strings to the placeholder created when parameter found + description[-1].append(s) + for i in range(len(description)): + # concatenating parameter description strings + description[i] = "\n".join(description[i]).strip() + # limiting all parameter descriptions to accepted OpenML string length + if len(description[i]) > char_lim: + description[i] = f"{description[i][: char_lim - 3]}..." + + # collecting parameters and their types + parameter_docs = OrderedDict() + matches = p.findall(docstring) + for i, param in enumerate(matches): + key, value = str(param).split(":") + parameter_docs[key.strip()] = [value.strip(), description[i]] + + # to avoid KeyError for missing parameters + param_list_true = list(model.get_params().keys()) + param_list_found = list(parameter_docs.keys()) + for param in list(set(param_list_true) - set(param_list_found)): + parameter_docs[param] = [None, None] + + return parameter_docs + + def _serialize_model(self, model: Any) -> OpenMLFlow: + """Create an OpenMLFlow. + + Calls `sklearn_to_flow` recursively to properly serialize the + parameters to strings and the components (other models) to OpenMLFlows. + + Parameters + ---------- + model : sklearn estimator + + Returns + ------- + OpenMLFlow + + """ + # Get all necessary information about the model objects itself + ( + parameters, + parameters_meta_info, + subcomponents, + subcomponents_explicit, + ) = self._extract_information_from_model(model) + + # Check that a component does not occur multiple times in a flow as this + # is not supported by OpenML + self._check_multiple_occurence_of_component_in_flow(model, subcomponents) + + # Create a flow name, which contains all components in brackets, e.g.: + # RandomizedSearchCV(Pipeline(StandardScaler,AdaBoostClassifier(DecisionTreeClassifier)), + # StandardScaler,AdaBoostClassifier(DecisionTreeClassifier)) + class_name = model.__module__ + "." + model.__class__.__name__ + + # will be part of the name (in brackets) + sub_components_names = "" + for key in subcomponents: + name_thing = subcomponents[key] + if isinstance(name_thing, OpenMLFlow): + name = name_thing.name + elif ( + isinstance(name_thing, str) + and subcomponents[key] in SKLEARN_PIPELINE_STRING_COMPONENTS + ): + name = name_thing + else: + raise TypeError(type(subcomponents[key])) + + if key in subcomponents_explicit: + sub_components_names += "," + key + "=" + name + else: + sub_components_names += "," + name + + # slice operation on string in order to get rid of leading comma + name = f"{class_name}({sub_components_names[1:]})" if sub_components_names else class_name + short_name = self.trim_flow_name(name) + + # Get the external versions of all sub-components + external_version = self._get_external_version_string(model, subcomponents) + dependencies = self._get_dependencies() + tags = self._get_tags() + + sklearn_description = self._get_sklearn_description(model) + return OpenMLFlow( + name=name, + class_name=class_name, + custom_name=short_name, + description=sklearn_description, + model=model, + components=subcomponents, + parameters=parameters, + parameters_meta_info=parameters_meta_info, + external_version=external_version, + tags=tags, + extension=self, + language="English", + dependencies=dependencies, + ) + + def _get_dependencies(self) -> str: + return self._min_dependency_str(sklearn.__version__) # type: ignore + + def _get_tags(self) -> list[str]: + sklearn_version = self._format_external_version("sklearn", sklearn.__version__) # type: ignore + sklearn_version_formatted = sklearn_version.replace("==", "_") + return [ + "openml-python", + "sklearn", + "scikit-learn", + "python", + sklearn_version_formatted, + # TODO: add more tags based on the scikit-learn + # module a flow is in? For example automatically + # annotate a class of sklearn.svm.SVC() with the + # tag svm? + ] + + def _get_external_version_string( + self, + model: Any, + sub_components: dict[str, OpenMLFlow], + ) -> str: + # Create external version string for a flow, given the model and the + # already parsed dictionary of sub_components. Retrieves the external + # version of all subcomponents, which themselves already contain all + # requirements for their subcomponents. The external version string is a + # sorted concatenation of all modules which are present in this run. + + external_versions = set() + + # The model is None if the flow is a placeholder flow such as 'passthrough' or 'drop' + if model is not None: + model_package_name = model.__module__.split(".")[0] + module = importlib.import_module(model_package_name) + model_package_version_number = module.__version__ # type: ignore + external_version = self._format_external_version( + model_package_name, + model_package_version_number, + ) + external_versions.add(external_version) + + openml_version = self._format_external_version("openml", openml.__version__) + sklearn_version = self._format_external_version("sklearn", sklearn.__version__) # type: ignore + external_versions.add(openml_version) + external_versions.add(sklearn_version) + for visitee in sub_components.values(): + if isinstance(visitee, str) and visitee in SKLEARN_PIPELINE_STRING_COMPONENTS: + continue + for external_version in visitee.external_version.split(","): + external_versions.add(external_version) + return ",".join(sorted(external_versions)) + + def _check_multiple_occurence_of_component_in_flow( + self, + model: Any, + sub_components: dict[str, OpenMLFlow], + ) -> None: + to_visit_stack: list[OpenMLFlow] = [] + to_visit_stack.extend(sub_components.values()) + known_sub_components: set[str] = set() + + while len(to_visit_stack) > 0: + visitee = to_visit_stack.pop() + if isinstance(visitee, str) and visitee in SKLEARN_PIPELINE_STRING_COMPONENTS: + known_sub_components.add(visitee) + elif visitee.name in known_sub_components: + raise ValueError( + f"Found a second occurence of component {visitee.name} when " + f"trying to serialize {model}.", + ) + else: + known_sub_components.add(visitee.name) + to_visit_stack.extend(visitee.components.values()) + + def _extract_information_from_model( # noqa: PLR0915, C901, PLR0912 + self, + model: Any, + ) -> tuple[ + OrderedDict[str, str | None], + OrderedDict[str, dict | None], + OrderedDict[str, OpenMLFlow], + set, + ]: + # This function contains four "global" states and is quite long and + # complicated. If it gets to complicated to ensure it's correctness, + # it would be best to make it a class with the four "global" states being + # the class attributes and the if/elif/else in the for-loop calls to + # separate class methods + + # stores all entities that should become subcomponents + sub_components = OrderedDict() # type: OrderedDict[str, OpenMLFlow] + # stores the keys of all subcomponents that should become + sub_components_explicit = set() + parameters: OrderedDict[str, str | None] = OrderedDict() + parameters_meta_info: OrderedDict[str, dict | None] = OrderedDict() + parameters_docs = self._extract_sklearn_param_info(model) + + model_parameters = model.get_params(deep=False) + for k, v in sorted(model_parameters.items(), key=lambda t: t[0]): + rval = self._serialize_sklearn(v, model) + + def flatten_all(list_): + """Flattens arbitrary depth lists of lists (e.g. [[1,2],[3,[1]]] -> [1,2,3,1]).""" + for el in list_: + if isinstance(el, (list, tuple)) and len(el) > 0: + yield from flatten_all(el) + else: + yield el + + # In case rval is a list of lists (or tuples), we need to identify two situations: + # - sklearn pipeline steps, feature union or base classifiers in voting classifier. + # They look like e.g. [("imputer", Imputer()), ("classifier", SVC())] + # - a list of lists with simple types (e.g. int or str), such as for an OrdinalEncoder + # where all possible values for each feature are described: [[0,1,2], [1,2,5]] + is_non_empty_list_of_lists_with_same_type = ( + isinstance(rval, (list, tuple)) + and len(rval) > 0 + and isinstance(rval[0], (list, tuple)) + and all(isinstance(rval_i, type(rval[0])) for rval_i in rval) + ) + + # Check that all list elements are of simple types. + nested_list_of_simple_types = ( + is_non_empty_list_of_lists_with_same_type + and all(isinstance(el, SIMPLE_TYPES) for el in flatten_all(rval)) + and all( + len(rv) in (2, 3) and rv[1] not in SKLEARN_PIPELINE_STRING_COMPONENTS + for rv in rval + ) + ) + + if is_non_empty_list_of_lists_with_same_type and not nested_list_of_simple_types: + # If a list of lists is identified that include 'non-simple' types (e.g. objects), + # we assume they are steps in a pipeline, feature union, or base classifiers in + # a voting classifier. + parameter_value = [] # type: List + reserved_keywords = set(model.get_params(deep=False).keys()) + + for sub_component_tuple in rval: + identifier = sub_component_tuple[0] + sub_component = sub_component_tuple[1] + sub_component_type = type(sub_component_tuple) + if not 2 <= len(sub_component_tuple) <= 3: + # length 2 is for {VotingClassifier.estimators, + # Pipeline.steps, FeatureUnion.transformer_list} + # length 3 is for ColumnTransformer + raise ValueError( + f"Length of tuple of type {sub_component_type}" + " does not match assumptions" + ) + + if isinstance(sub_component, str): + if sub_component not in SKLEARN_PIPELINE_STRING_COMPONENTS: + msg = ( + "Second item of tuple does not match assumptions. " + "If string, can be only 'drop' or 'passthrough' but" + f"got {sub_component}" + ) + raise ValueError(msg) + elif sub_component is None: + msg = ( + "Cannot serialize objects of None type. Please use a valid " + "placeholder for None. Note that empty sklearn estimators can be " + "replaced with 'drop' or 'passthrough'." + ) + raise ValueError(msg) + elif not isinstance(sub_component, OpenMLFlow): + msg = ( + "Second item of tuple does not match assumptions. " + f"Expected OpenMLFlow, got {type(sub_component)}" + ) + raise TypeError(msg) + + if identifier in reserved_keywords: + parent_model = f"{model.__module__}.{model.__class__.__name__}" + msg = ( + "Found element shadowing official " + f"parameter for {parent_model}: {identifier}" + ) + raise PyOpenMLError(msg) + + # when deserializing the parameter + sub_components_explicit.add(identifier) + if isinstance(sub_component, str): + external_version = self._get_external_version_string(None, {}) + dependencies = self._get_dependencies() + tags = self._get_tags() + + sub_components[identifier] = OpenMLFlow( + name=sub_component, + description="Placeholder flow for scikit-learn's string pipeline " + "members", + components=OrderedDict(), + parameters=OrderedDict(), + parameters_meta_info=OrderedDict(), + external_version=external_version, + tags=tags, + language="English", + dependencies=dependencies, + model=None, + ) + component_reference: OrderedDict[str, str | dict] = OrderedDict() + component_reference["oml-python:serialized_object"] = ( + COMPOSITION_STEP_CONSTANT + ) + cr_value: dict[str, Any] = OrderedDict() + cr_value["key"] = identifier + cr_value["step_name"] = identifier + if len(sub_component_tuple) == 3: + cr_value["argument_1"] = sub_component_tuple[2] + component_reference["value"] = cr_value + else: + sub_components[identifier] = sub_component + component_reference = OrderedDict() + component_reference["oml-python:serialized_object"] = COMPONENT_REFERENCE + cr_value = OrderedDict() + cr_value["key"] = identifier + cr_value["step_name"] = identifier + if len(sub_component_tuple) == 3: + cr_value["argument_1"] = sub_component_tuple[2] + component_reference["value"] = cr_value + parameter_value.append(component_reference) + + # Here (and in the elif and else branch below) are the only + # places where we encode a value as json to make sure that all + # parameter values still have the same type after + # deserialization + if isinstance(rval, tuple): + parameter_json = json.dumps(tuple(parameter_value)) + else: + parameter_json = json.dumps(parameter_value) + parameters[k] = parameter_json + + elif isinstance(rval, OpenMLFlow): + # A subcomponent, for example the base model in + # AdaBoostClassifier + sub_components[k] = rval + sub_components_explicit.add(k) + component_reference = OrderedDict() + component_reference["oml-python:serialized_object"] = COMPONENT_REFERENCE + cr_value = OrderedDict() + cr_value["key"] = k + cr_value["step_name"] = None + component_reference["value"] = cr_value + cr = self._serialize_sklearn(component_reference, model) + parameters[k] = json.dumps(cr) + + elif not (hasattr(rval, "__len__") and len(rval) == 0): + rval = json.dumps(rval) + parameters[k] = rval + # a regular hyperparameter + else: + parameters[k] = None + + if parameters_docs is not None: + data_type, description = parameters_docs[k] + parameters_meta_info[k] = OrderedDict( + (("description", description), ("data_type", data_type)), + ) + else: + parameters_meta_info[k] = OrderedDict((("description", None), ("data_type", None))) + + return parameters, parameters_meta_info, sub_components, sub_components_explicit + + def _get_fn_arguments_with_defaults(self, fn_name: Callable) -> tuple[dict, set]: + """ + Returns + ------- + i) a dict with all parameter names that have a default value, and + ii) a set with all parameter names that do not have a default + + Parameters + ---------- + fn_name : callable + The function of which we want to obtain the defaults + + Returns + ------- + params_with_defaults: dict + a dict mapping parameter name to the default value + params_without_defaults: set + a set with all parameters that do not have a default value + """ + # parameters with defaults are optional, all others are required. + parameters = inspect.signature(fn_name).parameters + required_params = set() + optional_params = {} + for param in parameters: + parameter = parameters.get(param) + default_val = parameter.default # type: ignore + if default_val is inspect.Signature.empty: + required_params.add(param) + else: + optional_params[param] = default_val + return optional_params, required_params + + def _deserialize_model( # noqa: C901 + self, + flow: OpenMLFlow, + keep_defaults: bool, # noqa: FBT001 + recursion_depth: int, + strict_version: bool = True, # noqa: FBT002, FBT001 + ) -> Any: + logger.info(f"-{'-' * recursion_depth} deserialize {flow.name}") + model_name = flow.class_name + self._check_dependencies(flow.dependencies, strict_version=strict_version) + + parameters = flow.parameters + components = flow.components + parameter_dict: dict[str, Any] = OrderedDict() + + # Do a shallow copy of the components dictionary so we can remove the + # components from this copy once we added them into the pipeline. This + # allows us to not consider them any more when looping over the + # components, but keeping the dictionary of components untouched in the + # original components dictionary. + components_ = copy.copy(components) + + for name in parameters: + value = parameters.get(name) + logger.info(f"--{'-' * recursion_depth} flow_parameter={name}, value={value}") + rval = self._deserialize_sklearn( + value, + components=components_, + initialize_with_defaults=keep_defaults, + recursion_depth=recursion_depth + 1, + strict_version=strict_version, + ) + parameter_dict[name] = rval + + for name in components: + if name in parameter_dict: + continue + if name not in components_: + continue + value = components[name] + logger.info(f"--{'-' * recursion_depth} flow_component={name}, value={value}") + rval = self._deserialize_sklearn( + value, + recursion_depth=recursion_depth + 1, + strict_version=strict_version, + ) + parameter_dict[name] = rval + + if model_name is None and flow.name in SKLEARN_PIPELINE_STRING_COMPONENTS: + return flow.name + + assert model_name is not None + module_name = model_name.rsplit(".", 1) + model_class = getattr(importlib.import_module(module_name[0]), module_name[1]) + + if keep_defaults: + # obtain all params with a default + param_defaults, _ = self._get_fn_arguments_with_defaults(model_class.__init__) + + # delete the params that have a default from the dict, + # so they get initialized with their default value + # except [...] + for param in param_defaults: + # [...] the ones that also have a key in the components dict. + # As OpenML stores different flows for ensembles with different + # (base-)components, in OpenML terms, these are not considered + # hyperparameters but rather constants (i.e., changing them would + # result in a different flow) + if param not in components: + del parameter_dict[param] + + if not strict_version: + # Ignore incompatible parameters + allowed_parameter = list(inspect.signature(model_class.__init__).parameters) + for p in list(parameter_dict.keys()): + if p not in allowed_parameter: + warnings.warn( + f"While deserializing in a non-strict way, parameter {p} is not " + f"allowed for {model_class.__name__} likely due to a version mismatch. " + "We ignore the parameter.", + UserWarning, + stacklevel=2, + ) + del parameter_dict[p] + + return model_class(**parameter_dict) + + def _check_dependencies( + self, + dependencies: str, + strict_version: bool = True, # noqa: FBT001, FBT002 + ) -> None: + if not dependencies: + return + + dependencies_list = dependencies.split("\n") + for dependency_string in dependencies_list: + match = DEPENDENCIES_PATTERN.match(dependency_string) + if not match: + raise ValueError(f"Cannot parse dependency {dependency_string}") + + dependency_name = match.group("name") + operation = match.group("operation") + version = match.group("version") + + module = importlib.import_module(dependency_name) + required_version = Version(version) + installed_version = Version(module.__version__) # type: ignore + + if operation == "==": + check = required_version == installed_version + elif operation == ">": + check = installed_version > required_version + elif operation == ">=": + check = ( + installed_version > required_version or installed_version == required_version + ) + else: + raise NotImplementedError(f"operation '{operation}' is not supported") + message = ( + f"Trying to deserialize a model with dependency {dependency_string} not satisfied." + ) + if not check: + if strict_version: + raise ValueError(message) + + warnings.warn(message, category=UserWarning, stacklevel=2) + + def _serialize_type(self, o: Any) -> OrderedDict[str, str]: + mapping = { + float: "float", + np.float32: "np.float32", + np.float64: "np.float64", + int: "int", + np.int32: "np.int32", + np.int64: "np.int64", + } + if Version(np.__version__) < Version("1.24"): + mapping[float] = "np.float" + mapping[int] = "np.int" + + ret = OrderedDict() # type: 'OrderedDict[str, str]' + ret["oml-python:serialized_object"] = "type" + ret["value"] = mapping[o] + return ret + + def _deserialize_type(self, o: str) -> Any: + mapping = { + "float": float, + "np.float32": np.float32, + "np.float64": np.float64, + "int": int, + "np.int32": np.int32, + "np.int64": np.int64, + } + + # TODO(eddiebergman): Might be able to remove this + if Version(np.__version__) < Version("1.24"): + mapping["np.float"] = np.float # type: ignore # noqa: NPY001 + mapping["np.int"] = np.int # type: ignore # noqa: NPY001 + + return mapping[o] + + def _serialize_rv_frozen(self, o: Any) -> OrderedDict[str, str | dict]: + args = o.args + kwds = o.kwds + a = o.a + b = o.b + dist = o.dist.__class__.__module__ + "." + o.dist.__class__.__name__ + ret: OrderedDict[str, str | dict] = OrderedDict() + ret["oml-python:serialized_object"] = "rv_frozen" + ret["value"] = OrderedDict( + (("dist", dist), ("a", a), ("b", b), ("args", args), ("kwds", kwds)), + ) + return ret + + def _deserialize_rv_frozen(self, o: OrderedDict[str, str]) -> Any: + args = o["args"] + kwds = o["kwds"] + a = o["a"] + b = o["b"] + dist_name = o["dist"] + + module_name = dist_name.rsplit(".", 1) + try: + rv_class = getattr(importlib.import_module(module_name[0]), module_name[1]) + except AttributeError as e: + _tb = traceback.format_exc() + warnings.warn( + f"Cannot create model {dist_name} for flow. Reason is from error {type(e)}:{e}" + f"\nTraceback: {_tb}", + RuntimeWarning, + stacklevel=2, + ) + return None + + dist = scipy.stats.distributions.rv_frozen(rv_class(), *args, **kwds) # type: ignore + dist.a = a + dist.b = b + + return dist + + def _serialize_function(self, o: Callable) -> OrderedDict[str, str]: + name = o.__module__ + "." + o.__name__ + ret = OrderedDict() # type: 'OrderedDict[str, str]' + ret["oml-python:serialized_object"] = "function" + ret["value"] = name + return ret + + def _deserialize_function(self, name: str) -> Callable: + module_name = name.rsplit(".", 1) + return getattr(importlib.import_module(module_name[0]), module_name[1]) + + def _serialize_cross_validator(self, o: Any) -> OrderedDict[str, str | dict]: + ret: OrderedDict[str, str | dict] = OrderedDict() + + parameters = OrderedDict() # type: 'OrderedDict[str, Any]' + + # XXX this is copied from sklearn.model_selection._split + cls = o.__class__ + init = getattr(cls.__init__, "deprecated_original", cls.__init__) + # Ignore varargs, kw and default values and pop self + init_signature = inspect.signature(init) # type: ignore + # Consider the constructor parameters excluding 'self' + if init is object.__init__: + args = [] # type: List + else: + args = sorted( + [ + p.name + for p in init_signature.parameters.values() + if p.name != "self" and p.kind != p.VAR_KEYWORD + ], + ) + + for key in args: + # We need deprecation warnings to always be on in order to + # catch deprecated param values. + # This is set in utils/__init__.py but it gets overwritten + # when running under python3 somehow. + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always", DeprecationWarning) + value = getattr(o, key, None) + if w is not None and len(w) and w[0].category is DeprecationWarning: + # if the parameter is deprecated, don't show it + continue + + if not (isinstance(value, Sized) and len(value) == 0): + value = json.dumps(value) + parameters[key] = value + else: + parameters[key] = None + + ret["oml-python:serialized_object"] = "cv_object" + name = o.__module__ + "." + o.__class__.__name__ + value = OrderedDict([("name", name), ("parameters", parameters)]) + ret["value"] = value + + return ret + + def _deserialize_cross_validator( + self, + value: OrderedDict[str, Any], + recursion_depth: int, + strict_version: bool = True, # noqa: FBT002, FBT001 + ) -> Any: + model_name = value["name"] + parameters = value["parameters"] + + module_name = model_name.rsplit(".", 1) + model_class = getattr(importlib.import_module(module_name[0]), module_name[1]) + for parameter in parameters: + parameters[parameter] = self._deserialize_sklearn( + parameters[parameter], + recursion_depth=recursion_depth + 1, + strict_version=strict_version, + ) + return model_class(**parameters) + + def _format_external_version( + self, + model_package_name: str, + model_package_version_number: str, + ) -> str: + return f"{model_package_name}=={model_package_version_number}" + + @staticmethod + def _get_parameter_values_recursive( + param_grid: dict | list[dict], + parameter_name: str, + ) -> list[Any]: + """ + Returns a list of values for a given hyperparameter, encountered + recursively throughout the flow. (e.g., n_jobs can be defined + for various flows) + + Parameters + ---------- + param_grid: Union[Dict, List[Dict]] + Dict mapping from hyperparameter list to value, to a list of + such dicts + + parameter_name: str + The hyperparameter that needs to be inspected + + Returns + ------- + List + A list of all values of hyperparameters with this name + """ + if isinstance(param_grid, dict): + return [ + value + for param, value in param_grid.items() + if param.split("__")[-1] == parameter_name + ] + + if isinstance(param_grid, list): + result = [] + for sub_grid in param_grid: + result.extend( + SklearnSerializer._get_parameter_values_recursive(sub_grid, parameter_name), + ) + return result + + raise ValueError("Param_grid should either be a dict or list of dicts") + + def _prevent_optimize_n_jobs(self, model): + """ + Ensures that HPO classes will not optimize the n_jobs hyperparameter + + Parameters + ---------- + model: + The model that will be fitted + """ + if self._is_hpo_class(model): + if isinstance(model, sklearn.model_selection.GridSearchCV): + param_distributions = model.param_grid + elif isinstance(model, sklearn.model_selection.RandomizedSearchCV): + param_distributions = model.param_distributions + else: + if hasattr(model, "param_distributions"): + param_distributions = model.param_distributions + else: + raise AttributeError( + "Using subclass BaseSearchCV other than " + "{GridSearchCV, RandomizedSearchCV}. " + "Could not find attribute " + "param_distributions.", + ) + logger.warning( + "Warning! Using subclass BaseSearchCV other than " + "{GridSearchCV, RandomizedSearchCV}. " + "Should implement param check. ", + ) + n_jobs_vals = self._get_parameter_values_recursive( + param_distributions, + "n_jobs", + ) + if len(n_jobs_vals) > 0: + raise PyOpenMLError( + "openml-python should not be used to optimize the n_jobs parameter.", + ) From e86fab7fdabebb456799a476e9575613aa4a5558 Mon Sep 17 00:00:00 2001 From: Jigyasu Date: Mon, 5 Jan 2026 16:25:41 +0530 Subject: [PATCH 08/15] add public function for serialisation and deserialisation --- openml/extensions/utils.py | 46 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 openml/extensions/utils.py diff --git a/openml/extensions/utils.py b/openml/extensions/utils.py new file mode 100644 index 000000000..e3480ef6b --- /dev/null +++ b/openml/extensions/utils.py @@ -0,0 +1,46 @@ +# License: BSD 3-Clause + +"""Utility functions for OpenML extensions.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from openml.extensions.registry import resolve_api_connector + +if TYPE_CHECKING: + from openml.flows import OpenMLFlow + + +def flow_to_estimator(flow: OpenMLFlow) -> Any: + """Convert an OpenML flow to an estimator instance. + + Parameters + ---------- + flow : openml.flows.OpenMLFlow + The OpenML flow to convert. + + Returns + ------- + estimator_instance : Any + The corresponding estimator instance. + """ + connector = resolve_api_connector(flow) + return connector.serializer().flow_to_model(flow) + + +def estimator_to_flow(estimator_instance: Any) -> OpenMLFlow: + """Convert an estimator instance to an OpenML flow. + + Parameters + ---------- + estimator_instance : Any + The estimator instance to convert. + + Returns + ------- + flow : openml.flows.OpenMLFlow + The corresponding OpenML flow. + """ + connector = resolve_api_connector(estimator_instance) + return connector.serializer().model_to_flow(estimator_instance) From e92156aa4b4d94bf41ed009a2261e1c25864cb70 Mon Sep 17 00:00:00 2001 From: Jigyasu Date: Mon, 5 Jan 2026 16:36:55 +0530 Subject: [PATCH 09/15] move the flow utils to flows/functions.py --- openml/extensions/utils.py | 46 -------------------------------------- openml/flows/functions.py | 35 +++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 46 deletions(-) delete mode 100644 openml/extensions/utils.py diff --git a/openml/extensions/utils.py b/openml/extensions/utils.py deleted file mode 100644 index e3480ef6b..000000000 --- a/openml/extensions/utils.py +++ /dev/null @@ -1,46 +0,0 @@ -# License: BSD 3-Clause - -"""Utility functions for OpenML extensions.""" - -from __future__ import annotations - -from typing import TYPE_CHECKING, Any - -from openml.extensions.registry import resolve_api_connector - -if TYPE_CHECKING: - from openml.flows import OpenMLFlow - - -def flow_to_estimator(flow: OpenMLFlow) -> Any: - """Convert an OpenML flow to an estimator instance. - - Parameters - ---------- - flow : openml.flows.OpenMLFlow - The OpenML flow to convert. - - Returns - ------- - estimator_instance : Any - The corresponding estimator instance. - """ - connector = resolve_api_connector(flow) - return connector.serializer().flow_to_model(flow) - - -def estimator_to_flow(estimator_instance: Any) -> OpenMLFlow: - """Convert an estimator instance to an OpenML flow. - - Parameters - ---------- - estimator_instance : Any - The estimator instance to convert. - - Returns - ------- - flow : openml.flows.OpenMLFlow - The corresponding OpenML flow. - """ - connector = resolve_api_connector(estimator_instance) - return connector.serializer().model_to_flow(estimator_instance) diff --git a/openml/flows/functions.py b/openml/flows/functions.py index 9906958e5..e51175acc 100644 --- a/openml/flows/functions.py +++ b/openml/flows/functions.py @@ -14,6 +14,7 @@ import openml._api_calls import openml.utils from openml.exceptions import OpenMLCacheException +from openml.extensions.registry import resolve_api_connector from . import OpenMLFlow @@ -552,3 +553,37 @@ def delete_flow(flow_id: int) -> bool: True if the deletion was successful. False otherwise. """ return openml.utils._delete_entity("flow", flow_id) + + +def flow_to_estimator(flow: OpenMLFlow) -> Any: + """Convert an OpenML flow to an estimator instance. + + Parameters + ---------- + flow : openml.flows.OpenMLFlow + The OpenML flow to convert. + + Returns + ------- + estimator_instance : Any + The corresponding estimator instance. + """ + connector = resolve_api_connector(flow) + return connector.serializer().flow_to_model(flow) + + +def estimator_to_flow(estimator_instance: Any) -> OpenMLFlow: + """Convert an estimator instance to an OpenML flow. + + Parameters + ---------- + estimator_instance : Any + The estimator instance to convert. + + Returns + ------- + flow : openml.flows.OpenMLFlow + The corresponding OpenML flow. + """ + connector = resolve_api_connector(estimator_instance) + return connector.serializer().model_to_flow(estimator_instance) From 1945c58a456db0a684c27289026a71322f3b828b Mon Sep 17 00:00:00 2001 From: Jigyasu Date: Mon, 5 Jan 2026 16:40:18 +0530 Subject: [PATCH 10/15] update flows --- openml/flows/functions.py | 35 ----------------------------- openml/flows/utils.py | 46 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 35 deletions(-) create mode 100644 openml/flows/utils.py diff --git a/openml/flows/functions.py b/openml/flows/functions.py index e51175acc..9906958e5 100644 --- a/openml/flows/functions.py +++ b/openml/flows/functions.py @@ -14,7 +14,6 @@ import openml._api_calls import openml.utils from openml.exceptions import OpenMLCacheException -from openml.extensions.registry import resolve_api_connector from . import OpenMLFlow @@ -553,37 +552,3 @@ def delete_flow(flow_id: int) -> bool: True if the deletion was successful. False otherwise. """ return openml.utils._delete_entity("flow", flow_id) - - -def flow_to_estimator(flow: OpenMLFlow) -> Any: - """Convert an OpenML flow to an estimator instance. - - Parameters - ---------- - flow : openml.flows.OpenMLFlow - The OpenML flow to convert. - - Returns - ------- - estimator_instance : Any - The corresponding estimator instance. - """ - connector = resolve_api_connector(flow) - return connector.serializer().flow_to_model(flow) - - -def estimator_to_flow(estimator_instance: Any) -> OpenMLFlow: - """Convert an estimator instance to an OpenML flow. - - Parameters - ---------- - estimator_instance : Any - The estimator instance to convert. - - Returns - ------- - flow : openml.flows.OpenMLFlow - The corresponding OpenML flow. - """ - connector = resolve_api_connector(estimator_instance) - return connector.serializer().model_to_flow(estimator_instance) diff --git a/openml/flows/utils.py b/openml/flows/utils.py new file mode 100644 index 000000000..e3480ef6b --- /dev/null +++ b/openml/flows/utils.py @@ -0,0 +1,46 @@ +# License: BSD 3-Clause + +"""Utility functions for OpenML extensions.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from openml.extensions.registry import resolve_api_connector + +if TYPE_CHECKING: + from openml.flows import OpenMLFlow + + +def flow_to_estimator(flow: OpenMLFlow) -> Any: + """Convert an OpenML flow to an estimator instance. + + Parameters + ---------- + flow : openml.flows.OpenMLFlow + The OpenML flow to convert. + + Returns + ------- + estimator_instance : Any + The corresponding estimator instance. + """ + connector = resolve_api_connector(flow) + return connector.serializer().flow_to_model(flow) + + +def estimator_to_flow(estimator_instance: Any) -> OpenMLFlow: + """Convert an estimator instance to an OpenML flow. + + Parameters + ---------- + estimator_instance : Any + The estimator instance to convert. + + Returns + ------- + flow : openml.flows.OpenMLFlow + The corresponding OpenML flow. + """ + connector = resolve_api_connector(estimator_instance) + return connector.serializer().model_to_flow(estimator_instance) From 5a1ccd620bd4e9e2e112a303f2ac2d978106292b Mon Sep 17 00:00:00 2001 From: Jigyasu Date: Mon, 5 Jan 2026 16:56:36 +0530 Subject: [PATCH 11/15] expose parameters of flow_to_model --- openml/flows/utils.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/openml/flows/utils.py b/openml/flows/utils.py index e3480ef6b..0e2c2020f 100644 --- a/openml/flows/utils.py +++ b/openml/flows/utils.py @@ -12,13 +12,23 @@ from openml.flows import OpenMLFlow -def flow_to_estimator(flow: OpenMLFlow) -> Any: - """Convert an OpenML flow to an estimator instance. +def flow_to_estimator( + flow: OpenMLFlow, + initialize_with_defaults: bool = False, # noqa: FBT001, FBT002 + strict_version: bool = True, # noqa: FBT002, FBT001 +) -> Any: + """Instantiate a model from the flow representation. Parameters ---------- - flow : openml.flows.OpenMLFlow - The OpenML flow to convert. + flow : OpenMLFlow + + initialize_with_defaults : bool, optional (default=False) + If this flag is set, the hyperparameter values of flows will be + ignored and a flow with its defaults is returned. + + strict_version : bool, default=True + Whether to fail if version requirements are not fulfilled. Returns ------- @@ -26,7 +36,11 @@ def flow_to_estimator(flow: OpenMLFlow) -> Any: The corresponding estimator instance. """ connector = resolve_api_connector(flow) - return connector.serializer().flow_to_model(flow) + return connector.serializer().flow_to_model( + flow, + initialize_with_defaults=initialize_with_defaults, + strict_version=strict_version, + ) def estimator_to_flow(estimator_instance: Any) -> OpenMLFlow: From c7e52e1e6b3cfdd6f5197a4958cc20efb09696a7 Mon Sep 17 00:00:00 2001 From: Jigyasu Date: Mon, 5 Jan 2026 21:16:50 +0530 Subject: [PATCH 12/15] remove sklearn --- openml/extensions/.DS_Store | Bin 0 -> 10244 bytes openml/extensions/sklearn/__init__.py | 13 - openml/extensions/sklearn/connector.py | 68 - openml/extensions/sklearn/executor.py | 784 ------------ openml/extensions/sklearn/serializer.py | 1508 ----------------------- 5 files changed, 2373 deletions(-) create mode 100644 openml/extensions/.DS_Store delete mode 100644 openml/extensions/sklearn/__init__.py delete mode 100644 openml/extensions/sklearn/connector.py delete mode 100644 openml/extensions/sklearn/executor.py delete mode 100644 openml/extensions/sklearn/serializer.py diff --git a/openml/extensions/.DS_Store b/openml/extensions/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..0ee8fbd94f44119155f92a35af97354b7bcaca43 GIT binary patch literal 10244 zcmeHMTWl0n7(V~B&>1?=0SXk{i3^2r$x;dwT5e{0Q_8Ktwsc!A%k0iT2c|R2%L_ z9nS#_6AffMkOeLo6zAlp2MC}L95H~yaXd=ZQN{yV;F7}$a5#ZrWe83vz*a|Gl&KSl za~YN)10e$w8K7tPN_J*9i9PQ8KHOnBnY87kQG*6m3+B4NSi)Knl_kR@Lp03&#Gvz( zpstjl%1IivJz|X4gC!+;^KrHzyiG80S03)mL775nvp$y`V{01hrlDk~@NCIkGHT}# znL#X{$(sx(&{xWM3te3QIAT0Rx@alA*C<&{a7$1XsC|-Hh|~~ckGGS&fZCVFuvns< z9`v#S=hxF&%k=#E`maDJE1x#KqC%>utT>$LcLo!_=I6W?%{@q4U5;a=i*+r1z%&L^ zN_DGa`{b(#YvT0p? zEZVTKVRTdy=hd#+vOjgw8nB(SXaOb&Uc`iY(Aabi8zolhUdBq}E-9pB$nx^7+~BfA z(crHGgGFgZr94yZN%r;~lvB!_JX+S7^-ah24j8V7GNn{`$UsNNvG=+L9q-9FR&K!d zQp)U%W@U1g<{QnHl`u~m*jMFcT*tEZI-W_V=}r2^kWa@Z3?=(K$Msv;WJwd}=UPh6 zGxi%eshKz0aB8o5tm#+>>GG@v4=-J@dR^nzczeh0eYdKV*>mPr%W@x5ZTsdi({Q^7 zP2KPB)x3;h>!y9YCvSRY+A_LqZNQifF-f{rRkLu>LlISdkZLlQHc#bDe^{v%MYcwC zu__BLS6{YRRS(jUR@cB%4*)AysdATx#w%K&Shq%&yH28yC?4Gc zLf1FQYPY4O4NK7_zE0C7S?$Sl{*|3U=eBBDO=g^d0W{GI`kGr*^$1;dm*)AMS;NLH z-3-=#6vz6!e$6#H2hm4^-W}M>9x3)yP`hsW4jV)^_MjELbh>6;XErRVSy-he)NWDv zTudp7Y3EIh3w*3RJjB}Kqp6Lr;?>#@G6hy+MAydadAEeWJyf!ew2@upD8}pvIZvJ^ zm&jG}F}Xp$B;S&s$X)U~089r7W{zmo`Umm0iJ_b;8l1HF2S2{1+Kz1cn98xoA4=o24BD(_zu2@AK)(h0e`|h zVVbZ&s1X(lON5xPTBs8mg(hLMutnG-bP7p96HW+@FvJZIim`Z@w~M7PjJ1gY7rbGF z%aaFOjq_Z!VWtiWn@)bl!|b>O>4rMPN;NZDhTK zL8z4RW-QVui3;$>W_hcG;i#1JrYgrJNg=EeX^pgt7*dM(Px<<7a)8xj5$A%O?)Ykb#hakb#hakb$YiKp8hTO7H*MAN>FSsYOa? zZOFhBV*twA5^eN)uXv0swVz&$MKK@6OcT{BaLFK8P!-zGuH&&muj5w)eZeuCjmLw& f0+$SqI0i?_Z}b24e+Go-|1>_b@cci8&;Ne|0>mUZ literal 0 HcmV?d00001 diff --git a/openml/extensions/sklearn/__init__.py b/openml/extensions/sklearn/__init__.py deleted file mode 100644 index 4e22407be..000000000 --- a/openml/extensions/sklearn/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# License: BSD 3-Clause - -"""OpenML extension for Scikit-learn.""" - -from openml.extensions.sklearn.connector import SklearnAPIConnector -from openml.extensions.sklearn.executor import SklearnExecutor -from openml.extensions.sklearn.serializer import SklearnSerializer - -__all__ = [ - "SklearnAPIConnector", - "SklearnExecutor", - "SklearnSerializer", -] diff --git a/openml/extensions/sklearn/connector.py b/openml/extensions/sklearn/connector.py deleted file mode 100644 index 795baf640..000000000 --- a/openml/extensions/sklearn/connector.py +++ /dev/null @@ -1,68 +0,0 @@ -# License: BSD 3-Clause - -"""Connector for the Scikit-learn extension.""" - -from __future__ import annotations - -from openml.extensions.base import OpenMLAPIConnector -from openml.extensions.sklearn.executor import SklearnExecutor -from openml.extensions.sklearn.serializer import SklearnSerializer -from openml.flows import OpenMLFlow - - -class SklearnAPIConnector(OpenMLAPIConnector): - """ - Connector for the Scikit-learn extension. - - This class provides the interface to connect Scikit-learn models and flows - to the OpenML API, handling both serialization and execution compatibility checks. - """ - - def serializer(self) -> SklearnSerializer: - """ - Return the serializer for Scikit-learn estimators. - - Returns - ------- - SklearnSerializer - The serializer instance capable of handling Scikit-learn estimator. - """ - return SklearnSerializer() - - def executor(self) -> SklearnExecutor: - """ - Return the executor for Scikit-learn estimators. - - Returns - ------- - SklearnExecutor - The executor instance capable of running Scikit-learn estimators. - """ - return SklearnExecutor() - - @classmethod - def supports(cls, estimator) -> bool: - """ - Check if this connector supports the given model or flow. - - Parameters - ---------- - estimator : Any or OpenMLFlow - The Scikit-learn estimator instance or OpenMLFlow object. - - Returns - ------- - bool - True if both the serializer and executor can handle the provided - estimator or flow, False otherwise. - """ - serializer = SklearnSerializer() - SklearnExecutor() - - if isinstance(estimator, OpenMLFlow): - support = serializer.can_handle_flow(estimator) - - else: - support = serializer.can_handle_model(estimator) - - return support diff --git a/openml/extensions/sklearn/executor.py b/openml/extensions/sklearn/executor.py deleted file mode 100644 index 69d54a102..000000000 --- a/openml/extensions/sklearn/executor.py +++ /dev/null @@ -1,784 +0,0 @@ -# License: BSD 3-Clause -from __future__ import annotations - -import json -import logging -import time -import warnings -from collections import OrderedDict -from typing import TYPE_CHECKING, Any, List, cast - -import numpy as np -import pandas as pd -import sklearn.base -import sklearn.model_selection -import sklearn.pipeline - -import openml -from openml.exceptions import PyOpenMLError -from openml.extensions.base import ModelExecutor -from openml.flows import OpenMLFlow -from openml.runs.trace import PREFIX, OpenMLRunTrace, OpenMLTraceIteration -from openml.tasks import ( - OpenMLClassificationTask, - OpenMLClusteringTask, - OpenMLLearningCurveTask, - OpenMLRegressionTask, - OpenMLSupervisedTask, - OpenMLTask, -) - -if TYPE_CHECKING: - import scipy.sparse - -logger = logging.getLogger(__name__) - -SKLEARN_PIPELINE_STRING_COMPONENTS = ("drop", "passthrough") -COMPONENT_REFERENCE = "component_reference" -COMPOSITION_STEP_CONSTANT = "composition_step_constant" - - -class SklearnExecutor(ModelExecutor): - """Executor for Scikit-learn estimators.""" - - def seed_model(self, model: Any, seed: int | None = None) -> Any: # noqa: C901 - """Set the random state of all the unseeded components of a model and return the seeded - model. - - Required so that all seed information can be uploaded to OpenML for reproducible results. - - Models that are already seeded will maintain the seed. In this case, - only integer seeds are allowed (An exception is raised when a RandomState was used as - seed). - - Parameters - ---------- - model : sklearn model - The model to be seeded - seed : int - The seed to initialize the RandomState with. Unseeded subcomponents - will be seeded with a random number from the RandomState. - - Returns - ------- - Any - """ - - def _seed_current_object(current_value): - if isinstance(current_value, int): # acceptable behaviour - return False - - if isinstance(current_value, np.random.RandomState): - raise ValueError( - "Models initialized with a RandomState object are not " - "supported. Please seed with an integer. ", - ) - - if current_value is not None: - raise ValueError( - "Models should be seeded with int or None (this should never happen). ", - ) - - return True - - rs = np.random.RandomState(seed) - model_params = model.get_params() - random_states = {} - for param_name in sorted(model_params): - if "random_state" in param_name: - current_value = model_params[param_name] - # important to draw the value at this point (and not in the if - # statement) this way we guarantee that if a different set of - # subflows is seeded, the same number of the random generator is - # used - new_value = rs.randint(0, 2**16) - if _seed_current_object(current_value): - random_states[param_name] = new_value - - # Also seed CV objects! - elif isinstance(model_params[param_name], sklearn.model_selection.BaseCrossValidator): - if not hasattr(model_params[param_name], "random_state"): - continue - - current_value = model_params[param_name].random_state - new_value = rs.randint(0, 2**16) - if _seed_current_object(current_value): - model_params[param_name].random_state = new_value - - model.set_params(**random_states) - return model - - def check_if_model_fitted(self, model: Any) -> bool: - """Returns True/False denoting if the model has already been fitted/trained - - Parameters - ---------- - model : Any - - Returns - ------- - bool - """ - from sklearn.exceptions import NotFittedError - from sklearn.utils.validation import check_is_fitted - - try: - # check if model is fitted - check_is_fitted(model) - - # Creating random dummy data of arbitrary size - dummy_data = np.random.uniform(size=(10, 3)) # noqa: NPY002 - # Using 'predict' instead of 'sklearn.utils.validation.check_is_fitted' for a more - # robust check that works across sklearn versions and models. Internally, 'predict' - # should call 'check_is_fitted' for every concerned attribute, thus offering a more - # assured check than explicit calls to 'check_is_fitted' - model.predict(dummy_data) - # Will reach here if the model was fit on a dataset with 3 features - return True - except NotFittedError: # needs to be the first exception to be caught - # Model is not fitted, as is required - return False - except ValueError: - # Will reach here if the model was fit on a dataset with more or less than 3 features - return True - - def _run_model_on_fold( # noqa: PLR0915, PLR0913, C901, PLR0912 - self, - model: Any, - task: OpenMLTask, - X_train: np.ndarray | scipy.sparse.spmatrix | pd.DataFrame, - rep_no: int, - fold_no: int, - y_train: np.ndarray | None = None, - X_test: np.ndarray | scipy.sparse.spmatrix | pd.DataFrame | None = None, - ) -> tuple[ - np.ndarray, - pd.DataFrame | None, - OrderedDict[str, float], - OpenMLRunTrace | None, - ]: - """Run a model on a repeat,fold,subsample triplet of the task and return prediction - information. - - Furthermore, it will measure run time measures in case multi-core behaviour allows this. - * exact user cpu time will be measured if the number of cores is set (recursive throughout - the model) exactly to 1 - * wall clock time will be measured if the number of cores is set (recursive throughout the - model) to any given number (but not when it is set to -1) - - Returns the data that is necessary to construct the OpenML Run object. Is used by - run_task_get_arff_content. Do not use this function unless you know what you are doing. - - Parameters - ---------- - model : Any - The UNTRAINED model to run. The model instance will be copied and not altered. - task : OpenMLTask - The task to run the model on. - X_train : array-like - Training data for the given repetition and fold. - rep_no : int - The repeat of the experiment (0-based; in case of 1 time CV, always 0) - fold_no : int - The fold nr of the experiment (0-based; in case of holdout, always 0) - y_train : Optional[np.ndarray] (default=None) - Target attributes for supervised tasks. In case of classification, these are integer - indices to the potential classes specified by dataset. - X_test : Optional, array-like (default=None) - Test attributes to test for generalization in supervised tasks. - - Returns - ------- - pred_y : np.ndarray - Predictions on the training/test set, depending on the task type. - For supervised tasks, predictions are on the test set. - For unsupervised tasks, predictions are on the training set. - proba_y : pd.DataFrame, optional - Predicted probabilities for the test set. - None, if task is not Classification or Learning Curve prediction. - user_defined_measures : OrderedDict[str, float] - User defined measures that were generated on this fold - trace : OpenMLRunTrace, optional - arff trace object from a fitted model and the trace content obtained by - repeatedly calling ``run_model_on_task`` - """ - - def _prediction_to_probabilities( - y: np.ndarray | list, - model_classes: list[Any], - class_labels: list[str] | None, - ) -> pd.DataFrame: - """Transforms predicted probabilities to match with OpenML class indices. - - Parameters - ---------- - y : np.ndarray - Predicted probabilities (possibly omitting classes if they were not present in the - training data). - model_classes : list - List of classes known_predicted by the model, ordered by their index. - class_labels : list - List of classes as stored in the task object fetched from server. - - Returns - ------- - pd.DataFrame - """ - if class_labels is None: - raise ValueError("The task has no class labels") - - if isinstance(y_train, np.ndarray) and isinstance(class_labels[0], str): - # mapping (decoding) the predictions to the categories - # creating a separate copy to not change the expected pred_y type - y = [class_labels[pred] for pred in y] # list or numpy array of predictions - - # model_classes: sklearn classifier mapping from original array id to - # prediction index id - if not isinstance(model_classes, list): - raise ValueError("please convert model classes to list prior to calling this fn") - - # DataFrame allows more accurate mapping of classes as column names - result = pd.DataFrame( - 0, - index=np.arange(len(y)), - columns=model_classes, - dtype=np.float32, - ) - for obs, prediction in enumerate(y): - result.loc[obs, prediction] = 1.0 - return result - - if isinstance(task, OpenMLSupervisedTask): - if y_train is None: - raise TypeError("argument y_train must not be of type None") - if X_test is None: - raise TypeError("argument X_test must not be of type None") - - model_copy = sklearn.base.clone(model, safe=True) - # sanity check: prohibit users from optimizing n_jobs - self._prevent_optimize_n_jobs(model_copy) - # measures and stores runtimes - user_defined_measures = OrderedDict() # type: 'OrderedDict[str, float]' - try: - # for measuring runtime. Only available since Python 3.3 - modelfit_start_cputime = time.process_time() - modelfit_start_walltime = time.time() - - if isinstance(task, OpenMLSupervisedTask): - model_copy.fit(X_train, y_train) # type: ignore - elif isinstance(task, OpenMLClusteringTask): - model_copy.fit(X_train) # type: ignore - - modelfit_dur_cputime = (time.process_time() - modelfit_start_cputime) * 1000 - modelfit_dur_walltime = (time.time() - modelfit_start_walltime) * 1000 - - user_defined_measures["usercpu_time_millis_training"] = modelfit_dur_cputime - refit_time = model_copy.refit_time_ * 1000 if hasattr(model_copy, "refit_time_") else 0 # type: ignore - user_defined_measures["wall_clock_time_millis_training"] = modelfit_dur_walltime - - except AttributeError as e: - # typically happens when training a regressor on classification task - raise PyOpenMLError(str(e)) from e - - if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)): - # search for model classes_ (might differ depending on modeltype) - # first, pipelines are a special case (these don't have a classes_ - # object, but rather borrows it from the last step. We do this manually, - # because of the BaseSearch check) - if isinstance(model_copy, sklearn.pipeline.Pipeline): - used_estimator = model_copy.steps[-1][-1] - else: - used_estimator = model_copy - - if self._is_hpo_class(used_estimator): - model_classes = used_estimator.best_estimator_.classes_ - else: - model_classes = used_estimator.classes_ - - if not isinstance(model_classes, list): - model_classes = model_classes.tolist() - - # to handle the case when dataset is numpy and categories are encoded - # however the class labels stored in task are still categories - if isinstance(y_train, np.ndarray) and isinstance( - cast("List", task.class_labels)[0], - str, - ): - model_classes = [cast("List[str]", task.class_labels)[i] for i in model_classes] - - modelpredict_start_cputime = time.process_time() - modelpredict_start_walltime = time.time() - - # In supervised learning this returns the predictions for Y, in clustering - # it returns the clusters - if isinstance(task, OpenMLSupervisedTask): - pred_y = model_copy.predict(X_test) - elif isinstance(task, OpenMLClusteringTask): - pred_y = model_copy.predict(X_train) - else: - raise ValueError(task) - - modelpredict_duration_cputime = (time.process_time() - modelpredict_start_cputime) * 1000 - user_defined_measures["usercpu_time_millis_testing"] = modelpredict_duration_cputime - user_defined_measures["usercpu_time_millis"] = ( - modelfit_dur_cputime + modelpredict_duration_cputime - ) - modelpredict_duration_walltime = (time.time() - modelpredict_start_walltime) * 1000 - user_defined_measures["wall_clock_time_millis_testing"] = modelpredict_duration_walltime - user_defined_measures["wall_clock_time_millis"] = ( - modelfit_dur_walltime + modelpredict_duration_walltime + refit_time - ) - - if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)): - try: - proba_y = model_copy.predict_proba(X_test) - proba_y = pd.DataFrame(proba_y, columns=model_classes) # handles X_test as numpy - except AttributeError: # predict_proba is not available when probability=False - proba_y = _prediction_to_probabilities(pred_y, model_classes, task.class_labels) - - if task.class_labels is not None: - if proba_y.shape[1] != len(task.class_labels): - # Remap the probabilities in case there was a class missing - # at training time. By default, the classification targets - # are mapped to be zero-based indices to the actual classes. - # Therefore, the model_classes contain the correct indices to - # the correct probability array. Example: - # classes in the dataset: 0, 1, 2, 3, 4, 5 - # classes in the training set: 0, 1, 2, 4, 5 - # then we need to add a column full of zeros into the probabilities - # for class 3 because the rest of the library expects that the - # probabilities are ordered the same way as the classes are ordered). - message = ( - f"Estimator only predicted for {proba_y.shape[1]}/{len(task.class_labels)}" - " classes!" - ) - warnings.warn(message, stacklevel=2) - openml.config.logger.warning(message) - - for _i, col in enumerate(task.class_labels): - # adding missing columns with 0 probability - if col not in model_classes: - proba_y[col] = 0 - # We re-order the columns to move possibly added missing columns into place. - proba_y = proba_y[task.class_labels] - else: - raise ValueError("The task has no class labels") - - if not np.all(set(proba_y.columns) == set(task.class_labels)): - missing_cols = list(set(task.class_labels) - set(proba_y.columns)) - raise ValueError("Predicted probabilities missing for the columns: ", missing_cols) - - elif isinstance(task, (OpenMLRegressionTask, OpenMLClusteringTask)): - proba_y = None - else: - raise TypeError(type(task)) - - if self._is_hpo_class(model_copy): - trace_data = self._extract_trace_data(model_copy, rep_no, fold_no) - trace: OpenMLRunTrace | None = self._obtain_arff_trace( - model_copy, - trace_data, - ) - else: - trace = None - - return pred_y, proba_y, user_defined_measures, trace - - def obtain_parameter_values( # noqa: C901, PLR0915 - self, - flow: OpenMLFlow, - model: Any = None, - ) -> list[dict[str, Any]]: - """Extracts all parameter settings required for the flow from the model. - - If no explicit model is provided, the parameters will be extracted from `flow.model` - instead. - - Parameters - ---------- - flow : OpenMLFlow - OpenMLFlow object (containing flow ids, i.e., it has to be downloaded from the server) - - model: Any, optional (default=None) - The model from which to obtain the parameter values. Must match the flow signature. - If None, use the model specified in ``OpenMLFlow.model``. - - Returns - ------- - list - A list of dicts, where each dict has the following entries: - - ``oml:name`` : str: The OpenML parameter name - - ``oml:value`` : mixed: A representation of the parameter value - - ``oml:component`` : int: flow id to which the parameter belongs - """ - openml.flows.functions._check_flow_for_server_id(flow) - - def get_flow_dict(_flow): - flow_map = {_flow.name: _flow.flow_id} - for subflow in _flow.components: - flow_map.update(get_flow_dict(_flow.components[subflow])) - return flow_map - - def extract_parameters( # noqa: PLR0915, PLR0912, C901 - _flow, - _flow_dict, - component_model, - _main_call=False, # noqa: FBT002 - main_id=None, - ): - def is_subcomponent_specification(values): - # checks whether the current value can be a specification of - # subcomponents, as for example the value for steps parameter - # (in Pipeline) or transformers parameter (in - # ColumnTransformer). - return ( - # Specification requires list/tuple of list/tuple with - # at least length 2. - isinstance(values, (tuple, list)) - and all(isinstance(item, (tuple, list)) and len(item) > 1 for item in values) - # And each component needs to be a flow or interpretable string - and all( - isinstance(item[1], openml.flows.OpenMLFlow) - or ( - isinstance(item[1], str) - and item[1] in SKLEARN_PIPELINE_STRING_COMPONENTS - ) - for item in values - ) - ) - - # _flow is openml flow object, _param dict maps from flow name to flow - # id for the main call, the param dict can be overridden (useful for - # unit tests / sentinels) this way, for flows without subflows we do - # not have to rely on _flow_dict - exp_parameters = set(_flow.parameters) - if ( - isinstance(component_model, str) - and component_model in SKLEARN_PIPELINE_STRING_COMPONENTS - ): - model_parameters = set() - else: - model_parameters = set(component_model.get_params(deep=False)) - if len(exp_parameters.symmetric_difference(model_parameters)) != 0: - flow_params = sorted(exp_parameters) - model_params = sorted(model_parameters) - raise ValueError( - "Parameters of the model do not match the " - "parameters expected by the " - "flow:\nexpected flow parameters: " - f"{flow_params}\nmodel parameters: {model_params}", - ) - exp_components = set(_flow.components) - if ( - isinstance(component_model, str) - and component_model in SKLEARN_PIPELINE_STRING_COMPONENTS - ): - model_components = set() - else: - _ = set(component_model.get_params(deep=False)) - model_components = { - mp - for mp in component_model.get_params(deep=True) - if "__" not in mp and mp not in _ - } - if len(exp_components.symmetric_difference(model_components)) != 0: - is_problem = True - if len(exp_components - model_components) > 0: - # If an expected component is not returned as a component by get_params(), - # this means that it is also a parameter -> we need to check that this is - # actually the case - difference = exp_components - model_components - component_in_model_parameters = [] - for component in difference: - if component in model_parameters: - component_in_model_parameters.append(True) - else: - component_in_model_parameters.append(False) - is_problem = not all(component_in_model_parameters) - if is_problem: - flow_components = sorted(exp_components) - model_components = sorted(model_components) - raise ValueError( - "Subcomponents of the model do not match the " - "parameters expected by the " - "flow:\nexpected flow subcomponents: " - f"{flow_components}\nmodel subcomponents: {model_components}", - ) - - _params = [] - for _param_name in _flow.parameters: - _current = OrderedDict() - _current["oml:name"] = _param_name - - current_param_values = self.model_to_flow(component_model.get_params()[_param_name]) - - # Try to filter out components (a.k.a. subflows) which are - # handled further down in the code (by recursively calling - # this function)! - if isinstance(current_param_values, openml.flows.OpenMLFlow): - continue - - if is_subcomponent_specification(current_param_values): - # complex parameter value, with subcomponents - parsed_values = [] - for subcomponent in current_param_values: - # scikit-learn stores usually tuples in the form - # (name (str), subcomponent (mixed), argument - # (mixed)). OpenML replaces the subcomponent by an - # OpenMLFlow object. - if len(subcomponent) < 2 or len(subcomponent) > 3: - raise ValueError("Component reference should be size {2,3}. ") - - subcomponent_identifier = subcomponent[0] - subcomponent_flow = subcomponent[1] - if not isinstance(subcomponent_identifier, str): - raise TypeError( - "Subcomponent identifier should be of type string, " - f"but is {type(subcomponent_identifier)}", - ) - if not isinstance(subcomponent_flow, (openml.flows.OpenMLFlow, str)): - if ( - isinstance(subcomponent_flow, str) - and subcomponent_flow in SKLEARN_PIPELINE_STRING_COMPONENTS - ): - pass - else: - raise TypeError( - "Subcomponent flow should be of type flow, but is" - f" {type(subcomponent_flow)}", - ) - - current = { - "oml-python:serialized_object": COMPONENT_REFERENCE, - "value": { - "key": subcomponent_identifier, - "step_name": subcomponent_identifier, - }, - } - if len(subcomponent) == 3: - if not isinstance(subcomponent[2], list) and not isinstance( - subcomponent[2], - OrderedDict, - ): - raise TypeError( - "Subcomponent argument should be list or OrderedDict", - ) - current["value"]["argument_1"] = subcomponent[2] - parsed_values.append(current) - parsed_values = json.dumps(parsed_values) - else: - # vanilla parameter value - parsed_values = json.dumps(current_param_values) - - _current["oml:value"] = parsed_values - if _main_call: - _current["oml:component"] = main_id - else: - _current["oml:component"] = _flow_dict[_flow.name] - _params.append(_current) - - for _identifier in _flow.components: - subcomponent_model = component_model.get_params()[_identifier] - _params.extend( - extract_parameters( - _flow.components[_identifier], - _flow_dict, - subcomponent_model, - ), - ) - return _params - - flow_dict = get_flow_dict(flow) - model = model if model is not None else flow.model - return extract_parameters(flow, flow_dict, model, _main_call=True, main_id=flow.flow_id) - - def _openml_param_name_to_sklearn( - self, - openml_parameter: openml.setups.OpenMLParameter, - flow: OpenMLFlow, - ) -> str: - """ - Converts the name of an OpenMLParameter into the sklean name, given a flow. - - Parameters - ---------- - openml_parameter: OpenMLParameter - The parameter under consideration - - flow: OpenMLFlow - The flow that provides context. - - Returns - ------- - sklearn_parameter_name: str - The name the parameter will have once used in scikit-learn - """ - if not isinstance(openml_parameter, openml.setups.OpenMLParameter): - raise ValueError("openml_parameter should be an instance of OpenMLParameter") - if not isinstance(flow, OpenMLFlow): - raise ValueError("flow should be an instance of OpenMLFlow") - - flow_structure = flow.get_structure("name") - if openml_parameter.flow_name not in flow_structure: - raise ValueError("Obtained OpenMLParameter and OpenMLFlow do not correspond. ") - name = openml_parameter.flow_name # for PEP8 - return "__".join(flow_structure[name] + [openml_parameter.parameter_name]) - - ################################################################################################ - # Methods for hyperparameter optimization - - def _is_hpo_class(self, model: Any) -> bool: - """Check whether the model performs hyperparameter optimization. - - Used to check whether an optimization trace can be extracted from the model after - running it. - - Parameters - ---------- - model : Any - - Returns - ------- - bool - """ - return isinstance(model, sklearn.model_selection._search.BaseSearchCV) - - def instantiate_model_from_hpo_class( - self, - model: Any, - trace_iteration: OpenMLTraceIteration, - ) -> Any: - """Instantiate a ``base_estimator`` which can be searched over by the hyperparameter - optimization model. - - Parameters - ---------- - model : Any - A hyperparameter optimization model which defines the model to be instantiated. - trace_iteration : OpenMLTraceIteration - Describing the hyperparameter settings to instantiate. - - Returns - ------- - Any - """ - if not self._is_hpo_class(model): - raise AssertionError( - f"Flow model {model} is not an instance of" - " sklearn.model_selection._search.BaseSearchCV", - ) - base_estimator = model.estimator - base_estimator.set_params(**trace_iteration.get_parameters()) - return base_estimator - - def _extract_trace_data(self, model, rep_no, fold_no): - """Extracts data from a machine learning model's cross-validation results - and creates an ARFF (Attribute-Relation File Format) trace. - - Parameters - ---------- - model : Any - A fitted hyperparameter optimization model. - rep_no : int - The repetition number. - fold_no : int - The fold number. - - Returns - ------- - A list of ARFF tracecontent. - """ - arff_tracecontent = [] - for itt_no in range(len(model.cv_results_["mean_test_score"])): - # we use the string values for True and False, as it is defined in - # this way by the OpenML server - selected = "false" - if itt_no == model.best_index_: - selected = "true" - test_score = model.cv_results_["mean_test_score"][itt_no] - arff_line = [rep_no, fold_no, itt_no, test_score, selected] - for key in model.cv_results_: - if key.startswith("param_"): - value = model.cv_results_[key][itt_no] - # Built-in serializer does not convert all numpy types, - # these methods convert them to built-in types instead. - if isinstance(value, np.generic): - # For scalars it actually returns scalars, not a list - value = value.tolist() - serialized_value = json.dumps(value) if value is not np.ma.masked else np.nan - arff_line.append(serialized_value) - arff_tracecontent.append(arff_line) - return arff_tracecontent - - def _obtain_arff_trace( - self, - model: Any, - trace_content: list, - ) -> OpenMLRunTrace: - """Create arff trace object from a fitted model and the trace content obtained by - repeatedly calling ``run_model_on_task``. - - Parameters - ---------- - model : Any - A fitted hyperparameter optimization model. - - trace_content : List[List] - Trace content obtained by ``openml.runs.run_flow_on_task``. - - Returns - ------- - OpenMLRunTrace - """ - if not self._is_hpo_class(model): - raise AssertionError( - f"Flow model {model} is not an instance of " - "sklearn.model_selection._search.BaseSearchCV", - ) - if not hasattr(model, "cv_results_"): - raise ValueError("model should contain `cv_results_`") - - # attributes that will be in trace arff, regardless of the model - trace_attributes = [ - ("repeat", "NUMERIC"), - ("fold", "NUMERIC"), - ("iteration", "NUMERIC"), - ("evaluation", "NUMERIC"), - ("selected", ["true", "false"]), - ] - - # model dependent attributes for trace arff - for key in model.cv_results_: - if key.startswith("param_"): - # supported types should include all types, including bool, - # int float - supported_basic_types = (bool, int, float, str) - for param_value in model.cv_results_[key]: - if isinstance(param_value, np.generic): - param_value = param_value.tolist() # noqa: PLW2901 - if ( - isinstance(param_value, supported_basic_types) - or param_value is None - or param_value is np.ma.masked - ): - # basic string values - type = "STRING" # noqa: A001 - elif isinstance(param_value, (list, tuple)) and all( - isinstance(i, int) for i in param_value - ): - # list of integers (usually for selecting features) - # hyperparameter layer_sizes of MLPClassifier - type = "STRING" # noqa: A001 - else: - raise TypeError(f"Unsupported param type in param grid: {key}") - - # renamed the attribute param to parameter, as this is a required - # OpenML convention - this also guards against name collisions - # with the required trace attributes - attribute = (PREFIX + key[6:], type) # type: ignore - trace_attributes.append(attribute) - - return OpenMLRunTrace.generate( - trace_attributes, - trace_content, - ) diff --git a/openml/extensions/sklearn/serializer.py b/openml/extensions/sklearn/serializer.py deleted file mode 100644 index 26413afec..000000000 --- a/openml/extensions/sklearn/serializer.py +++ /dev/null @@ -1,1508 +0,0 @@ -# License: BSD 3-Clause - -"""Serializer for the Scikit-learn estimators.""" - -from __future__ import annotations - -import contextlib -import copy -import importlib -import inspect -import json -import logging -import re -import sys -import traceback -import warnings -from collections import OrderedDict -from json.decoder import JSONDecodeError -from re import IGNORECASE -from typing import Any, Callable, Sized - -import numpy as np -import scipy.stats -import sklearn.base -import sklearn.model_selection -import sklearn.pipeline -from packaging.version import Version - -import openml -from openml.exceptions import PyOpenMLError -from openml.extensions.base import ModelSerializer -from openml.flows import OpenMLFlow - -logger = logging.getLogger(__name__) - - -DEPENDENCIES_PATTERN = re.compile( - r"^(?P[\w\-]+)((?P==|>=|>)" - r"(?P(\d+\.)?(\d+\.)?(\d+)?(dev)?[0-9]*))?$", -) - -# NOTE(eddiebergman): This was imported before but became deprecated, -# as a result I just enumerated them manually by copy-ing and pasting, -# recommended solution in Numpy 2.0 guide was to explicitly list them. -SIMPLE_NUMPY_TYPES = [ - np.int8, - np.int16, - np.int32, - np.int64, - np.longlong, - np.uint8, - np.uint16, - np.uint32, - np.uint64, - np.ulonglong, - np.float16, - np.float32, - np.float64, - np.longdouble, - np.complex64, - np.complex128, - np.clongdouble, -] -SIMPLE_TYPES = (bool, int, float, str, *SIMPLE_NUMPY_TYPES) - - -SKLEARN_PIPELINE_STRING_COMPONENTS = ("drop", "passthrough") -COMPONENT_REFERENCE = "component_reference" -COMPOSITION_STEP_CONSTANT = "composition_step_constant" - - -class SklearnSerializer(ModelSerializer): - """Serializer for Scikit-learn estimators.""" - - @classmethod - def can_handle_flow(cls, flow: OpenMLFlow) -> bool: - """Check whether a given describes a scikit-learn estimator. - - This is done by parsing the ``external_version`` field. - - Parameters - ---------- - flow : OpenMLFlow - - Returns - ------- - bool - """ - return cls._is_sklearn_flow(flow) - - @classmethod - def can_handle_model(cls, model: Any) -> bool: - """Check whether a model is an instance of ``sklearn.base.BaseEstimator``. - - Parameters - ---------- - model : Any - - Returns - ------- - bool - """ - return isinstance(model, sklearn.base.BaseEstimator) - - def flow_to_model( - self, - flow: OpenMLFlow, - initialize_with_defaults: bool = False, # noqa: FBT001, FBT002 - strict_version: bool = True, # noqa: FBT001, FBT002 - ) -> Any: - """Initializes a sklearn model based on a flow. - - Parameters - ---------- - flow : mixed - the object to deserialize (can be flow object, or any serialized - parameter value that is accepted by) - - initialize_with_defaults : bool, optional (default=False) - If this flag is set, the hyperparameter values of flows will be - ignored and a flow with its defaults is returned. - - strict_version : bool, default=True - Whether to fail if version requirements are not fulfilled. - - Returns - ------- - mixed - """ - return self._deserialize_sklearn( - flow, - initialize_with_defaults=initialize_with_defaults, - strict_version=strict_version, - ) - - def model_to_flow(self, model: Any) -> OpenMLFlow: - """Transform a scikit-learn model to a flow for uploading it to OpenML. - - Parameters - ---------- - model : Any - - Returns - ------- - OpenMLFlow - """ - # Necessary to make pypy not complain about all the different possible return types - return self._serialize_sklearn(model) - - @classmethod - def trim_flow_name( # noqa: C901 - cls, - long_name: str, - extra_trim_length: int = 100, - _outer: bool = True, # noqa: FBT001, FBT002 - ) -> str: - """Shorten generated sklearn flow name to at most ``max_length`` characters. - - Flows are assumed to have the following naming structure: - ``(model_selection)? (pipeline)? (steps)+`` - and will be shortened to: - ``sklearn.(selection.)?(pipeline.)?(steps)+`` - e.g. (white spaces and newlines added for readability) - - .. code :: - - sklearn.pipeline.Pipeline( - columntransformer=sklearn.compose._column_transformer.ColumnTransformer( - numeric=sklearn.pipeline.Pipeline( - imputer=sklearn.preprocessing.imputation.Imputer, - standardscaler=sklearn.preprocessing.data.StandardScaler), - nominal=sklearn.pipeline.Pipeline( - simpleimputer=sklearn.impute.SimpleImputer, - onehotencoder=sklearn.preprocessing._encoders.OneHotEncoder)), - variancethreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, - svc=sklearn.svm.classes.SVC) - - -> - ``sklearn.Pipeline(ColumnTransformer,VarianceThreshold,SVC)`` - - Parameters - ---------- - long_name : str - The full flow name generated by the scikit-learn extension. - extra_trim_length: int (default=100) - If the trimmed name would exceed `extra_trim_length` characters, additional trimming - of the short name is performed. This reduces the produced short name length. - There is no guarantee the end result will not exceed `extra_trim_length`. - _outer : bool (default=True) - For internal use only. Specifies if the function is called recursively. - - Returns - ------- - str - - """ - - def remove_all_in_parentheses(string: str) -> str: - string, removals = re.subn(r"\([^()]*\)", "", string) - while removals > 0: - string, removals = re.subn(r"\([^()]*\)", "", string) - return string - - # Generally, we want to trim all hyperparameters, the exception to that is for model - # selection, as the `estimator` hyperparameter is very indicative of what is in the flow. - # So we first trim name of the `estimator` specified in mode selection. For reference, in - # the example below, we want to trim `sklearn.tree.tree.DecisionTreeClassifier`, and - # keep it in the final trimmed flow name: - # sklearn.pipeline.Pipeline(Imputer=sklearn.preprocessing.imputation.Imputer, - # VarianceThreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, # noqa: ERA001, E501 - # Estimator=sklearn.model_selection._search.RandomizedSearchCV(estimator= - # sklearn.tree.tree.DecisionTreeClassifier)) - if "sklearn.model_selection" in long_name: - start_index = long_name.index("sklearn.model_selection") - estimator_start = ( - start_index + long_name[start_index:].index("estimator=") + len("estimator=") - ) - - model_select_boilerplate = long_name[start_index:estimator_start] - # above is .g. "sklearn.model_selection._search.RandomizedSearchCV(estimator=" - model_selection_class = model_select_boilerplate.split("(")[0].split(".")[-1] - - # Now we want to also find and parse the `estimator`, for this we find the closing - # parenthesis to the model selection technique: - closing_parenthesis_expected = 1 - for char in long_name[estimator_start:]: - if char == "(": - closing_parenthesis_expected += 1 - if char == ")": - closing_parenthesis_expected -= 1 - if closing_parenthesis_expected == 0: - break - - _end: int = estimator_start + len(long_name[estimator_start:]) - 1 - model_select_pipeline = long_name[estimator_start:_end] - - trimmed_pipeline = cls.trim_flow_name(model_select_pipeline, _outer=False) - _, trimmed_pipeline = trimmed_pipeline.split(".", maxsplit=1) # trim module prefix - model_select_short = f"sklearn.{model_selection_class}[{trimmed_pipeline}]" - name = long_name[:start_index] + model_select_short + long_name[_end + 1 :] - else: - name = long_name - - module_name = long_name.split(".")[0] - short_name = module_name + ".{}" - - if name.startswith("sklearn.pipeline"): - full_pipeline_class, pipeline = name[:-1].split("(", maxsplit=1) - pipeline_class = full_pipeline_class.split(".")[-1] - # We don't want nested pipelines in the short name, so we trim all complicated - # subcomponents, i.e. those with parentheses: - pipeline = remove_all_in_parentheses(pipeline) - - # then the pipeline steps are formatted e.g.: - # step1name=sklearn.submodule.ClassName,step2name... - components = [component.split(".")[-1] for component in pipeline.split(",")] - pipeline = f"{pipeline_class}({','.join(components)})" - if len(short_name.format(pipeline)) > extra_trim_length: - pipeline = f"{pipeline_class}(...,{components[-1]})" - else: - # Just a simple component: e.g. sklearn.tree.DecisionTreeClassifier - pipeline = remove_all_in_parentheses(name).split(".")[-1] - - if not _outer: - # Anything from parenthesis in inner calls should not be culled, so we use brackets - pipeline = pipeline.replace("(", "[").replace(")", "]") - else: - # Square brackets may be introduced with nested model_selection - pipeline = pipeline.replace("[", "(").replace("]", ")") - - return short_name.format(pipeline) - - @classmethod - def _min_dependency_str(cls, sklearn_version: str) -> str: - """Returns a string containing the minimum dependencies for the sklearn version passed. - - Parameters - ---------- - sklearn_version : str - A version string of the xx.xx.xx - - Returns - ------- - str - """ - # This explicit check is necessary to support existing entities on the OpenML servers - # that used the fixed dependency string (in the else block) - if Version(openml.__version__) > Version("0.11"): - # OpenML v0.11 onwards supports sklearn>=0.24 - # assumption: 0.24 onwards sklearn should contain a _min_dependencies.py file with - # variables declared for extracting minimum dependency for that version - if Version(sklearn_version) >= Version("0.24"): - from sklearn import _min_dependencies as _mindep - - dependency_list = { - "numpy": f"{_mindep.NUMPY_MIN_VERSION}", - "scipy": f"{_mindep.SCIPY_MIN_VERSION}", - "joblib": f"{_mindep.JOBLIB_MIN_VERSION}", - "threadpoolctl": f"{_mindep.THREADPOOLCTL_MIN_VERSION}", - } - elif Version(sklearn_version) >= Version("0.23"): - dependency_list = { - "numpy": "1.13.3", - "scipy": "0.19.1", - "joblib": "0.11", - "threadpoolctl": "2.0.0", - } - if Version(sklearn_version).micro == 0: - dependency_list.pop("threadpoolctl") - elif Version(sklearn_version) >= Version("0.21"): - dependency_list = {"numpy": "1.11.0", "scipy": "0.17.0", "joblib": "0.11"} - elif Version(sklearn_version) >= Version("0.19"): - dependency_list = {"numpy": "1.8.2", "scipy": "0.13.3"} - else: - dependency_list = {"numpy": "1.6.1", "scipy": "0.9"} - else: - # this is INCORRECT for sklearn versions >= 0.19 and < 0.24 - # given that OpenML has existing flows uploaded with such dependency information, - # we change no behaviour for older sklearn version, however from 0.24 onwards - # the dependency list will be accurately updated for any flow uploaded to OpenML - dependency_list = {"numpy": "1.6.1", "scipy": "0.9"} - - sklearn_dep = f"sklearn=={sklearn_version}" - dep_str = "\n".join([f"{k}>={v}" for k, v in dependency_list.items()]) - return "\n".join([sklearn_dep, dep_str]) - - def _deserialize_sklearn( # noqa: PLR0915, C901, PLR0912 - self, - o: Any, - components: dict | None = None, - initialize_with_defaults: bool = False, # noqa: FBT001, FBT002 - recursion_depth: int = 0, - strict_version: bool = True, # noqa: FBT002, FBT001 - ) -> Any: - """Recursive function to deserialize a scikit-learn flow. - - This function inspects an object to deserialize and decides how to do so. This function - delegates all work to the respective functions to deserialize special data structures etc. - This function works on everything that has been serialized to OpenML: OpenMLFlow, - components (which are flows themselves), functions, hyperparameter distributions (for - random search) and the actual hyperparameter values themselves. - - Parameters - ---------- - o : mixed - the object to deserialize (can be flow object, or any serialized - parameter value that is accepted by) - - components : Optional[dict] - Components of the current flow being de-serialized. These will not be used when - de-serializing the actual flow, but when de-serializing a component reference. - - initialize_with_defaults : bool, optional (default=False) - If this flag is set, the hyperparameter values of flows will be - ignored and a flow with its defaults is returned. - - recursion_depth : int - The depth at which this flow is called, mostly for debugging - purposes - - strict_version : bool, default=True - Whether to fail if version requirements are not fulfilled. - - Returns - ------- - mixed - """ - logger.info( - "-{} flow_to_sklearn START o={}, components={}, init_defaults={}".format( - "-" * recursion_depth, o, components, initialize_with_defaults - ), - ) - depth_pp = recursion_depth + 1 # shortcut var, depth plus plus - - # First, we need to check whether the presented object is a json string. - # JSON strings are used to encoder parameter values. By passing around - # json strings for parameters, we make sure that we can flow_to_sklearn - # the parameter values to the correct type. - - if isinstance(o, str): - with contextlib.suppress(JSONDecodeError): - o = json.loads(o) - - if isinstance(o, dict): - # Check if the dict encodes a 'special' object, which could not - # easily converted into a string, but rather the information to - # re-create the object were stored in a dictionary. - if "oml-python:serialized_object" in o: - serialized_type = o["oml-python:serialized_object"] - value = o["value"] - if serialized_type == "type": - rval = self._deserialize_type(value) - elif serialized_type == "rv_frozen": - rval = self._deserialize_rv_frozen(value) - elif serialized_type == "function": - rval = self._deserialize_function(value) - elif serialized_type in (COMPOSITION_STEP_CONSTANT, COMPONENT_REFERENCE): - if serialized_type == COMPOSITION_STEP_CONSTANT: - pass - elif serialized_type == COMPONENT_REFERENCE: - value = self._deserialize_sklearn( - value, - recursion_depth=depth_pp, - strict_version=strict_version, - ) - else: - raise NotImplementedError(serialized_type) - assert components is not None # Necessary for mypy - step_name = value["step_name"] - key = value["key"] - component = self._deserialize_sklearn( - components[key], - initialize_with_defaults=initialize_with_defaults, - recursion_depth=depth_pp, - strict_version=strict_version, - ) - # The component is now added to where it should be used - # later. It should not be passed to the constructor of the - # main flow object. - del components[key] - if step_name is None: - rval = component - elif "argument_1" not in value: - rval = (step_name, component) - else: - rval = (step_name, component, value["argument_1"]) - elif serialized_type == "cv_object": - rval = self._deserialize_cross_validator( - value, - recursion_depth=recursion_depth, - strict_version=strict_version, - ) - else: - raise ValueError(f"Cannot flow_to_sklearn {serialized_type}") - - else: - rval = OrderedDict( - ( - self._deserialize_sklearn( - o=key, - components=components, - initialize_with_defaults=initialize_with_defaults, - recursion_depth=depth_pp, - strict_version=strict_version, - ), - self._deserialize_sklearn( - o=value, - components=components, - initialize_with_defaults=initialize_with_defaults, - recursion_depth=depth_pp, - strict_version=strict_version, - ), - ) - for key, value in sorted(o.items()) - ) - elif isinstance(o, (list, tuple)): - rval = [ - self._deserialize_sklearn( - o=element, - components=components, - initialize_with_defaults=initialize_with_defaults, - recursion_depth=depth_pp, - strict_version=strict_version, - ) - for element in o - ] - if isinstance(o, tuple): - rval = tuple(rval) - elif isinstance(o, (bool, int, float, str)) or o is None: - rval = o - elif isinstance(o, OpenMLFlow): - if not self._is_sklearn_flow(o): - raise ValueError("Only sklearn flows can be reinstantiated") - rval = self._deserialize_model( - flow=o, - keep_defaults=initialize_with_defaults, - recursion_depth=recursion_depth, - strict_version=strict_version, - ) - else: - raise TypeError(o) - logger.info(f"-{'-' * recursion_depth} flow_to_sklearn END o={o}, rval={rval}") - return rval - - def _serialize_sklearn(self, o: Any, parent_model: Any | None = None) -> Any: # noqa: PLR0912, C901 - rval = None # type: Any - - # TODO: assert that only on first recursion lvl `parent_model` can be None - if self.is_estimator(o): - # is the main model or a submodel - rval = self._serialize_model(o) - elif ( - isinstance(o, (list, tuple)) - and len(o) == 2 - and o[1] in SKLEARN_PIPELINE_STRING_COMPONENTS - and isinstance(parent_model, sklearn.pipeline._BaseComposition) - ): - rval = o - elif isinstance(o, (list, tuple)): - # TODO: explain what type of parameter is here - rval = [self._serialize_sklearn(element, parent_model) for element in o] - if isinstance(o, tuple): - rval = tuple(rval) - elif isinstance(o, SIMPLE_TYPES) or o is None: - if isinstance(o, tuple(SIMPLE_NUMPY_TYPES)): - o = o.item() # type: ignore - # base parameter values - rval = o - elif isinstance(o, dict): - # TODO: explain what type of parameter is here - if not isinstance(o, OrderedDict): - o = OrderedDict(sorted(o.items())) - - rval = OrderedDict() - for key, value in o.items(): - if not isinstance(key, str): - raise TypeError( - "Can only use string as keys, you passed " - f"type {type(key)} for value {key!s}.", - ) - _key = self._serialize_sklearn(key, parent_model) - rval[_key] = self._serialize_sklearn(value, parent_model) - elif isinstance(o, type): - # TODO: explain what type of parameter is here - rval = self._serialize_type(o) - elif isinstance(o, scipy.stats.distributions.rv_frozen): - rval = self._serialize_rv_frozen(o) - # This only works for user-defined functions (and not even partial). - # I think this is exactly what we want here as there shouldn't be any - # built-in or functool.partials in a pipeline - elif inspect.isfunction(o): - # TODO: explain what type of parameter is here - rval = self._serialize_function(o) - elif self._is_cross_validator(o): - # TODO: explain what type of parameter is here - rval = self._serialize_cross_validator(o) - else: - raise TypeError(o, type(o)) - - return rval - - def is_estimator(self, model: Any) -> bool: - """Check whether the given model is a scikit-learn estimator. - - This function is only required for backwards compatibility and will be removed in the - near future. - - Parameters - ---------- - model : Any - - Returns - ------- - bool - """ - o = model - return hasattr(o, "fit") and hasattr(o, "get_params") and hasattr(o, "set_params") - - def get_version_information(self) -> list[str]: - """List versions of libraries required by the flow. - - Libraries listed are ``Python``, ``scikit-learn``, ``numpy`` and ``scipy``. - - Returns - ------- - List - """ - # This can possibly be done by a package such as pyxb, but I could not get - # it to work properly. - import numpy - import scipy - import sklearn - - major, minor, micro, _, _ = sys.version_info - python_version = f"Python_{'.'.join([str(major), str(minor), str(micro)])}." - sklearn_version = f"Sklearn_{sklearn.__version__}." - numpy_version = f"NumPy_{numpy.__version__}." # type: ignore - scipy_version = f"SciPy_{scipy.__version__}." - - return [python_version, sklearn_version, numpy_version, scipy_version] - - def create_setup_string(self, model: Any) -> str: # noqa: ARG002 - """Create a string which can be used to reinstantiate the given model. - - Parameters - ---------- - model : Any - - Returns - ------- - str - """ - return " ".join(self.get_version_information()) - - def _is_cross_validator(self, o: Any) -> bool: - return isinstance(o, sklearn.model_selection.BaseCrossValidator) - - @classmethod - def _is_sklearn_flow(cls, flow: OpenMLFlow) -> bool: - sklearn_dependency = isinstance(flow.dependencies, str) and "sklearn" in flow.dependencies - sklearn_as_external = isinstance(flow.external_version, str) and ( - flow.external_version.startswith("sklearn==") or ",sklearn==" in flow.external_version - ) - return sklearn_dependency or sklearn_as_external - - def _get_sklearn_description(self, model: Any, char_lim: int = 1024) -> str: - r"""Fetches the sklearn function docstring for the flow description - - Retrieves the sklearn docstring available and does the following: - * If length of docstring <= char_lim, then returns the complete docstring - * Else, trims the docstring till it encounters a 'Read more in the :ref:' - * Or till it encounters a 'Parameters\n----------\n' - The final string returned is at most of length char_lim with leading and - trailing whitespaces removed. - - Parameters - ---------- - model : sklearn model - char_lim : int - Specifying the max length of the returned string. - OpenML servers have a constraint of 1024 characters for the 'description' field. - - Returns - ------- - str - """ - - def match_format(s): - return f"{s}\n{len(s) * '-'}\n" - - s = inspect.getdoc(model) - if s is None: - return "" - try: - # trim till 'Read more' - pattern = "Read more in the :ref:" - index = s.index(pattern) - s = s[:index] - # trimming docstring to be within char_lim - if len(s) > char_lim: - s = f"{s[: char_lim - 3]}..." - return s.strip() - except ValueError: - logger.warning( - "'Read more' not found in descriptions. " - "Trying to trim till 'Parameters' if available in docstring.", - ) - try: - # if 'Read more' doesn't exist, trim till 'Parameters' - pattern = "Parameters" - index = s.index(match_format(pattern)) - except ValueError: - # returning full docstring - logger.warning("'Parameters' not found in docstring. Omitting docstring trimming.") - index = len(s) - s = s[:index] - # trimming docstring to be within char_lim - if len(s) > char_lim: - s = f"{s[: char_lim - 3]}..." - return s.strip() - - def _extract_sklearn_parameter_docstring(self, model) -> None | str: - """Extracts the part of sklearn docstring containing parameter information - - Fetches the entire docstring and trims just the Parameter section. - The assumption is that 'Parameters' is the first section in sklearn docstrings, - followed by other sections titled 'Attributes', 'See also', 'Note', 'References', - appearing in that order if defined. - Returns a None if no section with 'Parameters' can be found in the docstring. - - Parameters - ---------- - model : sklearn model - - Returns - ------- - str, or None - """ - - def match_format(s): - return f"{s}\n{len(s) * '-'}\n" - - s = inspect.getdoc(model) - if s is None: - return None - try: - index1 = s.index(match_format("Parameters")) - except ValueError as e: - # when sklearn docstring has no 'Parameters' section - logger.warning(f"{match_format('Parameters')} {e}") - return None - - headings = ["Attributes", "Notes", "See also", "Note", "References"] - for h in headings: - try: - # to find end of Parameters section - index2 = s.index(match_format(h)) - break - except ValueError: - logger.warning(f"{h} not available in docstring") - continue - else: - # in the case only 'Parameters' exist, trim till end of docstring - index2 = len(s) - s = s[index1:index2] - return s.strip() - - def _extract_sklearn_param_info(self, model, char_lim=1024) -> None | dict: - """Parses parameter type and description from sklearn dosctring - - Parameters - ---------- - model : sklearn model - char_lim : int - Specifying the max length of the returned string. - OpenML servers have a constraint of 1024 characters string fields. - - Returns - ------- - Dict, or None - """ - docstring = self._extract_sklearn_parameter_docstring(model) - if docstring is None: - # when sklearn docstring has no 'Parameters' section - return None - - n = re.compile("[.]*\n", flags=IGNORECASE) - lines = n.split(docstring) - p = re.compile("[a-z0-9_ ]+ : [a-z0-9_']+[a-z0-9_ ]*", flags=IGNORECASE) - # The above regular expression is designed to detect sklearn parameter names and type - # in the format of [variable_name][space]:[space][type] - # The expectation is that the parameter description for this detected parameter will - # be all the lines in the docstring till the regex finds another parameter match - - # collecting parameters and their descriptions - description = [] # type: List - for s in lines: - param = p.findall(s) - if param != []: - # a parameter definition is found by regex - # creating placeholder when parameter found which will be a list of strings - # string descriptions will be appended in subsequent iterations - # till another parameter is found and a new placeholder is created - placeholder = [""] # type: List[str] - description.append(placeholder) - elif len(description) > 0: # description=[] means no parameters found yet - # appending strings to the placeholder created when parameter found - description[-1].append(s) - for i in range(len(description)): - # concatenating parameter description strings - description[i] = "\n".join(description[i]).strip() - # limiting all parameter descriptions to accepted OpenML string length - if len(description[i]) > char_lim: - description[i] = f"{description[i][: char_lim - 3]}..." - - # collecting parameters and their types - parameter_docs = OrderedDict() - matches = p.findall(docstring) - for i, param in enumerate(matches): - key, value = str(param).split(":") - parameter_docs[key.strip()] = [value.strip(), description[i]] - - # to avoid KeyError for missing parameters - param_list_true = list(model.get_params().keys()) - param_list_found = list(parameter_docs.keys()) - for param in list(set(param_list_true) - set(param_list_found)): - parameter_docs[param] = [None, None] - - return parameter_docs - - def _serialize_model(self, model: Any) -> OpenMLFlow: - """Create an OpenMLFlow. - - Calls `sklearn_to_flow` recursively to properly serialize the - parameters to strings and the components (other models) to OpenMLFlows. - - Parameters - ---------- - model : sklearn estimator - - Returns - ------- - OpenMLFlow - - """ - # Get all necessary information about the model objects itself - ( - parameters, - parameters_meta_info, - subcomponents, - subcomponents_explicit, - ) = self._extract_information_from_model(model) - - # Check that a component does not occur multiple times in a flow as this - # is not supported by OpenML - self._check_multiple_occurence_of_component_in_flow(model, subcomponents) - - # Create a flow name, which contains all components in brackets, e.g.: - # RandomizedSearchCV(Pipeline(StandardScaler,AdaBoostClassifier(DecisionTreeClassifier)), - # StandardScaler,AdaBoostClassifier(DecisionTreeClassifier)) - class_name = model.__module__ + "." + model.__class__.__name__ - - # will be part of the name (in brackets) - sub_components_names = "" - for key in subcomponents: - name_thing = subcomponents[key] - if isinstance(name_thing, OpenMLFlow): - name = name_thing.name - elif ( - isinstance(name_thing, str) - and subcomponents[key] in SKLEARN_PIPELINE_STRING_COMPONENTS - ): - name = name_thing - else: - raise TypeError(type(subcomponents[key])) - - if key in subcomponents_explicit: - sub_components_names += "," + key + "=" + name - else: - sub_components_names += "," + name - - # slice operation on string in order to get rid of leading comma - name = f"{class_name}({sub_components_names[1:]})" if sub_components_names else class_name - short_name = self.trim_flow_name(name) - - # Get the external versions of all sub-components - external_version = self._get_external_version_string(model, subcomponents) - dependencies = self._get_dependencies() - tags = self._get_tags() - - sklearn_description = self._get_sklearn_description(model) - return OpenMLFlow( - name=name, - class_name=class_name, - custom_name=short_name, - description=sklearn_description, - model=model, - components=subcomponents, - parameters=parameters, - parameters_meta_info=parameters_meta_info, - external_version=external_version, - tags=tags, - extension=self, - language="English", - dependencies=dependencies, - ) - - def _get_dependencies(self) -> str: - return self._min_dependency_str(sklearn.__version__) # type: ignore - - def _get_tags(self) -> list[str]: - sklearn_version = self._format_external_version("sklearn", sklearn.__version__) # type: ignore - sklearn_version_formatted = sklearn_version.replace("==", "_") - return [ - "openml-python", - "sklearn", - "scikit-learn", - "python", - sklearn_version_formatted, - # TODO: add more tags based on the scikit-learn - # module a flow is in? For example automatically - # annotate a class of sklearn.svm.SVC() with the - # tag svm? - ] - - def _get_external_version_string( - self, - model: Any, - sub_components: dict[str, OpenMLFlow], - ) -> str: - # Create external version string for a flow, given the model and the - # already parsed dictionary of sub_components. Retrieves the external - # version of all subcomponents, which themselves already contain all - # requirements for their subcomponents. The external version string is a - # sorted concatenation of all modules which are present in this run. - - external_versions = set() - - # The model is None if the flow is a placeholder flow such as 'passthrough' or 'drop' - if model is not None: - model_package_name = model.__module__.split(".")[0] - module = importlib.import_module(model_package_name) - model_package_version_number = module.__version__ # type: ignore - external_version = self._format_external_version( - model_package_name, - model_package_version_number, - ) - external_versions.add(external_version) - - openml_version = self._format_external_version("openml", openml.__version__) - sklearn_version = self._format_external_version("sklearn", sklearn.__version__) # type: ignore - external_versions.add(openml_version) - external_versions.add(sklearn_version) - for visitee in sub_components.values(): - if isinstance(visitee, str) and visitee in SKLEARN_PIPELINE_STRING_COMPONENTS: - continue - for external_version in visitee.external_version.split(","): - external_versions.add(external_version) - return ",".join(sorted(external_versions)) - - def _check_multiple_occurence_of_component_in_flow( - self, - model: Any, - sub_components: dict[str, OpenMLFlow], - ) -> None: - to_visit_stack: list[OpenMLFlow] = [] - to_visit_stack.extend(sub_components.values()) - known_sub_components: set[str] = set() - - while len(to_visit_stack) > 0: - visitee = to_visit_stack.pop() - if isinstance(visitee, str) and visitee in SKLEARN_PIPELINE_STRING_COMPONENTS: - known_sub_components.add(visitee) - elif visitee.name in known_sub_components: - raise ValueError( - f"Found a second occurence of component {visitee.name} when " - f"trying to serialize {model}.", - ) - else: - known_sub_components.add(visitee.name) - to_visit_stack.extend(visitee.components.values()) - - def _extract_information_from_model( # noqa: PLR0915, C901, PLR0912 - self, - model: Any, - ) -> tuple[ - OrderedDict[str, str | None], - OrderedDict[str, dict | None], - OrderedDict[str, OpenMLFlow], - set, - ]: - # This function contains four "global" states and is quite long and - # complicated. If it gets to complicated to ensure it's correctness, - # it would be best to make it a class with the four "global" states being - # the class attributes and the if/elif/else in the for-loop calls to - # separate class methods - - # stores all entities that should become subcomponents - sub_components = OrderedDict() # type: OrderedDict[str, OpenMLFlow] - # stores the keys of all subcomponents that should become - sub_components_explicit = set() - parameters: OrderedDict[str, str | None] = OrderedDict() - parameters_meta_info: OrderedDict[str, dict | None] = OrderedDict() - parameters_docs = self._extract_sklearn_param_info(model) - - model_parameters = model.get_params(deep=False) - for k, v in sorted(model_parameters.items(), key=lambda t: t[0]): - rval = self._serialize_sklearn(v, model) - - def flatten_all(list_): - """Flattens arbitrary depth lists of lists (e.g. [[1,2],[3,[1]]] -> [1,2,3,1]).""" - for el in list_: - if isinstance(el, (list, tuple)) and len(el) > 0: - yield from flatten_all(el) - else: - yield el - - # In case rval is a list of lists (or tuples), we need to identify two situations: - # - sklearn pipeline steps, feature union or base classifiers in voting classifier. - # They look like e.g. [("imputer", Imputer()), ("classifier", SVC())] - # - a list of lists with simple types (e.g. int or str), such as for an OrdinalEncoder - # where all possible values for each feature are described: [[0,1,2], [1,2,5]] - is_non_empty_list_of_lists_with_same_type = ( - isinstance(rval, (list, tuple)) - and len(rval) > 0 - and isinstance(rval[0], (list, tuple)) - and all(isinstance(rval_i, type(rval[0])) for rval_i in rval) - ) - - # Check that all list elements are of simple types. - nested_list_of_simple_types = ( - is_non_empty_list_of_lists_with_same_type - and all(isinstance(el, SIMPLE_TYPES) for el in flatten_all(rval)) - and all( - len(rv) in (2, 3) and rv[1] not in SKLEARN_PIPELINE_STRING_COMPONENTS - for rv in rval - ) - ) - - if is_non_empty_list_of_lists_with_same_type and not nested_list_of_simple_types: - # If a list of lists is identified that include 'non-simple' types (e.g. objects), - # we assume they are steps in a pipeline, feature union, or base classifiers in - # a voting classifier. - parameter_value = [] # type: List - reserved_keywords = set(model.get_params(deep=False).keys()) - - for sub_component_tuple in rval: - identifier = sub_component_tuple[0] - sub_component = sub_component_tuple[1] - sub_component_type = type(sub_component_tuple) - if not 2 <= len(sub_component_tuple) <= 3: - # length 2 is for {VotingClassifier.estimators, - # Pipeline.steps, FeatureUnion.transformer_list} - # length 3 is for ColumnTransformer - raise ValueError( - f"Length of tuple of type {sub_component_type}" - " does not match assumptions" - ) - - if isinstance(sub_component, str): - if sub_component not in SKLEARN_PIPELINE_STRING_COMPONENTS: - msg = ( - "Second item of tuple does not match assumptions. " - "If string, can be only 'drop' or 'passthrough' but" - f"got {sub_component}" - ) - raise ValueError(msg) - elif sub_component is None: - msg = ( - "Cannot serialize objects of None type. Please use a valid " - "placeholder for None. Note that empty sklearn estimators can be " - "replaced with 'drop' or 'passthrough'." - ) - raise ValueError(msg) - elif not isinstance(sub_component, OpenMLFlow): - msg = ( - "Second item of tuple does not match assumptions. " - f"Expected OpenMLFlow, got {type(sub_component)}" - ) - raise TypeError(msg) - - if identifier in reserved_keywords: - parent_model = f"{model.__module__}.{model.__class__.__name__}" - msg = ( - "Found element shadowing official " - f"parameter for {parent_model}: {identifier}" - ) - raise PyOpenMLError(msg) - - # when deserializing the parameter - sub_components_explicit.add(identifier) - if isinstance(sub_component, str): - external_version = self._get_external_version_string(None, {}) - dependencies = self._get_dependencies() - tags = self._get_tags() - - sub_components[identifier] = OpenMLFlow( - name=sub_component, - description="Placeholder flow for scikit-learn's string pipeline " - "members", - components=OrderedDict(), - parameters=OrderedDict(), - parameters_meta_info=OrderedDict(), - external_version=external_version, - tags=tags, - language="English", - dependencies=dependencies, - model=None, - ) - component_reference: OrderedDict[str, str | dict] = OrderedDict() - component_reference["oml-python:serialized_object"] = ( - COMPOSITION_STEP_CONSTANT - ) - cr_value: dict[str, Any] = OrderedDict() - cr_value["key"] = identifier - cr_value["step_name"] = identifier - if len(sub_component_tuple) == 3: - cr_value["argument_1"] = sub_component_tuple[2] - component_reference["value"] = cr_value - else: - sub_components[identifier] = sub_component - component_reference = OrderedDict() - component_reference["oml-python:serialized_object"] = COMPONENT_REFERENCE - cr_value = OrderedDict() - cr_value["key"] = identifier - cr_value["step_name"] = identifier - if len(sub_component_tuple) == 3: - cr_value["argument_1"] = sub_component_tuple[2] - component_reference["value"] = cr_value - parameter_value.append(component_reference) - - # Here (and in the elif and else branch below) are the only - # places where we encode a value as json to make sure that all - # parameter values still have the same type after - # deserialization - if isinstance(rval, tuple): - parameter_json = json.dumps(tuple(parameter_value)) - else: - parameter_json = json.dumps(parameter_value) - parameters[k] = parameter_json - - elif isinstance(rval, OpenMLFlow): - # A subcomponent, for example the base model in - # AdaBoostClassifier - sub_components[k] = rval - sub_components_explicit.add(k) - component_reference = OrderedDict() - component_reference["oml-python:serialized_object"] = COMPONENT_REFERENCE - cr_value = OrderedDict() - cr_value["key"] = k - cr_value["step_name"] = None - component_reference["value"] = cr_value - cr = self._serialize_sklearn(component_reference, model) - parameters[k] = json.dumps(cr) - - elif not (hasattr(rval, "__len__") and len(rval) == 0): - rval = json.dumps(rval) - parameters[k] = rval - # a regular hyperparameter - else: - parameters[k] = None - - if parameters_docs is not None: - data_type, description = parameters_docs[k] - parameters_meta_info[k] = OrderedDict( - (("description", description), ("data_type", data_type)), - ) - else: - parameters_meta_info[k] = OrderedDict((("description", None), ("data_type", None))) - - return parameters, parameters_meta_info, sub_components, sub_components_explicit - - def _get_fn_arguments_with_defaults(self, fn_name: Callable) -> tuple[dict, set]: - """ - Returns - ------- - i) a dict with all parameter names that have a default value, and - ii) a set with all parameter names that do not have a default - - Parameters - ---------- - fn_name : callable - The function of which we want to obtain the defaults - - Returns - ------- - params_with_defaults: dict - a dict mapping parameter name to the default value - params_without_defaults: set - a set with all parameters that do not have a default value - """ - # parameters with defaults are optional, all others are required. - parameters = inspect.signature(fn_name).parameters - required_params = set() - optional_params = {} - for param in parameters: - parameter = parameters.get(param) - default_val = parameter.default # type: ignore - if default_val is inspect.Signature.empty: - required_params.add(param) - else: - optional_params[param] = default_val - return optional_params, required_params - - def _deserialize_model( # noqa: C901 - self, - flow: OpenMLFlow, - keep_defaults: bool, # noqa: FBT001 - recursion_depth: int, - strict_version: bool = True, # noqa: FBT002, FBT001 - ) -> Any: - logger.info(f"-{'-' * recursion_depth} deserialize {flow.name}") - model_name = flow.class_name - self._check_dependencies(flow.dependencies, strict_version=strict_version) - - parameters = flow.parameters - components = flow.components - parameter_dict: dict[str, Any] = OrderedDict() - - # Do a shallow copy of the components dictionary so we can remove the - # components from this copy once we added them into the pipeline. This - # allows us to not consider them any more when looping over the - # components, but keeping the dictionary of components untouched in the - # original components dictionary. - components_ = copy.copy(components) - - for name in parameters: - value = parameters.get(name) - logger.info(f"--{'-' * recursion_depth} flow_parameter={name}, value={value}") - rval = self._deserialize_sklearn( - value, - components=components_, - initialize_with_defaults=keep_defaults, - recursion_depth=recursion_depth + 1, - strict_version=strict_version, - ) - parameter_dict[name] = rval - - for name in components: - if name in parameter_dict: - continue - if name not in components_: - continue - value = components[name] - logger.info(f"--{'-' * recursion_depth} flow_component={name}, value={value}") - rval = self._deserialize_sklearn( - value, - recursion_depth=recursion_depth + 1, - strict_version=strict_version, - ) - parameter_dict[name] = rval - - if model_name is None and flow.name in SKLEARN_PIPELINE_STRING_COMPONENTS: - return flow.name - - assert model_name is not None - module_name = model_name.rsplit(".", 1) - model_class = getattr(importlib.import_module(module_name[0]), module_name[1]) - - if keep_defaults: - # obtain all params with a default - param_defaults, _ = self._get_fn_arguments_with_defaults(model_class.__init__) - - # delete the params that have a default from the dict, - # so they get initialized with their default value - # except [...] - for param in param_defaults: - # [...] the ones that also have a key in the components dict. - # As OpenML stores different flows for ensembles with different - # (base-)components, in OpenML terms, these are not considered - # hyperparameters but rather constants (i.e., changing them would - # result in a different flow) - if param not in components: - del parameter_dict[param] - - if not strict_version: - # Ignore incompatible parameters - allowed_parameter = list(inspect.signature(model_class.__init__).parameters) - for p in list(parameter_dict.keys()): - if p not in allowed_parameter: - warnings.warn( - f"While deserializing in a non-strict way, parameter {p} is not " - f"allowed for {model_class.__name__} likely due to a version mismatch. " - "We ignore the parameter.", - UserWarning, - stacklevel=2, - ) - del parameter_dict[p] - - return model_class(**parameter_dict) - - def _check_dependencies( - self, - dependencies: str, - strict_version: bool = True, # noqa: FBT001, FBT002 - ) -> None: - if not dependencies: - return - - dependencies_list = dependencies.split("\n") - for dependency_string in dependencies_list: - match = DEPENDENCIES_PATTERN.match(dependency_string) - if not match: - raise ValueError(f"Cannot parse dependency {dependency_string}") - - dependency_name = match.group("name") - operation = match.group("operation") - version = match.group("version") - - module = importlib.import_module(dependency_name) - required_version = Version(version) - installed_version = Version(module.__version__) # type: ignore - - if operation == "==": - check = required_version == installed_version - elif operation == ">": - check = installed_version > required_version - elif operation == ">=": - check = ( - installed_version > required_version or installed_version == required_version - ) - else: - raise NotImplementedError(f"operation '{operation}' is not supported") - message = ( - f"Trying to deserialize a model with dependency {dependency_string} not satisfied." - ) - if not check: - if strict_version: - raise ValueError(message) - - warnings.warn(message, category=UserWarning, stacklevel=2) - - def _serialize_type(self, o: Any) -> OrderedDict[str, str]: - mapping = { - float: "float", - np.float32: "np.float32", - np.float64: "np.float64", - int: "int", - np.int32: "np.int32", - np.int64: "np.int64", - } - if Version(np.__version__) < Version("1.24"): - mapping[float] = "np.float" - mapping[int] = "np.int" - - ret = OrderedDict() # type: 'OrderedDict[str, str]' - ret["oml-python:serialized_object"] = "type" - ret["value"] = mapping[o] - return ret - - def _deserialize_type(self, o: str) -> Any: - mapping = { - "float": float, - "np.float32": np.float32, - "np.float64": np.float64, - "int": int, - "np.int32": np.int32, - "np.int64": np.int64, - } - - # TODO(eddiebergman): Might be able to remove this - if Version(np.__version__) < Version("1.24"): - mapping["np.float"] = np.float # type: ignore # noqa: NPY001 - mapping["np.int"] = np.int # type: ignore # noqa: NPY001 - - return mapping[o] - - def _serialize_rv_frozen(self, o: Any) -> OrderedDict[str, str | dict]: - args = o.args - kwds = o.kwds - a = o.a - b = o.b - dist = o.dist.__class__.__module__ + "." + o.dist.__class__.__name__ - ret: OrderedDict[str, str | dict] = OrderedDict() - ret["oml-python:serialized_object"] = "rv_frozen" - ret["value"] = OrderedDict( - (("dist", dist), ("a", a), ("b", b), ("args", args), ("kwds", kwds)), - ) - return ret - - def _deserialize_rv_frozen(self, o: OrderedDict[str, str]) -> Any: - args = o["args"] - kwds = o["kwds"] - a = o["a"] - b = o["b"] - dist_name = o["dist"] - - module_name = dist_name.rsplit(".", 1) - try: - rv_class = getattr(importlib.import_module(module_name[0]), module_name[1]) - except AttributeError as e: - _tb = traceback.format_exc() - warnings.warn( - f"Cannot create model {dist_name} for flow. Reason is from error {type(e)}:{e}" - f"\nTraceback: {_tb}", - RuntimeWarning, - stacklevel=2, - ) - return None - - dist = scipy.stats.distributions.rv_frozen(rv_class(), *args, **kwds) # type: ignore - dist.a = a - dist.b = b - - return dist - - def _serialize_function(self, o: Callable) -> OrderedDict[str, str]: - name = o.__module__ + "." + o.__name__ - ret = OrderedDict() # type: 'OrderedDict[str, str]' - ret["oml-python:serialized_object"] = "function" - ret["value"] = name - return ret - - def _deserialize_function(self, name: str) -> Callable: - module_name = name.rsplit(".", 1) - return getattr(importlib.import_module(module_name[0]), module_name[1]) - - def _serialize_cross_validator(self, o: Any) -> OrderedDict[str, str | dict]: - ret: OrderedDict[str, str | dict] = OrderedDict() - - parameters = OrderedDict() # type: 'OrderedDict[str, Any]' - - # XXX this is copied from sklearn.model_selection._split - cls = o.__class__ - init = getattr(cls.__init__, "deprecated_original", cls.__init__) - # Ignore varargs, kw and default values and pop self - init_signature = inspect.signature(init) # type: ignore - # Consider the constructor parameters excluding 'self' - if init is object.__init__: - args = [] # type: List - else: - args = sorted( - [ - p.name - for p in init_signature.parameters.values() - if p.name != "self" and p.kind != p.VAR_KEYWORD - ], - ) - - for key in args: - # We need deprecation warnings to always be on in order to - # catch deprecated param values. - # This is set in utils/__init__.py but it gets overwritten - # when running under python3 somehow. - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always", DeprecationWarning) - value = getattr(o, key, None) - if w is not None and len(w) and w[0].category is DeprecationWarning: - # if the parameter is deprecated, don't show it - continue - - if not (isinstance(value, Sized) and len(value) == 0): - value = json.dumps(value) - parameters[key] = value - else: - parameters[key] = None - - ret["oml-python:serialized_object"] = "cv_object" - name = o.__module__ + "." + o.__class__.__name__ - value = OrderedDict([("name", name), ("parameters", parameters)]) - ret["value"] = value - - return ret - - def _deserialize_cross_validator( - self, - value: OrderedDict[str, Any], - recursion_depth: int, - strict_version: bool = True, # noqa: FBT002, FBT001 - ) -> Any: - model_name = value["name"] - parameters = value["parameters"] - - module_name = model_name.rsplit(".", 1) - model_class = getattr(importlib.import_module(module_name[0]), module_name[1]) - for parameter in parameters: - parameters[parameter] = self._deserialize_sklearn( - parameters[parameter], - recursion_depth=recursion_depth + 1, - strict_version=strict_version, - ) - return model_class(**parameters) - - def _format_external_version( - self, - model_package_name: str, - model_package_version_number: str, - ) -> str: - return f"{model_package_name}=={model_package_version_number}" - - @staticmethod - def _get_parameter_values_recursive( - param_grid: dict | list[dict], - parameter_name: str, - ) -> list[Any]: - """ - Returns a list of values for a given hyperparameter, encountered - recursively throughout the flow. (e.g., n_jobs can be defined - for various flows) - - Parameters - ---------- - param_grid: Union[Dict, List[Dict]] - Dict mapping from hyperparameter list to value, to a list of - such dicts - - parameter_name: str - The hyperparameter that needs to be inspected - - Returns - ------- - List - A list of all values of hyperparameters with this name - """ - if isinstance(param_grid, dict): - return [ - value - for param, value in param_grid.items() - if param.split("__")[-1] == parameter_name - ] - - if isinstance(param_grid, list): - result = [] - for sub_grid in param_grid: - result.extend( - SklearnSerializer._get_parameter_values_recursive(sub_grid, parameter_name), - ) - return result - - raise ValueError("Param_grid should either be a dict or list of dicts") - - def _prevent_optimize_n_jobs(self, model): - """ - Ensures that HPO classes will not optimize the n_jobs hyperparameter - - Parameters - ---------- - model: - The model that will be fitted - """ - if self._is_hpo_class(model): - if isinstance(model, sklearn.model_selection.GridSearchCV): - param_distributions = model.param_grid - elif isinstance(model, sklearn.model_selection.RandomizedSearchCV): - param_distributions = model.param_distributions - else: - if hasattr(model, "param_distributions"): - param_distributions = model.param_distributions - else: - raise AttributeError( - "Using subclass BaseSearchCV other than " - "{GridSearchCV, RandomizedSearchCV}. " - "Could not find attribute " - "param_distributions.", - ) - logger.warning( - "Warning! Using subclass BaseSearchCV other than " - "{GridSearchCV, RandomizedSearchCV}. " - "Should implement param check. ", - ) - n_jobs_vals = self._get_parameter_values_recursive( - param_distributions, - "n_jobs", - ) - if len(n_jobs_vals) > 0: - raise PyOpenMLError( - "openml-python should not be used to optimize the n_jobs parameter.", - ) From 12df955a3af5a9b3ec117f921207de73d34ba67a Mon Sep 17 00:00:00 2001 From: Jigyasu Date: Mon, 5 Jan 2026 21:21:02 +0530 Subject: [PATCH 13/15] remove .DS_Store --- openml/extensions/.DS_Store | Bin 10244 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 openml/extensions/.DS_Store diff --git a/openml/extensions/.DS_Store b/openml/extensions/.DS_Store deleted file mode 100644 index 0ee8fbd94f44119155f92a35af97354b7bcaca43..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 10244 zcmeHMTWl0n7(V~B&>1?=0SXk{i3^2r$x;dwT5e{0Q_8Ktwsc!A%k0iT2c|R2%L_ z9nS#_6AffMkOeLo6zAlp2MC}L95H~yaXd=ZQN{yV;F7}$a5#ZrWe83vz*a|Gl&KSl za~YN)10e$w8K7tPN_J*9i9PQ8KHOnBnY87kQG*6m3+B4NSi)Knl_kR@Lp03&#Gvz( zpstjl%1IivJz|X4gC!+;^KrHzyiG80S03)mL775nvp$y`V{01hrlDk~@NCIkGHT}# znL#X{$(sx(&{xWM3te3QIAT0Rx@alA*C<&{a7$1XsC|-Hh|~~ckGGS&fZCVFuvns< z9`v#S=hxF&%k=#E`maDJE1x#KqC%>utT>$LcLo!_=I6W?%{@q4U5;a=i*+r1z%&L^ zN_DGa`{b(#YvT0p? zEZVTKVRTdy=hd#+vOjgw8nB(SXaOb&Uc`iY(Aabi8zolhUdBq}E-9pB$nx^7+~BfA z(crHGgGFgZr94yZN%r;~lvB!_JX+S7^-ah24j8V7GNn{`$UsNNvG=+L9q-9FR&K!d zQp)U%W@U1g<{QnHl`u~m*jMFcT*tEZI-W_V=}r2^kWa@Z3?=(K$Msv;WJwd}=UPh6 zGxi%eshKz0aB8o5tm#+>>GG@v4=-J@dR^nzczeh0eYdKV*>mPr%W@x5ZTsdi({Q^7 zP2KPB)x3;h>!y9YCvSRY+A_LqZNQifF-f{rRkLu>LlISdkZLlQHc#bDe^{v%MYcwC zu__BLS6{YRRS(jUR@cB%4*)AysdATx#w%K&Shq%&yH28yC?4Gc zLf1FQYPY4O4NK7_zE0C7S?$Sl{*|3U=eBBDO=g^d0W{GI`kGr*^$1;dm*)AMS;NLH z-3-=#6vz6!e$6#H2hm4^-W}M>9x3)yP`hsW4jV)^_MjELbh>6;XErRVSy-he)NWDv zTudp7Y3EIh3w*3RJjB}Kqp6Lr;?>#@G6hy+MAydadAEeWJyf!ew2@upD8}pvIZvJ^ zm&jG}F}Xp$B;S&s$X)U~089r7W{zmo`Umm0iJ_b;8l1HF2S2{1+Kz1cn98xoA4=o24BD(_zu2@AK)(h0e`|h zVVbZ&s1X(lON5xPTBs8mg(hLMutnG-bP7p96HW+@FvJZIim`Z@w~M7PjJ1gY7rbGF z%aaFOjq_Z!VWtiWn@)bl!|b>O>4rMPN;NZDhTK zL8z4RW-QVui3;$>W_hcG;i#1JrYgrJNg=EeX^pgt7*dM(Px<<7a)8xj5$A%O?)Ykb#hakb#hakb$YiKp8hTO7H*MAN>FSsYOa? zZOFhBV*twA5^eN)uXv0swVz&$MKK@6OcT{BaLFK8P!-zGuH&&muj5w)eZeuCjmLw& f0+$SqI0i?_Z}b24e+Go-|1>_b@cci8&;Ne|0>mUZ From 9e5e752fe43f57c436c8b1f876bcb4bf7c924fc9 Mon Sep 17 00:00:00 2001 From: Jigyasu Date: Mon, 5 Jan 2026 21:24:51 +0530 Subject: [PATCH 14/15] add flow functions to __init__.py --- openml/flows/__init__.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/openml/flows/__init__.py b/openml/flows/__init__.py index ce32fec7d..e0bd979da 100644 --- a/openml/flows/__init__.py +++ b/openml/flows/__init__.py @@ -1,7 +1,7 @@ # License: BSD 3-Clause -from .flow import OpenMLFlow -from .functions import ( +from openml.flows.flow import OpenMLFlow +from openml.flows.functions import ( assert_flows_equal, delete_flow, flow_exists, @@ -9,6 +9,7 @@ get_flow_id, list_flows, ) +from openml.flows.utils import estimator_to_flow, flow_to_estimator __all__ = [ "OpenMLFlow", @@ -18,4 +19,6 @@ "flow_exists", "assert_flows_equal", "delete_flow", + "estimator_to_flow", + "flow_to_estimator", ] From bf9a0aa823e3e16654550e93683d887e4a98e279 Mon Sep 17 00:00:00 2001 From: Jigyasu Date: Tue, 6 Jan 2026 11:18:45 +0530 Subject: [PATCH 15/15] add tests for extension base classes and registry --- tests/test_extensions/test_base.py | 143 +++++++++++++++++++++++++++++ 1 file changed, 143 insertions(+) create mode 100644 tests/test_extensions/test_base.py diff --git a/tests/test_extensions/test_base.py b/tests/test_extensions/test_base.py new file mode 100644 index 000000000..46abcf4ca --- /dev/null +++ b/tests/test_extensions/test_base.py @@ -0,0 +1,143 @@ +# License: BSD 3-Clause + +"""Test OpenML extension base classes and registry.""" + +import pytest +from collections import OrderedDict + +from openml.exceptions import PyOpenMLError +from openml.extensions.base import ( + ModelSerializer, + ModelExecutor, + OpenMLAPIConnector, +) +from openml.extensions.registry import resolve_api_connector + + +class TestModelSerializer: + """Test ModelSerializer abstract base class.""" + + def test_is_abstract(self): + """ModelSerializer should not be instantiable.""" + with pytest.raises(TypeError): + ModelSerializer() # noqa: B024 + + class DummySerializer(ModelSerializer): + @classmethod + def can_handle_model(cls, model): + return True + + def model_to_flow(self, model): + return "dummy_flow" + + def flow_to_model(self, flow, initialize_with_defaults=False, strict_version=True): + return "dummy_model" + + def get_version_information(self): + return ["dummy>=0.1"] + + def test_concrete_implementation(self): + serializer = self.DummySerializer() + + assert serializer.can_handle_model(object()) is True + assert serializer.model_to_flow("model") == "dummy_flow" + assert serializer.flow_to_model("flow") == "dummy_model" + assert serializer.get_version_information() == ["dummy>=0.1"] + + +class TestModelExecutor: + """Test ModelExecutor abstract base class.""" + + def test_is_abstract(self): + """ModelExecutor should not be instantiable.""" + with pytest.raises(TypeError): + ModelExecutor() # noqa: B024 + + class DummyExecutor(ModelExecutor): + def seed_model(self, model, seed): + return model + + def _run_model_on_fold( + self, + model, + task, + X_train, + rep_no, + fold_no, + y_train=None, + X_test=None, + ): + return ( + [], # predictions + None, # probabilities + OrderedDict(), # user_defined_measures + None, # trace + ) + + def check_if_model_fitted(self, model): + return False + + def obtain_parameter_values(self, flow, model=None): + return [] + + def instantiate_model_from_hpo_class(self, model, trace_iteration): + return model + + def test_concrete_implementation(self): + executor = self.DummyExecutor() + + assert executor.seed_model("model", 42) == "model" + assert executor.check_if_model_fitted("model") is False + assert executor.obtain_parameter_values("flow") == [] + + +class TestOpenMLAPIConnector: + """Test OpenMLAPIConnector abstract base class.""" + + def test_is_abstract(self): + """OpenMLAPIConnector should not be instantiable.""" + with pytest.raises(TypeError): + OpenMLAPIConnector() # noqa: B024 + + class DummySerializer: + pass + + class DummyExecutor: + pass + + class DummyConnector(OpenMLAPIConnector): + def serializer(self): + return TestOpenMLAPIConnector.DummySerializer() + + def executor(self): + return TestOpenMLAPIConnector.DummyExecutor() + + @classmethod + def supports(cls, estimator): + return estimator == "supported" + + def test_concrete_implementation(self): + connector = self.DummyConnector() + + assert isinstance(connector.serializer(), self.DummySerializer) + assert isinstance(connector.executor(), self.DummyExecutor) + assert self.DummyConnector.supports("supported") is True + assert self.DummyConnector.supports("unsupported") is False + + def test_resolve_api_connector_success(self, monkeypatch): + monkeypatch.setattr( + "openml.extensions.registry.API_CONNECTOR_REGISTRY", + [self.DummyConnector], + ) + + connector = resolve_api_connector("supported") + assert isinstance(connector, self.DummyConnector) + + def test_resolve_api_connector_no_match(self, monkeypatch): + monkeypatch.setattr( + "openml.extensions.registry.API_CONNECTOR_REGISTRY", + [], + ) + + with pytest.raises(PyOpenMLError, match="No OpenML API connector supports"): + resolve_api_connector("anything")