From d29bd71fb1faea69902e13ad5b27fd8d370e8d15 Mon Sep 17 00:00:00 2001 From: Omswastik-11 Date: Thu, 1 Jan 2026 18:38:38 +0530 Subject: [PATCH 1/3] remove redundant __init__ by adding ClassVar --- openml/tasks/functions.py | 3 + openml/tasks/task.py | 180 ++++++++------------------------------ 2 files changed, 41 insertions(+), 142 deletions(-) diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index d2bf5e946..d59de155e 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -424,6 +424,9 @@ def get_task( # Including class labels as part of task meta data handles # the case where data download was initially disabled if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)): + assert ( + task.target_name is not None + ), "Supervised tasks must define a target feature before retrieving class labels." task.class_labels = dataset.retrieve_class_labels(task.target_name) # Clustering tasks do not have class labels # and do not offer download_split diff --git a/openml/tasks/task.py b/openml/tasks/task.py index 395b52482..2355eb594 100644 --- a/openml/tasks/task.py +++ b/openml/tasks/task.py @@ -1,13 +1,11 @@ # License: BSD 3-Clause -# TODO(eddbergman): Seems like a lot of the subclasses could just get away with setting -# a `ClassVar` for whatever changes as their `__init__` defaults, less duplicated code. from __future__ import annotations import warnings from abc import ABC from enum import Enum from pathlib import Path -from typing import TYPE_CHECKING, Any, Sequence +from typing import TYPE_CHECKING, Any, ClassVar, Sequence from typing_extensions import TypedDict import openml._api_calls @@ -70,31 +68,45 @@ class OpenMLTask(OpenMLBase): Refers to the URL of the data splits used for the OpenML task. """ + DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 1 + def __init__( # noqa: PLR0913 self, - task_id: int | None, task_type_id: TaskType, task_type: str, data_set_id: int, - estimation_procedure_id: int = 1, + task_id: int | None = None, + estimation_procedure_id: int | None = None, estimation_procedure_type: str | None = None, estimation_parameters: dict[str, str] | None = None, evaluation_measure: str | None = None, data_splits_url: str | None = None, + target_name: str | None = None, ): self.task_id = int(task_id) if task_id is not None else None self.task_type_id = task_type_id self.task_type = task_type self.dataset_id = int(data_set_id) + self.target_name = target_name + resolved_estimation_procedure_id = self._resolve_estimation_procedure_id( + estimation_procedure_id, + ) self.evaluation_measure = evaluation_measure self.estimation_procedure: _EstimationProcedure = { "type": estimation_procedure_type, "parameters": estimation_parameters, "data_splits_url": data_splits_url, } - self.estimation_procedure_id = estimation_procedure_id + self.estimation_procedure_id = resolved_estimation_procedure_id self.split: OpenMLSplit | None = None + def _resolve_estimation_procedure_id(self, estimation_procedure_id: int | None) -> int: + return ( + estimation_procedure_id + if estimation_procedure_id is not None + else self.DEFAULT_ESTIMATION_PROCEDURE_ID + ) + @classmethod def _entity_letter(cls) -> str: return "t" @@ -128,7 +140,8 @@ def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str]]]: if class_labels is not None: fields["# of Classes"] = len(class_labels) - if hasattr(self, "cost_matrix"): + cost_matrix = getattr(self, "cost_matrix", None) + if cost_matrix is not None: fields["Cost Matrix"] = "Available" # determines the order in which the information will be printed @@ -249,32 +262,43 @@ class OpenMLSupervisedTask(OpenMLTask, ABC): Refers to the unique identifier of task. """ + DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 1 + def __init__( # noqa: PLR0913 self, task_type_id: TaskType, task_type: str, data_set_id: int, target_name: str, - estimation_procedure_id: int = 1, + estimation_procedure_id: int | None = None, estimation_procedure_type: str | None = None, estimation_parameters: dict[str, str] | None = None, evaluation_measure: str | None = None, data_splits_url: str | None = None, task_id: int | None = None, + class_labels: list[str] | None = None, + cost_matrix: np.ndarray | None = None, ): + resolved_estimation_procedure_id = self._resolve_estimation_procedure_id( + estimation_procedure_id, + ) super().__init__( task_id=task_id, task_type_id=task_type_id, task_type=task_type, data_set_id=data_set_id, - estimation_procedure_id=estimation_procedure_id, + estimation_procedure_id=resolved_estimation_procedure_id, estimation_procedure_type=estimation_procedure_type, estimation_parameters=estimation_parameters, evaluation_measure=evaluation_measure, data_splits_url=data_splits_url, + target_name=target_name, ) - self.target_name = target_name + self.class_labels = class_labels + self.cost_matrix = cost_matrix + if cost_matrix is not None: + raise NotImplementedError("Costmatrix") def get_X_and_y(self) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]: """Get data associated with the current task. @@ -325,64 +349,13 @@ class OpenMLClassificationTask(OpenMLSupervisedTask): Parameters ---------- - task_type_id : TaskType - ID of the Classification task type. - task_type : str - Name of the Classification task type. - data_set_id : int - ID of the OpenML dataset associated with the Classification task. - target_name : str - Name of the target variable. - estimation_procedure_id : int, default=None - ID of the estimation procedure for the Classification task. - estimation_procedure_type : str, default=None - Type of the estimation procedure. - estimation_parameters : dict, default=None - Estimation parameters for the Classification task. - evaluation_measure : str, default=None - Name of the evaluation measure. - data_splits_url : str, default=None - URL of the data splits for the Classification task. - task_id : Union[int, None] - ID of the Classification task (if it already exists on OpenML). class_labels : List of str, default=None A list of class labels (for classification tasks). cost_matrix : array, default=None A cost matrix (for classification tasks). """ - def __init__( # noqa: PLR0913 - self, - task_type_id: TaskType, - task_type: str, - data_set_id: int, - target_name: str, - estimation_procedure_id: int = 1, - estimation_procedure_type: str | None = None, - estimation_parameters: dict[str, str] | None = None, - evaluation_measure: str | None = None, - data_splits_url: str | None = None, - task_id: int | None = None, - class_labels: list[str] | None = None, - cost_matrix: np.ndarray | None = None, - ): - super().__init__( - task_id=task_id, - task_type_id=task_type_id, - task_type=task_type, - data_set_id=data_set_id, - estimation_procedure_id=estimation_procedure_id, - estimation_procedure_type=estimation_procedure_type, - estimation_parameters=estimation_parameters, - evaluation_measure=evaluation_measure, - target_name=target_name, - data_splits_url=data_splits_url, - ) - self.class_labels = class_labels - self.cost_matrix = cost_matrix - - if cost_matrix is not None: - raise NotImplementedError("Costmatrix") + DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 1 class OpenMLRegressionTask(OpenMLSupervisedTask): @@ -412,31 +385,7 @@ class OpenMLRegressionTask(OpenMLSupervisedTask): Evaluation measure used in the Regression task. """ - def __init__( # noqa: PLR0913 - self, - task_type_id: TaskType, - task_type: str, - data_set_id: int, - target_name: str, - estimation_procedure_id: int = 7, - estimation_procedure_type: str | None = None, - estimation_parameters: dict[str, str] | None = None, - data_splits_url: str | None = None, - task_id: int | None = None, - evaluation_measure: str | None = None, - ): - super().__init__( - task_id=task_id, - task_type_id=task_type_id, - task_type=task_type, - data_set_id=data_set_id, - estimation_procedure_id=estimation_procedure_id, - estimation_procedure_type=estimation_procedure_type, - estimation_parameters=estimation_parameters, - evaluation_measure=evaluation_measure, - target_name=target_name, - data_splits_url=data_splits_url, - ) + DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 7 class OpenMLClusteringTask(OpenMLTask): @@ -467,32 +416,7 @@ class OpenMLClusteringTask(OpenMLTask): feature set for the clustering task. """ - def __init__( # noqa: PLR0913 - self, - task_type_id: TaskType, - task_type: str, - data_set_id: int, - estimation_procedure_id: int = 17, - task_id: int | None = None, - estimation_procedure_type: str | None = None, - estimation_parameters: dict[str, str] | None = None, - data_splits_url: str | None = None, - evaluation_measure: str | None = None, - target_name: str | None = None, - ): - super().__init__( - task_id=task_id, - task_type_id=task_type_id, - task_type=task_type, - data_set_id=data_set_id, - evaluation_measure=evaluation_measure, - estimation_procedure_id=estimation_procedure_id, - estimation_procedure_type=estimation_procedure_type, - estimation_parameters=estimation_parameters, - data_splits_url=data_splits_url, - ) - - self.target_name = target_name + DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 17 def get_X(self) -> pd.DataFrame: """Get data associated with the current task. @@ -554,32 +478,4 @@ class OpenMLLearningCurveTask(OpenMLClassificationTask): Cost matrix for Learning Curve tasks. """ - def __init__( # noqa: PLR0913 - self, - task_type_id: TaskType, - task_type: str, - data_set_id: int, - target_name: str, - estimation_procedure_id: int = 13, - estimation_procedure_type: str | None = None, - estimation_parameters: dict[str, str] | None = None, - data_splits_url: str | None = None, - task_id: int | None = None, - evaluation_measure: str | None = None, - class_labels: list[str] | None = None, - cost_matrix: np.ndarray | None = None, - ): - super().__init__( - task_id=task_id, - task_type_id=task_type_id, - task_type=task_type, - data_set_id=data_set_id, - estimation_procedure_id=estimation_procedure_id, - estimation_procedure_type=estimation_procedure_type, - estimation_parameters=estimation_parameters, - evaluation_measure=evaluation_measure, - target_name=target_name, - data_splits_url=data_splits_url, - class_labels=class_labels, - cost_matrix=cost_matrix, - ) + DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 13 From 08efa66a50000b3185f70f8e9f310b5351547c57 Mon Sep 17 00:00:00 2001 From: Omswastik-11 Date: Thu, 1 Jan 2026 19:30:37 +0530 Subject: [PATCH 2/3] keeping old API --- openml/tasks/functions.py | 1 + openml/tasks/task.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index 5822db901..2a8f98e0e 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -590,6 +590,7 @@ def create_task( raise NotImplementedError(f"Task type {task_type:d} not supported.") return task_cls( + task_id=None, task_type_id=task_type, task_type="None", # TODO: refactor to get task type string from ID. data_set_id=dataset_id, diff --git a/openml/tasks/task.py b/openml/tasks/task.py index 2355eb594..5e8b95001 100644 --- a/openml/tasks/task.py +++ b/openml/tasks/task.py @@ -72,10 +72,10 @@ class OpenMLTask(OpenMLBase): def __init__( # noqa: PLR0913 self, + task_id: int | None, task_type_id: TaskType, task_type: str, data_set_id: int, - task_id: int | None = None, estimation_procedure_id: int | None = None, estimation_procedure_type: str | None = None, estimation_parameters: dict[str, str] | None = None, From c5bd7edacf0eef92622d34d76d362c0145e0eb84 Mon Sep 17 00:00:00 2001 From: Omswastik-11 Date: Thu, 1 Jan 2026 22:58:09 +0530 Subject: [PATCH 3/3] keeping old API --- openml/tasks/task.py | 77 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 61 insertions(+), 16 deletions(-) diff --git a/openml/tasks/task.py b/openml/tasks/task.py index 5e8b95001..98eba950e 100644 --- a/openml/tasks/task.py +++ b/openml/tasks/task.py @@ -276,8 +276,6 @@ def __init__( # noqa: PLR0913 evaluation_measure: str | None = None, data_splits_url: str | None = None, task_id: int | None = None, - class_labels: list[str] | None = None, - cost_matrix: np.ndarray | None = None, ): resolved_estimation_procedure_id = self._resolve_estimation_procedure_id( estimation_procedure_id, @@ -295,11 +293,6 @@ def __init__( # noqa: PLR0913 target_name=target_name, ) - self.class_labels = class_labels - self.cost_matrix = cost_matrix - if cost_matrix is not None: - raise NotImplementedError("Costmatrix") - def get_X_and_y(self) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]: """Get data associated with the current task. @@ -349,6 +342,26 @@ class OpenMLClassificationTask(OpenMLSupervisedTask): Parameters ---------- + task_id : Union[int, None] + ID of the Classification task (if it already exists on OpenML). + task_type_id : TaskType + ID of the Classification task type. + task_type : str + Name of the Classification task type. + data_set_id : int + ID of the OpenML dataset associated with the Classification task. + target_name : str + Name of the target variable. + estimation_procedure_id : int, default=1 + ID of the estimation procedure for the Classification task. + estimation_procedure_type : str, default=None + Type of the estimation procedure. + estimation_parameters : dict, default=None + Estimation parameters for the Classification task. + evaluation_measure : str, default=None + Name of the evaluation measure. + data_splits_url : str, default=None + URL of the data splits for the Classification task. class_labels : List of str, default=None A list of class labels (for classification tasks). cost_matrix : array, default=None @@ -357,12 +370,46 @@ class OpenMLClassificationTask(OpenMLSupervisedTask): DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 1 + def __init__( # noqa: PLR0913 + self, + task_type_id: TaskType, + task_type: str, + data_set_id: int, + target_name: str, + estimation_procedure_id: int | None = None, + estimation_procedure_type: str | None = None, + estimation_parameters: dict[str, str] | None = None, + evaluation_measure: str | None = None, + data_splits_url: str | None = None, + task_id: int | None = None, + class_labels: list[str] | None = None, + cost_matrix: np.ndarray | None = None, + ): + super().__init__( + task_type_id=task_type_id, + task_type=task_type, + data_set_id=data_set_id, + target_name=target_name, + estimation_procedure_id=estimation_procedure_id, + estimation_procedure_type=estimation_procedure_type, + estimation_parameters=estimation_parameters, + evaluation_measure=evaluation_measure, + data_splits_url=data_splits_url, + task_id=task_id, + ) + self.class_labels = class_labels + self.cost_matrix = cost_matrix + if cost_matrix is not None: + raise NotImplementedError("Costmatrix") + class OpenMLRegressionTask(OpenMLSupervisedTask): """OpenML Regression object. Parameters ---------- + task_id : Union[int, None] + ID of the OpenML Regression task. task_type_id : TaskType Task type ID of the OpenML Regression task. task_type : str @@ -371,7 +418,7 @@ class OpenMLRegressionTask(OpenMLSupervisedTask): ID of the OpenML dataset. target_name : str Name of the target feature used in the Regression task. - estimation_procedure_id : int, default=None + estimation_procedure_id : int, default=7 ID of the OpenML estimation procedure. estimation_procedure_type : str, default=None Type of the OpenML estimation procedure. @@ -379,8 +426,6 @@ class OpenMLRegressionTask(OpenMLSupervisedTask): Parameters used by the OpenML estimation procedure. data_splits_url : str, default=None URL of the OpenML data splits for the Regression task. - task_id : Union[int, None] - ID of the OpenML Regression task. evaluation_measure : str, default=None Evaluation measure used in the Regression task. """ @@ -393,16 +438,16 @@ class OpenMLClusteringTask(OpenMLTask): Parameters ---------- + task_id : Union[int, None] + ID of the OpenML clustering task. task_type_id : TaskType Task type ID of the OpenML clustering task. task_type : str Task type of the OpenML clustering task. data_set_id : int ID of the OpenML dataset used in clustering the task. - estimation_procedure_id : int, default=None + estimation_procedure_id : int, default=17 ID of the OpenML estimation procedure. - task_id : Union[int, None] - ID of the OpenML clustering task. estimation_procedure_type : str, default=None Type of the OpenML estimation procedure used in the clustering task. estimation_parameters : dict, default=None @@ -452,6 +497,8 @@ class OpenMLLearningCurveTask(OpenMLClassificationTask): Parameters ---------- + task_id : Union[int, None] + ID of the Learning Curve task. task_type_id : TaskType ID of the Learning Curve task. task_type : str @@ -460,7 +507,7 @@ class OpenMLLearningCurveTask(OpenMLClassificationTask): ID of the dataset that this task is associated with. target_name : str Name of the target feature in the dataset. - estimation_procedure_id : int, default=None + estimation_procedure_id : int, default=13 ID of the estimation procedure to use for evaluating models. estimation_procedure_type : str, default=None Type of the estimation procedure. @@ -468,8 +515,6 @@ class OpenMLLearningCurveTask(OpenMLClassificationTask): Additional parameters for the estimation procedure. data_splits_url : str, default=None URL of the file containing the data splits for Learning Curve task. - task_id : Union[int, None] - ID of the Learning Curve task. evaluation_measure : str, default=None Name of the evaluation measure to use for evaluating models. class_labels : list of str, default=None