diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index e9b879ae4..2a8f98e0e 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -425,6 +425,9 @@ def get_task( # Including class labels as part of task meta data handles # the case where data download was initially disabled if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)): + assert ( + task.target_name is not None + ), "Supervised tasks must define a target feature before retrieving class labels." task.class_labels = dataset.retrieve_class_labels(task.target_name) # Clustering tasks do not have class labels # and do not offer download_split @@ -587,6 +590,7 @@ def create_task( raise NotImplementedError(f"Task type {task_type:d} not supported.") return task_cls( + task_id=None, task_type_id=task_type, task_type="None", # TODO: refactor to get task type string from ID. data_set_id=dataset_id, diff --git a/openml/tasks/task.py b/openml/tasks/task.py index 395b52482..98eba950e 100644 --- a/openml/tasks/task.py +++ b/openml/tasks/task.py @@ -1,13 +1,11 @@ # License: BSD 3-Clause -# TODO(eddbergman): Seems like a lot of the subclasses could just get away with setting -# a `ClassVar` for whatever changes as their `__init__` defaults, less duplicated code. from __future__ import annotations import warnings from abc import ABC from enum import Enum from pathlib import Path -from typing import TYPE_CHECKING, Any, Sequence +from typing import TYPE_CHECKING, Any, ClassVar, Sequence from typing_extensions import TypedDict import openml._api_calls @@ -70,31 +68,45 @@ class OpenMLTask(OpenMLBase): Refers to the URL of the data splits used for the OpenML task. """ + DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 1 + def __init__( # noqa: PLR0913 self, task_id: int | None, task_type_id: TaskType, task_type: str, data_set_id: int, - estimation_procedure_id: int = 1, + estimation_procedure_id: int | None = None, estimation_procedure_type: str | None = None, estimation_parameters: dict[str, str] | None = None, evaluation_measure: str | None = None, data_splits_url: str | None = None, + target_name: str | None = None, ): self.task_id = int(task_id) if task_id is not None else None self.task_type_id = task_type_id self.task_type = task_type self.dataset_id = int(data_set_id) + self.target_name = target_name + resolved_estimation_procedure_id = self._resolve_estimation_procedure_id( + estimation_procedure_id, + ) self.evaluation_measure = evaluation_measure self.estimation_procedure: _EstimationProcedure = { "type": estimation_procedure_type, "parameters": estimation_parameters, "data_splits_url": data_splits_url, } - self.estimation_procedure_id = estimation_procedure_id + self.estimation_procedure_id = resolved_estimation_procedure_id self.split: OpenMLSplit | None = None + def _resolve_estimation_procedure_id(self, estimation_procedure_id: int | None) -> int: + return ( + estimation_procedure_id + if estimation_procedure_id is not None + else self.DEFAULT_ESTIMATION_PROCEDURE_ID + ) + @classmethod def _entity_letter(cls) -> str: return "t" @@ -128,7 +140,8 @@ def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str]]]: if class_labels is not None: fields["# of Classes"] = len(class_labels) - if hasattr(self, "cost_matrix"): + cost_matrix = getattr(self, "cost_matrix", None) + if cost_matrix is not None: fields["Cost Matrix"] = "Available" # determines the order in which the information will be printed @@ -249,33 +262,37 @@ class OpenMLSupervisedTask(OpenMLTask, ABC): Refers to the unique identifier of task. """ + DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 1 + def __init__( # noqa: PLR0913 self, task_type_id: TaskType, task_type: str, data_set_id: int, target_name: str, - estimation_procedure_id: int = 1, + estimation_procedure_id: int | None = None, estimation_procedure_type: str | None = None, estimation_parameters: dict[str, str] | None = None, evaluation_measure: str | None = None, data_splits_url: str | None = None, task_id: int | None = None, ): + resolved_estimation_procedure_id = self._resolve_estimation_procedure_id( + estimation_procedure_id, + ) super().__init__( task_id=task_id, task_type_id=task_type_id, task_type=task_type, data_set_id=data_set_id, - estimation_procedure_id=estimation_procedure_id, + estimation_procedure_id=resolved_estimation_procedure_id, estimation_procedure_type=estimation_procedure_type, estimation_parameters=estimation_parameters, evaluation_measure=evaluation_measure, data_splits_url=data_splits_url, + target_name=target_name, ) - self.target_name = target_name - def get_X_and_y(self) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]: """Get data associated with the current task. @@ -325,6 +342,8 @@ class OpenMLClassificationTask(OpenMLSupervisedTask): Parameters ---------- + task_id : Union[int, None] + ID of the Classification task (if it already exists on OpenML). task_type_id : TaskType ID of the Classification task type. task_type : str @@ -333,7 +352,7 @@ class OpenMLClassificationTask(OpenMLSupervisedTask): ID of the OpenML dataset associated with the Classification task. target_name : str Name of the target variable. - estimation_procedure_id : int, default=None + estimation_procedure_id : int, default=1 ID of the estimation procedure for the Classification task. estimation_procedure_type : str, default=None Type of the estimation procedure. @@ -343,21 +362,21 @@ class OpenMLClassificationTask(OpenMLSupervisedTask): Name of the evaluation measure. data_splits_url : str, default=None URL of the data splits for the Classification task. - task_id : Union[int, None] - ID of the Classification task (if it already exists on OpenML). class_labels : List of str, default=None A list of class labels (for classification tasks). cost_matrix : array, default=None A cost matrix (for classification tasks). """ + DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 1 + def __init__( # noqa: PLR0913 self, task_type_id: TaskType, task_type: str, data_set_id: int, target_name: str, - estimation_procedure_id: int = 1, + estimation_procedure_id: int | None = None, estimation_procedure_type: str | None = None, estimation_parameters: dict[str, str] | None = None, evaluation_measure: str | None = None, @@ -367,20 +386,19 @@ def __init__( # noqa: PLR0913 cost_matrix: np.ndarray | None = None, ): super().__init__( - task_id=task_id, task_type_id=task_type_id, task_type=task_type, data_set_id=data_set_id, + target_name=target_name, estimation_procedure_id=estimation_procedure_id, estimation_procedure_type=estimation_procedure_type, estimation_parameters=estimation_parameters, evaluation_measure=evaluation_measure, - target_name=target_name, data_splits_url=data_splits_url, + task_id=task_id, ) self.class_labels = class_labels self.cost_matrix = cost_matrix - if cost_matrix is not None: raise NotImplementedError("Costmatrix") @@ -390,6 +408,8 @@ class OpenMLRegressionTask(OpenMLSupervisedTask): Parameters ---------- + task_id : Union[int, None] + ID of the OpenML Regression task. task_type_id : TaskType Task type ID of the OpenML Regression task. task_type : str @@ -398,7 +418,7 @@ class OpenMLRegressionTask(OpenMLSupervisedTask): ID of the OpenML dataset. target_name : str Name of the target feature used in the Regression task. - estimation_procedure_id : int, default=None + estimation_procedure_id : int, default=7 ID of the OpenML estimation procedure. estimation_procedure_type : str, default=None Type of the OpenML estimation procedure. @@ -406,37 +426,11 @@ class OpenMLRegressionTask(OpenMLSupervisedTask): Parameters used by the OpenML estimation procedure. data_splits_url : str, default=None URL of the OpenML data splits for the Regression task. - task_id : Union[int, None] - ID of the OpenML Regression task. evaluation_measure : str, default=None Evaluation measure used in the Regression task. """ - def __init__( # noqa: PLR0913 - self, - task_type_id: TaskType, - task_type: str, - data_set_id: int, - target_name: str, - estimation_procedure_id: int = 7, - estimation_procedure_type: str | None = None, - estimation_parameters: dict[str, str] | None = None, - data_splits_url: str | None = None, - task_id: int | None = None, - evaluation_measure: str | None = None, - ): - super().__init__( - task_id=task_id, - task_type_id=task_type_id, - task_type=task_type, - data_set_id=data_set_id, - estimation_procedure_id=estimation_procedure_id, - estimation_procedure_type=estimation_procedure_type, - estimation_parameters=estimation_parameters, - evaluation_measure=evaluation_measure, - target_name=target_name, - data_splits_url=data_splits_url, - ) + DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 7 class OpenMLClusteringTask(OpenMLTask): @@ -444,16 +438,16 @@ class OpenMLClusteringTask(OpenMLTask): Parameters ---------- + task_id : Union[int, None] + ID of the OpenML clustering task. task_type_id : TaskType Task type ID of the OpenML clustering task. task_type : str Task type of the OpenML clustering task. data_set_id : int ID of the OpenML dataset used in clustering the task. - estimation_procedure_id : int, default=None + estimation_procedure_id : int, default=17 ID of the OpenML estimation procedure. - task_id : Union[int, None] - ID of the OpenML clustering task. estimation_procedure_type : str, default=None Type of the OpenML estimation procedure used in the clustering task. estimation_parameters : dict, default=None @@ -467,32 +461,7 @@ class OpenMLClusteringTask(OpenMLTask): feature set for the clustering task. """ - def __init__( # noqa: PLR0913 - self, - task_type_id: TaskType, - task_type: str, - data_set_id: int, - estimation_procedure_id: int = 17, - task_id: int | None = None, - estimation_procedure_type: str | None = None, - estimation_parameters: dict[str, str] | None = None, - data_splits_url: str | None = None, - evaluation_measure: str | None = None, - target_name: str | None = None, - ): - super().__init__( - task_id=task_id, - task_type_id=task_type_id, - task_type=task_type, - data_set_id=data_set_id, - evaluation_measure=evaluation_measure, - estimation_procedure_id=estimation_procedure_id, - estimation_procedure_type=estimation_procedure_type, - estimation_parameters=estimation_parameters, - data_splits_url=data_splits_url, - ) - - self.target_name = target_name + DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 17 def get_X(self) -> pd.DataFrame: """Get data associated with the current task. @@ -528,6 +497,8 @@ class OpenMLLearningCurveTask(OpenMLClassificationTask): Parameters ---------- + task_id : Union[int, None] + ID of the Learning Curve task. task_type_id : TaskType ID of the Learning Curve task. task_type : str @@ -536,7 +507,7 @@ class OpenMLLearningCurveTask(OpenMLClassificationTask): ID of the dataset that this task is associated with. target_name : str Name of the target feature in the dataset. - estimation_procedure_id : int, default=None + estimation_procedure_id : int, default=13 ID of the estimation procedure to use for evaluating models. estimation_procedure_type : str, default=None Type of the estimation procedure. @@ -544,8 +515,6 @@ class OpenMLLearningCurveTask(OpenMLClassificationTask): Additional parameters for the estimation procedure. data_splits_url : str, default=None URL of the file containing the data splits for Learning Curve task. - task_id : Union[int, None] - ID of the Learning Curve task. evaluation_measure : str, default=None Name of the evaluation measure to use for evaluating models. class_labels : list of str, default=None @@ -554,32 +523,4 @@ class OpenMLLearningCurveTask(OpenMLClassificationTask): Cost matrix for Learning Curve tasks. """ - def __init__( # noqa: PLR0913 - self, - task_type_id: TaskType, - task_type: str, - data_set_id: int, - target_name: str, - estimation_procedure_id: int = 13, - estimation_procedure_type: str | None = None, - estimation_parameters: dict[str, str] | None = None, - data_splits_url: str | None = None, - task_id: int | None = None, - evaluation_measure: str | None = None, - class_labels: list[str] | None = None, - cost_matrix: np.ndarray | None = None, - ): - super().__init__( - task_id=task_id, - task_type_id=task_type_id, - task_type=task_type, - data_set_id=data_set_id, - estimation_procedure_id=estimation_procedure_id, - estimation_procedure_type=estimation_procedure_type, - estimation_parameters=estimation_parameters, - evaluation_measure=evaluation_measure, - target_name=target_name, - data_splits_url=data_splits_url, - class_labels=class_labels, - cost_matrix=cost_matrix, - ) + DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 13