Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions openml/tasks/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,9 @@ def get_task(
# Including class labels as part of task meta data handles
# the case where data download was initially disabled
if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
assert (
task.target_name is not None
), "Supervised tasks must define a target feature before retrieving class labels."
task.class_labels = dataset.retrieve_class_labels(task.target_name)
# Clustering tasks do not have class labels
# and do not offer download_split
Expand Down Expand Up @@ -587,6 +590,7 @@ def create_task(
raise NotImplementedError(f"Task type {task_type:d} not supported.")

return task_cls(
task_id=None,
task_type_id=task_type,
task_type="None", # TODO: refactor to get task type string from ID.
data_set_id=dataset_id,
Expand Down
153 changes: 47 additions & 106 deletions openml/tasks/task.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
# License: BSD 3-Clause
# TODO(eddbergman): Seems like a lot of the subclasses could just get away with setting
# a `ClassVar` for whatever changes as their `__init__` defaults, less duplicated code.
from __future__ import annotations

import warnings
from abc import ABC
from enum import Enum
from pathlib import Path
from typing import TYPE_CHECKING, Any, Sequence
from typing import TYPE_CHECKING, Any, ClassVar, Sequence
from typing_extensions import TypedDict

import openml._api_calls
Expand Down Expand Up @@ -70,31 +68,45 @@ class OpenMLTask(OpenMLBase):
Refers to the URL of the data splits used for the OpenML task.
"""

DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 1

def __init__( # noqa: PLR0913
self,
task_id: int | None,
task_type_id: TaskType,
task_type: str,
data_set_id: int,
estimation_procedure_id: int = 1,
estimation_procedure_id: int | None = None,
estimation_procedure_type: str | None = None,
estimation_parameters: dict[str, str] | None = None,
evaluation_measure: str | None = None,
data_splits_url: str | None = None,
target_name: str | None = None,
):
self.task_id = int(task_id) if task_id is not None else None
self.task_type_id = task_type_id
self.task_type = task_type
self.dataset_id = int(data_set_id)
self.target_name = target_name
resolved_estimation_procedure_id = self._resolve_estimation_procedure_id(
estimation_procedure_id,
)
self.evaluation_measure = evaluation_measure
self.estimation_procedure: _EstimationProcedure = {
"type": estimation_procedure_type,
"parameters": estimation_parameters,
"data_splits_url": data_splits_url,
}
self.estimation_procedure_id = estimation_procedure_id
self.estimation_procedure_id = resolved_estimation_procedure_id
self.split: OpenMLSplit | None = None

def _resolve_estimation_procedure_id(self, estimation_procedure_id: int | None) -> int:
return (
estimation_procedure_id
if estimation_procedure_id is not None
else self.DEFAULT_ESTIMATION_PROCEDURE_ID
)

@classmethod
def _entity_letter(cls) -> str:
return "t"
Expand Down Expand Up @@ -128,7 +140,8 @@ def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str]]]:
if class_labels is not None:
fields["# of Classes"] = len(class_labels)

if hasattr(self, "cost_matrix"):
cost_matrix = getattr(self, "cost_matrix", None)
if cost_matrix is not None:
fields["Cost Matrix"] = "Available"

# determines the order in which the information will be printed
Expand Down Expand Up @@ -249,33 +262,37 @@ class OpenMLSupervisedTask(OpenMLTask, ABC):
Refers to the unique identifier of task.
"""

DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 1

def __init__( # noqa: PLR0913
self,
task_type_id: TaskType,
task_type: str,
data_set_id: int,
target_name: str,
estimation_procedure_id: int = 1,
estimation_procedure_id: int | None = None,
estimation_procedure_type: str | None = None,
estimation_parameters: dict[str, str] | None = None,
evaluation_measure: str | None = None,
data_splits_url: str | None = None,
task_id: int | None = None,
):
resolved_estimation_procedure_id = self._resolve_estimation_procedure_id(
estimation_procedure_id,
)
super().__init__(
task_id=task_id,
task_type_id=task_type_id,
task_type=task_type,
data_set_id=data_set_id,
estimation_procedure_id=estimation_procedure_id,
estimation_procedure_id=resolved_estimation_procedure_id,
estimation_procedure_type=estimation_procedure_type,
estimation_parameters=estimation_parameters,
evaluation_measure=evaluation_measure,
data_splits_url=data_splits_url,
target_name=target_name,
)

self.target_name = target_name

def get_X_and_y(self) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]:
"""Get data associated with the current task.

Expand Down Expand Up @@ -325,6 +342,8 @@ class OpenMLClassificationTask(OpenMLSupervisedTask):

Parameters
----------
task_id : Union[int, None]
ID of the Classification task (if it already exists on OpenML).
task_type_id : TaskType
ID of the Classification task type.
task_type : str
Expand All @@ -333,7 +352,7 @@ class OpenMLClassificationTask(OpenMLSupervisedTask):
ID of the OpenML dataset associated with the Classification task.
target_name : str
Name of the target variable.
estimation_procedure_id : int, default=None
estimation_procedure_id : int, default=1
ID of the estimation procedure for the Classification task.
estimation_procedure_type : str, default=None
Type of the estimation procedure.
Expand All @@ -343,21 +362,21 @@ class OpenMLClassificationTask(OpenMLSupervisedTask):
Name of the evaluation measure.
data_splits_url : str, default=None
URL of the data splits for the Classification task.
task_id : Union[int, None]
ID of the Classification task (if it already exists on OpenML).
class_labels : List of str, default=None
A list of class labels (for classification tasks).
cost_matrix : array, default=None
A cost matrix (for classification tasks).
"""

DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 1

def __init__( # noqa: PLR0913
self,
task_type_id: TaskType,
task_type: str,
data_set_id: int,
target_name: str,
estimation_procedure_id: int = 1,
estimation_procedure_id: int | None = None,
estimation_procedure_type: str | None = None,
estimation_parameters: dict[str, str] | None = None,
evaluation_measure: str | None = None,
Expand All @@ -367,20 +386,19 @@ def __init__( # noqa: PLR0913
cost_matrix: np.ndarray | None = None,
):
super().__init__(
task_id=task_id,
task_type_id=task_type_id,
task_type=task_type,
data_set_id=data_set_id,
target_name=target_name,
estimation_procedure_id=estimation_procedure_id,
estimation_procedure_type=estimation_procedure_type,
estimation_parameters=estimation_parameters,
evaluation_measure=evaluation_measure,
target_name=target_name,
data_splits_url=data_splits_url,
task_id=task_id,
)
self.class_labels = class_labels
self.cost_matrix = cost_matrix

if cost_matrix is not None:
raise NotImplementedError("Costmatrix")

Expand All @@ -390,6 +408,8 @@ class OpenMLRegressionTask(OpenMLSupervisedTask):

Parameters
----------
task_id : Union[int, None]
ID of the OpenML Regression task.
task_type_id : TaskType
Task type ID of the OpenML Regression task.
task_type : str
Expand All @@ -398,62 +418,36 @@ class OpenMLRegressionTask(OpenMLSupervisedTask):
ID of the OpenML dataset.
target_name : str
Name of the target feature used in the Regression task.
estimation_procedure_id : int, default=None
estimation_procedure_id : int, default=7
ID of the OpenML estimation procedure.
estimation_procedure_type : str, default=None
Type of the OpenML estimation procedure.
estimation_parameters : dict, default=None
Parameters used by the OpenML estimation procedure.
data_splits_url : str, default=None
URL of the OpenML data splits for the Regression task.
task_id : Union[int, None]
ID of the OpenML Regression task.
evaluation_measure : str, default=None
Evaluation measure used in the Regression task.
"""

def __init__( # noqa: PLR0913
self,
task_type_id: TaskType,
task_type: str,
data_set_id: int,
target_name: str,
estimation_procedure_id: int = 7,
estimation_procedure_type: str | None = None,
estimation_parameters: dict[str, str] | None = None,
data_splits_url: str | None = None,
task_id: int | None = None,
evaluation_measure: str | None = None,
):
super().__init__(
task_id=task_id,
task_type_id=task_type_id,
task_type=task_type,
data_set_id=data_set_id,
estimation_procedure_id=estimation_procedure_id,
estimation_procedure_type=estimation_procedure_type,
estimation_parameters=estimation_parameters,
evaluation_measure=evaluation_measure,
target_name=target_name,
data_splits_url=data_splits_url,
)
DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 7


class OpenMLClusteringTask(OpenMLTask):
"""OpenML Clustering object.

Parameters
----------
task_id : Union[int, None]
ID of the OpenML clustering task.
task_type_id : TaskType
Task type ID of the OpenML clustering task.
task_type : str
Task type of the OpenML clustering task.
data_set_id : int
ID of the OpenML dataset used in clustering the task.
estimation_procedure_id : int, default=None
estimation_procedure_id : int, default=17
ID of the OpenML estimation procedure.
task_id : Union[int, None]
ID of the OpenML clustering task.
estimation_procedure_type : str, default=None
Type of the OpenML estimation procedure used in the clustering task.
estimation_parameters : dict, default=None
Expand All @@ -467,32 +461,7 @@ class OpenMLClusteringTask(OpenMLTask):
feature set for the clustering task.
"""

def __init__( # noqa: PLR0913
self,
task_type_id: TaskType,
task_type: str,
data_set_id: int,
estimation_procedure_id: int = 17,
task_id: int | None = None,
estimation_procedure_type: str | None = None,
estimation_parameters: dict[str, str] | None = None,
data_splits_url: str | None = None,
evaluation_measure: str | None = None,
target_name: str | None = None,
):
super().__init__(
task_id=task_id,
task_type_id=task_type_id,
task_type=task_type,
data_set_id=data_set_id,
evaluation_measure=evaluation_measure,
estimation_procedure_id=estimation_procedure_id,
estimation_procedure_type=estimation_procedure_type,
estimation_parameters=estimation_parameters,
data_splits_url=data_splits_url,
)

self.target_name = target_name
DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 17

def get_X(self) -> pd.DataFrame:
"""Get data associated with the current task.
Expand Down Expand Up @@ -528,6 +497,8 @@ class OpenMLLearningCurveTask(OpenMLClassificationTask):

Parameters
----------
task_id : Union[int, None]
ID of the Learning Curve task.
task_type_id : TaskType
ID of the Learning Curve task.
task_type : str
Expand All @@ -536,16 +507,14 @@ class OpenMLLearningCurveTask(OpenMLClassificationTask):
ID of the dataset that this task is associated with.
target_name : str
Name of the target feature in the dataset.
estimation_procedure_id : int, default=None
estimation_procedure_id : int, default=13
ID of the estimation procedure to use for evaluating models.
estimation_procedure_type : str, default=None
Type of the estimation procedure.
estimation_parameters : dict, default=None
Additional parameters for the estimation procedure.
data_splits_url : str, default=None
URL of the file containing the data splits for Learning Curve task.
task_id : Union[int, None]
ID of the Learning Curve task.
evaluation_measure : str, default=None
Name of the evaluation measure to use for evaluating models.
class_labels : list of str, default=None
Expand All @@ -554,32 +523,4 @@ class OpenMLLearningCurveTask(OpenMLClassificationTask):
Cost matrix for Learning Curve tasks.
"""

def __init__( # noqa: PLR0913
self,
task_type_id: TaskType,
task_type: str,
data_set_id: int,
target_name: str,
estimation_procedure_id: int = 13,
estimation_procedure_type: str | None = None,
estimation_parameters: dict[str, str] | None = None,
data_splits_url: str | None = None,
task_id: int | None = None,
evaluation_measure: str | None = None,
class_labels: list[str] | None = None,
cost_matrix: np.ndarray | None = None,
):
super().__init__(
task_id=task_id,
task_type_id=task_type_id,
task_type=task_type,
data_set_id=data_set_id,
estimation_procedure_id=estimation_procedure_id,
estimation_procedure_type=estimation_procedure_type,
estimation_parameters=estimation_parameters,
evaluation_measure=evaluation_measure,
target_name=target_name,
data_splits_url=data_splits_url,
class_labels=class_labels,
cost_matrix=cost_matrix,
)
DEFAULT_ESTIMATION_PROCEDURE_ID: ClassVar[int] = 13