Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 17 additions & 7 deletions openml/runs/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,8 @@ def initialize_model_from_run(run_id: int, *, strict_version: bool = True) -> An
run = get_run(run_id)
# TODO(eddiebergman): I imagine this is None if it's not published,
# might need to raise an explicit error for that
assert run.setup_id is not None
if run.setup_id is None:
raise ValueError(f"Run {run_id} has no associated setup_id. Cannot initialize model.")
return initialize_model(setup_id=run.setup_id, strict_version=strict_version)


Expand Down Expand Up @@ -416,7 +417,8 @@ def initialize_model_from_trace(
run = get_run(run_id)
# TODO(eddiebergman): I imagine this is None if it's not published,
# might need to raise an explicit error for that
assert run.flow_id is not None
if run.flow_id is None:
raise ValueError(f"Run {run_id} has no associated flow_id. Cannot initialize model.")

flow = get_flow(run.flow_id)
run_trace = get_run_trace(run_id)
Expand Down Expand Up @@ -576,8 +578,10 @@ def _calculate_local_measure( # type: ignore
_user_defined_measures_fold[openml_name] = sklearn_fn(_test_y, _pred_y)

if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
assert test_y is not None
assert proba_y is not None
if test_y is None:
raise ValueError("test_y cannot be None for classification tasks.")
if proba_y is None:
raise ValueError("proba_y cannot be None for classification tasks.")

for i, tst_idx in enumerate(test_indices):
if task.class_labels is not None:
Expand Down Expand Up @@ -622,7 +626,8 @@ def _calculate_local_measure( # type: ignore
)

elif isinstance(task, OpenMLRegressionTask):
assert test_y is not None
if test_y is None:
raise ValueError("test_y cannot be None for regression tasks.")
for i, _ in enumerate(test_indices):
truth = test_y.iloc[i] if isinstance(test_y, pd.Series) else test_y[i]
arff_line = format_prediction(
Expand Down Expand Up @@ -743,7 +748,8 @@ def _run_task_get_arffcontent_parallel_helper( # noqa: PLR0913

if isinstance(task, OpenMLSupervisedTask):
x, y = task.get_X_and_y()
assert isinstance(y, (pd.Series, pd.DataFrame))
if not isinstance(y, (pd.Series, pd.DataFrame)):
raise TypeError(f"y must be a pandas Series or DataFrame, got {type(y).__name__}")
train_x = x.iloc[train_indices]
train_y = y.iloc[train_indices]
test_x = x.iloc[test_indices]
Expand Down Expand Up @@ -1202,7 +1208,11 @@ def __list_runs(api_call: str) -> pd.DataFrame:
f'"http://openml.org/openml": {runs_dict}',
)

assert isinstance(runs_dict["oml:runs"]["oml:run"], list), type(runs_dict["oml:runs"])
if not isinstance(runs_dict["oml:runs"]["oml:run"], list):
raise TypeError(
f"Expected runs_dict['oml:runs']['oml:run'] to be a list, "
f"got {type(runs_dict['oml:runs']).__name__}"
)

runs = {
int(r["oml:run_id"]): {
Expand Down
114 changes: 49 additions & 65 deletions openml/runs/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,48 @@ def to_filesystem(
if self.trace is not None:
self.trace._to_filesystem(directory)

def _get_arff_attributes_for_task(self, task: OpenMLTask) -> list[tuple[str, Any]]:
"""Get ARFF attributes based on task type.

Parameters
----------
task : OpenMLTask
The task for which to generate attributes.

Returns
-------
list[tuple[str, Any]]
List of attribute tuples (name, type).
"""
instance_specifications = [
("repeat", "NUMERIC"),
("fold", "NUMERIC"),
]

if isinstance(task, (OpenMLLearningCurveTask, OpenMLClassificationTask)):
instance_specifications.append(("sample", "NUMERIC"))

instance_specifications.append(("row_id", "NUMERIC"))

if isinstance(task, (OpenMLLearningCurveTask, OpenMLClassificationTask)):
class_labels = task.class_labels
if class_labels is None:
raise ValueError("The task has no class labels")

prediction_confidences = [
("confidence." + class_labels[i], "NUMERIC") for i in range(len(class_labels))
]
prediction_and_true = [("prediction", class_labels), ("correct", class_labels)]
return instance_specifications + prediction_and_true + prediction_confidences

if isinstance(task, OpenMLRegressionTask):
return [*instance_specifications, ("prediction", "NUMERIC"), ("truth", "NUMERIC")]

if isinstance(task, OpenMLClusteringTask):
return [*instance_specifications, ("cluster", "NUMERIC")]

raise NotImplementedError(f"Task type {task.task_type!s} is not yet supported.")

def _generate_arff_dict(self) -> OrderedDict[str, Any]:
"""Generates the arff dictionary for uploading predictions to the
server.
Expand All @@ -407,7 +449,8 @@ def _generate_arff_dict(self) -> OrderedDict[str, Any]:
if self.data_content is None:
raise ValueError("Run has not been executed.")
if self.flow is None:
assert self.flow_id is not None, "Run has no associated flow id!"
if self.flow_id is None:
raise ValueError("Run has no associated flow id!")
self.flow = get_flow(self.flow_id)

if self.description_text is None:
Expand All @@ -418,69 +461,7 @@ def _generate_arff_dict(self) -> OrderedDict[str, Any]:
arff_dict["data"] = self.data_content
arff_dict["description"] = self.description_text
arff_dict["relation"] = f"openml_task_{task.task_id}_predictions"

if isinstance(task, OpenMLLearningCurveTask):
class_labels = task.class_labels
instance_specifications = [
("repeat", "NUMERIC"),
("fold", "NUMERIC"),
("sample", "NUMERIC"),
("row_id", "NUMERIC"),
]

arff_dict["attributes"] = instance_specifications
if class_labels is not None:
arff_dict["attributes"] = (
arff_dict["attributes"]
+ [("prediction", class_labels), ("correct", class_labels)]
+ [
("confidence." + class_labels[i], "NUMERIC")
for i in range(len(class_labels))
]
)
else:
raise ValueError("The task has no class labels")

elif isinstance(task, OpenMLClassificationTask):
class_labels = task.class_labels
instance_specifications = [
("repeat", "NUMERIC"),
("fold", "NUMERIC"),
("sample", "NUMERIC"), # Legacy
("row_id", "NUMERIC"),
]

arff_dict["attributes"] = instance_specifications
if class_labels is not None:
prediction_confidences = [
("confidence." + class_labels[i], "NUMERIC") for i in range(len(class_labels))
]
prediction_and_true = [("prediction", class_labels), ("correct", class_labels)]
arff_dict["attributes"] = (
arff_dict["attributes"] + prediction_and_true + prediction_confidences
)
else:
raise ValueError("The task has no class labels")

elif isinstance(task, OpenMLRegressionTask):
arff_dict["attributes"] = [
("repeat", "NUMERIC"),
("fold", "NUMERIC"),
("row_id", "NUMERIC"),
("prediction", "NUMERIC"),
("truth", "NUMERIC"),
]

elif isinstance(task, OpenMLClusteringTask):
arff_dict["attributes"] = [
("repeat", "NUMERIC"),
("fold", "NUMERIC"),
("row_id", "NUMERIC"),
("cluster", "NUMERIC"),
]

else:
raise NotImplementedError(f"Task type {task.task_type!s} is not yet supported.")
arff_dict["attributes"] = self._get_arff_attributes_for_task(task)

return arff_dict

Expand Down Expand Up @@ -637,7 +618,10 @@ def _get_file_elements(self) -> dict:

if self.parameter_settings is None:
if self.flow is None:
assert self.flow_id is not None # for mypy
if self.flow_id is None:
raise ValueError(
"Run has no associated flow_id and cannot obtain parameter values."
)
self.flow = openml.flows.get_flow(self.flow_id)
self.parameter_settings = self.flow.extension.obtain_parameter_values(
self.flow,
Expand Down
15 changes: 12 additions & 3 deletions openml/runs/trace.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ def get_parameters(self) -> dict[str, Any]:
for param, value in self.setup_string.items()
}

assert self.parameters is not None
if self.parameters is None:
raise ValueError("Parameters must be set before calling get_parameters().")
return {param[len(PREFIX) :]: value for param, value in self.parameters.items()}


Expand Down Expand Up @@ -492,13 +493,21 @@ def merge_traces(cls, traces: list[OpenMLRunTrace]) -> OpenMLRunTrace:
for iteration in trace:
key = (iteration.repeat, iteration.fold, iteration.iteration)

assert iteration.parameters is not None
if iteration.parameters is None:
raise ValueError(
f"Iteration parameters cannot be None for repeat {iteration.repeat}, "
f"fold {iteration.fold}, iteration {iteration.iteration}"
)
param_keys = iteration.parameters.keys()

if previous_iteration is not None:
trace_itr = merged_trace[previous_iteration]

assert trace_itr.parameters is not None
if trace_itr.parameters is None:
raise ValueError(
f"Trace iteration parameters cannot be None "
f"for iteration {previous_iteration}"
)
trace_itr_keys = trace_itr.parameters.keys()

if list(param_keys) != list(trace_itr_keys):
Expand Down
Loading