diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 666b75c37..13e2a6711 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -376,7 +376,8 @@ def initialize_model_from_run(run_id: int, *, strict_version: bool = True) -> An run = get_run(run_id) # TODO(eddiebergman): I imagine this is None if it's not published, # might need to raise an explicit error for that - assert run.setup_id is not None + if run.setup_id is None: + raise ValueError(f"Run {run_id} has no associated setup_id. Cannot initialize model.") return initialize_model(setup_id=run.setup_id, strict_version=strict_version) @@ -416,7 +417,8 @@ def initialize_model_from_trace( run = get_run(run_id) # TODO(eddiebergman): I imagine this is None if it's not published, # might need to raise an explicit error for that - assert run.flow_id is not None + if run.flow_id is None: + raise ValueError(f"Run {run_id} has no associated flow_id. Cannot initialize model.") flow = get_flow(run.flow_id) run_trace = get_run_trace(run_id) @@ -576,8 +578,10 @@ def _calculate_local_measure( # type: ignore _user_defined_measures_fold[openml_name] = sklearn_fn(_test_y, _pred_y) if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)): - assert test_y is not None - assert proba_y is not None + if test_y is None: + raise ValueError("test_y cannot be None for classification tasks.") + if proba_y is None: + raise ValueError("proba_y cannot be None for classification tasks.") for i, tst_idx in enumerate(test_indices): if task.class_labels is not None: @@ -622,7 +626,8 @@ def _calculate_local_measure( # type: ignore ) elif isinstance(task, OpenMLRegressionTask): - assert test_y is not None + if test_y is None: + raise ValueError("test_y cannot be None for regression tasks.") for i, _ in enumerate(test_indices): truth = test_y.iloc[i] if isinstance(test_y, pd.Series) else test_y[i] arff_line = format_prediction( @@ -743,7 +748,8 @@ def _run_task_get_arffcontent_parallel_helper( # noqa: PLR0913 if isinstance(task, OpenMLSupervisedTask): x, y = task.get_X_and_y() - assert isinstance(y, (pd.Series, pd.DataFrame)) + if not isinstance(y, (pd.Series, pd.DataFrame)): + raise TypeError(f"y must be a pandas Series or DataFrame, got {type(y).__name__}") train_x = x.iloc[train_indices] train_y = y.iloc[train_indices] test_x = x.iloc[test_indices] @@ -1202,7 +1208,11 @@ def __list_runs(api_call: str) -> pd.DataFrame: f'"http://openml.org/openml": {runs_dict}', ) - assert isinstance(runs_dict["oml:runs"]["oml:run"], list), type(runs_dict["oml:runs"]) + if not isinstance(runs_dict["oml:runs"]["oml:run"], list): + raise TypeError( + f"Expected runs_dict['oml:runs']['oml:run'] to be a list, " + f"got {type(runs_dict['oml:runs']).__name__}" + ) runs = { int(r["oml:run_id"]): { diff --git a/openml/runs/run.py b/openml/runs/run.py index 945264131..337b740ea 100644 --- a/openml/runs/run.py +++ b/openml/runs/run.py @@ -390,6 +390,48 @@ def to_filesystem( if self.trace is not None: self.trace._to_filesystem(directory) + def _get_arff_attributes_for_task(self, task: OpenMLTask) -> list[tuple[str, Any]]: + """Get ARFF attributes based on task type. + + Parameters + ---------- + task : OpenMLTask + The task for which to generate attributes. + + Returns + ------- + list[tuple[str, Any]] + List of attribute tuples (name, type). + """ + instance_specifications = [ + ("repeat", "NUMERIC"), + ("fold", "NUMERIC"), + ] + + if isinstance(task, (OpenMLLearningCurveTask, OpenMLClassificationTask)): + instance_specifications.append(("sample", "NUMERIC")) + + instance_specifications.append(("row_id", "NUMERIC")) + + if isinstance(task, (OpenMLLearningCurveTask, OpenMLClassificationTask)): + class_labels = task.class_labels + if class_labels is None: + raise ValueError("The task has no class labels") + + prediction_confidences = [ + ("confidence." + class_labels[i], "NUMERIC") for i in range(len(class_labels)) + ] + prediction_and_true = [("prediction", class_labels), ("correct", class_labels)] + return instance_specifications + prediction_and_true + prediction_confidences + + if isinstance(task, OpenMLRegressionTask): + return [*instance_specifications, ("prediction", "NUMERIC"), ("truth", "NUMERIC")] + + if isinstance(task, OpenMLClusteringTask): + return [*instance_specifications, ("cluster", "NUMERIC")] + + raise NotImplementedError(f"Task type {task.task_type!s} is not yet supported.") + def _generate_arff_dict(self) -> OrderedDict[str, Any]: """Generates the arff dictionary for uploading predictions to the server. @@ -407,7 +449,8 @@ def _generate_arff_dict(self) -> OrderedDict[str, Any]: if self.data_content is None: raise ValueError("Run has not been executed.") if self.flow is None: - assert self.flow_id is not None, "Run has no associated flow id!" + if self.flow_id is None: + raise ValueError("Run has no associated flow id!") self.flow = get_flow(self.flow_id) if self.description_text is None: @@ -418,69 +461,7 @@ def _generate_arff_dict(self) -> OrderedDict[str, Any]: arff_dict["data"] = self.data_content arff_dict["description"] = self.description_text arff_dict["relation"] = f"openml_task_{task.task_id}_predictions" - - if isinstance(task, OpenMLLearningCurveTask): - class_labels = task.class_labels - instance_specifications = [ - ("repeat", "NUMERIC"), - ("fold", "NUMERIC"), - ("sample", "NUMERIC"), - ("row_id", "NUMERIC"), - ] - - arff_dict["attributes"] = instance_specifications - if class_labels is not None: - arff_dict["attributes"] = ( - arff_dict["attributes"] - + [("prediction", class_labels), ("correct", class_labels)] - + [ - ("confidence." + class_labels[i], "NUMERIC") - for i in range(len(class_labels)) - ] - ) - else: - raise ValueError("The task has no class labels") - - elif isinstance(task, OpenMLClassificationTask): - class_labels = task.class_labels - instance_specifications = [ - ("repeat", "NUMERIC"), - ("fold", "NUMERIC"), - ("sample", "NUMERIC"), # Legacy - ("row_id", "NUMERIC"), - ] - - arff_dict["attributes"] = instance_specifications - if class_labels is not None: - prediction_confidences = [ - ("confidence." + class_labels[i], "NUMERIC") for i in range(len(class_labels)) - ] - prediction_and_true = [("prediction", class_labels), ("correct", class_labels)] - arff_dict["attributes"] = ( - arff_dict["attributes"] + prediction_and_true + prediction_confidences - ) - else: - raise ValueError("The task has no class labels") - - elif isinstance(task, OpenMLRegressionTask): - arff_dict["attributes"] = [ - ("repeat", "NUMERIC"), - ("fold", "NUMERIC"), - ("row_id", "NUMERIC"), - ("prediction", "NUMERIC"), - ("truth", "NUMERIC"), - ] - - elif isinstance(task, OpenMLClusteringTask): - arff_dict["attributes"] = [ - ("repeat", "NUMERIC"), - ("fold", "NUMERIC"), - ("row_id", "NUMERIC"), - ("cluster", "NUMERIC"), - ] - - else: - raise NotImplementedError(f"Task type {task.task_type!s} is not yet supported.") + arff_dict["attributes"] = self._get_arff_attributes_for_task(task) return arff_dict @@ -637,7 +618,10 @@ def _get_file_elements(self) -> dict: if self.parameter_settings is None: if self.flow is None: - assert self.flow_id is not None # for mypy + if self.flow_id is None: + raise ValueError( + "Run has no associated flow_id and cannot obtain parameter values." + ) self.flow = openml.flows.get_flow(self.flow_id) self.parameter_settings = self.flow.extension.obtain_parameter_values( self.flow, diff --git a/openml/runs/trace.py b/openml/runs/trace.py index bc9e1b5d6..861bd060a 100644 --- a/openml/runs/trace.py +++ b/openml/runs/trace.py @@ -93,7 +93,8 @@ def get_parameters(self) -> dict[str, Any]: for param, value in self.setup_string.items() } - assert self.parameters is not None + if self.parameters is None: + raise ValueError("Parameters must be set before calling get_parameters().") return {param[len(PREFIX) :]: value for param, value in self.parameters.items()} @@ -492,13 +493,21 @@ def merge_traces(cls, traces: list[OpenMLRunTrace]) -> OpenMLRunTrace: for iteration in trace: key = (iteration.repeat, iteration.fold, iteration.iteration) - assert iteration.parameters is not None + if iteration.parameters is None: + raise ValueError( + f"Iteration parameters cannot be None for repeat {iteration.repeat}, " + f"fold {iteration.fold}, iteration {iteration.iteration}" + ) param_keys = iteration.parameters.keys() if previous_iteration is not None: trace_itr = merged_trace[previous_iteration] - assert trace_itr.parameters is not None + if trace_itr.parameters is None: + raise ValueError( + f"Trace iteration parameters cannot be None " + f"for iteration {previous_iteration}" + ) trace_itr_keys = trace_itr.parameters.keys() if list(param_keys) != list(trace_itr_keys):