openml · fkiraly · Jan 1, 2026 · Dec 31, 2025 · Dec 31, 2025
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -280,6 +280,7 @@ def test_dataset_by_name_cannot_access_private_data(self):
         self.use_production_server()
         self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, "NAME_GOES_HERE")
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_get_dataset_lazy_all_functions(self):
         """Test that all expected functionality is available without downloading the dataset."""
         dataset = openml.datasets.get_dataset(1)
@@ -664,6 +665,7 @@ def test_attributes_arff_from_df_unknown_dtype(self):
             with pytest.raises(ValueError, match=err_msg):
                 attributes_arff_from_df(df)
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_create_dataset_numpy(self):
         data = np.array([[1, 2, 3], [1.2, 2.5, 3.8], [2, 5, 8], [0, 1, 0]]).T
 
@@ -751,6 +753,7 @@ def test_create_dataset_list(self):
         ), "Uploaded ARFF does not match original one"
         assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset"
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_create_dataset_sparse(self):
         # test the scipy.sparse.coo_matrix
         sparse_data = scipy.sparse.coo_matrix(
@@ -868,6 +871,7 @@ def test_get_online_dataset_arff(self):
             return_type=arff.DENSE if d_format == "arff" else arff.COO,
         ), "ARFF files are not equal"
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_topic_api_error(self):
         # Check server exception when non-admin accessses apis
         self.assertRaisesRegex(
@@ -895,6 +899,7 @@ def test_get_online_dataset_format(self):
             dataset_id
         ), "The format of the ARFF files is different"
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_create_dataset_pandas(self):
         data = [
             ["a", "sunny", 85.0, 85.0, "FALSE", "no"],
@@ -1119,6 +1124,7 @@ def test_ignore_attributes_dataset(self):
                 paper_url=paper_url,
             )
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_publish_fetch_ignore_attribute(self):
         """Test to upload and retrieve dataset and check ignore_attributes"""
         data = [
@@ -1237,6 +1243,7 @@ def test_create_dataset_row_id_attribute_error(self):
                 paper_url=paper_url,
             )
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_create_dataset_row_id_attribute_inference(self):
         # meta-information
         name = f"{self._get_sentinel()}-pandas_testing_dataset"
@@ -1400,6 +1407,7 @@ def test_data_edit_non_critical_field(self):
         edited_dataset = openml.datasets.get_dataset(did)
         assert edited_dataset.description == desc
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_data_edit_critical_field(self):
         # Case 2
         # only owners (or admin) can edit all critical fields of datasets
@@ -1448,6 +1456,7 @@ def test_data_edit_requires_valid_dataset(self):
             description="xor operation dataset",
         )
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_data_edit_cannot_edit_critical_field_if_dataset_has_task(self):
         # Need to own a dataset to be able to edit meta-data
         # Will be creating a forked version of an existing dataset to allow the unit test user

diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py
@@ -178,6 +178,7 @@ def test_to_xml_from_xml(self):
         assert new_flow is not flow
 
     @pytest.mark.sklearn()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_publish_flow(self):
         flow = openml.OpenMLFlow(
             name="sklearn.dummy.DummyClassifier",
@@ -219,6 +220,7 @@ def test_publish_existing_flow(self, flow_exists_mock):
         )
 
     @pytest.mark.sklearn()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_publish_flow_with_similar_components(self):
         clf = sklearn.ensemble.VotingClassifier(
             [("lr", sklearn.linear_model.LogisticRegression(solver="lbfgs"))],
@@ -269,6 +271,7 @@ def test_publish_flow_with_similar_components(self):
         TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow3.flow_id}")
 
     @pytest.mark.sklearn()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_semi_legal_flow(self):
         # TODO: Test if parameters are set correctly!
         # should not throw error as it contains two differentiable forms of
@@ -377,6 +380,7 @@ def get_sentinel():
         assert not flow_id
 
     @pytest.mark.sklearn()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_existing_flow_exists(self):
         # create a flow
         nb = sklearn.naive_bayes.GaussianNB()
@@ -417,6 +421,7 @@ def test_existing_flow_exists(self):
             assert downloaded_flow_id == flow.flow_id
 
     @pytest.mark.sklearn()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_sklearn_to_upload_to_flow(self):
         iris = sklearn.datasets.load_iris()
         X = iris.data

diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
@@ -274,6 +274,7 @@ def test_are_flows_equal_ignore_if_older(self):
         assert_flows_equal(flow, flow, ignore_parameter_values_on_older_children=None)
 
     @pytest.mark.sklearn()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="OrdinalEncoder introduced in 0.20. "
@@ -388,6 +389,7 @@ def test_get_flow_reinstantiate_flow_not_strict_pre_023(self):
         assert "sklearn==0.19.1" not in flow.dependencies
 
     @pytest.mark.sklearn()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_get_flow_id(self):
         if self.long_version:
             list_all = openml.utils._list_all

diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py
@@ -118,6 +118,7 @@ def _check_array(array, type_):
             assert run_prime_trace_content is None
 
     @pytest.mark.sklearn()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_to_from_filesystem_vanilla(self):
         model = Pipeline(
             [
@@ -153,6 +154,7 @@ def test_to_from_filesystem_vanilla(self):
 
     @pytest.mark.sklearn()
     @pytest.mark.flaky()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_to_from_filesystem_search(self):
         model = Pipeline(
             [
@@ -187,6 +189,7 @@ def test_to_from_filesystem_search(self):
         )
 
     @pytest.mark.sklearn()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_to_from_filesystem_no_model(self):
         model = Pipeline(
             [("imputer", SimpleImputer(strategy="mean")), ("classifier", DummyClassifier())],
@@ -292,6 +295,7 @@ def assert_run_prediction_data(task, run, model):
             assert_method(y_test, saved_y_test)
 
     @pytest.mark.sklearn()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_publish_with_local_loaded_flow(self):
         """
         Publish a run tied to a local flow after it has first been saved to
@@ -335,6 +339,7 @@ def test_publish_with_local_loaded_flow(self):
             openml.runs.get_run(loaded_run.run_id)
 
     @pytest.mark.sklearn()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_offline_and_online_run_identical(self):
         extension = SklearnExtension()
 

diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
@@ -413,6 +413,7 @@ def test_run_regression_on_classif_task(self):
                 task=task,
             )
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     def test_check_erronous_sklearn_flow_fails(self):
         task_id = 115  # diabetes; crossvalidation
@@ -881,6 +882,7 @@ def test_run_and_upload_maskedarrays(self):
 
     ##########################################################################
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     def test_learning_curve_task_1(self):
         task_id = 801  # diabates dataset
@@ -905,6 +907,7 @@ def test_learning_curve_task_1(self):
         )
         self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples)
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     def test_learning_curve_task_2(self):
         task_id = 801  # diabates dataset
@@ -941,6 +944,7 @@ def test_learning_curve_task_2(self):
         )
         self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples)
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.21"),
@@ -1019,6 +1023,7 @@ def _test_local_evaluations(self, run):
                 assert alt_scores[idx] >= 0
                 assert alt_scores[idx] <= 1
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     def test_local_run_swapped_parameter_order_model(self):
         clf = DecisionTreeClassifier()
@@ -1034,6 +1039,7 @@ def test_local_run_swapped_parameter_order_model(self):
 
         self._test_local_evaluations(run)
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -1062,6 +1068,7 @@ def test_local_run_swapped_parameter_order_flow(self):
 
         self._test_local_evaluations(run)
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -1099,6 +1106,7 @@ def test_online_run_metric_score(self):
 
         self._test_local_evaluations(run)
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -1160,6 +1168,7 @@ def test_initialize_model_from_run(self):
         assert flowS.components["Imputer"].parameters["strategy"] == '"most_frequent"'
         assert flowS.components["VarianceThreshold"].parameters["threshold"] == "0.05"
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -1219,6 +1228,7 @@ def test__run_exists(self):
             run_ids = run_exists(task.task_id, setup_exists)
             assert run_ids, (run_ids, clf)
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     def test_run_with_illegal_flow_id(self):
         # check the case where the user adds an illegal flow id to a
@@ -1238,6 +1248,7 @@ def test_run_with_illegal_flow_id(self):
                 avoid_duplicate_runs=True,
             )
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     def test_run_with_illegal_flow_id_after_load(self):
         # Same as `test_run_with_illegal_flow_id`, but test this error is also
@@ -1294,6 +1305,7 @@ def test_run_with_illegal_flow_id_1(self):
                 avoid_duplicate_runs=True,
             )
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     def test_run_with_illegal_flow_id_1_after_load(self):
         # Same as `test_run_with_illegal_flow_id_1`, but test this error is
@@ -1332,6 +1344,7 @@ def test_run_with_illegal_flow_id_1_after_load(self):
             loaded_run.publish,
         )
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -1559,6 +1572,7 @@ def test_get_runs_list_by_tag(self):
         runs = openml.runs.list_runs(tag="curves", size=2)
         assert len(runs) >= 1
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -1595,6 +1609,7 @@ def test_run_on_dataset_with_missing_labels_dataframe(self):
             # repeat, fold, row_id, 6 confidences, prediction and correct label
             assert len(row) == 12
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -1647,6 +1662,7 @@ def test_get_uncached_run(self):
         with pytest.raises(openml.exceptions.OpenMLCacheException):
             openml.runs.functions._get_cached_run(10)
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     def test_run_flow_on_task_downloaded_flow(self):
         model = sklearn.ensemble.RandomForestClassifier(n_estimators=33)
@@ -1687,6 +1703,7 @@ def test_format_prediction_classification_no_probabilities(self):
         with pytest.raises(ValueError, match="`proba` is required for classification task"):
             format_prediction(classification, *ignored_input, proba=None)
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_format_prediction_classification_incomplete_probabilities(self):
         classification = openml.tasks.get_task(
             self.TEST_SERVER_TASK_SIMPLE["task_id"],
@@ -1707,13 +1724,15 @@ def test_format_prediction_task_without_classlabels_set(self):
         with pytest.raises(ValueError, match="The classification task must have class labels set"):
             format_prediction(classification, *ignored_input, proba={})
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_format_prediction_task_learning_curve_sample_not_set(self):
         learning_curve = openml.tasks.get_task(801, download_data=False)  # diabetes;crossvalidation
         probabilities = {c: 0.2 for c in learning_curve.class_labels}
         ignored_input = [0] * 5
         with pytest.raises(ValueError, match="`sample` can not be none for LearningCurveTask"):
             format_prediction(learning_curve, *ignored_input, sample=None, proba=probabilities)
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_format_prediction_task_regression(self):
         task_meta_data = self.TEST_SERVER_TASK_REGRESSION["task_meta_data"]
         _task_id = check_task_existence(**task_meta_data)
@@ -1743,6 +1762,7 @@ def test_format_prediction_task_regression(self):
 
 
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
@@ -1843,6 +1863,7 @@ def test_delete_unknown_run(mock_delete, test_files_directory, test_api_key):
 
 
 @pytest.mark.sklearn()
+@pytest.mark.xfail(reason="failures_issue_1544", strict=False)
 @unittest.skipIf(
     Version(sklearn.__version__) < Version("0.21"),
     reason="couldn't perform local tests successfully w/o bloating RAM",
@@ -1919,6 +1940,7 @@ def test__run_task_get_arffcontent_2(parallel_mock):
     )
 
 
+@pytest.mark.xfail(reason="failures_issue_1544", strict=False)
 @pytest.mark.sklearn()
 @unittest.skipIf(
     Version(sklearn.__version__) < Version("0.21"),

diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py
@@ -34,6 +34,7 @@ def setUp(self):
         self.extension = SklearnExtension()
         super().setUp()
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     def test_nonexisting_setup_exists(self):
         # first publish a non-existing flow
@@ -81,6 +82,7 @@ def _existing_setup_exists(self, classif):
         setup_id = openml.setups.setup_exists(flow)
         assert setup_id == run.setup_id
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     def test_existing_setup_exists_1(self):
         def side_effect(self):
@@ -96,11 +98,13 @@ def side_effect(self):
             nb = sklearn.naive_bayes.GaussianNB()
             self._existing_setup_exists(nb)
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     def test_exisiting_setup_exists_2(self):
         # Check a flow with one hyperparameter
         self._existing_setup_exists(sklearn.naive_bayes.GaussianNB())
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     def test_existing_setup_exists_3(self):
         # Check a flow with many hyperparameters

diff --git a/tests/test_tasks/test_classification_task.py b/tests/test_tasks/test_classification_task.py
@@ -18,18 +18,21 @@ def setUp(self, n_levels: int = 1):
         self.task_type = TaskType.SUPERVISED_CLASSIFICATION
         self.estimation_procedure = 5
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_download_task(self):
         task = super().test_download_task()
         assert task.task_id == self.task_id
         assert task.task_type_id == TaskType.SUPERVISED_CLASSIFICATION
         assert task.dataset_id == 20
         assert task.estimation_procedure_id == self.estimation_procedure
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_class_labels(self):
         task = get_task(self.task_id)
         assert task.class_labels == ["tested_negative", "tested_positive"]
 
 
+@pytest.mark.xfail(reason="failures_issue_1544", strict=False)
 @pytest.mark.server()
 def test_get_X_and_Y():
     task = get_task(119)