From e89ed3f303431d2e7e308dfe368f5cf03da01992 Mon Sep 17 00:00:00 2001
From: JATAYU000 <shrivaths44kunju@gmail.com>
Date: Mon, 29 Dec 2025 11:07:47 +0530
Subject: [PATCH 1/8] Added __repr__

---
 openml/tasks/split.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/openml/tasks/split.py b/openml/tasks/split.py
index 4e781df35..073a378c8 100644
--- a/openml/tasks/split.py
+++ b/openml/tasks/split.py
@@ -63,6 +63,29 @@ def __init__(
         self.folds = len(self.split[0])
         self.samples = len(self.split[0][0])
 
+    def __repr__(self) -> str:
+        header = "OpenML Split"
+        header = f"{header}\n{'=' * len(header)}\n"
+
+        fields = {
+            "Name": self.name,
+            "Description": (
+                self.description if len(self.description) <= 80 else self.description[:77] + "..."
+            ),
+            "Repeats": self.repeats,
+            "Folds": self.folds,
+            "Samples": self.samples,
+        }
+
+        order = ["Name", "Description", "Repeats", "Folds", "Samples"]
+
+        _fields = [(key, fields[key]) for key in order if key in fields]
+
+        longest_field_name_length = max(len(name) for name, _ in _fields)
+        field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}"
+        body = "\n".join(field_line_format.format(name, value) for name, value in _fields)
+        return header + body
+
     def __eq__(self, other: Any) -> bool:
         if (
             (not isinstance(self, type(other)))

From e948f2e61996edd4fc80820458c3219c82ebef03 Mon Sep 17 00:00:00 2001
From: Eman Abdelhaleem <101830347+EmanAbdelhaleem@users.noreply.github.com>
Date: Thu, 1 Jan 2026 13:18:21 +0200
Subject: [PATCH 2/8] [BUG] Temporarily fix issue #1586 by marking some failed
 tests as non-strict expected fail. (#1587)

#### Metadata
* Reference Issue: Temporarily fix issue #1586

#### Details
- Running the pytest locally, I found only one failed test which is: `tests/test_runs/test_run_functions.py::test__run_task_get_arffcontent_2`
- However, when trying to go through the failed tests in the recent runed jobs in different recent PRs, I found many other failed tests, I picked some of them and tried to make a kind of analysis, and here are my findings:

##### Primary Failure Patterns
1. OpenML Test Server Issues (Most Common)
The majority of failures are caused by:
  - `OpenMLServerError: Unexpected server error when calling https://test.openml.org/... with Status code: 500`
  - Database connection errors: `Database connection error. Usually due to high server load. Please wait N seconds and try again.`
  - Timeout errors: `TIMEOUT: Failed to fetch uploaded dataset`

2. Cache/Filesystem Issues
  - `ValueError: Cannot remove faulty tasks cache directory ... Please do this manually!`
  - `FileNotFoundError: No such file or directory`

3. Data Format Issues
  - `KeyError: ['type'] not found in axis`
  - `KeyError: ['class'] not found in axis`
  - `KeyError: ['Class'] not found in axis`
---
 tests/test_datasets/test_dataset_functions.py |  9 ++++++++
 tests/test_flows/test_flow.py                 |  5 +++++
 tests/test_flows/test_flow_functions.py       |  2 ++
 tests/test_runs/test_run.py                   |  5 +++++
 tests/test_runs/test_run_functions.py         | 22 +++++++++++++++++++
 tests/test_setups/test_setup_functions.py     |  4 ++++
 tests/test_tasks/test_classification_task.py  |  3 +++
 tests/test_tasks/test_learning_curve_task.py  |  3 +++
 tests/test_tasks/test_regression_task.py      |  2 ++
 tests/test_tasks/test_task.py                 |  3 +++
 tests/test_tasks/test_task_functions.py       |  1 +
 11 files changed, 59 insertions(+)

diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
index 266a6f6f7..f8cb1943c 100644
--- a/tests/test_datasets/test_dataset_functions.py
+++ b/tests/test_datasets/test_dataset_functions.py
@@ -280,6 +280,7 @@ def test_dataset_by_name_cannot_access_private_data(self):
         self.use_production_server()
         self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, "NAME_GOES_HERE")
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_get_dataset_lazy_all_functions(self):
         """Test that all expected functionality is available without downloading the dataset."""
         dataset = openml.datasets.get_dataset(1)
@@ -664,6 +665,7 @@ def test_attributes_arff_from_df_unknown_dtype(self):
             with pytest.raises(ValueError, match=err_msg):
                 attributes_arff_from_df(df)
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_create_dataset_numpy(self):
         data = np.array([[1, 2, 3], [1.2, 2.5, 3.8], [2, 5, 8], [0, 1, 0]]).T
 
@@ -751,6 +753,7 @@ def test_create_dataset_list(self):
         ), "Uploaded ARFF does not match original one"
         assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset"
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_create_dataset_sparse(self):
         # test the scipy.sparse.coo_matrix
         sparse_data = scipy.sparse.coo_matrix(
@@ -868,6 +871,7 @@ def test_get_online_dataset_arff(self):
             return_type=arff.DENSE if d_format == "arff" else arff.COO,
         ), "ARFF files are not equal"
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_topic_api_error(self):
         # Check server exception when non-admin accessses apis
         self.assertRaisesRegex(
@@ -895,6 +899,7 @@ def test_get_online_dataset_format(self):
             dataset_id
         ), "The format of the ARFF files is different"
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_create_dataset_pandas(self):
         data = [
             ["a", "sunny", 85.0, 85.0, "FALSE", "no"],
@@ -1119,6 +1124,7 @@ def test_ignore_attributes_dataset(self):
                 paper_url=paper_url,
             )
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_publish_fetch_ignore_attribute(self):
         """Test to upload and retrieve dataset and check ignore_attributes"""
         data = [
@@ -1237,6 +1243,7 @@ def test_create_dataset_row_id_attribute_error(self):
                 paper_url=paper_url,
             )
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_create_dataset_row_id_attribute_inference(self):
         # meta-information
         name = f"{self._get_sentinel()}-pandas_testing_dataset"
@@ -1400,6 +1407,7 @@ def test_data_edit_non_critical_field(self):
         edited_dataset = openml.datasets.get_dataset(did)
         assert edited_dataset.description == desc
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_data_edit_critical_field(self):
         # Case 2
         # only owners (or admin) can edit all critical fields of datasets
@@ -1448,6 +1456,7 @@ def test_data_edit_requires_valid_dataset(self):
             description="xor operation dataset",
         )
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_data_edit_cannot_edit_critical_field_if_dataset_has_task(self):
         # Need to own a dataset to be able to edit meta-data
         # Will be creating a forked version of an existing dataset to allow the unit test user
diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py
index 0b034c3b4..da719d058 100644
--- a/tests/test_flows/test_flow.py
+++ b/tests/test_flows/test_flow.py
@@ -178,6 +178,7 @@ def test_to_xml_from_xml(self):
         assert new_flow is not flow
 
     @pytest.mark.sklearn()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_publish_flow(self):
         flow = openml.OpenMLFlow(
             name="sklearn.dummy.DummyClassifier",
@@ -219,6 +220,7 @@ def test_publish_existing_flow(self, flow_exists_mock):
         )
 
     @pytest.mark.sklearn()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_publish_flow_with_similar_components(self):
         clf = sklearn.ensemble.VotingClassifier(
             [("lr", sklearn.linear_model.LogisticRegression(solver="lbfgs"))],
@@ -269,6 +271,7 @@ def test_publish_flow_with_similar_components(self):
         TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow3.flow_id}")
 
     @pytest.mark.sklearn()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_semi_legal_flow(self):
         # TODO: Test if parameters are set correctly!
         # should not throw error as it contains two differentiable forms of
@@ -377,6 +380,7 @@ def get_sentinel():
         assert not flow_id
 
     @pytest.mark.sklearn()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_existing_flow_exists(self):
         # create a flow
         nb = sklearn.naive_bayes.GaussianNB()
@@ -417,6 +421,7 @@ def test_existing_flow_exists(self):
             assert downloaded_flow_id == flow.flow_id
 
     @pytest.mark.sklearn()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_sklearn_to_upload_to_flow(self):
         iris = sklearn.datasets.load_iris()
         X = iris.data
diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
index 9f8ec5e36..0be65ceac 100644
--- a/tests/test_flows/test_flow_functions.py
+++ b/tests/test_flows/test_flow_functions.py
@@ -274,6 +274,7 @@ def test_are_flows_equal_ignore_if_older(self):
         assert_flows_equal(flow, flow, ignore_parameter_values_on_older_children=None)
 
     @pytest.mark.sklearn()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="OrdinalEncoder introduced in 0.20. "
@@ -388,6 +389,7 @@ def test_get_flow_reinstantiate_flow_not_strict_pre_023(self):
         assert "sklearn==0.19.1" not in flow.dependencies
 
     @pytest.mark.sklearn()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_get_flow_id(self):
         if self.long_version:
             list_all = openml.utils._list_all
diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py
index 034b731aa..71651d431 100644
--- a/tests/test_runs/test_run.py
+++ b/tests/test_runs/test_run.py
@@ -118,6 +118,7 @@ def _check_array(array, type_):
             assert run_prime_trace_content is None
 
     @pytest.mark.sklearn()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_to_from_filesystem_vanilla(self):
         model = Pipeline(
             [
@@ -153,6 +154,7 @@ def test_to_from_filesystem_vanilla(self):
 
     @pytest.mark.sklearn()
     @pytest.mark.flaky()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_to_from_filesystem_search(self):
         model = Pipeline(
             [
@@ -187,6 +189,7 @@ def test_to_from_filesystem_search(self):
         )
 
     @pytest.mark.sklearn()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_to_from_filesystem_no_model(self):
         model = Pipeline(
             [("imputer", SimpleImputer(strategy="mean")), ("classifier", DummyClassifier())],
@@ -292,6 +295,7 @@ def assert_run_prediction_data(task, run, model):
             assert_method(y_test, saved_y_test)
 
     @pytest.mark.sklearn()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_publish_with_local_loaded_flow(self):
         """
         Publish a run tied to a local flow after it has first been saved to
@@ -335,6 +339,7 @@ def test_publish_with_local_loaded_flow(self):
             openml.runs.get_run(loaded_run.run_id)
 
     @pytest.mark.sklearn()
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_offline_and_online_run_identical(self):
         extension = SklearnExtension()
 
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index e4cec56ab..305d859d9 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -413,6 +413,7 @@ def test_run_regression_on_classif_task(self):
                 task=task,
             )
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     def test_check_erronous_sklearn_flow_fails(self):
         task_id = 115  # diabetes; crossvalidation
@@ -881,6 +882,7 @@ def test_run_and_upload_maskedarrays(self):
 
     ##########################################################################
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     def test_learning_curve_task_1(self):
         task_id = 801  # diabates dataset
@@ -905,6 +907,7 @@ def test_learning_curve_task_1(self):
         )
         self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples)
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     def test_learning_curve_task_2(self):
         task_id = 801  # diabates dataset
@@ -941,6 +944,7 @@ def test_learning_curve_task_2(self):
         )
         self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples)
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.21"),
@@ -1019,6 +1023,7 @@ def _test_local_evaluations(self, run):
                 assert alt_scores[idx] >= 0
                 assert alt_scores[idx] <= 1
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     def test_local_run_swapped_parameter_order_model(self):
         clf = DecisionTreeClassifier()
@@ -1034,6 +1039,7 @@ def test_local_run_swapped_parameter_order_model(self):
 
         self._test_local_evaluations(run)
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -1062,6 +1068,7 @@ def test_local_run_swapped_parameter_order_flow(self):
 
         self._test_local_evaluations(run)
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -1099,6 +1106,7 @@ def test_online_run_metric_score(self):
 
         self._test_local_evaluations(run)
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -1160,6 +1168,7 @@ def test_initialize_model_from_run(self):
         assert flowS.components["Imputer"].parameters["strategy"] == '"most_frequent"'
         assert flowS.components["VarianceThreshold"].parameters["threshold"] == "0.05"
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -1219,6 +1228,7 @@ def test__run_exists(self):
             run_ids = run_exists(task.task_id, setup_exists)
             assert run_ids, (run_ids, clf)
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     def test_run_with_illegal_flow_id(self):
         # check the case where the user adds an illegal flow id to a
@@ -1238,6 +1248,7 @@ def test_run_with_illegal_flow_id(self):
                 avoid_duplicate_runs=True,
             )
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     def test_run_with_illegal_flow_id_after_load(self):
         # Same as `test_run_with_illegal_flow_id`, but test this error is also
@@ -1294,6 +1305,7 @@ def test_run_with_illegal_flow_id_1(self):
                 avoid_duplicate_runs=True,
             )
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     def test_run_with_illegal_flow_id_1_after_load(self):
         # Same as `test_run_with_illegal_flow_id_1`, but test this error is
@@ -1332,6 +1344,7 @@ def test_run_with_illegal_flow_id_1_after_load(self):
             loaded_run.publish,
         )
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -1559,6 +1572,7 @@ def test_get_runs_list_by_tag(self):
         runs = openml.runs.list_runs(tag="curves", size=2)
         assert len(runs) >= 1
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -1595,6 +1609,7 @@ def test_run_on_dataset_with_missing_labels_dataframe(self):
             # repeat, fold, row_id, 6 confidences, prediction and correct label
             assert len(row) == 12
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -1647,6 +1662,7 @@ def test_get_uncached_run(self):
         with pytest.raises(openml.exceptions.OpenMLCacheException):
             openml.runs.functions._get_cached_run(10)
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     def test_run_flow_on_task_downloaded_flow(self):
         model = sklearn.ensemble.RandomForestClassifier(n_estimators=33)
@@ -1687,6 +1703,7 @@ def test_format_prediction_classification_no_probabilities(self):
         with pytest.raises(ValueError, match="`proba` is required for classification task"):
             format_prediction(classification, *ignored_input, proba=None)
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_format_prediction_classification_incomplete_probabilities(self):
         classification = openml.tasks.get_task(
             self.TEST_SERVER_TASK_SIMPLE["task_id"],
@@ -1707,6 +1724,7 @@ def test_format_prediction_task_without_classlabels_set(self):
         with pytest.raises(ValueError, match="The classification task must have class labels set"):
             format_prediction(classification, *ignored_input, proba={})
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_format_prediction_task_learning_curve_sample_not_set(self):
         learning_curve = openml.tasks.get_task(801, download_data=False)  # diabetes;crossvalidation
         probabilities = {c: 0.2 for c in learning_curve.class_labels}
@@ -1714,6 +1732,7 @@ def test_format_prediction_task_learning_curve_sample_not_set(self):
         with pytest.raises(ValueError, match="`sample` can not be none for LearningCurveTask"):
             format_prediction(learning_curve, *ignored_input, sample=None, proba=probabilities)
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_format_prediction_task_regression(self):
         task_meta_data = self.TEST_SERVER_TASK_REGRESSION["task_meta_data"]
         _task_id = check_task_existence(**task_meta_data)
@@ -1743,6 +1762,7 @@ def test_format_prediction_task_regression(self):
 
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
@@ -1843,6 +1863,7 @@ def test_delete_unknown_run(mock_delete, test_files_directory, test_api_key):
 
 
 @pytest.mark.sklearn()
+@pytest.mark.xfail(reason="failures_issue_1544", strict=False)
 @unittest.skipIf(
     Version(sklearn.__version__) < Version("0.21"),
     reason="couldn't perform local tests successfully w/o bloating RAM",
@@ -1919,6 +1940,7 @@ def test__run_task_get_arffcontent_2(parallel_mock):
     )
 
 
+@pytest.mark.xfail(reason="failures_issue_1544", strict=False)
 @pytest.mark.sklearn()
 @unittest.skipIf(
     Version(sklearn.__version__) < Version("0.21"),
diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py
index 42af5362b..a3b698a37 100644
--- a/tests/test_setups/test_setup_functions.py
+++ b/tests/test_setups/test_setup_functions.py
@@ -34,6 +34,7 @@ def setUp(self):
         self.extension = SklearnExtension()
         super().setUp()
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     def test_nonexisting_setup_exists(self):
         # first publish a non-existing flow
@@ -81,6 +82,7 @@ def _existing_setup_exists(self, classif):
         setup_id = openml.setups.setup_exists(flow)
         assert setup_id == run.setup_id
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     def test_existing_setup_exists_1(self):
         def side_effect(self):
@@ -96,11 +98,13 @@ def side_effect(self):
             nb = sklearn.naive_bayes.GaussianNB()
             self._existing_setup_exists(nb)
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     def test_exisiting_setup_exists_2(self):
         # Check a flow with one hyperparameter
         self._existing_setup_exists(sklearn.naive_bayes.GaussianNB())
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     def test_existing_setup_exists_3(self):
         # Check a flow with many hyperparameters
diff --git a/tests/test_tasks/test_classification_task.py b/tests/test_tasks/test_classification_task.py
index d4f2ed9d7..5528cabf2 100644
--- a/tests/test_tasks/test_classification_task.py
+++ b/tests/test_tasks/test_classification_task.py
@@ -18,6 +18,7 @@ def setUp(self, n_levels: int = 1):
         self.task_type = TaskType.SUPERVISED_CLASSIFICATION
         self.estimation_procedure = 5
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_download_task(self):
         task = super().test_download_task()
         assert task.task_id == self.task_id
@@ -25,11 +26,13 @@ def test_download_task(self):
         assert task.dataset_id == 20
         assert task.estimation_procedure_id == self.estimation_procedure
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_class_labels(self):
         task = get_task(self.task_id)
         assert task.class_labels == ["tested_negative", "tested_positive"]
 
 
+@pytest.mark.xfail(reason="failures_issue_1544", strict=False)
 @pytest.mark.server()
 def test_get_X_and_Y():
     task = get_task(119)
diff --git a/tests/test_tasks/test_learning_curve_task.py b/tests/test_tasks/test_learning_curve_task.py
index 4a3dede4e..5f4b3e0ab 100644
--- a/tests/test_tasks/test_learning_curve_task.py
+++ b/tests/test_tasks/test_learning_curve_task.py
@@ -18,6 +18,7 @@ def setUp(self, n_levels: int = 1):
         self.task_type = TaskType.LEARNING_CURVE
         self.estimation_procedure = 13
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_get_X_and_Y(self):
         X, Y = super().test_get_X_and_Y()
         assert X.shape == (768, 8)
@@ -26,12 +27,14 @@ def test_get_X_and_Y(self):
         assert isinstance(Y, pd.Series)
         assert pd.api.types.is_categorical_dtype(Y)
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_download_task(self):
         task = super().test_download_task()
         assert task.task_id == self.task_id
         assert task.task_type_id == TaskType.LEARNING_CURVE
         assert task.dataset_id == 20
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_class_labels(self):
         task = get_task(self.task_id)
         assert task.class_labels == ["tested_negative", "tested_positive"]
diff --git a/tests/test_tasks/test_regression_task.py b/tests/test_tasks/test_regression_task.py
index 3e324c4f8..0cd2d96e2 100644
--- a/tests/test_tasks/test_regression_task.py
+++ b/tests/test_tasks/test_regression_task.py
@@ -49,6 +49,7 @@ def setUp(self, n_levels: int = 1):
         self.task_type = TaskType.SUPERVISED_REGRESSION
 
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_get_X_and_Y(self):
         X, Y = super().test_get_X_and_Y()
         assert X.shape == (194, 32)
@@ -57,6 +58,7 @@ def test_get_X_and_Y(self):
         assert isinstance(Y, pd.Series)
         assert pd.api.types.is_numeric_dtype(Y)
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_download_task(self):
         task = super().test_download_task()
         assert task.task_id == self.task_id
diff --git a/tests/test_tasks/test_task.py b/tests/test_tasks/test_task.py
index e4c9418f2..67f715d2b 100644
--- a/tests/test_tasks/test_task.py
+++ b/tests/test_tasks/test_task.py
@@ -4,6 +4,8 @@
 import unittest
 from random import randint, shuffle
 
+import pytest
+
 from openml.datasets import (
     get_dataset,
     list_datasets,
@@ -33,6 +35,7 @@ def setUp(self, n_levels: int = 1):
     def test_download_task(self):
         return get_task(self.task_id)
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_upload_task(self):
         # We don't know if the task in question already exists, so we try a few times. Checking
         # beforehand would not be an option because a concurrent unit test could potentially
diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py
index 0aa2dcc9b..110459711 100644
--- a/tests/test_tasks/test_task_functions.py
+++ b/tests/test_tasks/test_task_functions.py
@@ -152,6 +152,7 @@ def test_get_task(self):
             os.path.join(self.workdir, "org", "openml", "test", "datasets", "1", "dataset.arff")
         )
 
+    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     def test_get_task_lazy(self):
         task = openml.tasks.get_task(2, download_data=False)  # anneal; crossvalidation
         assert isinstance(task, OpenMLTask)

From 4a3aae6378dad069a6ee3484ad60b78b11686044 Mon Sep 17 00:00:00 2001
From: Eman Abdelhaleem <101830347+EmanAbdelhaleem@users.noreply.github.com>
Date: Thu, 1 Jan 2026 15:13:18 +0200
Subject: [PATCH 3/8] [BUG] Fix Sklearn Models detection by safely importing
 openml-sklearn (#1556)

#### Metadata

* Reference Issue: Fixes #1542

#### Details
Fixed sklearn models detection by safely importing openml-sklearn at `openml/runs/__init__.py`
---
 openml/extensions/functions.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/openml/extensions/functions.py b/openml/extensions/functions.py
index 7a944c997..06902325e 100644
--- a/openml/extensions/functions.py
+++ b/openml/extensions/functions.py
@@ -1,6 +1,7 @@
 # License: BSD 3-Clause
 from __future__ import annotations
 
+import importlib.util
 from typing import TYPE_CHECKING, Any
 
 # Need to implement the following by its full path because otherwise it won't be possible to
@@ -16,8 +17,9 @@
 SKLEARN_HINT = (
     "But it looks related to scikit-learn. "
     "Please install the OpenML scikit-learn extension (openml-sklearn) and try again. "
+    "You can use `pip install openml-sklearn` for installation."
     "For more information, see "
-    "https://github.com/openml/openml-sklearn?tab=readme-ov-file#installation"
+    "https://docs.openml.org/python/extensions/"
 )
 
 
@@ -58,6 +60,10 @@ def get_extension_by_flow(
     -------
     Extension or None
     """
+    # import openml_sklearn to register SklearnExtension
+    if importlib.util.find_spec("openml_sklearn"):
+        import openml_sklearn  # noqa: F401
+
     candidates = []
     for extension_class in openml.extensions.extensions:
         if extension_class.can_handle_flow(flow):
@@ -103,6 +109,10 @@ def get_extension_by_model(
     -------
     Extension or None
     """
+    # import openml_sklearn to register SklearnExtension
+    if importlib.util.find_spec("openml_sklearn"):
+        import openml_sklearn  # noqa: F401
+
     candidates = []
     for extension_class in openml.extensions.extensions:
         if extension_class.can_handle_model(model):

From 8bbed4361a5ef1b800dff35a72a399982fe87684 Mon Sep 17 00:00:00 2001
From: Rohan Sen <rohansen856@gmail.com>
Date: Fri, 2 Jan 2026 15:53:57 +0530
Subject: [PATCH 4/8] refactor: updated OpenMLEvaluation to use dataclass
 decorator (#1559)

I have Refactored the `OpenMLEvaluation` class from a traditional Python class to use the `@dataclass` decorator to reduce boilerplate code and improve code maintainability.

#### Metadata
* Reference Issue: #1540
* New Tests Added: No
* Documentation Updated: No
* Change Log Entry: Refactored the `OpenMLEvaluation` class to use the `@dataclass`

#### Details
Edited the `OpenMLEvaluation` class in `openml\evaluations\evaluation.py` to use `@dataclass` decorator. This significantly reduces the boilerplate code in the following places:

- Instance Variable Definitions

**Before:**
```python
def __init__(
    self,
    run_id: int,
    task_id: int,
    setup_id: int,
    flow_id: int,
    flow_name: str,
    data_id: int,
    data_name: str,
    function: str,
    upload_time: str,
    uploader: int,
    uploader_name: str,
    value: float | None,
    values: list[float] | None,
    array_data: str | None = None,
):
    self.run_id = run_id
    self.task_id = task_id
    self.setup_id = setup_id
    self.flow_id = flow_id
    self.flow_name = flow_name
    self.data_id = data_id
    self.data_name = data_name
    self.function = function
    self.upload_time = upload_time
    self.uploader = uploader
    self.uploader_name = uploader_name
    self.value = value
    self.values = values
    self.array_data = array_data
```

**After:**
```python
run_id: int
task_id: int
setup_id: int
flow_id: int
flow_name: str
data_id: int
data_name: str
function: str
upload_time: str
uploader: int
uploader_name: str
value: float | None
values: list[float] | None
array_data: str | None = None
```

-  _to_dict Method Simplification

**Before:**
```python
def _to_dict(self) -> dict:
    return {
        "run_id": self.run_id,
        "task_id": self.task_id,
        "setup_id": self.setup_id,
        "flow_id": self.flow_id,
        "flow_name": self.flow_name,
        "data_id": self.data_id,
        "data_name": self.data_name,
        "function": self.function,
        "upload_time": self.upload_time,
        "uploader": self.uploader,
        "uploader_name": self.uploader_name,
        "value": self.value,
        "values": self.values,
        "array_data": self.array_data,
    }
```

**After:**
```python
def _to_dict(self) -> dict:
    return asdict(self)
```
All tests are passing with accordnce to the changes:

```bash
PS C:\Users\ASUS\Documents\work\opensource\openml-python> pytest tests/test_evaluations/
======================================= test session starts =======================================
platform win32 -- Python 3.14.0, pytest-9.0.2, pluggy-1.6.0
rootdir: C:\Users\ASUS\Documents\work\opensource\openml-python
configfile: pyproject.toml
plugins: anyio-4.12.0, flaky-3.8.1, asyncio-1.3.0, cov-7.0.0, mock-3.15.1, rerunfailures-16.1, timeout-2.4.0, xdist-3.8.0, requests-mock-1.12.1
asyncio: mode=Mode.STRICT, debug=False, asyncio_default_fixture_loop_scope=None, asyncio_default_test_loop_scope=function
collected 13 items

tests\test_evaluations\test_evaluation_functions.py ............                             [ 92%]
tests\test_evaluations\test_evaluations_example.py .                                         [100%]

================================= 13 passed in 274.80s (0:04:34) ==================================
```
---
 openml/evaluations/evaluation.py | 72 ++++++++++----------------------
 1 file changed, 21 insertions(+), 51 deletions(-)

diff --git a/openml/evaluations/evaluation.py b/openml/evaluations/evaluation.py
index 6d69d377e..5db087024 100644
--- a/openml/evaluations/evaluation.py
+++ b/openml/evaluations/evaluation.py
@@ -1,6 +1,8 @@
 # License: BSD 3-Clause
 from __future__ import annotations
 
+from dataclasses import asdict, dataclass
+
 import openml.config
 import openml.datasets
 import openml.flows
@@ -8,8 +10,7 @@
 import openml.tasks
 
 
-# TODO(eddiebergman): A lot of this class is automatically
-# handled by a dataclass
+@dataclass
 class OpenMLEvaluation:
     """
     Contains all meta-information about a run / evaluation combination,
@@ -48,55 +49,23 @@ class OpenMLEvaluation:
         (e.g., in case of precision, auroc, recall)
     """
 
-    def __init__(  # noqa: PLR0913
-        self,
-        run_id: int,
-        task_id: int,
-        setup_id: int,
-        flow_id: int,
-        flow_name: str,
-        data_id: int,
-        data_name: str,
-        function: str,
-        upload_time: str,
-        uploader: int,
-        uploader_name: str,
-        value: float | None,
-        values: list[float] | None,
-        array_data: str | None = None,
-    ):
-        self.run_id = run_id
-        self.task_id = task_id
-        self.setup_id = setup_id
-        self.flow_id = flow_id
-        self.flow_name = flow_name
-        self.data_id = data_id
-        self.data_name = data_name
-        self.function = function
-        self.upload_time = upload_time
-        self.uploader = uploader
-        self.uploader_name = uploader_name
-        self.value = value
-        self.values = values
-        self.array_data = array_data
+    run_id: int
+    task_id: int
+    setup_id: int
+    flow_id: int
+    flow_name: str
+    data_id: int
+    data_name: str
+    function: str
+    upload_time: str
+    uploader: int
+    uploader_name: str
+    value: float | None
+    values: list[float] | None
+    array_data: str | None = None
 
     def _to_dict(self) -> dict:
-        return {
-            "run_id": self.run_id,
-            "task_id": self.task_id,
-            "setup_id": self.setup_id,
-            "flow_id": self.flow_id,
-            "flow_name": self.flow_name,
-            "data_id": self.data_id,
-            "data_name": self.data_name,
-            "function": self.function,
-            "upload_time": self.upload_time,
-            "uploader": self.uploader,
-            "uploader_name": self.uploader_name,
-            "value": self.value,
-            "values": self.values,
-            "array_data": self.array_data,
-        }
+        return asdict(self)
 
     def __repr__(self) -> str:
         header = "OpenML Evaluation"
@@ -119,11 +88,12 @@ def __repr__(self) -> str:
         }
 
         order = [
-            "Uploader Date",
+            "Upload Date",
             "Run ID",
             "OpenML Run URL",
             "Task ID",
-            "OpenML Task URL" "Flow ID",
+            "OpenML Task URL",
+            "Flow ID",
             "OpenML Flow URL",
             "Setup ID",
             "Data ID",

From fa589ac7f9e0b06572b20d6f102bde0bf17e5ba9 Mon Sep 17 00:00:00 2001
From: DDiyash <149958769+DDiyash@users.noreply.github.com>
Date: Fri, 2 Jan 2026 18:34:10 +0530
Subject: [PATCH 5/8] [MNT] Update Python version support and CI to include
 Python 3.14 (#1566)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

#### Metadata
* Reference Issue: Fixes #1531
* New Tests Added: No
* Documentation Updated: Yes
* Change Log Entry: Update supported Python version range to 3.10–3.14 and extend CI testing to Python 3.14


#### Details
This pull request updates the officially supported Python version range for openml-python from 3.8–3.13 to 3.10–3.14, in line with currently supported Python releases.

The following changes were made:

Updated pyproject.toml to reflect the new supported Python range (3.10–3.14).

Extended GitHub Actions CI workflows (test.yml, dist.yaml, docs.yaml) to include Python 3.14.

Updated documentation (README.md) wherever Python version support is mentioned.

No new functionality or tests were introduced; this is a maintenance update to keep Python version support and CI configuration up to date.

This change ensures that users and contributors can use and test openml-python on the latest supported Python versions.
---
 .github/workflows/dist.yaml |  2 +-
 .github/workflows/docs.yaml |  2 +-
 .github/workflows/test.yml  | 15 +++++++++++++--
 .gitignore                  | 12 +++++++++++-
 README.md                   |  4 ++--
 pyproject.toml              |  2 +-
 6 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/dist.yaml b/.github/workflows/dist.yaml
index 0d2adc9ee..ecf6f0a7f 100644
--- a/.github/workflows/dist.yaml
+++ b/.github/workflows/dist.yaml
@@ -27,7 +27,7 @@ jobs:
     - name: Setup Python
       uses: actions/setup-python@v5
       with:
-        python-version: 3.8
+        python-version: "3.10"
     - name: Build dist
       run: |
         pip install build
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
index acce766ea..1a5a36a87 100644
--- a/.github/workflows/docs.yaml
+++ b/.github/workflows/docs.yaml
@@ -28,7 +28,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v5
         with:
-          python-version: 3.8
+          python-version: "3.10"
       - name: Install dependencies
         run: |
           pip install -e .[docs,examples]
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index b77cfd38c..850abdfe7 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -1,3 +1,4 @@
+---
 name: Tests
 
 on:
@@ -21,13 +22,13 @@ concurrency:
 
 jobs:
   test:
-    name: (${{ matrix.os }}, Py${{ matrix.python-version }}, sk${{ matrix.scikit-learn }}, sk-only:${{ matrix.sklearn-only }})
+    name: (${{ matrix.os }},Py${{ matrix.python-version }},sk${{ matrix.scikit-learn }},sk-only:${{ matrix.sklearn-only }})
     runs-on: ${{ matrix.os }}
 
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.10", "3.11", "3.12", "3.13"]
+        python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
         scikit-learn: ["1.3.*", "1.4.*", "1.5.*", "1.6.*", "1.7.*"]
         os: [ubuntu-latest]
         sklearn-only: ["true"]
@@ -38,8 +39,18 @@ jobs:
             scikit-learn: "1.3.*"
           - python-version: "3.13"
             scikit-learn: "1.4.*"
+          - python-version: "3.14"
+            scikit-learn: "1.3.*"
+          - python-version: "3.14"
+            scikit-learn: "1.4.*"
 
         include:
+          # Full test run on ubuntu, 3.14
+          - os: ubuntu-latest
+            python-version: "3.14"
+            scikit-learn: "1.7.*"
+            sklearn-only: "false"
+
           # Full test run on Windows
           - os: windows-latest
             python-version: "3.12"
diff --git a/.gitignore b/.gitignore
index 92679e5ca..d512c0ee6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -98,7 +98,17 @@ dmypy.sock
 
 # Tests
 .pytest_cache
+
+# Virtual environments
+oenv/
+venv/
+.env/
 .venv
+.venv/
+
+# Python cache
+__pycache__/
+*.pyc
 
 # Ruff
-.ruff-cache/
\ No newline at end of file
+.ruff-cache/
diff --git a/README.md b/README.md
index e8df97ad6..c44e42981 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@
 ## The Python API for a World of Data and More :dizzy:
 
 [![Latest Release](https://img.shields.io/github/v/release/openml/openml-python)](https://github.com/openml/openml-python/releases)
-[![Python Versions](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue)](https://pypi.org/project/openml/)
+[![Python Versions](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12%20%7C%203.13%20%7C%203.14-blue)](https://pypi.org/project/openml/)
 [![Downloads](https://static.pepy.tech/badge/openml)](https://pepy.tech/project/openml)
 [![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause)
 <!-- Add green badges for CI and precommit -->
@@ -60,7 +60,7 @@ for task_id in suite.tasks:
 
 ## :magic_wand: Installation
 
-OpenML-Python is supported on Python 3.8 - 3.13 and is available on Linux, MacOS, and Windows.
+OpenML-Python is supported on Python 3.10 - 3.14 and is available on Linux, MacOS, and Windows.
 
 You can install OpenML-Python with:
 
diff --git a/pyproject.toml b/pyproject.toml
index ede204ca0..14309c2d5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,7 +21,7 @@ dependencies = [
   "pyarrow",
   "tqdm",  # For MinIO download progress bars
 ]
-requires-python = ">=3.8"
+requires-python = ">=3.10,<3.15"    
 maintainers = [
   { name = "Pieter Gijsbers", email="p.gijsbers@tue.nl"},
   { name = "Lennart Purucker"},

From 5dfa47d0c315a28150e49994eb51d75cdabf1b18 Mon Sep 17 00:00:00 2001
From: JATAYU000 <shrivaths44kunju@gmail.com>
Date: Sat, 3 Jan 2026 10:28:49 +0530
Subject: [PATCH 6/8] Added RepeMixin in utils

---
 openml/utils.py | 68 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 67 insertions(+), 1 deletion(-)

diff --git a/openml/utils.py b/openml/utils.py
index 7e72e7aee..b19f9e698 100644
--- a/openml/utils.py
+++ b/openml/utils.py
@@ -2,11 +2,23 @@
 from __future__ import annotations
 
 import contextlib
+import re
 import shutil
 import warnings
+from abc import ABC, abstractmethod
 from functools import wraps
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Mapping, Sized, TypeVar, overload
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Iterable,
+    Mapping,
+    Sequence,
+    Sized,
+    TypeVar,
+    overload,
+)
 from typing_extensions import Literal, ParamSpec
 
 import numpy as np
@@ -469,3 +481,57 @@ def update(self, length: int) -> None:
         self._progress_bar.update(length)
         if self._progress_bar.total <= self._progress_bar.n:
             self._progress_bar.close()
+
+
+class ReprMixin(ABC):
+    """A mixin class that provides a customizable string representation for OpenML objects.
+
+    This mixin standardizes the __repr__ output format across OpenML classes.
+    Classes inheriting from this mixin should implement the
+    _get_repr_body_fields method to specify which fields to display.
+    """
+
+    def __repr__(self) -> str:
+        body_fields = self._get_repr_body_fields()
+        return self._apply_repr_template(body_fields)
+
+    @abstractmethod
+    def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str] | None]]:
+        """Collect all information to display in the __repr__ body.
+
+        Returns
+        -------
+        body_fields : List[Tuple[str, Union[str, int, List[str]]]]
+            A list of (name, value) pairs to display in the body of the __repr__.
+            E.g.: [('metric', 'accuracy'), ('dataset', 'iris')]
+            If value is a List of str, then each item of the list will appear in a separate row.
+        """
+        # Should be implemented in the base class.
+
+    def _apply_repr_template(
+        self,
+        body_fields: Iterable[tuple[str, str | int | list[str] | None]],
+    ) -> str:
+        """Generates the header and formats the body for string representation of the object.
+
+        Parameters
+        ----------
+        body_fields: List[Tuple[str, str]]
+           A list of (name, value) pairs to display in the body of the __repr__.
+        """
+        # We add spaces between capitals, e.g. ClassificationTask -> Classification Task
+        name_with_spaces = re.sub(
+            r"(\w)([A-Z])",
+            r"\1 \2",
+            self.__class__.__name__[len("OpenML") :],
+        )
+        header_text = f"OpenML {name_with_spaces}"
+        header = f"{header_text}\n{'=' * len(header_text)}\n"
+
+        _body_fields: list[tuple[str, str | int | list[str]]] = [
+            (k, "None" if v is None else v) for k, v in body_fields
+        ]
+        longest_field_name_length = max(len(name) for name, _ in _body_fields)
+        field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}"
+        body = "\n".join(field_line_format.format(name, value) for name, value in _body_fields)
+        return header + body

From a76333eae13b7d07640e8bf1e544a9b786826520 Mon Sep 17 00:00:00 2001
From: Armaghan Shakir <raoarmaghanshakir040@gmail.com>
Date: Thu, 8 Jan 2026 04:07:23 +0500
Subject: [PATCH 7/8] [MNT] add pytest marker to tests requiring test server
 (#1599)

Fixes https://github.com/openml/openml-python/issues/1598

This PR adds the `@pytest.mark.uses_test_server()` marker to tests that depend on the OpenML test server.

Changes
* added `uses_test_server` on the relevant test sets.
* replaced all the `server` markers with `uses_test_server` marker
* removed all the `@pytest.mark.xfail(reason="failures_issue_1544", strict=False)` where the failure was due to race-conditions or server connectivity
---
 .github/workflows/test.yml                    | 10 +--
 tests/test_datasets/test_dataset.py           |  9 ++-
 tests/test_datasets/test_dataset_functions.py | 67 ++++++++++++++++---
 .../test_evaluation_functions.py              |  2 +
 tests/test_flows/test_flow.py                 | 13 ++--
 tests/test_flows/test_flow_functions.py       |  7 +-
 tests/test_openml/test_api_calls.py           |  3 +
 tests/test_runs/test_run.py                   | 11 +--
 tests/test_runs/test_run_functions.py         | 65 +++++++++---------
 tests/test_setups/test_setup_functions.py     | 11 +--
 tests/test_study/test_study_functions.py      |  5 ++
 tests/test_tasks/test_classification_task.py  |  7 +-
 tests/test_tasks/test_clustering_task.py      |  2 +
 tests/test_tasks/test_learning_curve_task.py  |  6 +-
 tests/test_tasks/test_regression_task.py      |  4 +-
 tests/test_tasks/test_supervised_task.py      |  1 +
 tests/test_tasks/test_task.py                 |  3 +-
 tests/test_tasks/test_task_functions.py       | 18 ++++-
 tests/test_tasks/test_task_methods.py         |  2 +
 tests/test_utils/test_utils.py                | 20 +++---
 20 files changed, 183 insertions(+), 83 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 850abdfe7..d65cc3796 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -98,9 +98,9 @@ jobs:
         fi
 
         if [ "${{ matrix.sklearn-only }}" = "true" ]; then
-          marks="sklearn and not production"
+          marks="sklearn and not production and not uses_test_server"
         else
-          marks="not production"
+          marks="not production and not uses_test_server"
         fi
 
         pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
@@ -113,9 +113,9 @@ jobs:
         fi
 
         if [ "${{ matrix.sklearn-only }}" = "true" ]; then
-          marks="sklearn and production"
+          marks="sklearn and production and not uses_test_server"
         else
-          marks="production"
+          marks="production and not uses_test_server"
         fi
 
         pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
@@ -123,7 +123,7 @@ jobs:
     - name: Run tests on Windows
       if: matrix.os == 'windows-latest'
       run: |  # we need a separate step because of the bash-specific if-statement in the previous one.
-        pytest -n 4 --durations=20 --dist load -sv --reruns 5 --reruns-delay 1
+        pytest -n 4 --durations=20 --dist load -sv --reruns 5 --reruns-delay 1 -m "not uses_test_server"
 
     - name: Check for files left behind by test
       if: matrix.os != 'windows-latest' && always()
diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
index 66e9b8554..6dc4c7d5d 100644
--- a/tests/test_datasets/test_dataset.py
+++ b/tests/test_datasets/test_dataset.py
@@ -278,6 +278,7 @@ def test_equality_comparison(self):
         self.assertNotEqual(self.titanic, "Wrong_object")
 
 
+@pytest.mark.uses_test_server()
 def test_tagging():
     dataset = openml.datasets.get_dataset(125, download_data=False)
 
@@ -294,7 +295,7 @@ def test_tagging():
     datasets = openml.datasets.list_datasets(tag=tag)
     assert datasets.empty
 
-@pytest.mark.xfail(reason="failures_issue_1544")
+@pytest.mark.uses_test_server()
 def test_get_feature_with_ontology_data_id_11():
     # test on car dataset, which has built-in ontology references
     dataset = openml.datasets.get_dataset(11)
@@ -303,6 +304,7 @@ def test_get_feature_with_ontology_data_id_11():
     assert len(dataset.features[2].ontologies) >= 1
     assert len(dataset.features[3].ontologies) >= 1   
 
+@pytest.mark.uses_test_server()
 def test_add_remove_ontology_to_dataset():
     did = 1
     feature_index = 1
@@ -310,6 +312,7 @@ def test_add_remove_ontology_to_dataset():
     openml.datasets.functions.data_feature_add_ontology(did, feature_index, ontology)
     openml.datasets.functions.data_feature_remove_ontology(did, feature_index, ontology)    
 
+@pytest.mark.uses_test_server()
 def test_add_same_ontology_multiple_features():
     did = 1
     ontology = "https://www.openml.org/unittest/" + str(time())
@@ -318,6 +321,7 @@ def test_add_same_ontology_multiple_features():
         openml.datasets.functions.data_feature_add_ontology(did, i, ontology)    
 
 
+@pytest.mark.uses_test_server()
 def test_add_illegal_long_ontology():
     did = 1
     ontology = "http://www.google.com/" + ("a" * 257)
@@ -329,6 +333,7 @@ def test_add_illegal_long_ontology():
     
 
+@pytest.mark.uses_test_server()
 def test_add_illegal_url_ontology():
     did = 1
     ontology = "not_a_url" + str(time())
@@ -400,6 +405,7 @@ def test_get_sparse_categorical_data_id_395(self):
         assert len(feature.nominal_values) == 25
 
 
+@pytest.mark.uses_test_server()
 def test__read_features(mocker, workdir, static_cache_dir):
     """Test we read the features from the xml if no cache pickle is available.
     This test also does some simple checks to verify that the features are read correctly
@@ -431,6 +437,7 @@ def test__read_features(mocker, workdir, static_cache_dir):
     assert pickle_mock.dump.call_count == 1
 
 
+@pytest.mark.uses_test_server()
 def test__read_qualities(static_cache_dir, workdir, mocker):
     """Test we read the qualities from the xml if no cache pickle is available.
     This test also does some minor checks to ensure that the qualities are read correctly.
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
index f8cb1943c..c41664ba7 100644
--- a/tests/test_datasets/test_dataset_functions.py
+++ b/tests/test_datasets/test_dataset_functions.py
@@ -107,6 +107,7 @@ def _check_datasets(self, datasets):
         for did in datasets:
             self._check_dataset(datasets[did])
 
+    @pytest.mark.uses_test_server()
     def test_tag_untag_dataset(self):
         tag = "test_tag_%d" % random.randint(1, 1000000)
         all_tags = _tag_entity("data", 1, tag)
@@ -114,10 +115,12 @@ def test_tag_untag_dataset(self):
         all_tags = _tag_entity("data", 1, tag, untag=True)
         assert tag not in all_tags
 
+    @pytest.mark.uses_test_server()
     def test_list_datasets_length(self):
         datasets = openml.datasets.list_datasets()
         assert len(datasets) >= 100
 
+    @pytest.mark.uses_test_server()
     def test_list_datasets_paginate(self):
         size = 10
         max = 100
@@ -132,6 +135,7 @@ def test_list_datasets_paginate(self):
                 categories=["in_preparation", "active", "deactivated"],
             )
 
+    @pytest.mark.uses_test_server()
     def test_list_datasets_empty(self):
         datasets = openml.datasets.list_datasets(tag="NoOneWouldUseThisTagAnyway")
         assert datasets.empty
@@ -155,6 +159,7 @@ def test_check_datasets_active(self):
         )
         openml.config.server = self.test_server
 
+    @pytest.mark.uses_test_server()
     def test_illegal_character_tag(self):
         dataset = openml.datasets.get_dataset(1)
         tag = "illegal_tag&"
@@ -164,6 +169,7 @@ def test_illegal_character_tag(self):
         except openml.exceptions.OpenMLServerException as e:
             assert e.code == 477
 
+    @pytest.mark.uses_test_server()
     def test_illegal_length_tag(self):
         dataset = openml.datasets.get_dataset(1)
         tag = "a" * 65
@@ -205,6 +211,7 @@ def test__name_to_id_with_multiple_active_error(self):
             error_if_multiple=True,
         )
 
+    @pytest.mark.uses_test_server()
     def test__name_to_id_name_does_not_exist(self):
         """With multiple active datasets, retrieve the least recent active."""
         self.assertRaisesRegex(
@@ -214,6 +221,7 @@ def test__name_to_id_name_does_not_exist(self):
             dataset_name="does_not_exist",
         )
 
+    @pytest.mark.uses_test_server()
     def test__name_to_id_version_does_not_exist(self):
         """With multiple active datasets, retrieve the least recent active."""
         self.assertRaisesRegex(
@@ -224,6 +232,7 @@ def test__name_to_id_version_does_not_exist(self):
             version=100000,
         )
 
+    @pytest.mark.uses_test_server()
     def test_get_datasets_by_name(self):
         # did 1 and 2 on the test server:
         dids = ["anneal", "kr-vs-kp"]
@@ -231,6 +240,7 @@ def test_get_datasets_by_name(self):
         assert len(datasets) == 2
         _assert_datasets_retrieved_successfully([1, 2])
 
+    @pytest.mark.uses_test_server()
     def test_get_datasets_by_mixed(self):
         # did 1 and 2 on the test server:
         dids = ["anneal", 2]
@@ -238,12 +248,14 @@ def test_get_datasets_by_mixed(self):
         assert len(datasets) == 2
         _assert_datasets_retrieved_successfully([1, 2])
 
+    @pytest.mark.uses_test_server()
     def test_get_datasets(self):
         dids = [1, 2]
         datasets = openml.datasets.get_datasets(dids)
         assert len(datasets) == 2
         _assert_datasets_retrieved_successfully([1, 2])
 
+    @pytest.mark.uses_test_server()
     def test_get_dataset_by_name(self):
         dataset = openml.datasets.get_dataset("anneal")
         assert type(dataset) == OpenMLDataset
@@ -262,6 +274,7 @@ def test_get_dataset_download_all_files(self):
         # test_get_dataset_lazy
         raise NotImplementedError
 
+    @pytest.mark.uses_test_server()
     def test_get_dataset_uint8_dtype(self):
         dataset = openml.datasets.get_dataset(1)
         assert type(dataset) == OpenMLDataset
@@ -280,7 +293,7 @@ def test_dataset_by_name_cannot_access_private_data(self):
         self.use_production_server()
         self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, "NAME_GOES_HERE")
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_get_dataset_lazy_all_functions(self):
         """Test that all expected functionality is available without downloading the dataset."""
         dataset = openml.datasets.get_dataset(1)
@@ -310,24 +323,28 @@ def ensure_absence_of_real_data():
         assert classes == ["1", "2", "3", "4", "5", "U"]
         ensure_absence_of_real_data()
 
+    @pytest.mark.uses_test_server()
     def test_get_dataset_sparse(self):
         dataset = openml.datasets.get_dataset(102)
         X, *_ = dataset.get_data()
         assert isinstance(X, pd.DataFrame)
         assert all(isinstance(col, pd.SparseDtype) for col in X.dtypes)
 
+    @pytest.mark.uses_test_server()
     def test_download_rowid(self):
         # Smoke test which checks that the dataset has the row-id set correctly
         did = 44
         dataset = openml.datasets.get_dataset(did)
         assert dataset.row_id_attribute == "Counter"
 
+    @pytest.mark.uses_test_server()
     def test__get_dataset_description(self):
         description = _get_dataset_description(self.workdir, 2)
         assert isinstance(description, dict)
         description_xml_path = os.path.join(self.workdir, "description.xml")
         assert os.path.exists(description_xml_path)
 
+    @pytest.mark.uses_test_server()
     def test__getarff_path_dataset_arff(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         description = _get_dataset_description(self.workdir, 2)
@@ -391,6 +408,7 @@ def test__download_minio_file_works_with_bucket_subdirectory(self):
 
 
     @mock.patch("openml._api_calls._download_minio_file")
+    @pytest.mark.uses_test_server()
     def test__get_dataset_parquet_is_cached(self, patch):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         patch.side_effect = RuntimeError(
@@ -431,18 +449,21 @@ def test__getarff_md5_issue(self):
 
         openml.config.connection_n_retries = n
 
+    @pytest.mark.uses_test_server()
     def test__get_dataset_features(self):
         features_file = _get_dataset_features_file(self.workdir, 2)
         assert isinstance(features_file, Path)
         features_xml_path = self.workdir / "features.xml"
         assert features_xml_path.exists()
 
+    @pytest.mark.uses_test_server()
     def test__get_dataset_qualities(self):
         qualities = _get_dataset_qualities_file(self.workdir, 2)
         assert isinstance(qualities, Path)
         qualities_xml_path = self.workdir / "qualities.xml"
         assert qualities_xml_path.exists()
 
+    @pytest.mark.uses_test_server()
     def test_get_dataset_force_refresh_cache(self):
         did_cache_dir = _create_cache_directory_for_id(
             DATASETS_CACHE_DIR_NAME,
@@ -465,6 +486,7 @@ def test_get_dataset_force_refresh_cache(self):
             did_cache_dir,
         )
 
+    @pytest.mark.uses_test_server()
     def test_get_dataset_force_refresh_cache_clean_start(self):
         did_cache_dir = _create_cache_directory_for_id(
             DATASETS_CACHE_DIR_NAME,
@@ -501,12 +523,14 @@ def test_deletion_of_cache_dir(self):
 
     # get_dataset_description is the only data guaranteed to be downloaded
     @mock.patch("openml.datasets.functions._get_dataset_description")
+    @pytest.mark.uses_test_server()
     def test_deletion_of_cache_dir_faulty_download(self, patch):
         patch.side_effect = Exception("Boom!")
         self.assertRaisesRegex(Exception, "Boom!", openml.datasets.get_dataset, dataset_id=1)
         datasets_cache_dir = os.path.join(self.workdir, "org", "openml", "test", "datasets")
         assert len(os.listdir(datasets_cache_dir)) == 0
 
+    @pytest.mark.uses_test_server()
     def test_publish_dataset(self):
         # lazy loading not possible as we need the arff-file.
         openml.datasets.get_dataset(3, download_data=True)
@@ -532,6 +556,7 @@ def test_publish_dataset(self):
         )
         assert isinstance(dataset.dataset_id, int)
 
+    @pytest.mark.uses_test_server()
     def test__retrieve_class_labels(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         labels = openml.datasets.get_dataset(2).retrieve_class_labels()
@@ -548,6 +573,7 @@ def test__retrieve_class_labels(self):
         labels = custom_ds.retrieve_class_labels(target_name=custom_ds.features[31].name)
         assert labels == ["COIL", "SHEET"]
 
+    @pytest.mark.uses_test_server()
     def test_upload_dataset_with_url(self):
         dataset = OpenMLDataset(
             f"{self._get_sentinel()}-UploadTestWithURL",
@@ -574,6 +600,7 @@ def _assert_status_of_dataset(self, *, did: int, status: str):
         assert result[did]["status"] == status
 
     @pytest.mark.flaky()
+    @pytest.mark.uses_test_server()
     def test_data_status(self):
         dataset = OpenMLDataset(
             f"{self._get_sentinel()}-UploadTestWithURL",
@@ -665,7 +692,7 @@ def test_attributes_arff_from_df_unknown_dtype(self):
             with pytest.raises(ValueError, match=err_msg):
                 attributes_arff_from_df(df)
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_create_dataset_numpy(self):
         data = np.array([[1, 2, 3], [1.2, 2.5, 3.8], [2, 5, 8], [0, 1, 0]]).T
 
@@ -699,6 +726,7 @@ def test_create_dataset_numpy(self):
         ), "Uploaded arff does not match original one"
         assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset"
 
+    @pytest.mark.uses_test_server()
     def test_create_dataset_list(self):
         data = [
             ["a", "sunny", 85.0, 85.0, "FALSE", "no"],
@@ -753,7 +781,7 @@ def test_create_dataset_list(self):
         ), "Uploaded ARFF does not match original one"
         assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset"
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_create_dataset_sparse(self):
         # test the scipy.sparse.coo_matrix
         sparse_data = scipy.sparse.coo_matrix(
@@ -856,6 +884,7 @@ def test_create_invalid_dataset(self):
         param["data"] = data[0]
         self.assertRaises(ValueError, create_dataset, **param)
 
+    @pytest.mark.uses_test_server()
     def test_get_online_dataset_arff(self):
         dataset_id = 100  # Australian
         # lazy loading not used as arff file is checked.
@@ -871,7 +900,7 @@ def test_get_online_dataset_arff(self):
             return_type=arff.DENSE if d_format == "arff" else arff.COO,
         ), "ARFF files are not equal"
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_topic_api_error(self):
         # Check server exception when non-admin accessses apis
         self.assertRaisesRegex(
@@ -890,6 +919,7 @@ def test_topic_api_error(self):
             topic="business",
         )
 
+    @pytest.mark.uses_test_server()
     def test_get_online_dataset_format(self):
         # Phoneme dataset
         dataset_id = 77
@@ -899,7 +929,7 @@ def test_get_online_dataset_format(self):
             dataset_id
         ), "The format of the ARFF files is different"
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_create_dataset_pandas(self):
         data = [
             ["a", "sunny", 85.0, 85.0, "FALSE", "no"],
@@ -1124,7 +1154,7 @@ def test_ignore_attributes_dataset(self):
                 paper_url=paper_url,
             )
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_publish_fetch_ignore_attribute(self):
         """Test to upload and retrieve dataset and check ignore_attributes"""
         data = [
@@ -1243,7 +1273,7 @@ def test_create_dataset_row_id_attribute_error(self):
                 paper_url=paper_url,
             )
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_create_dataset_row_id_attribute_inference(self):
         # meta-information
         name = f"{self._get_sentinel()}-pandas_testing_dataset"
@@ -1334,11 +1364,13 @@ def test_create_dataset_attributes_auto_without_df(self):
                 paper_url=paper_url,
             )
 
+    @pytest.mark.uses_test_server()
     def test_list_qualities(self):
         qualities = openml.datasets.list_qualities()
         assert isinstance(qualities, list) is True
         assert all(isinstance(q, str) for q in qualities) is True
 
+    @pytest.mark.uses_test_server()
     def test_get_dataset_cache_format_pickle(self):
         dataset = openml.datasets.get_dataset(1)
         dataset.get_data()
@@ -1354,6 +1386,7 @@ def test_get_dataset_cache_format_pickle(self):
         assert len(categorical) == X.shape[1]
         assert len(attribute_names) == X.shape[1]
 
+    @pytest.mark.uses_test_server()
     def test_get_dataset_cache_format_feather(self):
         # This test crashed due to using the parquet file by default, which is downloaded
         # from minio. However, there is a mismatch between OpenML test server and minio IDs.
@@ -1386,6 +1419,7 @@ def test_get_dataset_cache_format_feather(self):
         assert len(categorical) == X.shape[1]
         assert len(attribute_names) == X.shape[1]
 
+    @pytest.mark.uses_test_server()
     def test_data_edit_non_critical_field(self):
         # Case 1
         # All users can edit non-critical fields of datasets
@@ -1407,7 +1441,7 @@ def test_data_edit_non_critical_field(self):
         edited_dataset = openml.datasets.get_dataset(did)
         assert edited_dataset.description == desc
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_data_edit_critical_field(self):
         # Case 2
         # only owners (or admin) can edit all critical fields of datasets
@@ -1434,6 +1468,7 @@ def test_data_edit_critical_field(self):
                     os.path.join(self.workdir, "org", "openml", "test", "datasets", str(did)),
                 )
 
+    @pytest.mark.uses_test_server()
     def test_data_edit_requires_field(self):
         # Check server exception when no field to edit is provided
         self.assertRaisesRegex(
@@ -1446,6 +1481,7 @@ def test_data_edit_requires_field(self):
             data_id=64,  # blood-transfusion-service-center
         )
 
+    @pytest.mark.uses_test_server()
     def test_data_edit_requires_valid_dataset(self):
         # Check server exception when unknown dataset is provided
         self.assertRaisesRegex(
@@ -1456,7 +1492,7 @@ def test_data_edit_requires_valid_dataset(self):
             description="xor operation dataset",
         )
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_data_edit_cannot_edit_critical_field_if_dataset_has_task(self):
         # Need to own a dataset to be able to edit meta-data
         # Will be creating a forked version of an existing dataset to allow the unit test user
@@ -1483,6 +1519,7 @@ def test_data_edit_cannot_edit_critical_field_if_dataset_has_task(self):
             default_target_attribute="y",
         )
 
+    @pytest.mark.uses_test_server()
     def test_edit_data_user_cannot_edit_critical_field_of_other_users_dataset(self):
         # Check server exception when a non-owner or non-admin tries to edit critical fields
         self.assertRaisesRegex(
@@ -1494,6 +1531,7 @@ def test_edit_data_user_cannot_edit_critical_field_of_other_users_dataset(self):
             default_target_attribute="y",
         )
 
+    @pytest.mark.uses_test_server()
     def test_data_fork(self):
         did = 1
         result = fork_dataset(did)
@@ -1785,6 +1823,7 @@ def all_datasets():
     return openml.datasets.list_datasets()
 
 
+@pytest.mark.uses_test_server()
 def test_list_datasets(all_datasets: pd.DataFrame):
     # We can only perform a smoke test here because we test on dynamic
     # data from the internet...
@@ -1793,42 +1832,49 @@ def test_list_datasets(all_datasets: pd.DataFrame):
     _assert_datasets_have_id_and_valid_status(all_datasets)
 
 
+@pytest.mark.uses_test_server()
 def test_list_datasets_by_tag(all_datasets: pd.DataFrame):
     tag_datasets = openml.datasets.list_datasets(tag="study_14")
     assert 0 < len(tag_datasets) < len(all_datasets)
     _assert_datasets_have_id_and_valid_status(tag_datasets)
 
 
+@pytest.mark.uses_test_server()
 def test_list_datasets_by_size():
     datasets = openml.datasets.list_datasets(size=5)
     assert len(datasets) == 5
     _assert_datasets_have_id_and_valid_status(datasets)
 
 
+@pytest.mark.uses_test_server()
 def test_list_datasets_by_number_instances(all_datasets: pd.DataFrame):
     small_datasets = openml.datasets.list_datasets(number_instances="5..100")
     assert 0 < len(small_datasets) <= len(all_datasets)
     _assert_datasets_have_id_and_valid_status(small_datasets)
 
 
+@pytest.mark.uses_test_server()
 def test_list_datasets_by_number_features(all_datasets: pd.DataFrame):
     wide_datasets = openml.datasets.list_datasets(number_features="50..100")
     assert 8 <= len(wide_datasets) < len(all_datasets)
     _assert_datasets_have_id_and_valid_status(wide_datasets)
 
 
+@pytest.mark.uses_test_server()
 def test_list_datasets_by_number_classes(all_datasets: pd.DataFrame):
     five_class_datasets = openml.datasets.list_datasets(number_classes="5")
     assert 3 <= len(five_class_datasets) < len(all_datasets)
     _assert_datasets_have_id_and_valid_status(five_class_datasets)
 
 
+@pytest.mark.uses_test_server()
 def test_list_datasets_by_number_missing_values(all_datasets: pd.DataFrame):
     na_datasets = openml.datasets.list_datasets(number_missing_values="5..100")
     assert 5 <= len(na_datasets) < len(all_datasets)
     _assert_datasets_have_id_and_valid_status(na_datasets)
 
 
+@pytest.mark.uses_test_server()
 def test_list_datasets_combined_filters(all_datasets: pd.DataFrame):
     combined_filter_datasets = openml.datasets.list_datasets(
         tag="study_14",
@@ -1901,6 +1947,7 @@ def isolate_for_test():
     ("with_data", "with_qualities", "with_features"),
     itertools.product([True, False], repeat=3),
 )
+@pytest.mark.uses_test_server()
 def test_get_dataset_lazy_behavior(
     isolate_for_test, with_data: bool, with_qualities: bool, with_features: bool
 ):
@@ -1927,6 +1974,7 @@ def test_get_dataset_lazy_behavior(
     )
 
 
+@pytest.mark.uses_test_server()
 def test_get_dataset_with_invalid_id() -> None:
     INVALID_ID = 123819023109238  # Well, at some point this will probably be valid...
     with pytest.raises(OpenMLServerNoResult, match="Unknown dataset") as e:
@@ -1954,6 +2002,7 @@ def test_read_features_from_xml_with_whitespace() -> None:
     assert dict[1].nominal_values == [" - 50000.", " 50000+."]
 
 
+@pytest.mark.uses_test_server()
 def test_get_dataset_parquet(requests_mock, test_files_directory):
     # Parquet functionality is disabled on the test server
     # There is no parquet-copy of the test server yet.
diff --git a/tests/test_evaluations/test_evaluation_functions.py b/tests/test_evaluations/test_evaluation_functions.py
index ffd3d9f78..7009217d6 100644
--- a/tests/test_evaluations/test_evaluation_functions.py
+++ b/tests/test_evaluations/test_evaluation_functions.py
@@ -155,6 +155,7 @@ def test_evaluation_list_limit(self):
         )
         assert len(evaluations) == 100
 
+    @pytest.mark.uses_test_server()
     def test_list_evaluations_empty(self):
         evaluations = openml.evaluations.list_evaluations("unexisting_measure")
         if len(evaluations) > 0:
@@ -232,6 +233,7 @@ def test_evaluation_list_sort(self):
         test_output = sorted(unsorted_output, reverse=True)
         assert test_output[:size] == sorted_output
 
+    @pytest.mark.uses_test_server()
     def test_list_evaluation_measures(self):
         measures = openml.evaluations.list_evaluation_measures()
         assert isinstance(measures, list) is True
diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py
index da719d058..99cee6f87 100644
--- a/tests/test_flows/test_flow.py
+++ b/tests/test_flows/test_flow.py
@@ -102,6 +102,7 @@ def test_get_structure(self):
                 subflow = flow.get_subflow(structure)
                 assert subflow.flow_id == sub_flow_id
 
+    @pytest.mark.uses_test_server()
     def test_tagging(self):
         flows = openml.flows.list_flows(size=1)
         flow_id = flows["id"].iloc[0]
@@ -119,6 +120,7 @@ def test_tagging(self):
         flows = openml.flows.list_flows(tag=tag)
         assert len(flows) == 0
 
+    @pytest.mark.uses_test_server()
     def test_from_xml_to_xml(self):
         # Get the raw xml thing
         # TODO maybe get this via get_flow(), which would have to be refactored
@@ -178,7 +180,7 @@ def test_to_xml_from_xml(self):
         assert new_flow is not flow
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_publish_flow(self):
         flow = openml.OpenMLFlow(
             name="sklearn.dummy.DummyClassifier",
@@ -220,7 +222,7 @@ def test_publish_existing_flow(self, flow_exists_mock):
         )
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_publish_flow_with_similar_components(self):
         clf = sklearn.ensemble.VotingClassifier(
             [("lr", sklearn.linear_model.LogisticRegression(solver="lbfgs"))],
@@ -271,7 +273,7 @@ def test_publish_flow_with_similar_components(self):
         TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow3.flow_id}")
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_semi_legal_flow(self):
         # TODO: Test if parameters are set correctly!
         # should not throw error as it contains two differentiable forms of
@@ -363,6 +365,7 @@ def test_illegal_flow(self):
         )
         self.assertRaises(ValueError, self.extension.model_to_flow, illegal)
 
+    @pytest.mark.uses_test_server()
     def test_nonexisting_flow_exists(self):
         def get_sentinel():
             # Create a unique prefix for the flow. Necessary because the flow
@@ -380,7 +383,7 @@ def get_sentinel():
         assert not flow_id
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_existing_flow_exists(self):
         # create a flow
         nb = sklearn.naive_bayes.GaussianNB()
@@ -421,7 +424,7 @@ def test_existing_flow_exists(self):
             assert downloaded_flow_id == flow.flow_id
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_sklearn_to_upload_to_flow(self):
         iris = sklearn.datasets.load_iris()
         X = iris.data
diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
index 0be65ceac..46bc36a94 100644
--- a/tests/test_flows/test_flow_functions.py
+++ b/tests/test_flows/test_flow_functions.py
@@ -274,12 +274,12 @@ def test_are_flows_equal_ignore_if_older(self):
         assert_flows_equal(flow, flow, ignore_parameter_values_on_older_children=None)
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="OrdinalEncoder introduced in 0.20. "
         "No known models with list of lists parameters in older versions.",
     )
+    @pytest.mark.uses_test_server()
     def test_sklearn_to_flow_list_of_lists(self):
         from sklearn.preprocessing import OrdinalEncoder
 
@@ -308,6 +308,7 @@ def test_get_flow1(self):
         assert flow.external_version is None
 
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_get_flow_reinstantiate_model(self):
         model = ensemble.RandomForestClassifier(n_estimators=33)
         extension = openml.extensions.get_extension_by_model(model)
@@ -319,6 +320,7 @@ def test_get_flow_reinstantiate_model(self):
         downloaded_flow = openml.flows.get_flow(flow.flow_id, reinstantiate=True)
         assert isinstance(downloaded_flow.model, sklearn.ensemble.RandomForestClassifier)
 
+    @pytest.mark.uses_test_server()
     def test_get_flow_reinstantiate_model_no_extension(self):
         # Flow 10 is a WEKA flow
         self.assertRaisesRegex(
@@ -389,7 +391,7 @@ def test_get_flow_reinstantiate_flow_not_strict_pre_023(self):
         assert "sklearn==0.19.1" not in flow.dependencies
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_get_flow_id(self):
         if self.long_version:
             list_all = openml.utils._list_all
@@ -424,6 +426,7 @@ def test_get_flow_id(self):
             pytest.skip(reason="Not sure why there should only be one version of this flow.")
             assert flow_ids_exact_version_True == flow_ids_exact_version_False
 
+    @pytest.mark.uses_test_server()
     def test_delete_flow(self):
         flow = openml.OpenMLFlow(
             name="sklearn.dummy.DummyClassifier",
diff --git a/tests/test_openml/test_api_calls.py b/tests/test_openml/test_api_calls.py
index da6857b6e..a295259ef 100644
--- a/tests/test_openml/test_api_calls.py
+++ b/tests/test_openml/test_api_calls.py
@@ -15,12 +15,14 @@
 
 
 class TestConfig(openml.testing.TestBase):
+    @pytest.mark.uses_test_server()
     def test_too_long_uri(self):
         with pytest.raises(openml.exceptions.OpenMLServerError, match="URI too long!"):
             openml.datasets.list_datasets(data_id=list(range(10000)))
 
     @unittest.mock.patch("time.sleep")
     @unittest.mock.patch("requests.Session")
+    @pytest.mark.uses_test_server()
     def test_retry_on_database_error(self, Session_class_mock, _):
         response_mock = unittest.mock.Mock()
         response_mock.text = (
@@ -115,6 +117,7 @@ def test_download_minio_failure(mock_minio, tmp_path: Path) -> None:
         ("task/42", "delete"),  # 460
     ],
 )
+@pytest.mark.uses_test_server()
 def test_authentication_endpoints_requiring_api_key_show_relevant_help_link(
     endpoint: str,
     method: str,
diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py
index 71651d431..1a66b76c0 100644
--- a/tests/test_runs/test_run.py
+++ b/tests/test_runs/test_run.py
@@ -25,6 +25,7 @@ class TestRun(TestBase):
     # Splitting not helpful, these test's don't rely on the server and take
     # less than 1 seconds
 
+    @pytest.mark.uses_test_server()
     def test_tagging(self):
         runs = openml.runs.list_runs(size=1)
         assert not runs.empty, "Test server state is incorrect"
@@ -118,7 +119,7 @@ def _check_array(array, type_):
             assert run_prime_trace_content is None
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_to_from_filesystem_vanilla(self):
         model = Pipeline(
             [
@@ -154,7 +155,7 @@ def test_to_from_filesystem_vanilla(self):
 
     @pytest.mark.sklearn()
     @pytest.mark.flaky()
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_to_from_filesystem_search(self):
         model = Pipeline(
             [
@@ -189,7 +190,7 @@ def test_to_from_filesystem_search(self):
         )
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_to_from_filesystem_no_model(self):
         model = Pipeline(
             [("imputer", SimpleImputer(strategy="mean")), ("classifier", DummyClassifier())],
@@ -295,7 +296,7 @@ def assert_run_prediction_data(task, run, model):
             assert_method(y_test, saved_y_test)
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_publish_with_local_loaded_flow(self):
         """
         Publish a run tied to a local flow after it has first been saved to
@@ -339,7 +340,7 @@ def test_publish_with_local_loaded_flow(self):
             openml.runs.get_run(loaded_run.run_id)
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_offline_and_online_run_identical(self):
         extension = SklearnExtension()
 
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 305d859d9..db54151d1 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -398,6 +398,7 @@ def _check_sample_evaluations(
                             assert evaluation < max_time_allowed
 
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_run_regression_on_classif_task(self):
         task_id = 259  # collins; crossvalidation; has numeric targets
 
@@ -413,8 +414,8 @@ def test_run_regression_on_classif_task(self):
                 task=task,
             )
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_check_erronous_sklearn_flow_fails(self):
         task_id = 115  # diabetes; crossvalidation
         task = openml.tasks.get_task(task_id)
@@ -626,8 +627,8 @@ def _run_and_upload_regression(
             sentinel=sentinel,
         )
 
-    @pytest.mark.skip(reason="failures_issue_1544")
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_run_and_upload_logistic_regression(self):
         lr = LogisticRegression(solver="lbfgs", max_iter=1000)
         task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"]
@@ -635,8 +636,8 @@ def test_run_and_upload_logistic_regression(self):
         n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"]
         self._run_and_upload_classification(lr, task_id, n_missing_vals, n_test_obs, "62501")
 
-    @pytest.mark.skip(reason="failures_issue_1544")
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_run_and_upload_linear_regression(self):
         lr = LinearRegression()
         task_id = self.TEST_SERVER_TASK_REGRESSION["task_id"]
@@ -666,8 +667,8 @@ def test_run_and_upload_linear_regression(self):
         n_test_obs = self.TEST_SERVER_TASK_REGRESSION["n_test_obs"]
         self._run_and_upload_regression(lr, task_id, n_missing_vals, n_test_obs, "62501")
 
-    @pytest.mark.skip(reason="failures_issue_1544")
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_run_and_upload_pipeline_dummy_pipeline(self):
         pipeline1 = Pipeline(
             steps=[
@@ -680,12 +681,12 @@ def test_run_and_upload_pipeline_dummy_pipeline(self):
         n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"]
         self._run_and_upload_classification(pipeline1, task_id, n_missing_vals, n_test_obs, "62501")
 
-    @pytest.mark.skip(reason="failures_issue_1544")
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="columntransformer introduction in 0.20.0",
     )
+    @pytest.mark.uses_test_server()
     def test_run_and_upload_column_transformer_pipeline(self):
         import sklearn.compose
         import sklearn.impute
@@ -745,7 +746,6 @@ def get_ct_cf(nominal_indices, numeric_indices):
             sentinel=sentinel,
         )
 
-    @pytest.mark.skip(reason="failures_issue_1544")
     @pytest.mark.sklearn()
     @unittest.skip("https://github.com/openml/OpenML/issues/1180")
     @unittest.skipIf(
@@ -798,8 +798,8 @@ def test_run_and_upload_knn_pipeline(self, warnings_mock):
                 call_count += 1
         assert call_count == 3
 
-    @pytest.mark.skip(reason="failures_issue_1544")
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_run_and_upload_gridsearch(self):
         estimator_name = (
             "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
@@ -822,7 +822,7 @@ def test_run_and_upload_gridsearch(self):
         assert len(run.trace.trace_iterations) == 9
 
     @pytest.mark.sklearn()
-    @pytest.mark.skip(reason="failures_issue_1544")
+    @pytest.mark.uses_test_server()
     def test_run_and_upload_randomsearch(self):
         randomsearch = RandomizedSearchCV(
             RandomForestClassifier(n_estimators=5),
@@ -854,8 +854,8 @@ def test_run_and_upload_randomsearch(self):
         trace = openml.runs.get_run_trace(run.run_id)
         assert len(trace.trace_iterations) == 5
 
-    @pytest.mark.skip(reason="failures_issue_1544")
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_run_and_upload_maskedarrays(self):
         # This testcase is important for 2 reasons:
         # 1) it verifies the correct handling of masked arrays (not all
@@ -882,8 +882,8 @@ def test_run_and_upload_maskedarrays(self):
 
     ##########################################################################
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_learning_curve_task_1(self):
         task_id = 801  # diabates dataset
         num_test_instances = 6144  # for learning curve
@@ -907,8 +907,8 @@ def test_learning_curve_task_1(self):
         )
         self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples)
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_learning_curve_task_2(self):
         task_id = 801  # diabates dataset
         num_test_instances = 6144  # for learning curve
@@ -944,12 +944,12 @@ def test_learning_curve_task_2(self):
         )
         self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples)
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.21"),
         reason="Pipelines don't support indexing (used for the assert check)",
     )
+    @pytest.mark.uses_test_server()
     def test_initialize_cv_from_run(self):
         randomsearch = Pipeline(
             [
@@ -1023,8 +1023,8 @@ def _test_local_evaluations(self, run):
                 assert alt_scores[idx] >= 0
                 assert alt_scores[idx] <= 1
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_local_run_swapped_parameter_order_model(self):
         clf = DecisionTreeClassifier()
         australian_task = 595  # Australian; crossvalidation
@@ -1039,12 +1039,12 @@ def test_local_run_swapped_parameter_order_model(self):
 
         self._test_local_evaluations(run)
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
     )
+    @pytest.mark.uses_test_server()
     def test_local_run_swapped_parameter_order_flow(self):
         # construct sci-kit learn classifier
         clf = Pipeline(
@@ -1068,12 +1068,12 @@ def test_local_run_swapped_parameter_order_flow(self):
 
         self._test_local_evaluations(run)
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
     )
+    @pytest.mark.uses_test_server()
     def test_local_run_metric_score(self):
         # construct sci-kit learn classifier
         clf = Pipeline(
@@ -1106,12 +1106,12 @@ def test_online_run_metric_score(self):
 
         self._test_local_evaluations(run)
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
     )
+    @pytest.mark.uses_test_server()
     def test_initialize_model_from_run(self):
         clf = sklearn.pipeline.Pipeline(
             steps=[
@@ -1168,12 +1168,12 @@ def test_initialize_model_from_run(self):
         assert flowS.components["Imputer"].parameters["strategy"] == '"most_frequent"'
         assert flowS.components["VarianceThreshold"].parameters["threshold"] == "0.05"
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
     )
+    @pytest.mark.uses_test_server()
     def test__run_exists(self):
         # would be better to not sentinel these clfs,
         # so we do not have to perform the actual runs
@@ -1228,8 +1228,8 @@ def test__run_exists(self):
             run_ids = run_exists(task.task_id, setup_exists)
             assert run_ids, (run_ids, clf)
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_run_with_illegal_flow_id(self):
         # check the case where the user adds an illegal flow id to a
         # non-existing flo
@@ -1248,8 +1248,8 @@ def test_run_with_illegal_flow_id(self):
                 avoid_duplicate_runs=True,
             )
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_run_with_illegal_flow_id_after_load(self):
         # Same as `test_run_with_illegal_flow_id`, but test this error is also
         # caught if the run is stored to and loaded from disk first.
@@ -1281,6 +1281,7 @@ def test_run_with_illegal_flow_id_after_load(self):
             TestBase.logger.info(f"collected from test_run_functions: {loaded_run.run_id}")
 
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_run_with_illegal_flow_id_1(self):
         # Check the case where the user adds an illegal flow id to an existing
         # flow. Comes to a different value error than the previous test
@@ -1305,8 +1306,8 @@ def test_run_with_illegal_flow_id_1(self):
                 avoid_duplicate_runs=True,
             )
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_run_with_illegal_flow_id_1_after_load(self):
         # Same as `test_run_with_illegal_flow_id_1`, but test this error is
         # also caught if the run is stored to and loaded from disk first.
@@ -1344,12 +1345,12 @@ def test_run_with_illegal_flow_id_1_after_load(self):
             loaded_run.publish,
         )
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="OneHotEncoder cannot handle mixed type DataFrame as input",
     )
+    @pytest.mark.uses_test_server()
     def test__run_task_get_arffcontent(self):
         task = openml.tasks.get_task(7)  # kr-vs-kp; crossvalidation
         num_instances = 3196
@@ -1450,6 +1451,7 @@ def test_get_runs_list(self):
         for run in runs.to_dict(orient="index").values():
             self._check_run(run)
 
+    @pytest.mark.uses_test_server()
     def test_list_runs_empty(self):
         runs = openml.runs.list_runs(task=[0])
         assert runs.empty
@@ -1572,12 +1574,12 @@ def test_get_runs_list_by_tag(self):
         runs = openml.runs.list_runs(tag="curves", size=2)
         assert len(runs) >= 1
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="columntransformer introduction in 0.20.0",
     )
+    @pytest.mark.uses_test_server()
     def test_run_on_dataset_with_missing_labels_dataframe(self):
         # Check that _run_task_get_arffcontent works when one of the class
         # labels only declared in the arff file, but is not present in the
@@ -1609,12 +1611,12 @@ def test_run_on_dataset_with_missing_labels_dataframe(self):
             # repeat, fold, row_id, 6 confidences, prediction and correct label
             assert len(row) == 12
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="columntransformer introduction in 0.20.0",
     )
+    @pytest.mark.uses_test_server()
     def test_run_on_dataset_with_missing_labels_array(self):
         # Check that _run_task_get_arffcontent works when one of the class
         # labels only declared in the arff file, but is not present in the
@@ -1653,6 +1655,7 @@ def test_run_on_dataset_with_missing_labels_array(self):
             # repeat, fold, row_id, 6 confidences, prediction and correct label
             assert len(row) == 12
 
+    @pytest.mark.uses_test_server()
     def test_get_cached_run(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         openml.runs.functions._get_cached_run(1)
@@ -1662,8 +1665,8 @@ def test_get_uncached_run(self):
         with pytest.raises(openml.exceptions.OpenMLCacheException):
             openml.runs.functions._get_cached_run(10)
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_run_flow_on_task_downloaded_flow(self):
         model = sklearn.ensemble.RandomForestClassifier(n_estimators=33)
         flow = self.extension.model_to_flow(model)
@@ -1694,6 +1697,7 @@ def test_format_prediction_non_supervised(self):
         ):
             format_prediction(clustering, *ignored_input)
 
+    @pytest.mark.uses_test_server()
     def test_format_prediction_classification_no_probabilities(self):
         classification = openml.tasks.get_task(
             self.TEST_SERVER_TASK_SIMPLE["task_id"],
@@ -1703,7 +1707,7 @@ def test_format_prediction_classification_no_probabilities(self):
         with pytest.raises(ValueError, match="`proba` is required for classification task"):
             format_prediction(classification, *ignored_input, proba=None)
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_format_prediction_classification_incomplete_probabilities(self):
         classification = openml.tasks.get_task(
             self.TEST_SERVER_TASK_SIMPLE["task_id"],
@@ -1714,6 +1718,7 @@ def test_format_prediction_classification_incomplete_probabilities(self):
         with pytest.raises(ValueError, match="Each class should have a predicted probability"):
             format_prediction(classification, *ignored_input, proba=incomplete_probabilities)
 
+    @pytest.mark.uses_test_server()
     def test_format_prediction_task_without_classlabels_set(self):
         classification = openml.tasks.get_task(
             self.TEST_SERVER_TASK_SIMPLE["task_id"],
@@ -1724,7 +1729,7 @@ def test_format_prediction_task_without_classlabels_set(self):
         with pytest.raises(ValueError, match="The classification task must have class labels set"):
             format_prediction(classification, *ignored_input, proba={})
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_format_prediction_task_learning_curve_sample_not_set(self):
         learning_curve = openml.tasks.get_task(801, download_data=False)  # diabetes;crossvalidation
         probabilities = {c: 0.2 for c in learning_curve.class_labels}
@@ -1732,7 +1737,7 @@ def test_format_prediction_task_learning_curve_sample_not_set(self):
         with pytest.raises(ValueError, match="`sample` can not be none for LearningCurveTask"):
             format_prediction(learning_curve, *ignored_input, sample=None, proba=probabilities)
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_format_prediction_task_regression(self):
         task_meta_data = self.TEST_SERVER_TASK_REGRESSION["task_meta_data"]
         _task_id = check_task_existence(**task_meta_data)
@@ -1762,12 +1767,12 @@ def test_format_prediction_task_regression(self):
 
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
         reason="SimpleImputer doesn't handle mixed type DataFrame as input",
     )
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_delete_run(self):
         rs = np.random.randint(1, 2**31 - 1)
         clf = sklearn.pipeline.Pipeline(
@@ -1863,12 +1868,12 @@ def test_delete_unknown_run(mock_delete, test_files_directory, test_api_key):
 
 
 @pytest.mark.sklearn()
-@pytest.mark.xfail(reason="failures_issue_1544", strict=False)
 @unittest.skipIf(
     Version(sklearn.__version__) < Version("0.21"),
     reason="couldn't perform local tests successfully w/o bloating RAM",
     )
 @mock.patch("openml_sklearn.SklearnExtension._prevent_optimize_n_jobs")
+@pytest.mark.uses_test_server()
 def test__run_task_get_arffcontent_2(parallel_mock):
     """Tests if a run executed in parallel is collated correctly."""
     task = openml.tasks.get_task(7)  # Supervised Classification on kr-vs-kp
@@ -1940,7 +1945,6 @@ def test__run_task_get_arffcontent_2(parallel_mock):
     )
 
 
-@pytest.mark.xfail(reason="failures_issue_1544", strict=False)
 @pytest.mark.sklearn()
 @unittest.skipIf(
     Version(sklearn.__version__) < Version("0.21"),
@@ -1960,6 +1964,7 @@ def test__run_task_get_arffcontent_2(parallel_mock):
         (-1, "threading", 10),  # the threading backend does preserve mocks even with parallelizing
     ]
 )
+@pytest.mark.uses_test_server()
 def test_joblib_backends(parallel_mock, n_jobs, backend, call_count):
     """Tests evaluation of a run using various joblib backends and n_jobs."""
     if backend is None:
diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py
index a3b698a37..a0469f9a5 100644
--- a/tests/test_setups/test_setup_functions.py
+++ b/tests/test_setups/test_setup_functions.py
@@ -34,8 +34,8 @@ def setUp(self):
         self.extension = SklearnExtension()
         super().setUp()
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_nonexisting_setup_exists(self):
         # first publish a non-existing flow
         sentinel = get_sentinel()
@@ -82,8 +82,8 @@ def _existing_setup_exists(self, classif):
         setup_id = openml.setups.setup_exists(flow)
         assert setup_id == run.setup_id
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_existing_setup_exists_1(self):
         def side_effect(self):
             self.var_smoothing = 1e-9
@@ -98,14 +98,14 @@ def side_effect(self):
             nb = sklearn.naive_bayes.GaussianNB()
             self._existing_setup_exists(nb)
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_exisiting_setup_exists_2(self):
         # Check a flow with one hyperparameter
         self._existing_setup_exists(sklearn.naive_bayes.GaussianNB())
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
     @pytest.mark.sklearn()
+    @pytest.mark.uses_test_server()
     def test_existing_setup_exists_3(self):
         # Check a flow with many hyperparameters
         self._existing_setup_exists(
@@ -147,6 +147,7 @@ def test_setup_list_filter_flow(self):
         for setup_id in setups:
             assert setups[setup_id].flow_id == flow_id
 
+    @pytest.mark.uses_test_server()
     def test_list_setups_empty(self):
         setups = openml.setups.list_setups(setup=[0])
         if len(setups) > 0:
@@ -167,6 +168,7 @@ def test_list_setups_output_format(self):
         assert isinstance(setups, pd.DataFrame)
         assert len(setups) == 10
 
+    @pytest.mark.uses_test_server()
     def test_setuplist_offset(self):
         size = 10
         setups = openml.setups.list_setups(offset=0, size=size)
@@ -178,6 +180,7 @@ def test_setuplist_offset(self):
 
         assert len(all) == size * 2
 
+    @pytest.mark.uses_test_server()
     def test_get_cached_setup(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         openml.setups.functions._get_cached_setup(1)
diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py
index 40026592f..839e74cf3 100644
--- a/tests/test_study/test_study_functions.py
+++ b/tests/test_study/test_study_functions.py
@@ -73,6 +73,7 @@ def test_get_suite_error(self):
         ):
             openml.study.get_suite(123)
 
+    @pytest.mark.uses_test_server()
     def test_publish_benchmark_suite(self):
         fixture_alias = None
         fixture_name = "unit tested benchmark suite"
@@ -141,13 +142,16 @@ def _test_publish_empty_study_is_allowed(self, explicit: bool):
         assert study_downloaded.main_entity_type == "run"
         assert study_downloaded.runs is None
 
+    @pytest.mark.uses_test_server()
     def test_publish_empty_study_explicit(self):
         self._test_publish_empty_study_is_allowed(explicit=True)
 
+    @pytest.mark.uses_test_server()
     def test_publish_empty_study_implicit(self):
         self._test_publish_empty_study_is_allowed(explicit=False)
 
     @pytest.mark.flaky()
+    @pytest.mark.uses_test_server()
     def test_publish_study(self):
         # get some random runs to attach
         run_list = openml.evaluations.list_evaluations("predictive_accuracy", size=10)
@@ -217,6 +221,7 @@ def test_publish_study(self):
         res = openml.study.delete_study(study.id)
         assert res
 
+    @pytest.mark.uses_test_server()
     def test_study_attach_illegal(self):
         run_list = openml.runs.list_runs(size=10)
         assert len(run_list) == 10
diff --git a/tests/test_tasks/test_classification_task.py b/tests/test_tasks/test_classification_task.py
index 5528cabf2..fed0c0a00 100644
--- a/tests/test_tasks/test_classification_task.py
+++ b/tests/test_tasks/test_classification_task.py
@@ -18,7 +18,7 @@ def setUp(self, n_levels: int = 1):
         self.task_type = TaskType.SUPERVISED_CLASSIFICATION
         self.estimation_procedure = 5
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_download_task(self):
         task = super().test_download_task()
         assert task.task_id == self.task_id
@@ -26,14 +26,13 @@ def test_download_task(self):
         assert task.dataset_id == 20
         assert task.estimation_procedure_id == self.estimation_procedure
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_class_labels(self):
         task = get_task(self.task_id)
         assert task.class_labels == ["tested_negative", "tested_positive"]
 
 
-@pytest.mark.xfail(reason="failures_issue_1544", strict=False)
-@pytest.mark.server()
+@pytest.mark.uses_test_server()
 def test_get_X_and_Y():
     task = get_task(119)
     X, Y = task.get_X_and_y()
diff --git a/tests/test_tasks/test_clustering_task.py b/tests/test_tasks/test_clustering_task.py
index dcc024388..2bbb015c6 100644
--- a/tests/test_tasks/test_clustering_task.py
+++ b/tests/test_tasks/test_clustering_task.py
@@ -28,6 +28,7 @@ def test_get_dataset(self):
         task.get_dataset()
 
     @pytest.mark.production()
+    @pytest.mark.uses_test_server()
     def test_download_task(self):
         # no clustering tasks on test server
         self.use_production_server()
@@ -36,6 +37,7 @@ def test_download_task(self):
         assert task.task_type_id == TaskType.CLUSTERING
         assert task.dataset_id == 36
 
+    @pytest.mark.uses_test_server()
     def test_upload_task(self):
         compatible_datasets = self._get_compatible_rand_dataset()
         for i in range(100):
diff --git a/tests/test_tasks/test_learning_curve_task.py b/tests/test_tasks/test_learning_curve_task.py
index 5f4b3e0ab..fbcbfe9bf 100644
--- a/tests/test_tasks/test_learning_curve_task.py
+++ b/tests/test_tasks/test_learning_curve_task.py
@@ -18,7 +18,7 @@ def setUp(self, n_levels: int = 1):
         self.task_type = TaskType.LEARNING_CURVE
         self.estimation_procedure = 13
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_get_X_and_Y(self):
         X, Y = super().test_get_X_and_Y()
         assert X.shape == (768, 8)
@@ -27,14 +27,14 @@ def test_get_X_and_Y(self):
         assert isinstance(Y, pd.Series)
         assert pd.api.types.is_categorical_dtype(Y)
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_download_task(self):
         task = super().test_download_task()
         assert task.task_id == self.task_id
         assert task.task_type_id == TaskType.LEARNING_CURVE
         assert task.dataset_id == 20
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_class_labels(self):
         task = get_task(self.task_id)
         assert task.class_labels == ["tested_negative", "tested_positive"]
diff --git a/tests/test_tasks/test_regression_task.py b/tests/test_tasks/test_regression_task.py
index 0cd2d96e2..a834cdf0f 100644
--- a/tests/test_tasks/test_regression_task.py
+++ b/tests/test_tasks/test_regression_task.py
@@ -49,7 +49,7 @@ def setUp(self, n_levels: int = 1):
         self.task_type = TaskType.SUPERVISED_REGRESSION
 
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_get_X_and_Y(self):
         X, Y = super().test_get_X_and_Y()
         assert X.shape == (194, 32)
@@ -58,7 +58,7 @@ def test_get_X_and_Y(self):
         assert isinstance(Y, pd.Series)
         assert pd.api.types.is_numeric_dtype(Y)
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_download_task(self):
         task = super().test_download_task()
         assert task.task_id == self.task_id
diff --git a/tests/test_tasks/test_supervised_task.py b/tests/test_tasks/test_supervised_task.py
index e5a17a72b..3f7b06ee4 100644
--- a/tests/test_tasks/test_supervised_task.py
+++ b/tests/test_tasks/test_supervised_task.py
@@ -28,6 +28,7 @@ def setUpClass(cls):
     def setUp(self, n_levels: int = 1):
         super().setUp()
 
+    @pytest.mark.uses_test_server()
     def test_get_X_and_Y(self) -> tuple[pd.DataFrame, pd.Series]:
         task = get_task(self.task_id)
         X, Y = task.get_X_and_y()
diff --git a/tests/test_tasks/test_task.py b/tests/test_tasks/test_task.py
index 67f715d2b..b77782847 100644
--- a/tests/test_tasks/test_task.py
+++ b/tests/test_tasks/test_task.py
@@ -32,10 +32,11 @@ def setUpClass(cls):
     def setUp(self, n_levels: int = 1):
         super().setUp()
 
+    @pytest.mark.uses_test_server()
     def test_download_task(self):
         return get_task(self.task_id)
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_upload_task(self):
         # We don't know if the task in question already exists, so we try a few times. Checking
         # beforehand would not be an option because a concurrent unit test could potentially
diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py
index 110459711..3a2b9ea0a 100644
--- a/tests/test_tasks/test_task_functions.py
+++ b/tests/test_tasks/test_task_functions.py
@@ -26,6 +26,7 @@ def setUp(self):
     def tearDown(self):
         super().tearDown()
 
+    @pytest.mark.uses_test_server()
     def test__get_cached_tasks(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         tasks = openml.tasks.functions._get_cached_tasks()
@@ -33,6 +34,7 @@ def test__get_cached_tasks(self):
         assert len(tasks) == 3
         assert isinstance(next(iter(tasks.values())), OpenMLTask)
 
+    @pytest.mark.uses_test_server()
     def test__get_cached_task(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         task = openml.tasks.functions._get_cached_task(1)
@@ -47,6 +49,7 @@ def test__get_cached_task_not_cached(self):
             2,
         )
 
+    @pytest.mark.uses_test_server()
     def test__get_estimation_procedure_list(self):
         estimation_procedures = openml.tasks.functions._get_estimation_procedure_list()
         assert isinstance(estimation_procedures, list)
@@ -69,6 +72,7 @@ def _check_task(self, task):
         assert isinstance(task["status"], str)
         assert task["status"] in ["in_preparation", "active", "deactivated"]
 
+    @pytest.mark.uses_test_server()
     def test_list_tasks_by_type(self):
         num_curves_tasks = 198  # number is flexible, check server if fails
         ttid = TaskType.LEARNING_CURVE
@@ -78,15 +82,18 @@ def test_list_tasks_by_type(self):
             assert ttid == task["ttid"]
             self._check_task(task)
 
+    @pytest.mark.uses_test_server()
     def test_list_tasks_length(self):
         ttid = TaskType.LEARNING_CURVE
         tasks = openml.tasks.list_tasks(task_type=ttid)
         assert len(tasks) > 100
 
+    @pytest.mark.uses_test_server()
     def test_list_tasks_empty(self):
         tasks = openml.tasks.list_tasks(tag="NoOneWillEverUseThisTag")
         assert tasks.empty
 
+    @pytest.mark.uses_test_server()
     def test_list_tasks_by_tag(self):
         num_basic_tasks = 100  # number is flexible, check server if fails
         tasks = openml.tasks.list_tasks(tag="OpenML100")
@@ -94,12 +101,14 @@ def test_list_tasks_by_tag(self):
         for task in tasks.to_dict(orient="index").values():
             self._check_task(task)
 
+    @pytest.mark.uses_test_server()
     def test_list_tasks(self):
         tasks = openml.tasks.list_tasks()
         assert len(tasks) >= 900
         for task in tasks.to_dict(orient="index").values():
             self._check_task(task)
 
+    @pytest.mark.uses_test_server()
     def test_list_tasks_paginate(self):
         size = 10
         max = 100
@@ -109,6 +118,7 @@ def test_list_tasks_paginate(self):
             for task in tasks.to_dict(orient="index").values():
                 self._check_task(task)
 
+    @pytest.mark.uses_test_server()
     def test_list_tasks_per_type_paginate(self):
         size = 40
         max = 100
@@ -125,6 +135,7 @@ def test_list_tasks_per_type_paginate(self):
                     assert j == task["ttid"]
                     self._check_task(task)
 
+    @pytest.mark.uses_test_server()
     def test__get_task(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         openml.tasks.get_task(1882)
@@ -139,6 +150,7 @@ def test__get_task_live(self):
         # https://github.com/openml/openml-python/issues/378
         openml.tasks.get_task(34536)
 
+    @pytest.mark.uses_test_server()
     def test_get_task(self):
         task = openml.tasks.get_task(1, download_data=True)  # anneal; crossvalidation
         assert isinstance(task, OpenMLTask)
@@ -152,7 +164,7 @@ def test_get_task(self):
             os.path.join(self.workdir, "org", "openml", "test", "datasets", "1", "dataset.arff")
         )
 
-    @pytest.mark.xfail(reason="failures_issue_1544", strict=False)
+    @pytest.mark.uses_test_server()
     def test_get_task_lazy(self):
         task = openml.tasks.get_task(2, download_data=False)  # anneal; crossvalidation
         assert isinstance(task, OpenMLTask)
@@ -175,7 +187,7 @@ def test_get_task_lazy(self):
         )
 
     @mock.patch("openml.tasks.functions.get_dataset")
-    @pytest.mark.xfail(reason="failures_issue_1544")
+    @pytest.mark.uses_test_server()
     def test_removal_upon_download_failure(self, get_dataset):
         class WeirdException(Exception):
             pass
@@ -193,6 +205,7 @@ def assert_and_raise(*args, **kwargs):
         # Now the file should no longer exist
         assert not os.path.exists(os.path.join(os.getcwd(), "tasks", "1", "tasks.xml"))
 
+    @pytest.mark.uses_test_server()
     def test_get_task_with_cache(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         task = openml.tasks.get_task(1)
@@ -208,6 +221,7 @@ def test_get_task_different_types(self):
         # Issue 538, get_task failing with clustering task.
         openml.tasks.functions.get_task(126033)
 
+    @pytest.mark.uses_test_server()
     def test_download_split(self):
         task = openml.tasks.get_task(1)  # anneal; crossvalidation
         split = task.download_split()
diff --git a/tests/test_tasks/test_task_methods.py b/tests/test_tasks/test_task_methods.py
index 540c43de0..6b8804b9f 100644
--- a/tests/test_tasks/test_task_methods.py
+++ b/tests/test_tasks/test_task_methods.py
@@ -16,6 +16,7 @@ def setUp(self):
     def tearDown(self):
         super().tearDown()
 
+    @pytest.mark.uses_test_server()
     def test_tagging(self):
         task = openml.tasks.get_task(1)  # anneal; crossvalidation
         # tags can be at most 64 alphanumeric (+ underscore) chars
@@ -31,6 +32,7 @@ def test_tagging(self):
         tasks = openml.tasks.list_tasks(tag=tag)
         assert len(tasks) == 0
 
+    @pytest.mark.uses_test_server()
     def test_get_train_and_test_split_indices(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         task = openml.tasks.get_task(1882)
diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py
index 35be84903..a1cdb55ea 100644
--- a/tests/test_utils/test_utils.py
+++ b/tests/test_utils/test_utils.py
@@ -48,18 +48,18 @@ def _mocked_perform_api_call(call, request_method):
     return openml._api_calls._download_text_file(url)
 
 
-@pytest.mark.server()
+@pytest.mark.uses_test_server()
 def test_list_all():
     openml.utils._list_all(listing_call=openml.tasks.functions._list_tasks)
 
 
-@pytest.mark.server()
+@pytest.mark.uses_test_server()
 def test_list_all_for_tasks(min_number_tasks_on_test_server):
     tasks = openml.tasks.list_tasks(size=min_number_tasks_on_test_server)
     assert min_number_tasks_on_test_server == len(tasks)
 
 
-@pytest.mark.server()
+@pytest.mark.uses_test_server()
 def test_list_all_with_multiple_batches(min_number_tasks_on_test_server):
     # By setting the batch size one lower than the minimum we guarantee at least two
     # batches and at the same time do as few batches (roundtrips) as possible.
@@ -72,7 +72,7 @@ def test_list_all_with_multiple_batches(min_number_tasks_on_test_server):
     assert min_number_tasks_on_test_server <= sum(len(batch) for batch in batches)
 
 
-@pytest.mark.server()
+@pytest.mark.uses_test_server()
 def test_list_all_for_datasets(min_number_datasets_on_test_server):
     datasets = openml.datasets.list_datasets(
         size=min_number_datasets_on_test_server,
@@ -83,29 +83,29 @@ def test_list_all_for_datasets(min_number_datasets_on_test_server):
         _check_dataset(dataset)
 
 
-@pytest.mark.server()
+@pytest.mark.uses_test_server()
 def test_list_all_for_flows(min_number_flows_on_test_server):
     flows = openml.flows.list_flows(size=min_number_flows_on_test_server)
     assert min_number_flows_on_test_server == len(flows)
 
 
-@pytest.mark.server()
 @pytest.mark.flaky()  # Other tests might need to upload runs first
+@pytest.mark.uses_test_server()
 def test_list_all_for_setups(min_number_setups_on_test_server):
     # TODO apparently list_setups function does not support kwargs
     setups = openml.setups.list_setups(size=min_number_setups_on_test_server)
     assert min_number_setups_on_test_server == len(setups)
 
 
-@pytest.mark.server()
 @pytest.mark.flaky()  # Other tests might need to upload runs first
+@pytest.mark.uses_test_server()
 def test_list_all_for_runs(min_number_runs_on_test_server):
     runs = openml.runs.list_runs(size=min_number_runs_on_test_server)
     assert min_number_runs_on_test_server == len(runs)
 
 
-@pytest.mark.server()
 @pytest.mark.flaky()  # Other tests might need to upload runs first
+@pytest.mark.uses_test_server()
 def test_list_all_for_evaluations(min_number_evaluations_on_test_server):
     # TODO apparently list_evaluations function does not support kwargs
     evaluations = openml.evaluations.list_evaluations(
@@ -115,8 +115,8 @@ def test_list_all_for_evaluations(min_number_evaluations_on_test_server):
     assert min_number_evaluations_on_test_server == len(evaluations)
 
 
-@pytest.mark.server()
 @unittest.mock.patch("openml._api_calls._perform_api_call", side_effect=_mocked_perform_api_call)
+@pytest.mark.uses_test_server()
 def test_list_all_few_results_available(_perform_api_call):
     datasets = openml.datasets.list_datasets(size=1000, data_name="iris", data_version=1)
     assert len(datasets) == 1, "only one iris dataset version 1 should be present"
@@ -141,7 +141,7 @@ def test__create_cache_directory(config_mock, tmp_path):
         openml.utils._create_cache_directory("ghi")
 
 
-@pytest.mark.server()
+@pytest.mark.uses_test_server()
 def test_correct_test_server_download_state():
     """This test verifies that the test server downloads the data from the correct source.
 

From d755d4c40c2d23dd996414fa6bf794a5b86c241c Mon Sep 17 00:00:00 2001
From: JATAYU000 <shrivaths44kunju@gmail.com>
Date: Thu, 8 Jan 2026 10:50:49 +0530
Subject: [PATCH 8/8] Updated OpenML classes which require repr

---
 openml/base.py                  | 38 +++---------------------------
 openml/datasets/data_feature.py | 20 +++++++++++++---
 openml/setups/setup.py          | 41 +++++++++++----------------------
 openml/tasks/split.py           | 19 ++++++---------
 4 files changed, 40 insertions(+), 78 deletions(-)

diff --git a/openml/base.py b/openml/base.py
index fbfb9dfc8..927ba8fed 100644
--- a/openml/base.py
+++ b/openml/base.py
@@ -1,26 +1,22 @@
 # License: BSD 3-Clause
 from __future__ import annotations
 
-import re
 import webbrowser
 from abc import ABC, abstractmethod
-from typing import Iterable, Sequence
+from typing import Sequence
 
 import xmltodict
 
 import openml._api_calls
 import openml.config
+from openml.utils import ReprMixin
 
 from .utils import _get_rest_api_type_alias, _tag_openml_base
 
 
-class OpenMLBase(ABC):
+class OpenMLBase(ReprMixin, ABC):
     """Base object for functionality that is shared across entities."""
 
-    def __repr__(self) -> str:
-        body_fields = self._get_repr_body_fields()
-        return self._apply_repr_template(body_fields)
-
     @property
     @abstractmethod
     def id(self) -> int | None:
@@ -60,34 +56,6 @@ def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str] | N
         """
         # Should be implemented in the base class.
 
-    def _apply_repr_template(
-        self,
-        body_fields: Iterable[tuple[str, str | int | list[str] | None]],
-    ) -> str:
-        """Generates the header and formats the body for string representation of the object.
-
-        Parameters
-        ----------
-        body_fields: List[Tuple[str, str]]
-           A list of (name, value) pairs to display in the body of the __repr__.
-        """
-        # We add spaces between capitals, e.g. ClassificationTask -> Classification Task
-        name_with_spaces = re.sub(
-            r"(\w)([A-Z])",
-            r"\1 \2",
-            self.__class__.__name__[len("OpenML") :],
-        )
-        header_text = f"OpenML {name_with_spaces}"
-        header = f"{header_text}\n{'=' * len(header_text)}\n"
-
-        _body_fields: list[tuple[str, str | int | list[str]]] = [
-            (k, "None" if v is None else v) for k, v in body_fields
-        ]
-        longest_field_name_length = max(len(name) for name, _ in _body_fields)
-        field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}"
-        body = "\n".join(field_line_format.format(name, value) for name, value in _body_fields)
-        return header + body
-
     @abstractmethod
     def _to_dict(self) -> dict[str, dict]:
         """Creates a dictionary representation of self.
diff --git a/openml/datasets/data_feature.py b/openml/datasets/data_feature.py
index 218b0066d..5026d27e8 100644
--- a/openml/datasets/data_feature.py
+++ b/openml/datasets/data_feature.py
@@ -6,8 +6,10 @@
 if TYPE_CHECKING:
     from IPython.lib import pretty
 
+from openml.utils import ReprMixin
 
-class OpenMLDataFeature:
+
+class OpenMLDataFeature(ReprMixin):
     """
     Data Feature (a.k.a. Attribute) object.
 
@@ -74,8 +76,20 @@ def __init__(  # noqa: PLR0913
         self.number_missing_values = number_missing_values
         self.ontologies = ontologies
 
-    def __repr__(self) -> str:
-        return "[%d - %s (%s)]" % (self.index, self.name, self.data_type)
+    def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str] | None]]:
+        """Collect all information to display in the __repr__ body."""
+        fields: dict[str, int | str | None] = {
+            "Index": self.index,
+            "Name": self.name,
+            "Data Type": self.data_type,
+        }
+
+        order = [
+            "Index",
+            "Name",
+            "Data Type",
+        ]
+        return [(key, fields[key]) for key in order if key in fields]
 
     def __eq__(self, other: Any) -> bool:
         return isinstance(other, OpenMLDataFeature) and self.__dict__ == other.__dict__
diff --git a/openml/setups/setup.py b/openml/setups/setup.py
index 0960ad4c1..bf65cecf9 100644
--- a/openml/setups/setup.py
+++ b/openml/setups/setup.py
@@ -1,13 +1,14 @@
 # License: BSD 3-Clause
 from __future__ import annotations
 
-from typing import Any
+from typing import Any, Sequence
 
 import openml.config
 import openml.flows
+from openml.utils import ReprMixin
 
 
-class OpenMLSetup:
+class OpenMLSetup(ReprMixin):
     """Setup object (a.k.a. Configuration).
 
     Parameters
@@ -43,30 +44,21 @@ def _to_dict(self) -> dict[str, Any]:
             else None,
         }
 
-    def __repr__(self) -> str:
-        header = "OpenML Setup"
-        header = f"{header}\n{'=' * len(header)}\n"
-
-        fields = {
+    def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str] | None]]:
+        """Collect all information to display in the __repr__ body."""
+        fields: dict[str, int | str | None] = {
             "Setup ID": self.setup_id,
             "Flow ID": self.flow_id,
             "Flow URL": openml.flows.OpenMLFlow.url_for_id(self.flow_id),
-            "# of Parameters": (
-                len(self.parameters) if self.parameters is not None else float("nan")
-            ),
+            "# of Parameters": (len(self.parameters) if self.parameters is not None else "nan"),
         }
 
         # determines the order in which the information will be printed
         order = ["Setup ID", "Flow ID", "Flow URL", "# of Parameters"]
-        _fields = [(key, fields[key]) for key in order if key in fields]
-
-        longest_field_name_length = max(len(name) for name, _ in _fields)
-        field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}"
-        body = "\n".join(field_line_format.format(name, value) for name, value in _fields)
-        return header + body
+        return [(key, fields[key]) for key in order if key in fields]
 
 
-class OpenMLParameter:
+class OpenMLParameter(ReprMixin):
     """Parameter object (used in setup).
 
     Parameters
@@ -123,11 +115,9 @@ def _to_dict(self) -> dict[str, Any]:
             "value": self.value,
         }
 
-    def __repr__(self) -> str:
-        header = "OpenML Parameter"
-        header = f"{header}\n{'=' * len(header)}\n"
-
-        fields = {
+    def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str] | None]]:
+        """Collect all information to display in the __repr__ body."""
+        fields: dict[str, int | str | None] = {
             "ID": self.id,
             "Flow ID": self.flow_id,
             # "Flow Name": self.flow_name,
@@ -156,9 +146,4 @@ def __repr__(self) -> str:
             parameter_default,
             parameter_value,
         ]
-        _fields = [(key, fields[key]) for key in order if key in fields]
-
-        longest_field_name_length = max(len(name) for name, _ in _fields)
-        field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}"
-        body = "\n".join(field_line_format.format(name, value) for name, value in _fields)
-        return header + body
+        return [(key, fields[key]) for key in order if key in fields]
diff --git a/openml/tasks/split.py b/openml/tasks/split.py
index 073a378c8..ece41c2ac 100644
--- a/openml/tasks/split.py
+++ b/openml/tasks/split.py
@@ -4,12 +4,14 @@
 import pickle
 from collections import OrderedDict
 from pathlib import Path
-from typing import Any
+from typing import Any, Sequence
 from typing_extensions import NamedTuple
 
 import arff  # type: ignore
 import numpy as np
 
+from openml.utils import ReprMixin
+
 
 class Split(NamedTuple):
     """A single split of a dataset."""
@@ -18,7 +20,7 @@ class Split(NamedTuple):
     test: np.ndarray
 
 
-class OpenMLSplit:
+class OpenMLSplit(ReprMixin):
     """OpenML Split object.
 
     This class manages train-test splits for a dataset across multiple
@@ -63,10 +65,8 @@ def __init__(
         self.folds = len(self.split[0])
         self.samples = len(self.split[0][0])
 
-    def __repr__(self) -> str:
-        header = "OpenML Split"
-        header = f"{header}\n{'=' * len(header)}\n"
-
+    def _get_repr_body_fields(self) -> Sequence[tuple[str, str | int | list[str] | None]]:
+        """Collect all information to display in the __repr__ body."""
         fields = {
             "Name": self.name,
             "Description": (
@@ -79,12 +79,7 @@ def __repr__(self) -> str:
 
         order = ["Name", "Description", "Repeats", "Folds", "Samples"]
 
-        _fields = [(key, fields[key]) for key in order if key in fields]
-
-        longest_field_name_length = max(len(name) for name, _ in _fields)
-        field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}"
-        body = "\n".join(field_line_format.format(name, value) for name, value in _fields)
-        return header + body
+        return [(key, fields[key]) for key in order if key in fields]
 
     def __eq__(self, other: Any) -> bool:
         if (