Inference trace and Best Point Recommendation (BPR) bugfix (#4128)

Carl Hvarfner · facebook-github-bot · commit e54ec2e21ad7 · 2025-08-15T08:22:41.000-07:00
Summary:

This diff addresses two issues in the computation of inference trace:
1. The generation strategy is copied inside run_optimization_with_orchestrator --&gt; we retrieve the traces on an unused generation strategy --&gt; get_best_point defaults to the best raw observation on ALL obserations

2. Relevant data not filtered in the fallback option for get_best_parameters_from_model_predictions_with_trial_index

Both of these individually lead to the inference trace being incorrect - the first to the best raw value of ALL trials, the second to the best predicted across ALL trials.

Changes:
- Moved copying of generation strategy to the level`benchmark_replication`, since results need to be computed on the used `generation_strategy` and not an empty copy. This means that `run_optimization_with_orchestrator` no longer `clone_and_reset`'s the GS.
- Clearer sequencing in get_best_parameters_from_model_predictions_with_trial_index
- Removed model fit quality check as part of BPR

Previous, redacted changes:
- Added argument use_model_only_if_good to force model-based BPR even if model fit is bad

Differential Revision: D80019803
diff --git a/ax/benchmark/benchmark.py b/ax/benchmark/benchmark.py
@@ -326,9 +326,8 @@ def get_best_parameters(
     trial_indices: Iterable[int] | None = None,
 ) -> TParameterization | None:
     """
-    Get the most promising point.
-
-    Only SOO is supported. It will return None if no best point can be found.
+    Get the most promising point. Returns None if no point is predicted to
+    satisfy all outcome constraints.
 
     Args:
         experiment: The experiment to get the data from. This should contain
@@ -421,16 +420,17 @@ def get_benchmark_result_from_experiment_and_gs(
             optimization_config=problem.optimization_config,
         )
     )
-    inference_trace = get_inference_trace(
-        trial_completion_order=trial_completion_order,
-        experiment=experiment,
-        problem=problem,
-        generation_strategy=generation_strategy,
-    )
-
-    optimization_trace = (
-        inference_trace if problem.report_inference_value_as_trace else oracle_trace
-    )
+    if problem.report_inference_value_as_trace:
+        inference_trace = get_inference_trace(
+            trial_completion_order=trial_completion_order,
+            experiment=experiment,
+            problem=problem,
+            generation_strategy=generation_strategy,
+        )
+        optimization_trace = inference_trace
+    else:
+        optimization_trace = oracle_trace
+        inference_trace = None
 
     score_trace = compute_score_trace(
         optimization_trace=optimization_trace,
@@ -507,7 +507,7 @@ def run_optimization_with_orchestrator(
 
     orchestrator = Orchestrator(
         experiment=experiment,
-        generation_strategy=method.generation_strategy.clone_reset(),
+        generation_strategy=method.generation_strategy,
         options=orchestrator_options,
     )
 
@@ -562,6 +562,8 @@ def benchmark_replication(
     Return:
         ``BenchmarkResult`` object.
     """
+    # Reset the generation strategy to ensure that it is in an unused state.
+    method.generation_strategy = method.generation_strategy.clone_reset()
     experiment = run_optimization_with_orchestrator(
         problem=problem,
         method=method,
diff --git a/ax/benchmark/benchmark_result.py b/ax/benchmark/benchmark_result.py
@@ -52,7 +52,8 @@ class BenchmarkResult(Base):
             points based only on data that would be observable in realistic
             settings, as specified by `BenchmarkMethod.get_best_parameters`, and
             then evaluating the oracle objective value of that point according
-            to the problem's `OptimizationConfig`.
+            to the problem's `OptimizationConfig`. Only reported if
+            report_inference_value_as_trace is enabled in the BenchmarkProblem.
 
             By default, if it is not overridden,
             `BenchmarkMethod.get_best_parameters` uses the empirical best point
@@ -96,7 +97,7 @@ class BenchmarkResult(Base):
     seed: int
 
     oracle_trace: npt.NDArray
-    inference_trace: npt.NDArray
+    inference_trace: npt.NDArray | None
     optimization_trace: npt.NDArray
     score_trace: npt.NDArray
     cost_trace: npt.NDArray
diff --git a/ax/benchmark/testing/benchmark_stubs.py b/ax/benchmark/testing/benchmark_stubs.py
@@ -352,6 +352,7 @@ def get_async_benchmark_problem(
     step_runtime_fn: TBenchmarkStepRuntimeFunction | None = None,
     n_steps: int = 1,
     lower_is_better: bool = False,
+    report_inference_value_as_trace: bool = False,
 ) -> BenchmarkProblem:
     search_space = get_discrete_search_space()
     test_function = IdentityTestFunction(n_steps=n_steps)
@@ -371,6 +372,7 @@ def get_async_benchmark_problem(
         baseline_value=19 if lower_is_better else 0,
         optimal_value=0 if lower_is_better else 19,
         step_runtime_function=step_runtime_fn,
+        report_inference_value_as_trace=report_inference_value_as_trace,
     )
 
 
diff --git a/ax/benchmark/tests/test_benchmark.py b/ax/benchmark/tests/test_benchmark.py
@@ -17,7 +17,6 @@
 
 import numpy as np
 import torch
-from ax.adapter.factory import get_sobol
 from ax.adapter.registry import Generators
 from ax.benchmark.benchmark import (
     _get_oracle_value_of_params,
@@ -38,7 +37,6 @@
 from ax.benchmark.benchmark_problem import (
     BenchmarkProblem,
     create_problem_from_botorch,
-    get_continuous_search_space,
     get_moo_opt_config,
     get_soo_opt_config,
 )
@@ -82,7 +80,10 @@
 from ax.utils.common.mock import mock_patch_method_original
 from ax.utils.common.testutils import TestCase
 
-from ax.utils.testing.core_stubs import get_experiment_with_observations
+from ax.utils.testing.core_stubs import (
+    get_branin_experiment,
+    get_branin_experiment_with_multi_objective,
+)
 from ax.utils.testing.mock import mock_botorch_optimize
 from botorch.acquisition.knowledge_gradient import qKnowledgeGradient
 from botorch.acquisition.logei import qLogNoisyExpectedImprovement
@@ -261,7 +262,9 @@ def test_replication_sobol_surrogate(self) -> None:
                 self.assertTrue(np.isfinite(res.score_trace).all())
                 self.assertTrue(np.all(res.score_trace <= 100))
 
-    def _test_replication_async(self, map_data: bool) -> None:
+    def _test_replication_async(
+        self, map_data: bool, report_inference_value_as_trace: bool
+    ) -> None:
         """
         The test function is the identity function, higher is better, observed
         to be noiseless, and the same at every point on the trajectory. And the
@@ -349,6 +352,7 @@ def _test_replication_async(self, map_data: bool) -> None:
                 problem = get_async_benchmark_problem(
                     map_data=map_data,
                     step_runtime_fn=step_runtime_fn,
+                    report_inference_value_as_trace=report_inference_value_as_trace,
                 )
 
                 with mock_patch_method_original(
@@ -417,12 +421,15 @@ def _test_replication_async(self, map_data: bool) -> None:
                         },
                         f"Failure for trial {trial_index} with {case_name}",
                     )
-                self.assertFalse(np.isnan(result.inference_trace).any())
-                self.assertEqual(
-                    result.inference_trace.tolist(),
-                    expected_traces[case_name],
-                    msg=case_name,
-                )
+                if report_inference_value_as_trace:
+                    self.assertFalse(np.isnan(result.inference_trace).any())
+                    self.assertEqual(
+                        result.inference_trace.tolist(),
+                        expected_traces[case_name],
+                        msg=case_name,
+                    )
+                else:
+                    self.assertIsNone(result.inference_trace)
                 self.assertEqual(
                     result.oracle_trace.tolist(),
                     expected_traces[case_name],
@@ -466,8 +473,15 @@ def _test_replication_async(self, map_data: bool) -> None:
                 self.assertEqual(completed_times, expected_completed_times)
 
     def test_replication_async(self) -> None:
-        self._test_replication_async(map_data=False)
-        self._test_replication_async(map_data=True)
+        self._test_replication_async(
+            map_data=False, report_inference_value_as_trace=False
+        )
+        self._test_replication_async(
+            map_data=True, report_inference_value_as_trace=False
+        )
+        self._test_replication_async(
+            map_data=False, report_inference_value_as_trace=True
+        )
 
     def test_run_optimization_with_orchestrator(self) -> None:
         method = get_async_benchmark_method()
@@ -491,6 +505,7 @@ def test_run_optimization_with_orchestrator(self) -> None:
                 none_throws(runner.simulated_backend_runner).simulator._verbose_logging
             )
 
+        method.generation_strategy = method.generation_strategy.clone_reset()
         with self.subTest("Logs not produced by default"), self.assertNoLogs(
             level=logging.INFO, logger=logger
         ), self.assertNoLogs(logger=logger):
@@ -618,9 +633,9 @@ def test_early_stopping(self) -> None:
             self.assertEqual(max_run, {0: 4, 1: 2, 2: 2, 3: 2})
 
     def test_replication_variable_runtime(self) -> None:
-        method = get_async_benchmark_method(max_pending_trials=1)
         for map_data in [False, True]:
             with self.subTest(map_data=map_data):
+                method = get_async_benchmark_method(max_pending_trials=1)
                 problem = get_async_benchmark_problem(
                     map_data=map_data,
                     step_runtime_fn=lambda params: params["x0"] + 1,
@@ -652,9 +667,7 @@ def test_replication_variable_runtime(self) -> None:
                 self.assertEqual(start_times, expected_start_times)
 
     @mock_botorch_optimize
-    def _test_replication_with_inference_value(
-        self, batch_size: int, report_inference_value_as_trace: bool
-    ) -> None:
+    def _test_replication_with_inference_value(self, batch_size: int) -> None:
         seed = 1
         method = get_sobol_botorch_modular_acquisition(
             model_cls=SingleTaskGP,
@@ -667,35 +680,29 @@ def _test_replication_with_inference_value(
         num_trials = 4
         problem = get_single_objective_benchmark_problem(
             num_trials=num_trials,
-            report_inference_value_as_trace=report_inference_value_as_trace,
+            report_inference_value_as_trace=True,
             noise_std=100.0,
         )
         res = self.benchmark_replication(problem=problem, method=method, seed=seed)
         # The inference trace could coincide with the oracle trace, but it won't
         # happen in this example with high noise and a seed
-        self.assertEqual(
-            np.equal(res.inference_trace, res.optimization_trace).all(),
-            report_inference_value_as_trace,
+        self.assertTrue(
+            np.equal(none_throws(res.inference_trace), res.optimization_trace).all(),
         )
-        self.assertEqual(
+        self.assertFalse(
             np.equal(res.oracle_trace, res.optimization_trace).all(),
-            not report_inference_value_as_trace,
         )
 
         self.assertEqual(res.optimization_trace.shape, (problem.num_trials,))
-        self.assertTrue((res.inference_trace >= res.oracle_trace).all())
+        self.assertTrue((none_throws(res.inference_trace) >= res.oracle_trace).all())
 
     def test_replication_with_inference_value(self) -> None:
-        for batch_size, report_inference_value_as_trace in product(
-            [1, 2], [False, True]
-        ):
+        for batch_size in [1, 2]:
             with self.subTest(
                 batch_size=batch_size,
-                report_inference_value_as_trace=report_inference_value_as_trace,
             ):
                 self._test_replication_with_inference_value(
                     batch_size=batch_size,
-                    report_inference_value_as_trace=report_inference_value_as_trace,
                 )
 
         with self.assertRaisesRegex(
@@ -793,7 +800,11 @@ def test_replication_mbm(self) -> None:
                     acquisition_cls=qLogNoisyExpectedImprovement,
                     distribute_replications=False,
                 ),
-                get_augmented_branin_problem(fidelity_or_task="fidelity"),
+                get_single_objective_benchmark_problem(
+                    observe_noise_sd=False,
+                    num_trials=6,
+                    report_inference_value_as_trace=True,
+                ),
                 "MBM::SingleTaskGP_qLogNEI",
             ),
         ]:
@@ -827,9 +838,7 @@ def test_replication_moo_sobol(self) -> None:
 
         self.assertTrue(np.all(res.score_trace <= 100))
         self.assertEqual(len(res.cost_trace), problem.num_trials)
-        self.assertEqual(len(res.inference_trace), problem.num_trials)
-        # since inference trace is not supported for MOO, it should be all NaN
-        self.assertTrue(np.isnan(res.inference_trace).all())
+        self.assertIsNone(res.inference_trace)
 
     def test_benchmark_one_method_problem(self) -> None:
         problem = get_single_objective_benchmark_problem()
@@ -1196,6 +1205,7 @@ def test_get_opt_trace_by_cumulative_epochs(self) -> None:
             ):
                 get_opt_trace_by_steps(experiment=experiment)
 
+        method.generation_strategy = method.generation_strategy.clone_reset()
         with self.subTest("Constrained"):
             problem = get_benchmark_problem("constrained_gramacy_observed_noise")
             experiment = self.run_optimization_with_orchestrator(
@@ -1237,72 +1247,28 @@ def test_get_benchmark_result_with_cumulative_steps(self) -> None:
         self.assertLessEqual(transformed.score_trace.min(), result.score_trace.min())
 
     def test_get_best_parameters(self) -> None:
-        """
-        Whether this produces the correct values is tested more thoroughly in
-        other tests such as `test_replication_with_inference_value` and
-        `test_get_inference_trace_from_params`.  Setting up an experiment with
-        data and trials without just running a benchmark is a pain, so in those
-        tests, we just run a benchmark.
-        """
-        gs = get_sobol_generation_strategy()
-
-        search_space = get_continuous_search_space(bounds=[(0, 1)])
-        moo_config = get_moo_opt_config(outcome_names=["a", "b"], ref_point=[0, 0])
-        experiment = Experiment(
-            name="test",
-            is_test=True,
-            search_space=search_space,
-            optimization_config=moo_config,
+        experiment = get_branin_experiment()
+        generation_strategy = get_sobol_generation_strategy()
+        mock_function = (
+            "ax.service.utils.best_point."
+            "get_best_parameters_from_model_predictions_with_trial_index"
         )
 
-        with self.subTest("MOO not supported"), self.assertRaisesRegex(
-            NotImplementedError, "Please use `get_pareto_optimal_parameters`"
-        ):
-            get_best_parameters(experiment=experiment, generation_strategy=gs)
-
-        soo_config = get_soo_opt_config(outcome_names=["a"])
-        with self.subTest("Empty experiment"):
-            result = get_best_parameters(
-                experiment=experiment.clone_with(optimization_config=soo_config),
-                generation_strategy=gs,
-            )
+        with patch(mock_function, return_value=None):
+            result = get_best_parameters(experiment, generation_strategy)
             self.assertIsNone(result)
 
-        with self.subTest("All constraints violated"):
-            experiment = get_experiment_with_observations(
-                observations=[[1, -1], [2, -1]],
-                constrained=True,
-            )
-            best_point = get_best_parameters(
-                experiment=experiment, generation_strategy=gs
-            )
-            self.assertIsNone(best_point)
-
-        with self.subTest("No completed trials"):
-            experiment = get_experiment_with_observations(observations=[])
-            sobol_generator = get_sobol(search_space=experiment.search_space)
-            for _ in range(3):
-                trial = experiment.new_trial(generator_run=sobol_generator.gen(n=1))
-                trial.run()
-            best_point = get_best_parameters(
-                experiment=experiment, generation_strategy=gs
-            )
-            self.assertIsNone(best_point)
+        with patch(mock_function, return_value=(0, {"x": 1.0}, None)):
+            result = get_best_parameters(experiment, generation_strategy)
+            self.assertEqual(result, {"x": 1.0})
 
-        experiment = get_experiment_with_observations(
-            observations=[[1], [2]], constrained=False
-        )
-        with self.subTest("Working case"):
-            best_point = get_best_parameters(
-                experiment=experiment, generation_strategy=gs
-            )
-            self.assertEqual(best_point, experiment.trials[1].arms[0].parameters)
-
-        with self.subTest("Trial indices"):
-            best_point = get_best_parameters(
-                experiment=experiment, generation_strategy=gs, trial_indices=[0]
+        with self.subTest("MOO not supported"), self.assertRaisesRegex(
+            NotImplementedError, "Please use `get_pareto_optimal_parameters`"
+        ):
+            experiment = get_branin_experiment_with_multi_objective()
+            get_best_parameters(
+                experiment=experiment, generation_strategy=generation_strategy
             )
-            self.assertEqual(best_point, experiment.trials[0].arms[0].parameters)
 
     def test_get_benchmark_result_from_experiment_and_gs(self) -> None:
         problem = get_single_objective_benchmark_problem()
diff --git a/ax/service/tests/test_best_point_utils.py b/ax/service/tests/test_best_point_utils.py
diff --git a/ax/service/utils/best_point.py b/ax/service/utils/best_point.py