Metaculus · lsabor · Nov 15, 2025 · Nov 20, 2025
diff --git a/tests/unit/test_questions/conftest.py b/tests/unit/test_questions/conftest.py
@@ -9,6 +9,7 @@
 
 __all__ = [
     "question_binary",
+    "question_multiple_choice",
     "question_numeric",
     "conditional_1",
     "question_binary_with_forecast_user_1",

diff --git a/tests/unit/test_utils/test_the_math/conftest.py b/tests/unit/test_utils/test_the_math/conftest.py
@@ -1 +1,4 @@
-from tests.unit.test_questions.conftest import question_binary  # noqa
+from tests.unit.test_questions.conftest import (
+    question_binary,
+    question_multiple_choice,
+)  # noqa
diff --git a/tests/unit/test_utils/test_the_math/test_aggregations.py b/tests/unit/test_utils/test_the_math/test_aggregations.py
@@ -23,6 +23,12 @@
     GoldMedalistsAggregation,
     JoinedBeforeDateAggregation,
     SingleAggregation,
+    compute_weighted_semi_standard_deviations,
+)
+from utils.typing import (
+    ForecastValues,
+    ForecastsValues,
+    Weights,
 )
 
 
@@ -46,6 +52,64 @@ def test_summarize_array(array, max_size, expceted_array):
 
 class TestAggregations:
 
+    @pytest.mark.parametrize(
+        "forecasts_values, weights, expected",
+        [
+            (
+                [[0.5, 0.5]],
+                None,
+                ([0.0, 0.0], [0.0, 0.0]),
+            ),  # Trivial
+            (
+                [
+                    [0.5, 0.5],
+                    [0.5, 0.5],
+                    [0.5, 0.5],
+                ],
+                None,
+                ([0.0, 0.0], [0.0, 0.0]),
+            ),  # 3 unwavaring forecasts
+            (
+                [
+                    [0.2, 0.8],
+                    [0.5, 0.5],
+                    [0.8, 0.2],
+                ],
+                None,
+                ([0.3, 0.3], [0.3, 0.3]),
+            ),  # 3 unwavaring forecasts
+            (
+                [
+                    [0.6, 0.15, None, 0.25],
+                    [0.6, 0.15, None, 0.25],
+                ],
+                None,
+                ([0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]),
+            ),  # identical forecasts with placeholders
+            (
+                [
+                    [0.4, 0.25, None, 0.35],
+                    [0.6, 0.15, None, 0.25],
+                ],
+                None,
+                ([0.1, 0.05, 0.0, 0.05], [0.1, 0.05, 0.0, 0.05]),
+            ),  # minorly different forecasts with placeholders
+        ],
+    )
+    def test_compute_weighted_semi_standard_deviations(
+        self,
+        forecasts_values: ForecastsValues,
+        weights: Weights | None,
+        expected: tuple[ForecastValues, ForecastValues],
+    ):
+        result = compute_weighted_semi_standard_deviations(forecasts_values, weights)
+        rl, ru = result
+        el, eu = expected
+        for v, e in zip(rl, el):
+            np.testing.assert_approx_equal(v, e)
+        for v, e in zip(ru, eu):
+            np.testing.assert_approx_equal(v, e)
+
     @pytest.mark.parametrize("aggregation_name", [Agg.method for Agg in AGGREGATIONS])
     def test_aggregations_initialize(
         self, question_binary: Question, aggregation_name: str
@@ -241,46 +305,120 @@ def test_aggregations_initialize(
                     histogram=None,
                 ),
             ),
+            # Multiple choice with placeholders
+            (
+                {},
+                ForecastSet(
+                    forecasts_values=[
+                        [0.6, 0.15, None, 0.25],
+                        [0.6, 0.25, None, 0.15],
+                    ],
+                    timestep=datetime(2024, 1, 1, tzinfo=dt_timezone.utc),
+                    forecaster_ids=[1, 2],
+                    timesteps=[
+                        datetime(2022, 1, 1, tzinfo=dt_timezone.utc),
+                        datetime(2023, 1, 1, tzinfo=dt_timezone.utc),
+                    ],
+                ),
+                True,
+                False,
+                AggregateForecast(
+                    start_time=datetime(2024, 1, 1, tzinfo=dt_timezone.utc),
+                    method=AggregationMethod.UNWEIGHTED,
+                    forecast_values=[0.6, 0.20, None, 0.20],
+                    interval_lower_bounds=[0.6, 0.15, None, 0.15],
+                    centers=[0.6, 0.20, None, 0.20],
+                    interval_upper_bounds=[0.6, 0.25, None, 0.25],
+                    means=[0.6, 0.20, None, 0.20],
+                    forecaster_count=2,
+                ),
+            ),
+            (
+                {},
+                ForecastSet(
+                    forecasts_values=[
+                        [0.6, 0.15, None, 0.25],
+                        [0.6, 0.25, None, 0.15],
+                        [0.4, 0.35, None, 0.25],
+                    ],
+                    timestep=datetime(2024, 1, 1, tzinfo=dt_timezone.utc),
+                    forecaster_ids=[1, 2],
+                    timesteps=[
+                        datetime(2022, 1, 1, tzinfo=dt_timezone.utc),
+                        datetime(2023, 1, 1, tzinfo=dt_timezone.utc),
+                    ],
+                ),
+                True,
+                False,
+                AggregateForecast(
+                    start_time=datetime(2024, 1, 1, tzinfo=dt_timezone.utc),
+                    method=AggregationMethod.UNWEIGHTED,
+                    forecast_values=[
+                        0.5453965360072925,
+                        0.22730173199635367,
+                        None,
+                        0.22730173199635367,
+                    ],
+                    interval_lower_bounds=[
+                        0.3635976906715284,
+                        0.1363810391978122,
+                        None,
+                        0.1363810391978122,
+                    ],
+                    centers=[
+                        0.5453965360072925,
+                        0.22730173199635367,
+                        None,
+                        0.22730173199635367,
+                    ],
+                    interval_upper_bounds=[
+                        0.5453965360072925,
+                        0.3182224247948951,
+                        None,
+                        0.22730173199635367,
+                    ],
+                    means=[
+                        0.5333333333333333,
+                        0.25,
+                        None,
+                        0.21666666666666667,
+                    ],
+                    forecaster_count=3,
+                ),
+            ),
         ],
     )
     def test_UnweightedAggregation(
         self,
         question_binary: Question,
+        question_multiple_choice: Question,
         init_params: dict,
         forecast_set: ForecastSet,
         include_stats: bool,
         histogram: bool,
         expected: AggregateForecast,
     ):
-        aggregation = UnweightedAggregation(question=question_binary, **init_params)
-        new_aggregation = aggregation.calculate_aggregation_entry(
+        if len(forecast_set.forecasts_values[0]) == 2:
+            question = question_binary
+        else:
+            question = question_multiple_choice
+
+        aggregation = UnweightedAggregation(question=question, **init_params)
+        new_aggregation: AggregateForecast = aggregation.calculate_aggregation_entry(
             forecast_set, include_stats, histogram
         )
 
-        assert new_aggregation.start_time == expected.start_time
-        assert (
-            new_aggregation.forecast_values == expected.forecast_values
-        ) or np.allclose(new_aggregation.forecast_values, expected.forecast_values)
-        assert new_aggregation.forecaster_count == expected.forecaster_count
-        assert (
-            new_aggregation.interval_lower_bounds == expected.interval_lower_bounds
-        ) or np.allclose(
-            new_aggregation.interval_lower_bounds, expected.interval_lower_bounds
-        )
-        assert (new_aggregation.centers == expected.centers) or np.allclose(
-            new_aggregation.centers, expected.centers
-        )
-        assert (
-            new_aggregation.interval_upper_bounds == expected.interval_upper_bounds
-        ) or np.allclose(
-            new_aggregation.interval_upper_bounds, expected.interval_upper_bounds
-        )
-        assert (new_aggregation.means == expected.means) or np.allclose(
-            new_aggregation.means, expected.means
-        )
-        assert (new_aggregation.histogram == expected.histogram) or np.allclose(
-            new_aggregation.histogram, expected.histogram
-        )
+        for r, e in [
+            (new_aggregation.forecast_values, expected.forecast_values),
+            (new_aggregation.interval_lower_bounds, expected.interval_lower_bounds),
+            (new_aggregation.centers, expected.centers),
+            (new_aggregation.interval_upper_bounds, expected.interval_upper_bounds),
+            (new_aggregation.means, expected.means),
+            (new_aggregation.histogram, expected.histogram),
+        ]:
+            r = np.where(np.equal(r, None), np.nan, r).astype(float)
+            e = np.where(np.equal(e, None), np.nan, e).astype(float)
+            np.testing.assert_allclose(r, e, equal_nan=True)
 
     @pytest.mark.parametrize(
         "init_params, forecast_set, include_stats, histogram, expected",
@@ -468,20 +606,52 @@ def test_UnweightedAggregation(
                     histogram=None,
                 ),
             ),
+            # Multiple choice with placeholders
+            (
+                {},
+                ForecastSet(
+                    forecasts_values=[
+                        [0.6, 0.15, None, 0.25],
+                        [0.6, 0.25, None, 0.15],
+                    ],
+                    timestep=datetime(2024, 1, 1, tzinfo=dt_timezone.utc),
+                    forecaster_ids=[1, 2],
+                    timesteps=[
+                        datetime(2022, 1, 1, tzinfo=dt_timezone.utc),
+                        datetime(2023, 1, 1, tzinfo=dt_timezone.utc),
+                    ],
+                ),
+                True,
+                False,
+                AggregateForecast(
+                    start_time=datetime(2024, 1, 1, tzinfo=dt_timezone.utc),
+                    method=AggregationMethod.UNWEIGHTED,
+                    forecast_values=[0.6, 0.20, None, 0.20],
+                    interval_lower_bounds=[0.6, 0.15, None, 0.15],
+                    centers=[0.6, 0.20, None, 0.20],
+                    interval_upper_bounds=[0.6, 0.25, None, 0.25],
+                    means=[0.6, 0.20, None, 0.20],
+                    forecaster_count=2,
+                ),
+            ),
         ],
     )
     def test_RecencyWeightedAggregation(
         self,
         question_binary: Question,
+        question_multiple_choice: Question,
         init_params: dict,
         forecast_set: ForecastSet,
         include_stats: bool,
         histogram: bool,
         expected: AggregateForecast,
     ):
-        aggregation = RecencyWeightedAggregation(
-            question=question_binary, **init_params
-        )
+        if len(forecast_set.forecasts_values[0]) == 2:
+            question = question_binary
+        else:
+            question = question_multiple_choice
+
+        aggregation = RecencyWeightedAggregation(question=question, **init_params)
         new_aggregation = aggregation.calculate_aggregation_entry(
             forecast_set, include_stats, histogram
         )