diff --git a/tests/unit/test_questions/conftest.py b/tests/unit/test_questions/conftest.py index 9c75696c2..57ebbb3d2 100644 --- a/tests/unit/test_questions/conftest.py +++ b/tests/unit/test_questions/conftest.py @@ -9,6 +9,7 @@ __all__ = [ "question_binary", + "question_multiple_choice", "question_numeric", "conditional_1", "question_binary_with_forecast_user_1", diff --git a/tests/unit/test_utils/test_the_math/conftest.py b/tests/unit/test_utils/test_the_math/conftest.py index b048040bb..8f150a381 100644 --- a/tests/unit/test_utils/test_the_math/conftest.py +++ b/tests/unit/test_utils/test_the_math/conftest.py @@ -1 +1,4 @@ -from tests.unit.test_questions.conftest import question_binary # noqa +from tests.unit.test_questions.conftest import ( + question_binary, + question_multiple_choice, +) # noqa diff --git a/tests/unit/test_utils/test_the_math/test_aggregations.py b/tests/unit/test_utils/test_the_math/test_aggregations.py index 73aaa5119..911c9b459 100644 --- a/tests/unit/test_utils/test_the_math/test_aggregations.py +++ b/tests/unit/test_utils/test_the_math/test_aggregations.py @@ -23,6 +23,12 @@ GoldMedalistsAggregation, JoinedBeforeDateAggregation, SingleAggregation, + compute_weighted_semi_standard_deviations, +) +from utils.typing import ( + ForecastValues, + ForecastsValues, + Weights, ) @@ -46,6 +52,64 @@ def test_summarize_array(array, max_size, expceted_array): class TestAggregations: + @pytest.mark.parametrize( + "forecasts_values, weights, expected", + [ + ( + [[0.5, 0.5]], + None, + ([0.0, 0.0], [0.0, 0.0]), + ), # Trivial + ( + [ + [0.5, 0.5], + [0.5, 0.5], + [0.5, 0.5], + ], + None, + ([0.0, 0.0], [0.0, 0.0]), + ), # 3 unwavaring forecasts + ( + [ + [0.2, 0.8], + [0.5, 0.5], + [0.8, 0.2], + ], + None, + ([0.3, 0.3], [0.3, 0.3]), + ), # 3 unwavaring forecasts + ( + [ + [0.6, 0.15, None, 0.25], + [0.6, 0.15, None, 0.25], + ], + None, + ([0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]), + ), # identical forecasts with placeholders + ( + [ + [0.4, 0.25, None, 0.35], + [0.6, 0.15, None, 0.25], + ], + None, + ([0.1, 0.05, 0.0, 0.05], [0.1, 0.05, 0.0, 0.05]), + ), # minorly different forecasts with placeholders + ], + ) + def test_compute_weighted_semi_standard_deviations( + self, + forecasts_values: ForecastsValues, + weights: Weights | None, + expected: tuple[ForecastValues, ForecastValues], + ): + result = compute_weighted_semi_standard_deviations(forecasts_values, weights) + rl, ru = result + el, eu = expected + for v, e in zip(rl, el): + np.testing.assert_approx_equal(v, e) + for v, e in zip(ru, eu): + np.testing.assert_approx_equal(v, e) + @pytest.mark.parametrize("aggregation_name", [Agg.method for Agg in AGGREGATIONS]) def test_aggregations_initialize( self, question_binary: Question, aggregation_name: str @@ -241,46 +305,120 @@ def test_aggregations_initialize( histogram=None, ), ), + # Multiple choice with placeholders + ( + {}, + ForecastSet( + forecasts_values=[ + [0.6, 0.15, None, 0.25], + [0.6, 0.25, None, 0.15], + ], + timestep=datetime(2024, 1, 1, tzinfo=dt_timezone.utc), + forecaster_ids=[1, 2], + timesteps=[ + datetime(2022, 1, 1, tzinfo=dt_timezone.utc), + datetime(2023, 1, 1, tzinfo=dt_timezone.utc), + ], + ), + True, + False, + AggregateForecast( + start_time=datetime(2024, 1, 1, tzinfo=dt_timezone.utc), + method=AggregationMethod.UNWEIGHTED, + forecast_values=[0.6, 0.20, None, 0.20], + interval_lower_bounds=[0.6, 0.15, None, 0.15], + centers=[0.6, 0.20, None, 0.20], + interval_upper_bounds=[0.6, 0.25, None, 0.25], + means=[0.6, 0.20, None, 0.20], + forecaster_count=2, + ), + ), + ( + {}, + ForecastSet( + forecasts_values=[ + [0.6, 0.15, None, 0.25], + [0.6, 0.25, None, 0.15], + [0.4, 0.35, None, 0.25], + ], + timestep=datetime(2024, 1, 1, tzinfo=dt_timezone.utc), + forecaster_ids=[1, 2], + timesteps=[ + datetime(2022, 1, 1, tzinfo=dt_timezone.utc), + datetime(2023, 1, 1, tzinfo=dt_timezone.utc), + ], + ), + True, + False, + AggregateForecast( + start_time=datetime(2024, 1, 1, tzinfo=dt_timezone.utc), + method=AggregationMethod.UNWEIGHTED, + forecast_values=[ + 0.5453965360072925, + 0.22730173199635367, + None, + 0.22730173199635367, + ], + interval_lower_bounds=[ + 0.3635976906715284, + 0.1363810391978122, + None, + 0.1363810391978122, + ], + centers=[ + 0.5453965360072925, + 0.22730173199635367, + None, + 0.22730173199635367, + ], + interval_upper_bounds=[ + 0.5453965360072925, + 0.3182224247948951, + None, + 0.22730173199635367, + ], + means=[ + 0.5333333333333333, + 0.25, + None, + 0.21666666666666667, + ], + forecaster_count=3, + ), + ), ], ) def test_UnweightedAggregation( self, question_binary: Question, + question_multiple_choice: Question, init_params: dict, forecast_set: ForecastSet, include_stats: bool, histogram: bool, expected: AggregateForecast, ): - aggregation = UnweightedAggregation(question=question_binary, **init_params) - new_aggregation = aggregation.calculate_aggregation_entry( + if len(forecast_set.forecasts_values[0]) == 2: + question = question_binary + else: + question = question_multiple_choice + + aggregation = UnweightedAggregation(question=question, **init_params) + new_aggregation: AggregateForecast = aggregation.calculate_aggregation_entry( forecast_set, include_stats, histogram ) - assert new_aggregation.start_time == expected.start_time - assert ( - new_aggregation.forecast_values == expected.forecast_values - ) or np.allclose(new_aggregation.forecast_values, expected.forecast_values) - assert new_aggregation.forecaster_count == expected.forecaster_count - assert ( - new_aggregation.interval_lower_bounds == expected.interval_lower_bounds - ) or np.allclose( - new_aggregation.interval_lower_bounds, expected.interval_lower_bounds - ) - assert (new_aggregation.centers == expected.centers) or np.allclose( - new_aggregation.centers, expected.centers - ) - assert ( - new_aggregation.interval_upper_bounds == expected.interval_upper_bounds - ) or np.allclose( - new_aggregation.interval_upper_bounds, expected.interval_upper_bounds - ) - assert (new_aggregation.means == expected.means) or np.allclose( - new_aggregation.means, expected.means - ) - assert (new_aggregation.histogram == expected.histogram) or np.allclose( - new_aggregation.histogram, expected.histogram - ) + for r, e in [ + (new_aggregation.forecast_values, expected.forecast_values), + (new_aggregation.interval_lower_bounds, expected.interval_lower_bounds), + (new_aggregation.centers, expected.centers), + (new_aggregation.interval_upper_bounds, expected.interval_upper_bounds), + (new_aggregation.means, expected.means), + (new_aggregation.histogram, expected.histogram), + ]: + r = np.where(np.equal(r, None), np.nan, r).astype(float) + e = np.where(np.equal(e, None), np.nan, e).astype(float) + np.testing.assert_allclose(r, e, equal_nan=True) @pytest.mark.parametrize( "init_params, forecast_set, include_stats, histogram, expected", @@ -468,20 +606,52 @@ def test_UnweightedAggregation( histogram=None, ), ), + # Multiple choice with placeholders + ( + {}, + ForecastSet( + forecasts_values=[ + [0.6, 0.15, None, 0.25], + [0.6, 0.25, None, 0.15], + ], + timestep=datetime(2024, 1, 1, tzinfo=dt_timezone.utc), + forecaster_ids=[1, 2], + timesteps=[ + datetime(2022, 1, 1, tzinfo=dt_timezone.utc), + datetime(2023, 1, 1, tzinfo=dt_timezone.utc), + ], + ), + True, + False, + AggregateForecast( + start_time=datetime(2024, 1, 1, tzinfo=dt_timezone.utc), + method=AggregationMethod.UNWEIGHTED, + forecast_values=[0.6, 0.20, None, 0.20], + interval_lower_bounds=[0.6, 0.15, None, 0.15], + centers=[0.6, 0.20, None, 0.20], + interval_upper_bounds=[0.6, 0.25, None, 0.25], + means=[0.6, 0.20, None, 0.20], + forecaster_count=2, + ), + ), ], ) def test_RecencyWeightedAggregation( self, question_binary: Question, + question_multiple_choice: Question, init_params: dict, forecast_set: ForecastSet, include_stats: bool, histogram: bool, expected: AggregateForecast, ): - aggregation = RecencyWeightedAggregation( - question=question_binary, **init_params - ) + if len(forecast_set.forecasts_values[0]) == 2: + question = question_binary + else: + question = question_multiple_choice + + aggregation = RecencyWeightedAggregation(question=question, **init_params) new_aggregation = aggregation.calculate_aggregation_entry( forecast_set, include_stats, histogram ) diff --git a/tests/unit/test_utils/test_the_math/test_measures.py b/tests/unit/test_utils/test_the_math/test_measures.py index b5ee3c835..ab2273d2f 100644 --- a/tests/unit/test_utils/test_the_math/test_measures.py +++ b/tests/unit/test_utils/test_the_math/test_measures.py @@ -56,14 +56,26 @@ ( [ [0.33, 0.33, 0.34], - [0.0, 0.5, 0.5], + [0.01, 0.49, 0.5], [0.4, 0.2, 0.4], [0.2, 0.6, 0.2], ], [0.1, 0.2, 0.3, 0.4], [50.0], - [[0.2, 0.5, 0.37]], + [[0.2, 0.49, 0.37]], + ), + ( + [ + [0.33, 0.33, None, 0.34], + [0.01, 0.49, None, 0.5], + [0.4, 0.2, None, 0.4], + [0.2, 0.6, None, 0.2], + ], + [0.1, 0.2, 0.3, 0.4], + [50.0], + [[0.2, 0.49, None, 0.37]], ), + # multiple choice options with placeholder values ], ) def test_weighted_percentile_2d(values, weights, percentiles, expected_result): @@ -73,7 +85,11 @@ def test_weighted_percentile_2d(values, weights, percentiles, expected_result): result = weighted_percentile_2d( values=values, weights=weights, percentiles=percentiles ) - np.testing.assert_allclose(result, expected_result) + result = np.where(np.equal(result, None), np.nan, result).astype(float) + expected_result = np.where( + np.equal(expected_result, None), np.nan, expected_result + ).astype(float) + np.testing.assert_allclose(result, expected_result, equal_nan=True) if weights is None and [percentiles] == [50.0]: # should behave like np.median numpy_medians = np.median(values, axis=0) np.testing.assert_allclose(result, [numpy_medians]) @@ -95,6 +111,7 @@ def test_percent_point_function(cdf, percentiles, expected_result): @pytest.mark.parametrize( "p1, p2, question, expected_result", [ + # binary ( [0.5, 0.5], [0.5, 0.5], @@ -107,6 +124,7 @@ def test_percent_point_function(cdf, percentiles, expected_result): Question(type="binary"), sum([-0.1 * np.log2(0.5 / 0.6), 0.1 * np.log2(0.5 / 0.4)]), # 0.05849625 ), + # multiple choice ( [0.5, 0.5], [0.5, 0.5], @@ -138,6 +156,54 @@ def test_percent_point_function(cdf, percentiles, expected_result): ] ), # 1.3169925 ), + ( + [0.2, 0.3, 0.5], + [0.2, 0.2, 0.6], + Question(type="multiple_choice"), + sum( + [ + 0, + (0.3 - 0.2) * np.log2(0.3 / 0.2), + (0.5 - 0.6) * np.log2(0.5 / 0.6), + ] + ), # 0.0847996 + ), + ( + [0.2, 0.3, None, 0.5], + [0.2, 0.3, None, 0.5], + Question(type="multiple_choice"), + 0.0, + ), # deal with Nones happily + ( + [0.2, 0.3, None, 0.5], + [0.2, 0.3, 0.1, 0.4], + Question(type="multiple_choice"), + 0.0, + ), # no difference across adding an option + ( + [0.2, 0.3, None, 0.5], + [0.2, 0.2, 0.1, 0.5], + Question(type="multiple_choice"), + sum( + [ + 0, + (0.3 - 0.2) * np.log2(0.3 / 0.2), + (0.5 - 0.6) * np.log2(0.5 / 0.6), + ] + ), # 0.0847996 + ), # difference across adding an option + ( + [0.2, 0.3, None, 0.5], + [0.1, None, 0.7, 0.2], + Question(type="multiple_choice"), + sum( + [ + (0.2 - 0.1) * np.log2(0.2 / 0.1), + (0.8 - 0.9) * np.log2(0.8 / 0.9), + ] + ), # 0.1169925 + ), # difference across removing and adding options + # continuous ( [0.01, 0.5, 0.99], [0.01, 0.5, 0.99], @@ -214,6 +280,7 @@ def test_prediction_difference_for_sorting(p1, p2, question, expected_result): @pytest.mark.parametrize( "p1, p2, question, expected_result", [ + # binary ( [0.5, 0.5], [0.5, 0.5], @@ -230,6 +297,7 @@ def test_prediction_difference_for_sorting(p1, p2, question, expected_result): (-0.1, (2 / 3) / 1), ], ), + # multiple choice ( [0.5, 0.5], [0.5, 0.5], @@ -270,6 +338,61 @@ def test_prediction_difference_for_sorting(p1, p2, question, expected_result): (-0.3, (1 / 9) / (4 / 6)), ], ), + ( + [0.2, 0.3, 0.5], + [0.2, 0.2, 0.6], + Question(type="multiple_choice"), + [ + (0.0, (2 / 8) / (2 / 8)), + (-0.1, (2 / 8) / (3 / 7)), + (0.1, (6 / 4) / (5 / 5)), + ], + ), + ( + [0.2, 0.3, None, 0.5], + [0.2, 0.3, None, 0.5], + Question(type="multiple_choice"), + [ + (0.0, (2 / 8) / (2 / 8)), + (0.0, (3 / 7) / (3 / 7)), + (0.0, 1.0), + (0.0, (5 / 5) / (5 / 5)), + ], + ), # deal with 0.0s happily + ( + [0.2, 0.3, None, 0.5], + [0.2, 0.3, 0.1, 0.4], + Question(type="multiple_choice"), + [ + (0.0, (2 / 8) / (2 / 8)), + (0.0, (3 / 7) / (3 / 7)), + (0.0, 1.0), + (0.0, (5 / 5) / (5 / 5)), + ], + ), # no difference across adding an option + ( + [0.2, 0.3, None, 0.5], + [0.2, 0.2, 0.1, 0.5], + Question(type="multiple_choice"), + [ + (0.0, (2 / 8) / (2 / 8)), + (-0.1, (2 / 8) / (3 / 7)), + (0.0, 1.0), + (0.1, (6 / 4) / (5 / 5)), + ], + ), # difference across adding an option + ( + [0.2, 0.3, None, 0.5], + [0.1, None, 0.7, 0.2], + Question(type="multiple_choice"), + [ + (-0.1, (1 / 9) / (2 / 8)), + (0.0, 1.0), + (0.0, 1.0), + (0.1, (9 / 1) / (8 / 2)), + ], + ), # difference across removing and adding options + # continuous ( [0.0, 0.5, 1.0], [0.0, 0.5, 1.0], diff --git a/utils/the_math/aggregations.py b/utils/the_math/aggregations.py index 40c0193de..60cfa26ae 100644 --- a/utils/the_math/aggregations.py +++ b/utils/the_math/aggregations.py @@ -489,6 +489,9 @@ def get_range_values( forecasts_values, weights, [25.0, 50.0, 75.0] ) centers_array = np.array(centers) + centers_array[np.equal(centers_array, 0.0) | (centers_array == 0.0)] = ( + 1.0 # avoid divide by zero + ) normalized_centers = np.array(aggregation_forecast_values) normalized_lowers = np.array(lowers) normalized_lowers[non_nones] = ( @@ -498,7 +501,7 @@ def get_range_values( ) normalized_uppers = np.array(uppers) normalized_uppers[non_nones] = ( - normalized_lowers[non_nones] + normalized_uppers[non_nones] * normalized_centers[non_nones] / centers_array[non_nones] ) @@ -641,9 +644,18 @@ def calculate_aggregation_entry( Question.QuestionType.BINARY, Question.QuestionType.MULTIPLE_CHOICE, ]: - aggregation.means = np.average( - forecast_set.forecasts_values, weights=weights, axis=0 - ).tolist() + forecasts_values = np.array(forecast_set.forecasts_values) + nones = ( + np.equal(forecasts_values[0], None) + if forecasts_values.size + else np.array([]) + ) + forecasts_values[:, nones] = np.nan + means = np.average(forecasts_values, weights=weights, axis=0).astype( + object + ) + means[np.isnan(means.astype(float))] = None + aggregation.means = means.tolist() if histogram and self.question.type in [ Question.QuestionType.BINARY, diff --git a/utils/the_math/measures.py b/utils/the_math/measures.py index e20bd381b..50fe1f21a 100644 --- a/utils/the_math/measures.py +++ b/utils/the_math/measures.py @@ -17,16 +17,17 @@ def weighted_percentile_2d( percentiles: Percentiles = None, ) -> Percentiles: values = np.array(values) + sorted_values = values.copy() # avoid side effects + # replace None with np.nan for calculations (return to None at the end) + sorted_values[np.equal(sorted_values, None)] = np.nan + if weights is None: ordered_weights = np.ones_like(values) else: weights = np.array(weights) - ordered_weights = weights[values.argsort(axis=0)] + ordered_weights = weights[sorted_values.argsort(axis=0)] percentiles = np.array(percentiles or [50.0]) - sorted_values = values.copy() # avoid side effects - # replace None with -1.0 for calculations (return to None at the end) - sorted_values[np.equal(sorted_values, None)] = -1.0 sorted_values.sort(axis=0) # get the normalized cumulative weights @@ -52,10 +53,10 @@ def weighted_percentile_2d( + sorted_values[right_indexes, column_indicies] ) ) - # replace -1.0 back to None + # replace np.nan back to None weighted_percentiles = np.array(weighted_percentiles) weighted_percentiles = np.where( - weighted_percentiles == -1.0, None, weighted_percentiles + weighted_percentiles == np.nan, None, weighted_percentiles ) return weighted_percentiles.tolist() @@ -104,10 +105,22 @@ def prediction_difference_for_sorting( """for binary and multiple choice, takes pmfs for continuous takes cdfs""" p1, p2 = np.array(p1), np.array(p2) - p1[np.equal(p1, None)] = -1.0 # replace None with -1.0 for calculations - p2[np.equal(p2, None)] = -1.0 # replace None with -1.0 for calculations # Uses Jeffrey's Divergence - if question_type in ["binary", "multiple_choice"]: + if question_type == Question.QuestionType.MULTIPLE_CHOICE: + # cover for Nones + p1_nones = np.equal(p1, None) + p2_nones = np.equal(p2, None) + never_nones = np.logical_not(p1_nones | p2_nones) + p1_new = p1[never_nones] + p2_new = p2[never_nones] + p1_new[-1] += sum(p1[~p1_nones & p2_nones]) + p2_new[-1] += sum(p2[~p2_nones & p1_nones]) + p1 = p1_new + p2 = p2_new + if question_type in [ + Question.QuestionType.BINARY, + Question.QuestionType.MULTIPLE_CHOICE, + ]: return sum([(p - q) * np.log2(p / q) for p, q in zip(p1, p2)]) cdf1 = np.array([1 - np.array(p1), p1]) cdf2 = np.array([1 - np.array(p2), p2]) @@ -123,14 +136,22 @@ def prediction_difference_for_display( """for binary and multiple choice, takes pmfs for continuous takes cdfs""" p1, p2 = np.array(p1), np.array(p2) - p1[np.equal(p1, None)] = -1.0 # replace None with -1.0 for calculations - p2[np.equal(p2, None)] = -1.0 # replace None with -1.0 for calculations if question.type == "binary": # single-item list of (pred diff, ratio of odds) return [(p2[1] - p1[1], (p2[1] / (1 - p2[1])) / (p1[1] / (1 - p1[1])))] elif question.type == "multiple_choice": # list of (pred diff, ratio of odds) - return [(q - p, (q / (1 - q)) / (p / (1 - p))) for p, q in zip(p1, p2)] + for p, q in zip(p1[:-1], p2[:-1]): + if p is None or q is None: + p1[-1] = (p1[-1] or 0.0) + (p or 0.0) + p2[-1] = (p2[-1] or 0.0) + (q or 0.0) + arr = [] + for p, q in zip(p1, p2): + if p is None or q is None: + arr.append((0.0, 1.0)) + else: + arr.append((q - p, (q / (1 - q)) / (p / (1 - p)))) + return arr # total earth mover's distance, assymmetric earth mover's distance x_locations = unscaled_location_to_scaled_location( np.linspace(0, 1, len(p1)), question