diff --git a/questions/models.py b/questions/models.py index 845b88e07c..60edd13fda 100644 --- a/questions/models.py +++ b/questions/models.py @@ -637,14 +637,16 @@ def get_prediction_values(self) -> list[float | None]: return self.probability_yes_per_category return self.continuous_cdf - def get_pmf(self) -> list[float]: + def get_pmf(self, replace_none: bool = False) -> list[float]: """ - gets the PMF for this forecast, replacing None values with 0.0 - Not for serialization use (keep None values in that case) + gets the PMF for this forecast + replaces None values with 0.0 if replace_none is True """ if self.probability_yes: return [1 - self.probability_yes, self.probability_yes] if self.probability_yes_per_category: + if not replace_none: + return self.probability_yes_per_category return [ v or 0.0 for v in self.probability_yes_per_category ] # replace None with 0.0 @@ -719,18 +721,20 @@ def get_cdf(self) -> list[float | None] | None: return self.forecast_values return None - def get_pmf(self) -> list[float]: + def get_pmf(self, replace_none: bool = False) -> list[float | None]: """ - gets the PMF for this forecast, replacing None values with 0.0 - Not for serialization use (keep None values in that case) + gets the PMF for this forecast + replacing None values with 0.0 if replace_none is True """ # grab annotation if it exists for efficiency question_type = getattr(self, "question_type", self.question.type) - forecast_values = [ - v or 0.0 for v in self.forecast_values - ] # replace None with 0.0 + forecast_values = self.forecast_values + if question_type == Question.QuestionType.MULTIPLE_CHOICE: + if not replace_none: + return forecast_values + return [v or 0.0 for v in forecast_values] # replace None with 0.0 if question_type in QUESTION_CONTINUOUS_TYPES: - cdf: list[float] = forecast_values + cdf: list[float] = forecast_values # type: ignore pmf = [cdf[0]] for i in range(1, len(cdf)): pmf.append(cdf[i] - cdf[i - 1]) diff --git a/scoring/score_math.py b/scoring/score_math.py index fada04f0d1..546b19d310 100644 --- a/scoring/score_math.py +++ b/scoring/score_math.py @@ -20,7 +20,7 @@ @dataclass class AggregationEntry: - pmf: np.ndarray | list[float] + pmf: np.ndarray | list[float | None] num_forecasters: int timestamp: float @@ -36,7 +36,7 @@ def get_geometric_means( timesteps.add(forecast.end_time.timestamp()) for timestep in sorted(timesteps): prediction_values = [ - f.get_pmf() + f.get_pmf(replace_none=True) for f in forecasts if f.start_time.timestamp() <= timestep and (f.end_time is None or f.end_time.timestamp() > timestep) @@ -84,9 +84,12 @@ def evaluate_forecasts_baseline_accuracy( forecast_coverage = forecast_duration / total_duration pmf = forecast.get_pmf() if question_type in ["binary", "multiple_choice"]: - forecast_score = ( - 100 * np.log(pmf[resolution_bucket] * len(pmf)) / np.log(len(pmf)) - ) + # forecasts always have `None` assigned to MC options that aren't + # available at the time. Detecting these allows us to avoid trying to + # follow the question's options_history. + options_at_time = len([p for p in pmf if p is not None]) + p = pmf[resolution_bucket] or pmf[-1] # if None, read from Other + forecast_score = 100 * np.log(p * options_at_time) / np.log(options_at_time) else: if resolution_bucket in [0, len(pmf) - 1]: baseline = 0.05 @@ -116,8 +119,13 @@ def evaluate_forecasts_baseline_spot_forecast( if start <= spot_forecast_timestamp < end: pmf = forecast.get_pmf() if question_type in ["binary", "multiple_choice"]: + # forecasts always have `None` assigned to MC options that aren't + # available at the time. Detecting these allows us to avoid trying to + # follow the question's options_history. + options_at_time = len([p for p in pmf if p is not None]) + p = pmf[resolution_bucket] or pmf[-1] # if None, read from Other forecast_score = ( - 100 * np.log(pmf[resolution_bucket] * len(pmf)) / np.log(len(pmf)) + 100 * np.log(p * options_at_time) / np.log(options_at_time) ) else: if resolution_bucket in [0, len(pmf) - 1]: @@ -159,17 +167,21 @@ def evaluate_forecasts_peer_accuracy( continue pmf = forecast.get_pmf() + p = pmf[resolution_bucket] or pmf[-1] # if None, read from Other interval_scores: list[float | None] = [] for gm in geometric_mean_forecasts: if forecast_start <= gm.timestamp < forecast_end: - score = ( + gmp = ( + gm.pmf[resolution_bucket] or gm.pmf[-1] + ) # if None, read from Other + interval_score = ( 100 * (gm.num_forecasters / (gm.num_forecasters - 1)) - * np.log(pmf[resolution_bucket] / gm.pmf[resolution_bucket]) + * np.log(p / gmp) ) if question_type in QUESTION_CONTINUOUS_TYPES: - score /= 2 - interval_scores.append(score) + interval_score /= 2 + interval_scores.append(interval_score) else: interval_scores.append(None) @@ -218,10 +230,10 @@ def evaluate_forecasts_peer_spot_forecast( ) if start <= spot_forecast_timestamp < end: pmf = forecast.get_pmf() + p = pmf[resolution_bucket] or pmf[-1] # if None, read from Other + gmp = gm.pmf[resolution_bucket] or gm.pmf[-1] # if None, read from Other forecast_score = ( - 100 - * (gm.num_forecasters / (gm.num_forecasters - 1)) - * np.log(pmf[resolution_bucket] / gm.pmf[resolution_bucket]) + 100 * (gm.num_forecasters / (gm.num_forecasters - 1)) * np.log(p / gmp) ) if question_type in QUESTION_CONTINUOUS_TYPES: forecast_score /= 2 @@ -260,11 +272,15 @@ def evaluate_forecasts_legacy_relative( continue pmf = forecast.get_pmf() + p = pmf[resolution_bucket] or pmf[-1] # if None, read from Other interval_scores: list[float | None] = [] for bf in baseline_forecasts: if forecast_start <= bf.timestamp < forecast_end: - score = np.log2(pmf[resolution_bucket] / bf.pmf[resolution_bucket]) - interval_scores.append(score) + bfp = ( + bf.pmf[resolution_bucket] or bf.pmf[-1] + ) # if None, read from Other + interval_score = np.log2(p / bfp) + interval_scores.append(interval_score) else: interval_scores.append(None) @@ -316,7 +332,7 @@ def evaluate_question( if spot_forecast_time: spot_forecast_timestamp = min(spot_forecast_time.timestamp(), actual_close_time) - # We need all user forecasts to calculated GeoMean even + # We need all user forecasts to calculate GeoMean even # if we're only scoring some or none of the users user_forecasts = question.user_forecasts.all() if only_include_user_ids: diff --git a/tests/unit/test_scoring/test_score_math.py b/tests/unit/test_scoring/test_score_math.py index 23f5f78c71..652dcd9be3 100644 --- a/tests/unit/test_scoring/test_score_math.py +++ b/tests/unit/test_scoring/test_score_math.py @@ -47,7 +47,7 @@ def F(q=None, v=None, s=None, e=None): return forecast -def A(p: list[float] | None = None, n: int = 0, t: int | None = None): +def A(p: list[float | None] | None = None, n: int = 0, t: int | None = None): # Create an AggregationEntry object with basic values # p: pmf # n: number of forecasters @@ -75,6 +75,11 @@ class TestScoreMath: ([F()] * 100, [A(n=100)]), # maths ([F(v=0.7), F(v=0.8), F(v=0.9)], [A(p=[0.18171206, 0.79581144], n=3)]), + # multiple choice forecasts with placeholder 0s + ( + [F(q=QT.MULTIPLE_CHOICE, v=[0.6, 0.15, None, 0.25])] * 2, + [A(n=2, p=[0.6, 0.15, 0.0, 0.25])], + ), # start times ([F(), F(s=1)], [A(), A(t=1, n=2)]), ([F(), F(s=1), F(s=2)], [A(), A(t=1, n=2), A(t=2, n=3)]), @@ -85,7 +90,7 @@ class TestScoreMath: # numeric ( [F(q=QT.NUMERIC), F(q=QT.NUMERIC)], - [A(p=[0] + [1 / 200] * 200 + [0], n=2)], + [A(p=[0.0] + [1 / 200] * 200 + [0.0], n=2)], ), ( [ @@ -103,7 +108,10 @@ def test_get_geometric_means( result = get_geometric_means(forecasts) assert len(result) == len(expected) for ra, ea in zip(result, expected): - assert all(round(r, 8) == round(e, 8) for r, e in zip(ra.pmf, ea.pmf)) + assert all( + ((r == e) or (round(r, 8) == round(e, 8))) + for r, e in zip(ra.pmf, ea.pmf) + ) assert ra.num_forecasters == ea.num_forecasters assert ra.timestamp == ea.timestamp @@ -131,6 +139,37 @@ def test_get_geometric_means( ([F(v=0.9, s=5)], {}, [S(v=84.79969066 / 2, c=0.5)]), # half coverage ([F(v=2 ** (-1 / 2))], {}, [S(v=50)]), ([F(v=2 ** (-3 / 2))], {}, [S(v=-50)]), + # multiple choice w/ placeholder at index 2 + ( + [ + F( + q=QT.MULTIPLE_CHOICE, + v=[1 / 3, 1 - 3 ** (-0.5) - 1 / 3, None, 3 ** (-0.5)], + ) + ], + {"resolution_bucket": 0, "question_type": QT.MULTIPLE_CHOICE}, + [S(v=0.0)], + ), # chosen to have a score of 0 for simplicity + ( + [ + F( + q=QT.MULTIPLE_CHOICE, + v=[1 / 3, 1 - 3 ** (-0.5) - 1 / 3, None, 3 ** (-0.5)], + ) + ], + {"resolution_bucket": 2, "question_type": QT.MULTIPLE_CHOICE}, + [S(v=50)], + ), # same score as index == 3 since None should read from "Other" + ( + [ + F( + q=QT.MULTIPLE_CHOICE, + v=[1 / 3, 1 - 3 ** (-0.5) - 1 / 3, None, 3 ** (-0.5)], + ) + ], + {"resolution_bucket": 3, "question_type": QT.MULTIPLE_CHOICE}, + [S(v=50)], + ), # chosen to have a score of 50 for simplicity # numeric ( [F(q=QT.NUMERIC)], @@ -199,6 +238,37 @@ def test_evaluate_forecasts_baseline_accuracy(self, forecasts, args, expected): ([F(v=0.9, s=5)], {}, [S(v=84.79969066, c=1)]), ([F(v=2 ** (-1 / 2))], {}, [S(v=50)]), ([F(v=2 ** (-3 / 2))], {}, [S(v=-50)]), + # multiple choice w/ placeholder at index 2 + ( + [ + F( + q=QT.MULTIPLE_CHOICE, + v=[1 / 3, 1 - 3 ** (-0.5) - 1 / 3, None, 3 ** (-0.5)], + ) + ], + {"resolution_bucket": 0, "question_type": QT.MULTIPLE_CHOICE}, + [S(v=0.0)], + ), # chosen to have a score of 0 for simplicity + ( + [ + F( + q=QT.MULTIPLE_CHOICE, + v=[1 / 3, 1 - 3 ** (-0.5) - 1 / 3, None, 3 ** (-0.5)], + ) + ], + {"resolution_bucket": 2, "question_type": QT.MULTIPLE_CHOICE}, + [S(v=50)], + ), # same score as index == 3 since None should read from "Other" + ( + [ + F( + q=QT.MULTIPLE_CHOICE, + v=[1 / 3, 1 - 3 ** (-0.5) - 1 / 3, None, 3 ** (-0.5)], + ) + ], + {"resolution_bucket": 3, "question_type": QT.MULTIPLE_CHOICE}, + [S(v=50)], + ), # chosen to have a score of 50 for simplicity # numeric ( [F(q=QT.NUMERIC)], @@ -319,6 +389,64 @@ def test_evaluate_forecasts_baseline_spot_forecast(self, forecasts, args, expect S(v=100 * (0.5 * 0 + 0.5 * np.log(0.9 / gmean([0.1, 0.5]))), c=0.5), ], ), + # multiple choice w/ placeholder at index 2 + ( + [ + F( + q=QT.MULTIPLE_CHOICE, + v=[ + 1 / 3, + 1 - (np.e ** (0.25) / 3) - 1 / 3, + None, + np.e ** (0.25) / 3, + ], + ), + F( + q=QT.MULTIPLE_CHOICE, + v=[1 / 3, 1 / 3, None, 1 / 3], + ), + ], + {"resolution_bucket": 0, "question_type": QT.MULTIPLE_CHOICE}, + [S(v=0), S(v=0)], + ), # chosen to have a score of 0 for simplicity + ( + [ + F( + q=QT.MULTIPLE_CHOICE, + v=[ + 1 / 3, + 1 - (np.e ** (0.25) / 3) - 1 / 3, + None, + np.e ** (0.25) / 3, + ], + ), + F( + q=QT.MULTIPLE_CHOICE, + v=[1 / 3, 1 / 3, None, 1 / 3], + ), + ], + {"resolution_bucket": 2, "question_type": QT.MULTIPLE_CHOICE}, + [S(v=25), S(v=-25)], + ), # same score as index == 3 since 0.0 should read from "Other" + ( + [ + F( + q=QT.MULTIPLE_CHOICE, + v=[ + 1 / 3, + 1 - (np.e ** (0.25) / 3) - 1 / 3, + None, + np.e ** (0.25) / 3, + ], + ), + F( + q=QT.MULTIPLE_CHOICE, + v=[1 / 3, 1 / 3, None, 1 / 3], + ), + ], + {"resolution_bucket": 3, "question_type": QT.MULTIPLE_CHOICE}, + [S(v=25), S(v=-25)], + ), # chosen to have a score of 25 for simplicity # TODO: add tests with base forecasts different from forecasts ], ) @@ -403,6 +531,64 @@ def test_evaluate_forecasts_peer_accuracy(self, forecasts, args, expected): {}, [S(v=100 * np.log(0.1 / 0.5)), S(v=100 * np.log(0.5 / 0.1)), S(c=0)], ), + # multiple choice w/ placeholder at index 2 + ( + [ + F( + q=QT.MULTIPLE_CHOICE, + v=[ + 1 / 3, + 1 - (np.e ** (0.25) / 3) - 1 / 3, + None, + np.e ** (0.25) / 3, + ], + ), + F( + q=QT.MULTIPLE_CHOICE, + v=[1 / 3, 1 / 3, None, 1 / 3], + ), + ], + {"resolution_bucket": 0, "question_type": QT.MULTIPLE_CHOICE}, + [S(v=0), S(v=0)], + ), # chosen to have a score of 0 for simplicity + ( + [ + F( + q=QT.MULTIPLE_CHOICE, + v=[ + 1 / 3, + 1 - (np.e ** (0.25) / 3) - 1 / 3, + None, + np.e ** (0.25) / 3, + ], + ), + F( + q=QT.MULTIPLE_CHOICE, + v=[1 / 3, 1 / 3, None, 1 / 3], + ), + ], + {"resolution_bucket": 2, "question_type": QT.MULTIPLE_CHOICE}, + [S(v=25), S(v=-25)], + ), # same score as index == 3 since None should read from "Other" + ( + [ + F( + q=QT.MULTIPLE_CHOICE, + v=[ + 1 / 3, + 1 - (np.e ** (0.25) / 3) - 1 / 3, + None, + np.e ** (0.25) / 3, + ], + ), + F( + q=QT.MULTIPLE_CHOICE, + v=[1 / 3, 1 / 3, None, 1 / 3], + ), + ], + {"resolution_bucket": 3, "question_type": QT.MULTIPLE_CHOICE}, + [S(v=25), S(v=-25)], + ), # chosen to have a score of 25 for simplicity # TODO: add tests with base forecasts different from forecasts ], ) diff --git a/tests/unit/test_utils/test_the_math/test_formulas.py b/tests/unit/test_utils/test_the_math/test_formulas.py index 54f78dd357..30bb3d3e13 100644 --- a/tests/unit/test_utils/test_the_math/test_formulas.py +++ b/tests/unit/test_utils/test_the_math/test_formulas.py @@ -15,7 +15,12 @@ class TestFormulas: binary_details = {"type": Question.QuestionType.BINARY} multiple_choice_details = { "type": Question.QuestionType.MULTIPLE_CHOICE, - "options": ["A", "B", "C"], + "options": ["a", "c", "Other"], + "options_history": [ + (0, ["a", "b", "Other"]), + (100, ["a", "Other"]), + (200, ["a", "c", "Other"]), + ], } numeric_details = { "type": Question.QuestionType.NUMERIC, @@ -57,8 +62,10 @@ class TestFormulas: ("", binary_details, None), (None, binary_details, None), # Multiple choice questions - ("A", multiple_choice_details, 0), - ("C", multiple_choice_details, 2), + ("a", multiple_choice_details, 0), + ("b", multiple_choice_details, 1), + ("c", multiple_choice_details, 2), + ("Other", multiple_choice_details, 3), # Numeric questions ("below_lower_bound", numeric_details, 0), ("-2", numeric_details, 0), diff --git a/utils/the_math/formulas.py b/utils/the_math/formulas.py index 999444794c..d582039269 100644 --- a/utils/the_math/formulas.py +++ b/utils/the_math/formulas.py @@ -5,6 +5,7 @@ from questions.constants import UnsuccessfulResolutionType from questions.models import Question +from questions.services.multiple_choice_handlers import get_all_options_from_history from utils.typing import ForecastValues logger = logging.getLogger(__name__) @@ -33,7 +34,8 @@ def string_location_to_scaled_location( if question.type == Question.QuestionType.BINARY: return 1.0 if string_location == "yes" else 0.0 if question.type == Question.QuestionType.MULTIPLE_CHOICE: - return float(question.options.index(string_location)) + list_of_all_options = get_all_options_from_history(question.options_history) + return float(list_of_all_options.index(string_location)) # continuous if string_location == "below_lower_bound": return question.range_min - 1.0 diff --git a/utils/the_math/measures.py b/utils/the_math/measures.py index 50fe1f21ac..7edce08712 100644 --- a/utils/the_math/measures.py +++ b/utils/the_math/measures.py @@ -56,7 +56,7 @@ def weighted_percentile_2d( # replace np.nan back to None weighted_percentiles = np.array(weighted_percentiles) weighted_percentiles = np.where( - weighted_percentiles == np.nan, None, weighted_percentiles + np.isnan(weighted_percentiles.astype(float)), None, weighted_percentiles ) return weighted_percentiles.tolist()