From 16a26170575451b9191cae5fb3d91ceb61c3d06b Mon Sep 17 00:00:00 2001 From: Paul Resnick Date: Fri, 8 Mar 2024 15:01:12 -0500 Subject: [PATCH 1/2] improve comment and log for _filter_misleading_notes (comment didn't match code about which timestamp) --- sourcecode/scoring/process_data.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/sourcecode/scoring/process_data.py b/sourcecode/scoring/process_data.py index dca467fde..cb6e6226c 100644 --- a/sourcecode/scoring/process_data.py +++ b/sourcecode/scoring/process_data.py @@ -182,9 +182,13 @@ def _filter_misleading_notes( """ This function actually filters ratings (not notes), based on which notes they rate. - Filter out ratings of notes that say the Tweet isn't misleading. - Also filter out ratings of deleted notes, unless they were deleted after - c.deletedNotesTombstoneLaunchTime, and appear in noteStatusHistory. + Keep ratings of undeleted notes that either: + - say the Tweet is misleading + - OR it's after the new UI launch time, c.notMisleadingUILaunchTime. + (After that timestamp, we start assessing the helpfulness of notes that say the Tweet isn't misleading. + Before that timestamp, we did not assess the helpfulness of such notes.) + Also keep ratings of deleted notes if: + - they were scored in noteStatusHistory Args: notes (pd.DataFrame): _description_ @@ -233,6 +237,9 @@ def _filter_misleading_notes( print( f" Keeping {ratings[deletedButInNSHKey].sum()} ratings on {len(np.unique(ratings.loc[ratings[deletedButInNSHKey],c.noteIdKey]))} deleted notes that were previously scored (in note status history)" ) + print( + f" Keeping {notDeletedNotMisleadingNewUI.sum()} ratings on {len(np.unique(ratings.loc[notDeletedNotMisleadingNewUI,c.noteIdKey]))} non-misleading notes after the new UI launch time" + ) print( f" Removing {notDeletedNotMisleadingOldUI.sum()} ratings on {len(np.unique(ratings.loc[notDeletedNotMisleadingOldUI, c.noteIdKey]))} older notes that aren't deleted, but are not-misleading." ) @@ -255,7 +262,7 @@ def _filter_misleading_notes( return ratings -def remove_duplicate_ratings(ratings: pd.DataFrame) -> pd.DataFrame: +def remove_duplicate_ratings(ratings: pd.DataFrame) -> pd.DataFrame: """Drop duplicate ratings, then assert that there is exactly one rating per noteId per raterId. Args: From 7542de41bb13d0da433a469b348bf094cabe580a Mon Sep 17 00:00:00 2001 From: Paul Resnick Date: Fri, 8 Mar 2024 15:24:10 -0500 Subject: [PATCH 2/2] improve log file messages --- sourcecode/scoring/process_data.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sourcecode/scoring/process_data.py b/sourcecode/scoring/process_data.py index cb6e6226c..4065d57ea 100644 --- a/sourcecode/scoring/process_data.py +++ b/sourcecode/scoring/process_data.py @@ -232,16 +232,16 @@ def _filter_misleading_notes( f"Preprocess Data: Filter misleading notes, starting with {len(ratings)} ratings on {len(np.unique(ratings[c.noteIdKey]))} notes" ) print( - f" Keeping {ratings[notDeletedMisleadingKey].sum()} ratings on {len(np.unique(ratings.loc[ratings[notDeletedMisleadingKey],c.noteIdKey]))} misleading notes" + f" Keeping {ratings[notDeletedMisleadingKey].sum()} ratings on {len(np.unique(ratings.loc[ratings[notDeletedMisleadingKey],c.noteIdKey]))} notes that claim the tweet is misleading" ) print( f" Keeping {ratings[deletedButInNSHKey].sum()} ratings on {len(np.unique(ratings.loc[ratings[deletedButInNSHKey],c.noteIdKey]))} deleted notes that were previously scored (in note status history)" ) print( - f" Keeping {notDeletedNotMisleadingNewUI.sum()} ratings on {len(np.unique(ratings.loc[notDeletedNotMisleadingNewUI,c.noteIdKey]))} non-misleading notes after the new UI launch time" + f" Keeping {notDeletedNotMisleadingNewUI.sum()} ratings on {len(np.unique(ratings.loc[notDeletedNotMisleadingNewUI,c.noteIdKey]))} notes that do not claim the tweet is misleading, but after the new UI launch time" ) print( - f" Removing {notDeletedNotMisleadingOldUI.sum()} ratings on {len(np.unique(ratings.loc[notDeletedNotMisleadingOldUI, c.noteIdKey]))} older notes that aren't deleted, but are not-misleading." + f" Removing {notDeletedNotMisleadingOldUI.sum()} ratings on {len(np.unique(ratings.loc[notDeletedNotMisleadingOldUI, c.noteIdKey]))} older notes that aren't deleted, but do not claim the tweet is misleading." ) print( f" Removing {deletedNotInNSH.sum()} ratings on {len(np.unique(ratings.loc[deletedNotInNSH, c.noteIdKey]))} notes that were deleted and not in note status history (e.g. old)."