From 16a26170575451b9191cae5fb3d91ceb61c3d06b Mon Sep 17 00:00:00 2001
From: Paul Resnick <presnick@umich.edu>
Date: Fri, 8 Mar 2024 15:01:12 -0500
Subject: [PATCH 1/2] improve comment and log for _filter_misleading_notes
 (comment didn't match code about which timestamp)

---
 sourcecode/scoring/process_data.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/sourcecode/scoring/process_data.py b/sourcecode/scoring/process_data.py
index dca467fde..cb6e6226c 100644
--- a/sourcecode/scoring/process_data.py
+++ b/sourcecode/scoring/process_data.py
@@ -182,9 +182,13 @@ def _filter_misleading_notes(
   """
   This function actually filters ratings (not notes), based on which notes they rate.
 
-  Filter out ratings of notes that say the Tweet isn't misleading.
-  Also filter out ratings of deleted notes, unless they were deleted after
-    c.deletedNotesTombstoneLaunchTime, and appear in noteStatusHistory.
+  Keep ratings of undeleted notes that either:
+    - say the Tweet is misleading
+    - OR it's after the new UI launch time, c.notMisleadingUILaunchTime. 
+       (After that timestamp, we start assessing the helpfulness of notes that say the Tweet isn't misleading.
+        Before that timestamp, we did not assess the helpfulness of such notes.)
+  Also keep ratings of deleted notes if:
+    - they were scored in noteStatusHistory
 
   Args:
       notes (pd.DataFrame): _description_
@@ -233,6 +237,9 @@ def _filter_misleading_notes(
     print(
       f"  Keeping {ratings[deletedButInNSHKey].sum()} ratings on {len(np.unique(ratings.loc[ratings[deletedButInNSHKey],c.noteIdKey]))} deleted notes that were previously scored (in note status history)"
     )
+    print(
+      f"  Keeping {notDeletedNotMisleadingNewUI.sum()} ratings on {len(np.unique(ratings.loc[notDeletedNotMisleadingNewUI,c.noteIdKey]))} non-misleading notes after the new UI launch time"
+    )
     print(
       f"  Removing {notDeletedNotMisleadingOldUI.sum()} ratings on {len(np.unique(ratings.loc[notDeletedNotMisleadingOldUI, c.noteIdKey]))} older notes that aren't deleted, but are not-misleading."
     )
@@ -255,7 +262,7 @@ def _filter_misleading_notes(
   return ratings
 
 
-def remove_duplicate_ratings(ratings: pd.DataFrame) -> pd.DataFrame:
+def remove_duplicate_ratings(ratings: pd.DataFrame) -> pd.DataFrame:  
   """Drop duplicate ratings, then assert that there is exactly one rating per noteId per raterId.
 
   Args:

From 7542de41bb13d0da433a469b348bf094cabe580a Mon Sep 17 00:00:00 2001
From: Paul Resnick <presnick@umich.edu>
Date: Fri, 8 Mar 2024 15:24:10 -0500
Subject: [PATCH 2/2] improve log file messages

---
 sourcecode/scoring/process_data.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sourcecode/scoring/process_data.py b/sourcecode/scoring/process_data.py
index cb6e6226c..4065d57ea 100644
--- a/sourcecode/scoring/process_data.py
+++ b/sourcecode/scoring/process_data.py
@@ -232,16 +232,16 @@ def _filter_misleading_notes(
       f"Preprocess Data: Filter misleading notes, starting with {len(ratings)} ratings on {len(np.unique(ratings[c.noteIdKey]))} notes"
     )
     print(
-      f"  Keeping {ratings[notDeletedMisleadingKey].sum()} ratings on {len(np.unique(ratings.loc[ratings[notDeletedMisleadingKey],c.noteIdKey]))} misleading notes"
+      f"  Keeping {ratings[notDeletedMisleadingKey].sum()} ratings on {len(np.unique(ratings.loc[ratings[notDeletedMisleadingKey],c.noteIdKey]))} notes that claim the tweet is misleading"
     )
     print(
       f"  Keeping {ratings[deletedButInNSHKey].sum()} ratings on {len(np.unique(ratings.loc[ratings[deletedButInNSHKey],c.noteIdKey]))} deleted notes that were previously scored (in note status history)"
     )
     print(
-      f"  Keeping {notDeletedNotMisleadingNewUI.sum()} ratings on {len(np.unique(ratings.loc[notDeletedNotMisleadingNewUI,c.noteIdKey]))} non-misleading notes after the new UI launch time"
+      f"  Keeping {notDeletedNotMisleadingNewUI.sum()} ratings on {len(np.unique(ratings.loc[notDeletedNotMisleadingNewUI,c.noteIdKey]))} notes that do not claim the tweet is misleading, but after the new UI launch time"
     )
     print(
-      f"  Removing {notDeletedNotMisleadingOldUI.sum()} ratings on {len(np.unique(ratings.loc[notDeletedNotMisleadingOldUI, c.noteIdKey]))} older notes that aren't deleted, but are not-misleading."
+      f"  Removing {notDeletedNotMisleadingOldUI.sum()} ratings on {len(np.unique(ratings.loc[notDeletedNotMisleadingOldUI, c.noteIdKey]))} older notes that aren't deleted, but do not claim the tweet is misleading."
     )
     print(
       f"  Removing {deletedNotInNSH.sum()} ratings on {len(np.unique(ratings.loc[deletedNotInNSH, c.noteIdKey]))} notes that were deleted and not in note status history (e.g. old)."