@@ -70,6 +70,9 @@ def start_flow():
7070 logger .debug ("Clearing pdp_contacts to prepare for match" )
7171 reset_pdp_contacts_with_unmatched (conn )
7272
73+ logger .debug ("Removing invalid entries from pdp_contacts" )
74+ filter_invalid_pdp_data (conn )
75+
7376 logger .debug ("Computing automatic matches" )
7477 automatic_matches = get_automatic_matches (conn )
7578 logger .debug ("Computing manual matches" )
@@ -129,6 +132,71 @@ def compare_names(n1, n2):
129132 return name_to_array (n1 ).bool_op ("&&" )(name_to_array (n2 ))
130133
131134
135+ def filter_invalid_pdp_data (conn ):
136+ pc = PdpContacts .__table__ .alias ()
137+ lower_first_name = func .lower (pc .c .first_name )
138+ lower_last_name = func .lower (pc .c .last_name )
139+
140+ unknown = and_ (
141+ lower_first_name .ilike ("%unknown%" ),
142+ lower_last_name .ilike ("%unknown%" )
143+ )
144+
145+ question_mark = and_ (
146+ lower_first_name == '?' ,
147+ lower_last_name == '?'
148+ )
149+
150+ john_or_jane_doe = and_ (
151+ or_ (
152+ lower_first_name == "john" ,
153+ lower_first_name == "jane"
154+ ),
155+ lower_last_name == "doe"
156+ )
157+
158+ no_name = and_ (
159+ lower_first_name == "no" ,
160+ lower_last_name == "name"
161+ )
162+
163+ none_friends = and_ (
164+ lower_first_name .is_ (None ),
165+ lower_last_name == "friends"
166+ )
167+
168+ red_flag = or_ (
169+ lower_first_name == "(red flag)" ,
170+ lower_last_name == "(red flag)"
171+ )
172+
173+ # It would be preferable for the following two conditions to use sqlalchemy statements,
174+ # but it proved surprisingly difficult to convert sqlalchemy regexp results into booleans
175+ digits_only = and_ (
176+ text ("""LOWER(first_name) ~ '^\d+$'""" ),
177+ text ("""LOWER(last_name) ~ '^\d+$'""" )
178+ )
179+ no_name_no_name = and_ (
180+ text ("""LOWER(first_name) ~ 'no\s?name'""" ),
181+ text ("""LOWER(last_name) ~ 'no\s?name'""" )
182+ )
183+
184+ composite_condition = or_ (
185+ unknown ,
186+ question_mark ,
187+ john_or_jane_doe ,
188+ no_name ,
189+ none_friends ,
190+ red_flag ,
191+ digits_only ,
192+ no_name_no_name ,
193+ )
194+
195+ delete_stmt = delete (pc ).where (composite_condition )
196+
197+ return conn .execute (delete_stmt )
198+
199+
132200def get_automatic_matches (conn ):
133201 pc1 = PdpContacts .__table__ .alias ()
134202 pc2 = PdpContacts .__table__ .alias ()
0 commit comments