Skip to content

Commit b858c03

Browse files
committed
added a small url check
1 parent 122c8a3 commit b858c03

3 files changed

Lines changed: 33 additions & 13 deletions

File tree

build/lib/urlcounter/urlcounter.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,8 @@ def regex_lister(the_list, key):
6060
Arguments:
6161
-df: DataFrame. Array of Strings to write as a regex String.
6262
-columns: a List of 4 column names to use from corpus:
63-
1. Name of URL column that includes a list of URLs included in post/content.
64-
2. Integer. Number of times a post was shared, such as Retweets on Twitter.
63+
0. Name of URL column that includes a list of URLs included in post/content.
64+
1. Integer. Number of times a post was shared, such as Retweets on Twitter.
6565
Returns:
6666
-A List that includes:
6767
- sorted_totals: List of Tuples that contain 2 items:
@@ -83,10 +83,19 @@ def url_counter(df, columns):
8383
# Go through list of urls
8484
for url in lu:
8585
if url != None:
86-
# Based on shared count, append that amount
87-
ranger = int(h[columns[1]])
88-
for g in range(0, ranger):
89-
cleaned_listed_data.append(url)
86+
# Check for malformed URLs
87+
# TODO: Could use some more checks here
88+
if (url[0] == '/'):
89+
print('=============')
90+
print('Cleaning needed for', h[columns[3]]+'\'s', h[columns[0]])
91+
print('Problem URL:', url)
92+
print('=============')
93+
continue
94+
else:
95+
# Based on shared count, append that amount
96+
ranger = int(h[columns[1]])
97+
for g in range(0, ranger):
98+
cleaned_listed_data.append(url)
9099

91100
# Count up domains
92101
domain_re = r"://[^.]{1,}\.[^\/]{1,}\/"
@@ -98,6 +107,7 @@ def url_counter(df, columns):
98107
if len(domain_match) == 0:
99108
s_match = [(s.start(0), s.end(0)) for s in re.finditer(simple_re, domain)]
100109
broken_check = [(s.start(0), s.end(0)) for s in re.finditer(broken_urls, domain)]
110+
101111
if len(broken_check) > 0:
102112
continue
103113
else:

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
setup(
66
name = 'urlcounter',
77
packages = ['urlcounter'],
8-
version = '0.0.2',
8+
version = '0.0.3',
99
description = 'A set of functions that tally URLs within an event-based corpus. It assumes that you have data divided into a range of event-based periods with community-detected modules/hubs. It also assumes that you have unspooled and cleaned your URL data. See Deen Freelon\'s unspooler module for help: https://github.com/dfreelon/unspooler.',
1010
author = 'Chris A. Lindgren',
1111
author_email = 'chris.a.lindgren@gmail.com',

urlcounter/urlcounter.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,8 @@ def regex_lister(the_list, key):
6060
Arguments:
6161
-df: DataFrame. Array of Strings to write as a regex String.
6262
-columns: a List of 4 column names to use from corpus:
63-
1. Name of URL column that includes a list of URLs included in post/content.
64-
2. Integer. Number of times a post was shared, such as Retweets on Twitter.
63+
0. Name of URL column that includes a list of URLs included in post/content.
64+
1. Integer. Number of times a post was shared, such as Retweets on Twitter.
6565
Returns:
6666
-A List that includes:
6767
- sorted_totals: List of Tuples that contain 2 items:
@@ -83,10 +83,19 @@ def url_counter(df, columns):
8383
# Go through list of urls
8484
for url in lu:
8585
if url != None:
86-
# Based on shared count, append that amount
87-
ranger = int(h[columns[1]])
88-
for g in range(0, ranger):
89-
cleaned_listed_data.append(url)
86+
# Check for malformed URLs
87+
# TODO: Could use some more checks here
88+
if (url[0] == '/'):
89+
print('=============')
90+
print('Cleaning needed for', h[columns[3]]+'\'s', h[columns[0]])
91+
print('Problem URL:', url)
92+
print('=============')
93+
continue
94+
else:
95+
# Based on shared count, append that amount
96+
ranger = int(h[columns[1]])
97+
for g in range(0, ranger):
98+
cleaned_listed_data.append(url)
9099

91100
# Count up domains
92101
domain_re = r"://[^.]{1,}\.[^\/]{1,}\/"
@@ -98,6 +107,7 @@ def url_counter(df, columns):
98107
if len(domain_match) == 0:
99108
s_match = [(s.start(0), s.end(0)) for s in re.finditer(simple_re, domain)]
100109
broken_check = [(s.start(0), s.end(0)) for s in re.finditer(broken_urls, domain)]
110+
101111
if len(broken_check) > 0:
102112
continue
103113
else:

0 commit comments

Comments
 (0)