-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathread_logs.py
More file actions
99 lines (91 loc) · 3.21 KB
/
read_logs.py
File metadata and controls
99 lines (91 loc) · 3.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import gzip
import re
import pandas as pd
import socket
from IPython.core.display import clear_output
def check_googlebot(ips):
verified = []
for ip in range(len(ips)):
try:
verify = socket.gethostbyaddr(ips[ip])
gf = verify[0].find(".")
gfi = verify[0].find(".", gf+1)
host = verify[0][gf+1:gfi]
if host == "google" or host == "googlebot":
verified.append("verified")
else:
verified.append("fake")
except:
verified.append("fake")
print("Verifying " + str(ip) + " of "+ str(len(ips)-1))
clear_output(wait = True)
return verified
def read_logs(file, type_file):
ips,date,hour,diff,method,url,status_code,download,referrer,user_agent=[],[],[],[],[],[],[],[],[],[]
if type_file == "gzip":
with gzip.open(file, 'rb') as f:
log_decompressed = f.read()
elif type_file == "log":
f = open(file, 'rb')
log_decompressed = f.read()
log_decompressed=str(log_decompressed)
log_decompressed = log_decompressed.split("\\")
for log in log_decompressed:
fs = log.find(" ")
ip = log[:fs].replace("n","").replace("b","").replace("'","")
ips.append(ip)
c = log.find("[")
dp = log.find(":")
f = log[c:dp].replace("[","")
date.append(f)
p = log.find("+")
h = log[dp+1:p].replace(" ","")
hour.append(h)
d = log[p:p+5]
diff.append(d)
try:
me = re.search("GET|POST|HEAD",log).group(0)
except:
me = re.search("GET|POST|HEAD",log)
method.append(me)
co = log.find(" /")
ss = log.find(" ",co+1)
ur = log[co:ss].replace(" ","")
url.append(ur)
ss1 = log.find(" ",ss+1)
ss2 = log.find(" ",ss1+1)
sta = log[ss1:ss2].replace(" ","")
status_code.append(sta)
ss3 = log.find(" ",ss2)
ss4 = log.find(" ",ss3+1)
down = log[ss3:ss4].replace(" ","")
download.append(down)
start_ref = log.find(" ",ss4)
end_ref = log.find(" ",start_ref+1)
ref = log[start_ref:end_ref].replace('"','').replace(" ","")
referrer.append(ref)
ua = log[end_ref+1:].replace('"','')
user_agent.append(ua)
final_log = pd.DataFrame({
"IP": ips,
"date":date,
"hour":hour,
"diff":diff,
"method":method,
"url":url,
"status_code":status_code,
"download":download,
"referrer":referrer,
"user_agent":user_agent
})
filtered_log = final_log["user_agent"].str.contains("Googlebot", na=False)
final_log = final_log[filtered_log]
try:
final_log.reset_index(inplace=True)
except:
pass
final_log = final_log[["IP","date","hour","diff","method","url","status_code","download","referrer","user_agent"]]
final_log["verified"] = check_googlebot(final_log["IP"]) #Comment this line if you don't want to verify Googlebot
return final_log
logs = read_logs(r'PATH-TO-FILE', "EXTENSION") # Path: C:\dir\file.extension && Extension: "gzip" | "log"
logs.to_csv("logs.csv", sep="\t", encoding="utf-8")