-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscript.py
More file actions
99 lines (72 loc) · 2.86 KB
/
script.py
File metadata and controls
99 lines (72 loc) · 2.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import urllib.request
import os
import pandas as pd
import postgresql as psql
def dropbox_down(url, file_name):
u = urllib.request.urlopen(url)
data = u.read()
u.close()
with open(file_name, "wb") as f:
f.write(data)
def unzip(file_path):
os.system('gunzip ' + file_path)
title_url = "https://www.dropbox.com/s/3do9bu0awq048uh/title.basics.tsv.gz?dl=1"
name_url = "https://www.dropbox.com/s/xaidig3yw2viyym/name.basics.tsv.gz?dl=1"
dropbox_down(title_url, "title.basics.tsv.gz")
dropbox_down(name_url, "name.basics.tsv.gz")
unzip("title.basics.tsv.gz")
unzip("name.basics.tsv.gz")
def load_movie_data(file_name, row_limit):
df = pd.read_csv(file_name, sep='\t', header=0, nrows=row_limit)
df['endYear'] = df['endYear'].apply(
lambda x: x if isinstance(x, int) else 0)
df['runtimeMinutes'] = df['runtimeMinutes'].apply(
lambda x: x if isinstance(x, int) else 0)
df['originalTitle'] = df['originalTitle'].apply(
lambda x: str(x).replace('\'', ''))
df['primaryTitle'] = df['primaryTitle'].apply(
lambda x: str(x).replace('\'', ''))
return df
def load_actor_data(file_name, row_limit):
df = pd.read_csv(file_name, sep='\t', header=0, nrows=row_limit)
df['deathYear'] = df['deathYear'].apply(
lambda x: x if isinstance(x, int) else 0)
df['birthYear'] = df['birthYear'].apply(
lambda x: x if isinstance(x, int) else 0)
df['primaryName'] = df['primaryName'].apply(
lambda x: str(x).replace('\'', ''))
return df
def load_assoc_data(file_name, row_limit):
df = pd.read_csv(file_name, sep='\t', header=0, nrows=row_limit)
df = df[['nconst', 'knownForTitles']]
df = df.set_index('nconst') \
.knownForTitles.str.split(',', expand=True) \
.stack() \
.reset_index('nconst') \
.rename(columns={0: 'knownForTitles'}) \
.reset_index(drop=True)
return df
def insert_df(connection, df, table_name):
cursor = connection.cursor()
for i in range(len(df)):
row = str(tuple(df.iloc[i]))
try:
cursor.execute("INSERT INTO " + table_name + " VALUES {} ".format(row))
except:
connection.rollback()
def insert_df2(connection, df, table_name):
cursor = connection.cursor()
for i in range(len(df)):
row = str(tuple(df.iloc[i]))
try:
cursor.execute("INSERT INTO " + table_name + "(actor_id_id, movie_id_id) VALUES {} ".format(row))
connection.commit()
except:
connection.rollback()
movie_df = load_movie_data('title.basics.tsv', 10000)
actor_df = load_actor_data('name.basics.tsv', 10000)
assoc_df = load_assoc_data('name.basics.tsv', 9999999)
connection = psql.connect()
insert_df(connection, actor_df, '"IMDB_actor"')
insert_df(connection, movie_df, '"IMDB_movie"')
insert_df2(connection, assoc_df, '"IMDB_movieactorassoc"')