-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathpreparing_data.py
More file actions
126 lines (97 loc) · 4.22 KB
/
preparing_data.py
File metadata and controls
126 lines (97 loc) · 4.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
""" Prepare the simulation data for Machine Learning procedures
"""
import json
import operator
import os
import numpy as np
import pandas as pd
cols_names = ['months', 'price_index', 'gdp_index', 'gdp_growth', 'unemployment', 'average_workers',
'families_wealth', 'families_savings', 'firms_wealth', 'firms_profit', 'gini_index',
'average_utility', 'inflation', 'average_qli']
def read_json(p):
# Interpret JSON file of configuration with simulation given parameters
return json.load(open(p))
def json_to_dict(df):
# Transforms JSON data into DataFrame, removing unchanging columns
t = pd.DataFrame.from_dict(df, orient='index').drop(labels='RUN', axis=0).dropna(axis=1)
t = t.drop(['LIST_NEW_AGE_GROUPS', 'TAXES_STRUCTURE', 'SIMPLIFY_POP_EVOLUTION'], axis=1)
try:
t = t.drop(['PROCESSING_STATES', 'HIRING_SAMPLE_SIZE'], axis=1)
except:
pass
t['PROCESSING_ACPS'] = t['PROCESSING_ACPS'].apply(lambda x: x[0])
# Spelling bug
try:
t = t.drop('HOUSE_VANCANCY', axis=1)
except:
pass
return t
def read_conf_files(general_path):
# Walks over directory collecting all conf.JSON files representing each simulation and its parameters
return [os.path.join(dirpath, f)
for dirpath, dirnames, files in os.walk(general_path)
for f in files if f.startswith('conf')]
def process_each_file(files_list, cols, y=pd.DataFrame(), x=pd.DataFrame()):
# Extract both parameters from conf.JSON files and results of that given simulation from 'avg' folder
for each in files_list:
# Removing 'conf.json' from path and accessing temp_stats.csv
y_test = pd.read_csv(each[:-9] + r'\avg\temp_stats.csv', sep=';', header=None)
# Testing last month of simulation
if len(y_test) == 240:
y = y.append(y_test)
x = x.append(json_to_dict(read_json(each)))
# Provides names for the columns of results of simulation
y.columns = cols
return x, y
def last_month(df):
return df[df.months == 239].drop('months', axis=1)
def selecting_y(df, col):
# Selects only results from last month of simulation
return df[col]
def customizing_target(base, percentile=65, op=operator.gt):
# Discretizes results for a given percentile and a given operator (greater than or less than)
return pd.DataFrame({'target': [1 if op.__call__(x, np.percentile(base, percentile)) else 0 for x in base]})
def averaging_targets(df1, df2):
# Summarizes two target columns into one when both results are one
return pd.DataFrame({'target': [1 if x == 1 and y == 1 else 0 for x, y in zip(df1['target'], df2['target'])]})
def dummies(data):
cat, num = [], []
for i in data.columns:
if data[i].dtype == object:
cat.append(i)
else:
num.append(i)
cat = data[cat]
try:
cat = cat.drop(['PROCESSING_STATES'], axis=1)
except:
pass
cat = pd.get_dummies(cat)
num = data[num]
try:
num = num.drop(['HIRING_SAMPLE_SIZE'], axis=1)
except:
pass
return pd.concat([num, cat], axis=1)
def main(pathway, selected_col1, selected_col2):
# Runs the script for a given directory and two given targets
# Target1 set to percentile 80 and greater than
# Target2 set to percentile 20 and less than
file_list = read_conf_files(pathway)
data_x, data_y = process_each_file(file_list, cols_names)
# Getting last months' data
data_y = last_month(data_y)
# Excluding the binary operation on target and keeping all values
first_col = customizing_target(selecting_y(data_y, selected_col1))
second_col = customizing_target(selecting_y(data_y, selected_col2), 35, operator.lt)
data_y = averaging_targets(first_col, second_col)
data_x = dummies(data_x)
name = 'pre_processed_data\\' + pathway[-4:] + '_' + selected_col1 + '_' + selected_col2 + '_x.csv'
data_x.to_csv(name, index=False, sep=';')
data_y.to_csv(name.replace('x.csv', 'y.csv'), index=False, sep=';')
return data_x, data_y
if __name__ == "__main__":
path = r'\\storage1\carga\MODELO DINAMICO DE SIMULACAO\Exits_python\JULY'
target1 = 'average_qli'
target2 = 'unemployment'
x, y = main(path, target1, target2)