-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathproblem.py
More file actions
161 lines (114 loc) · 5.11 KB
/
Copy pathproblem.py
File metadata and controls
161 lines (114 loc) · 5.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import os
import numpy as np
import pandas as pd
import rampwf as rw
from rampwf.workflows import FeatureExtractorRegressor
from rampwf.workflows import FeatureExtractorClassifier
from rampwf.score_types.base import BaseScoreType
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
problem_title = 'Parisian associations grants prediction challenge'
_target_column_name = 'montant vote'
# Label for binary classification
_prediction_label_names = [1, 0]
# We first need a classifier
Predictions_1 = rw.prediction_types.make_multiclass(label_names=_prediction_label_names)
# Then a regressor
Predictions_2 = rw.prediction_types.make_regression(label_names=[_target_column_name])
# The combined Predictions is initalized by the list of individual Predictions.
Predictions = rw.prediction_types.make_combined([Predictions_1, Predictions_2])
class clfreg(object):
def __init__(self, workflow_element_names=[
'feature_extractor_clf', 'classifier',
'feature_extractor_reg', 'regressor']):
self.element_names = workflow_element_names
self.feature_extractor_classifier_workflow =\
FeatureExtractorClassifier(self.element_names[:2])
self.feature_extractor_regressor_workflow =\
FeatureExtractorRegressor(self.element_names[2:])
def train_submission(self, module_path, X_df, y_array, train_is=None):
if train_is is None:
train_is = slice(None, None, None)
# Avoid setting with copy warning
X_train_df = X_df.iloc[train_is].copy()
y_train_array = y_array[train_is].copy()
y_train_clf = y_train_array[:, 0].copy()
y_train_reg = y_train_array[:, 1].copy()
idx = np.where(y_train_reg > 0)[0]
y_train_reg = y_train_reg[idx]
fe_clf, clf = self.feature_extractor_classifier_workflow.\
train_submission(module_path, X_train_df, y_train_clf)
fe_reg, reg = self.feature_extractor_regressor_workflow.\
train_submission(module_path, X_train_df.loc[idx,:], y_train_reg)
return fe_clf, clf, fe_reg, reg
def test_submission(self, trained_model, X_df):
fe_clf, clf, fe_reg, reg = trained_model
y_pred_clf = self.feature_extractor_classifier_workflow.\
test_submission((fe_clf, clf), X_df)
# Avoid setting with copy warning
X_df = X_df.copy()
labels = np.argmax(y_pred_clf, axis=1)
# get only subventioned label idx
pred_idx = np.where(labels != 0)[0]
y_pred_reg = np.full((y_pred_clf.shape[0],), -1)
y_temp = self.feature_extractor_regressor_workflow.\
test_submission((fe_reg, reg), X_df)
y_pred_reg[pred_idx] = y_temp[pred_idx]
return np.concatenate([y_pred_clf, y_pred_reg.reshape(-1, 1)], axis=1)
workflow = clfreg()
class F1_score(BaseScoreType):
is_lower_the_better = True
minimum = 0.0
maximum = 1.0
def __init__(self, name='f1', precision=4):
self.name = name
self.precision = precision
def __call__(self, y_true, y_pred):
labels = np.argmax(y_pred, axis=1)
return 1-f1_score(y_true[:,0], labels)
class log_score(BaseScoreType):
is_lower_the_better = True
minimum = 0.0
maximum = float('inf')
def __init__(self, name='fan error', precision=2):
self.name = name
self.precision = precision
def __call__(self, y_true, y_pred):
idx = np.where(y_pred > -1)[0]
if isinstance(y_true, pd.Series):
y_true = y_true.values
max_true = np.maximum(3., np.log10(np.maximum(1., y_true[idx])))
max_pred = np.maximum(3., np.log10(np.maximum(1., y_pred[idx])))
loss = np.mean(np.abs(max_true - max_pred))
return loss
score_clf = F1_score()
score_reg = log_score()
score_types = [
#Combination with 0.6, 0.4
rw.score_types.Combined(
name='combined', score_types=[score_clf, score_reg],
weights=[0.6, 0.4], precision=2),
rw.score_types.MakeCombined(score_type=score_clf, index=0),
rw.score_types.MakeCombined(score_type=score_reg, index=1),
]
def get_cv(X, y):
cv = GroupShuffleSplit(n_splits=8, test_size=0.20, random_state=42)
return cv.split(X, y, groups=X['numDoc'])
def _read_data(path, f_name):
data = pd.read_csv(os.path.join(path, 'data', f_name), low_memory=False,
compression='zip')
y_array = data[_target_column_name].values
X_df = data.drop(_target_column_name, axis=1)
y_reg_array = y_array.copy()
y_clf_array = y_array.copy()
y_clf_array[y_clf_array > 0 ] = 1
y_array = np.concatenate([y_clf_array.reshape(-1,1), y_reg_array.reshape(-1,1)], axis=1)
return X_df, y_array
def get_train_data(path='.'):
f_name = 'subventions-accordees-et-refusees_TRAIN.csv'
return _read_data(path, f_name)
def get_test_data(path='.'):
f_name = 'subventions-accordees-et-refusees_TEST.csv'
return _read_data(path, f_name)