-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathonline_forest_data.py
More file actions
99 lines (78 loc) · 3.08 KB
/
online_forest_data.py
File metadata and controls
99 lines (78 loc) · 3.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import os
import pandas as pd
import pickle as pkl
from tick.online import OnlineForestRegressor, OnlineForestClassifier
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, \
RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
# TODO: options for types of sampling of the features
# TODO: online construction of the feature_importances
# TODO: python script that tries all combinations
# TODO: what if we feed several times the same dataset
# TODO: show that the classifier is insensitive to the time of arrival of the points
# TODO: V-fold instead of train and test ?
# TODO: Set features importance with default to none
# TODO: implement a subsample strategy : only one tree is updated with the given sample
# TODO: tree aggregation
# TODO: different "types" of trees: no aggregation, aggregation and different temperatures
# TODO: unittest for attributes
# TODO: unittest for wrong n_features in fit and predict and wrong labels in training
# TODO: tryout multiple passes
# TODO: really make seed work with inline forest
path = '/Users/stephane.gaiffas/Dropbox/jaouad/online-forests/datasets/'
filenames = [
'dna.p',
'letter.p',
'satimage.p',
'usps.p'
]
n_classess = [3, 25, 5, 9]
n_trees = 10
names = [
"OF (agg, step=1.)",
"OF(agg, step=100.)",
"OF(no agg.)",
"KNN (k=5)",
"ET",
"BRF"
]
for filename, n_classes in zip(filenames, n_classess):
print(filename)
with open(os.path.join(path, filename), 'rb') as f:
data = pkl.load(f)
X_train = data['x_train']
X_test = data['x_test']
y_train = data['y_train']
y_test = data['y_test']
classifiers = [
OnlineForestClassifier(n_trees=n_trees, seed=123, step=1.,
use_aggregation=True, n_classes=n_classes),
OnlineForestClassifier(n_trees=n_trees, seed=123, step=100.,
n_classes=n_classes, use_aggregation=True),
OnlineForestClassifier(n_trees=n_trees, seed=123, step=1.,
use_aggregation=False, n_classes=n_classes),
KNeighborsClassifier(n_neighbors=5),
ExtraTreesClassifier(n_estimators=n_trees),
RandomForestClassifier(n_estimators=n_trees)
]
triche = RandomForestClassifier(n_estimators=n_trees)
triche.fit(X_train, y_train)
feature_importances = triche.feature_importances_ / triche.feature_importances_.sum()
#
# plt.stem(probabilities)
# plt.title('Features importance for ' + filename, fontsize=18)
# plt.xlabel('Features')
# plt.ylabel('Importance')
# # plt.show()
# plt.savefig(filename + '.pdf')
# online_forest.set_probabilities(probabilities)
# forest1 =
for clf, name in zip(classifiers, names):
if hasattr(clf, 'clear'):
clf.clear()
clf.set_feature_importances(feature_importances)
# print('Fitting', name)
clf.fit(X_train, y_train)
# print('Done.')
print('Accuracy of', name, ': ', '%.2f' % clf.score(X_test, y_test))