authorProfPAN16/cross.py at master · kbogas/authorProfPAN16 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/python

import time
from argparse import ArgumentParser
from pan import ProfilingDataset, createDocProfiles, create_target_prof_trainset
from sklearn.cross_validation import train_test_split
from tictacs import from_recipe
from json import dumps
from sklearn.grid_search import GridSearchCV
import logging
logging.basicConfig(filename='example.log',level=logging.DEBUG)

log = []


def cross_val(dataset, task, model, num_folds=4):
    """ train and cross validate a model

    :lang: the language
    :task: the task we want to classify for , ex: age

    """

    # if (task != "age") and (task !="gender"):
    #    X, y = dataset.get_data(task)
    # else:
    #    docs = createDocProfiles(dataset)
    #    X, y = create_target_prof_trainset(docs, task)
    #docs = createDocProfiles(dataset)
    #X, y = create_target_prof_trainset(docs, task)
    #X, _, y, _ = train_test_split(X, y, train_size=100000, stratify=y, random_state=100)
    #X, y = X[:20000], y[:20000]
    X, y = dataset.get_data(task)
    # y = [yy.lower() for yy in y]
    # get parameters for grid search if it exists - else pass empty dict
    params = model.grid_params if hasattr(model, 'grid_params') else dict()
    print params
    print model
    # from collections import Counter
    # import pprint
    # pprint.pprint(Counter(y))
    print '\nCreating model for %s - %s' % (dataset.lang, task)
    print 'Trainining instances: %s\n' % (len(X))
    print 'Using %s fold validation' % (num_folds)
    # get data
    log.append('\nResults for %s - %s with classifier %s' %
               (dataset.lang, task, model.__class__.__name__))
    if task in dataset.config.classifier_list:
        grid_cv = GridSearchCV(model, params, cv=num_folds, verbose=1,
                               n_jobs=-1, refit=False)
        grid_cv.fit(X, y)
        # y_pred = grid_cv.best_estimator_.predict(X)
        # pprint.pprint(y_pred)
        # pprint.pprint(y)
        # conf = confusion_matrix(y, y_pred, labels=list(set(y)))
        accuracy = grid_cv.best_score_
        # accuracy2 = accuracy_score(y, y_pred)
        log.append('best params: %s' % grid_cv.best_params_)
        log.append('Accuracy mean : %s' % accuracy)
        import pprint
        pprint.pprint(grid_cv.grid_scores_)
        with open('./comb_res/res.txt', 'a') as out:
            out.write('Results: %s - %s, params: %s ,Accuracy_Mean: %s\n' %
                      (dataset.lang, task,
                       dumps(grid_cv.best_params_), grid_cv.best_score_))
        # log.append('Best accuracy: {} '.format(accuracy2))
        # log.append('Best Confusion matrix :\n {}'.format(conf))
    else:
        # if it's not, we measure mean square root error (regression)
        raise KeyError('task %s was not found in task list!' % task)

if __name__ == '__main__':
    parser = ArgumentParser(description='Train a model with crossvalidation'
                            ' on pan dataset - used for testing purposes ')
    parser.add_argument('-i', '--input', type=str,
                        required=True, dest='infolder',
                        help='path to folder with pan dataset for a language')
    parser.add_argument('-n', '--numfolds', type=int,
                        dest='num_folds', default=4,
                        help='Number of folds to use in cross validation')

    args = parser.parse_args()
    infolder = args.infolder
    num_folds = args.num_folds
    time_start = time.time()
    print('Loading dataset...')
    dataset = ProfilingDataset(infolder)
    print('Loaded %s users...\n' % len(dataset.entries))
    config = dataset.config
    #print config
    #print type(config)
    #exit(1)
    tasks = config.tasks
    print('\n--------------- Thy time of Running ---------------')
    for task in ['gender']:
        tictac = from_recipe(config.recipes[task])
        outline = ""
        for step in tictac.steps:
            if step[0] == "features":
                # print type(step[1])
                    for tf in step[1].transformer_list:
                        # print type(tf[1])
                        # print type(tf[1].get_params())
                        outline += tf[0] + " with Params:[" + str(tf[1].get_params()) + "]+"
            else:
                # if hasattr(step[1], 'get_params'):
                    # outline += step[0] + " with Params:[" + str(step[1].get_params()) + "]+"
                # else:
                    # outline += step[0]+ "+"
                outline += step[0] + "+"
        outline = outline[:-1] + "\n"
        print('Task:{}, Pipeline:{}'.format(task, outline))
        with open('./comb_res/res.txt', 'a') as out:
            out.write('Task:{}, Pipeline:{}'.format(task, outline))
        cross_val(dataset, task, tictac, num_folds)
    # print results at end
    print('\n--------------- Thy time of Judgement ---------------')
    print ('Time: {} seconds.\n'.format(str(time.time()-time_start)))
    with open('./comb_res/res.txt', 'a') as out:
            out.write('Time: {} seconds.\n'.format(str(time.time()-time_start)))
    for message in log:
        print(message)