continuous-grasp-decoding/grasp_run.py at main · mottenhoff/continuous-grasp-decoding · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
# Builtin
import sys
import pickle
import copy
from os import mkdir
from os.path import isdir, exists

# 3th party
import numpy as np
import scipy.signal
from sklearn.metrics import roc_auc_score

# Local
paths = [r".\classifiers/"]
for path in paths:
    sys.path.insert(0, path)

from grasp_util import get_filenames
from grasp_util import load_seeg
from grasp_util import load_pickle
from grasp_util import load_location_files
from grasp_util import is_file_to_skip
from grasp_util import save_dict
from grasp_util import remove_data
from grasp_util import clean_data
from grasp_util import apply_reference
from grasp_util import apply_filters
from grasp_util import get_power_spectrum
from grasp_util import get_fractal_component
from grasp_util import generate_complex_morlet_wavelet
from grasp_util import create_windows
from grasp_util import combine_features_with_channels
from grasp_util import split_per_trial
from grasp_util import trial_per_window
from grasp_util import perform_stat_test
from grasp_util import get_train_test
from grasp_util import combine_data_to_x_y

from grasp_plot import pretty_print
from grasp_plot import plot_all
# from grasp_plot import plot_average_trial
from grasp_plot import plot_bargrid
from grasp_plot import plot_bargrid_averaged_score, plot_bargrid_score_per_metric, plot_bargrid_score_per_ppt

# Classifiers
from lda import LDA
from rfc import RFC


def load_raw_data(filename):
    data = None

    try:
        data = load_seeg(filename)
    except Exception as e:
        print("Failed to load: {}, due to exception: {}". format(str(filename), e))

    return data

def preprocess_data(data, frequency_bands={}, windows=[],
                    reference=None, power_spectrum=False,
                    savepath=None, save_incl_raw=False):

    # investigate_pink_noise2(data)

    data = remove_data(data)
    data = clean_data(data)

    # Reference electrodes
    if reference != None:
        data = apply_reference(data, reference)

    # Filter data
    if any(frequency_bands):
        data = apply_filters(data, frequency_bands)

    # Functions to apply when the data is being windowed
    window_fns = {
        'psd': get_power_spectrum,
        }

    # Create windows
    if any(windows):
        data = create_windows(data, windows,
                              window_fns=window_fns)

    # Save
    # if savepath:
    #     if not save_incl_raw:
    #         data.pop('eeg')
    #     save_dict(savepath, data)

    return data

def extract_features(data, features_to_combine=[]):
    ''' frequencybands should also be treated as
    features instead of preprocessing.
    '''

    # Split and combine features per channel to use for learning
    features, feature_names = combine_features_with_channels(data, features_to_combine)

    features = trial_per_window(data, features)

    # Add extra info
    features['fs'] = data['fs']
    features['window_size'] = data['window_size']
    features['frameshift'] = data['frameshift']
    features['feature_names'] = feature_names
    features['channel_names'] = data['channel_names']

    if 'channel_locations' in data.keys():
        features['channel_locations'] = data['channel_locations']

    # Save in a dictionary
    results = {}
    if data['subject'] not in results.keys():
        results[data['subject']] = {}
    if data['experiment_type'] not in results[data['subject']]:
        results[data['subject']][data['experiment_type']] = {}

    results[data['subject']] \
           [data['experiment_type']] \
           [data['experiment_date']] = features

    return results

def perform_statistics(features):
    # Label permutation test

    #  1 Get labels
    #  2 Randomly permute labels (because H0 == random output from the 'model')
    #  3 Take the folds and calculate auc
    #  4 Repeat 10000 times and get the 95% highest value
    #  5 Compare mean auc with aucs of trained models. If higher, then signifantly above chance.
    #       this is allowed because the base distribution is the same!

    labels = np.unique(labels, return_inverse=True)[1]
    # One hot encode
    n_classes = np.max(labels)+1
    labels = np.eye(n_classes)[labels]

    n_permutations = 10000
    mean_aucs = []
    for _ in range(n_permutations):
        repetitions = 10
        test_size = 1/repetitions
        test_samples = int(test_size*labels.shape[0])

        aucs = []
        for rep in range(repetitions):
            test_idc = np.arange(rep*test_samples, rep*test_samples+test_samples)

            permuted_idc = test_idc.copy()
            np.random.shuffle(permuted_idc)  # In place, so no need to assign

            fold = labels[test_idc, :].copy()
            permuted_fold = labels[permuted_idc, :].copy()

            try:
                auc = roc_auc_score(fold, permuted_fold, multi_class='ovo')
            except Exception:
                # If a class is not present in a fold: get auc for included folds
                included_classes = ~np.all(fold==fold[0], axis=0)
                auc = roc_auc_score(fold[:, included_classes], permuted_fold[:, included_classes])

            aucs += [auc]

        mean_aucs += [np.mean(aucs)]

    # Get the 95% Cutoff (== 95th value of sorted array)
    cutoff_value = np.sort(mean_aucs)[int(len(mean_aucs)*.95)]
    print('Cutoff value: {:.3f} [{:d} permutations]'.format(cutoff_value, n_permutations))

    return features

def train_score_evaluate(data):

    x, y = combine_data_to_x_y(data)

    classifier = LDA()
    repetitions = 10

    # Learn and score
    scores = []
    for rep in range(repetitions):
        train_x, train_y, test_x, test_y = get_train_test(x, y,
                                                          shuffle=False,
                                                          stratify=False,
                                                          rep=rep,
                                                          repetitions=repetitions,
                                                          print_dist=True,
                                                          random_state=None)
        clf, test_y_hat = classifier.train(train_x, train_y, test_x)
        scores += [classifier.score(clf, train_x, train_y,
                                      test_x, test_y,
                                      test_y_hat,
                                      protocol='1v1')]

        print('.', end='', flush=True)

    # Evaluate
    results = classifier.evaluate(scores)
    return results

def prepare_results_matrix(results, locs, count_locs=False, savepath=None):
    import pandas as pd
    df = pd.DataFrame([pd.Series([ppt, type_, idx_dt, name, score],
                                  index=['ppt', 'exp', 'n_meas', 'score_name', 'score_value']) \
                           for ppt, data_ppt in results.items() \
                           for type_, data_type in data_ppt.items() \
                           for idx_dt, (_, data_dt) in enumerate(data_type.items()) \
                           for name, score in data_dt.items() \
                           if name in ['AUC [Links vs Rechts]', 'AUC [Links vs Rest]', 'AUC [Rechts vs Rest]', 'AUC [Move vs Rest]']],
                     columns=['ppt', 'exp', 'n_meas', 'score_name', 'score_value'])
    if not count_locs:
        df_locs = pd.DataFrame([pd.Series([ppt, contact_name, location_name],
                                        index=['ppt', 'contact', 'loc']) \
                                for ppt, locations in locs.items() \
                                for contact_name, location_name in locations.items()])
    else:
        df_locs = pd.DataFrame(columns=['ppt', 'loc', 'loc_count'])
        for ppt, locations in locs.items():
            unique, counts = np.unique(list(locations.values()), return_counts=True)
            for i in range(len(unique)):
                df_locs = df_locs.append(pd.Series([ppt, unique[i], counts[i]], index=['ppt', 'loc', 'loc_count']),
                                        ignore_index=True)

    df = df.merge(df_locs, on='ppt')
    if savepath:
        df.to_excel('{}\performance_locs_{}.xlsx'.format(savepath, 'counted' if count_locs else ''))

    return df


if __name__ == "__main__":

    all_freq_bands = [
        # {"beta":  [12, 30]},
        # {"gamma": [55, 90]},
        {"beta":  [12, 30], "gamma": [55, 90]},
        # {"HFO": [90, 200]},
        # {'alpha': [8, 12]},
    ]

    for freq_bands in all_freq_bands:

        signal_reference = None  # ['CER', ]
        windows = [1, 0.1] # [window size, frameshift]

        features_to_combine = ['frequency_bands']
        subjects_to_exclude = [] # Complete
        experiment_type_to_exclude = [] # ['imagine', 'grasp']'
        reload_raw = 1
        load_locations = 0
        save_results = 1
        save_raw_data = 0
        power_spectrum = 0

        path_drive = r'<some_path>' # Local external hard
        path_local = r'<some_path>'

        # Path raw --> Drive
        # Path processed --> local
        ext = 'xdf' if reload_raw else 'pkl'
        main_path = path_drive if reload_raw else path_local
        filenames = get_filenames(main_path, ext,
                                keywords=['grasp', 'imagine'],
                                exclude=['speech'])

        electrode_filenames = get_filenames(path_drive, 'csv', keywords=['electrode_locations']) \
                            if load_locations else []

        results = {}
        channel_locs = {}
        for filename in filenames:
            if is_file_to_skip(filename, subjects_to_exclude, experiment_type_to_exclude, []):
                continue

            if reload_raw:
                data = load_raw_data(filename)

                if data == None:
                    continue

                if load_locations and \
                    not is_file_to_skip(filename, subjects_to_exclude,
                                        experiment_type_to_exclude,
                                        electrode_filenames):
                    data = load_location_files(data, electrode_filenames)

                data = preprocess_data(data,
                                    reference=signal_reference,
                                    frequency_bands=freq_bands,
                                    windows=windows,
                                    power_spectrum=power_spectrum,
                                    savepath=path_local,
                                    save_incl_raw=save_raw_data)
            else:
                data = load_pickle(filename)
                if data == None:
                    continue

            features = extract_features(data, features_to_combine)

            features = perform_statistics(features)

            # Explore, learn, plot
            for subj, exps in features.items():
                if subj in subjects_to_exclude:
                    continue
                if subj not in results.keys():
                    results[subj] = {}
                for exp, dates in exps.items():
                    if exp in experiment_type_to_exclude:
                        continue
                    path_exp = '{}/{}/{}'.format(path_local, subj, exp)
                    if exp not in results[subj].keys():
                        results[subj][exp] = {}
                    for date, data in dates.items():
                        savepath = '{}/{}/{}/{}'.format(path_local, subj, exp, date)

                        print('\n{} {} {}'.format(subj, exp, date))
                        results[subj][exp][date] = train_score_evaluate(data)

                        channel_locs[subj] = data.get('channel_names', {})
            savepath = '{}/{}'.format(path_local, subj)

            print('')

        if not freq_bands.keys():
            band = 'raw'
        else:
            band = '_'.join(freq_bands.keys())

        savepath = r'.\Figures\{:s}_1000_100'.format(band)
        with open('{}\{}_results.pkl'.format(savepath, band), 'wb') as file:
            pickle.dump(results, file)

        with open('{}\{}_channels.pkl'.format(savepath, band), 'wb') as file:
            pickle.dump(channel_locs, file)
    # pretty_print(results,
    #              savepath=savepath)
                #  savepath=path_local if save_results else None)
    # plot_bargrid_averaged_score(data, results, channel_locs, savepath,
    #              plot_all=True)
    # plot_bargrid_score_per_metric(data, results, channel_locs, savepath,
    #              plot_all=True)
    # plot_bargrid_score_per_ppt(data, results, [], savepath, plot_all=True)
    print('Done')