DeepLearningAbschluss/Pipline.py at main · FHKrieg/DeepLearningAbschluss · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204

from unittest import result
import pandas as pd
from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold, GridSearchCV, train_test_split
from sklearn.metrics import balanced_accuracy_score, accuracy_score, roc_auc_score, f1_score, mean_squared_error, mean_absolute_error, confusion_matrix, ConfusionMatrixDisplay
import time
import math
import seaborn as sns
import matplotlib.pyplot as plt
class Pipeline():
    def __init__(self, stratified: bool = False, n_repeated: int = 5, n_splits: int = 5):
        """This function is used to initialize the class Pipeline

        Args:
            stratified (bool): define if pipeline should use stratified split or not
            n_repeated (int): number of repetitions in the cross val
            n_splits (int): number of split in the kfold
        """
        #here the parameters will be set for the crossvalidation
        self.stratified = stratified
        self.n_repeated = n_repeated
        self.n_splits = n_splits


    def hyperparameter_tuning(self, model,x_train:pd.DataFrame, y_train:pd.DataFrame, hyperparameters: dict, scoring: str, random_state: int =42):
        """This function will use (stratified) Repeated k-fold and GridsearchCV to find the best hyperparameters for the given training data and hyperparameters.

        Args:
            model (sklearn model): This is the model to search the best hyperparameters for
            x_train (pd.DataFrame): Training Data Features
            y_train (pd.DataFrame): Training Data Targets
            stratified (bool): if the data should be stratified or not
            hyperparameters (dict): hyperparamters that should be tested in the evaluation
            scoring (str): For which value the hyperparameter should be optimized (e.g. balanced_accuracy or accuracy)
            random_state (int, optional): Seed for the randomstate. Defaults to 42.

        Returns:
            object: contains multiple information including the trained model
        """
        if self.stratified:
            rskf = RepeatedStratifiedKFold(n_splits=self.n_splits,n_repeats=self.n_repeated, random_state=random_state)
            clf = GridSearchCV(model,hyperparameters,cv=rskf,scoring=scoring, n_jobs=-1)
        else:
            rkf = RepeatedKFold(n_splits=self.n_splits,n_repeats=self.n_repeated, random_state=random_state)
            clf = GridSearchCV(model,hyperparameters,cv=rkf,scoring=scoring, n_jobs=-1)
        clf.fit(x_train,y_train)
        return clf

    def model_evaluation_classification(trained_model, x_test: pd.DataFrame, y_test: pd.DataFrame) -> float:
        """this function generates the predictions of a given model and calculates the accuracy as well as the balanced accuracy score

        Args:
            trained_model (sklearn trained model): previously trained model
            x_test (pd.DataFrame): festures test data
            y_test (pd.DataFrame): target test data

        Returns:
            float: accuracy_score
            float: balanced_accuracy_score
            float: ras score
            float: f1 score
        """
        y_hat = trained_model.predict(x_test)
        asc = accuracy_score(y_test, y_hat)
        basc = balanced_accuracy_score(y_test, y_hat)
        ras= roc_auc_score(y_test, y_hat)
        f1 = f1_score(y_test, y_hat)
        return asc, basc, ras, f1

    def model_evaluation_float(trained_model, x_test, y_test):
        """this function generates the predictions of a given model and calculates the accuracy as well as the balanced accuracy score

        Args:
            trained_model (sklearn trained model): previously trained model
            x_test (pd.DataFrame): festures test data
            y_test (pd.DataFrame): target test data

        Returns:
            float: Mean squared error
            float: Mean absolut error

        """
        y_hat = trained_model.predict(x_test)
        mse = mean_squared_error(y_test, y_hat)
        mae = mean_absolute_error(y_test, y_hat)
        return mse, mae, y_hat

    def confusion_matrix(trained_model, x_test, y_test):
        """This function generates a confusion matrix and prints the matrix as well as return the confusion matrix in a confusion matrix

        Args:
            trained_model (sklearn model): the trained model to evaluate
            x_test (pd.DataFrame): test data features
            y_test (pd.DataFrame): test data class

        Returns:
            array: returns a confusion matrix
        """
        y_hat = trained_model.predict(x_test)
        cm = confusion_matrix(y_test,y_hat)
        ConfusionMatrixDisplay(cm)
        return cm


    def run_experiments_float(self, experiments, X, Y, Scaler,scoring, random_state:int =42, shuffle:bool = True, test_size: float = 0.3):
        """This function runs the predefined experiments on a dataset for Regression problems

        Args:
            experiments (List with nested dicts): Here the experiments are defined
            X (_type_): The dataset that contains the features
            Y (_type_): The dataset that contains the dependent variable
            Scaler (sklearn.scaler): The scaler that is applied to the data
            scoring (sklearn.metrics): The metric that should be used to optimize the model
            random_state (int, optional): The randomstate used in the experimentation. Defaults to 42.
            shuffle (bool, optional): Defines if the data should be shuffeled before it is being split up. Defaults to True.

        Returns:
            list that contains dicts: Returns the results put into a dicitionarry format
        """
        X_train, X_test, y_train, y_test = train_test_split(X,Y,random_state = random_state, shuffle=shuffle, test_size=test_size)
        X_train_scaled = Scaler.fit_transform(X_train)
        X_test_scaled = Scaler.transform(X_test)
        df_results = []
        for experiment in experiments:

            start_time= time.time()

            print()
            print()
            print(experiment['name'])
            print("-----------------")
            trained_model = self.hyperparameter_tuning(model=experiment["model"],x_train=X_train_scaled, y_train=y_train, hyperparameters= experiment["parameters"], scoring=scoring, random_state=random_state)
            mse, mae, y_hat = Pipeline.model_evaluation_float(trained_model=trained_model, x_test=X_test_scaled, y_test=y_test)
            end_time = time.time() - start_time
            #print(f'Optimized for {scoring} the {experiment['name']} achieved the following scores:')
            print(f'RMSE: {math.sqrt(mse)}')
            print(f'MAE: {mae}')
            print(f'The best results achieved with parameters: {trained_model.best_params_}')
            print(f'Time: {int(round(end_time, 1))} seconds ({int(round(end_time/60, 1))} minutes)')
            df_results.append({"name": experiment["name"],
            "trained_model": trained_model,
            "scaler": Scaler,
            "best_results": {
                "MSE": mse,
                "MAE":mae,
                "predicted_values":y_hat,
                "actual_values": y_test
            }})
            ax = sns.scatterplot(y_test,y_hat)
            ax.set(ylim=(min([min(y_test), min(y_hat)]), max([max(y_test), max(y_hat)])),xlim= (min([min(y_test), min(y_hat)]),max([max(y_test), max(y_hat)])))
            plt.show()
        return df_results


    def run_experiments_classification(self, experiments, X, Y, Scaler,scoring, random_state:int =42, shuffle:bool = True, test_size: float = 0.3):
        """This function runs the predefined experiments on a dataset for Classification problems

        Args:
            experiments (List with nested dicts): Here the experiments are defined
            X (_type_): The dataset that contains the features
            Y (_type_): The dataset that contains the dependent variable
            Scaler (sklearn.scaler): The scaler that is applied to the data
            scoring (sklearn.metrics): The metric that should be used to optimize the model
            random_state (int, optional): The randomstate used in the experimentation. Defaults to 42.
            shuffle (bool, optional): Defines if the data should be shuffeled before it is being split up. Defaults to True.

        Returns:
            list that contains dicts: Returns the results put into a dicitionarry format
        """
        X_train, X_test, y_train, y_test = train_test_split(X,Y,random_state = random_state, shuffle=shuffle, test_size=test_size)
        X_train_scaled = Scaler.fit_transform(X_train)
        X_test_scaled = Scaler.transform(X_test)
        df_results = []
        for experiment in experiments:

            start_time= time.time()

            print()
            print()
            print(experiment['name'])
            print("-----------------")
            trained_model = self.hyperparameter_tuning(model=experiment["model"],x_train=X_train_scaled, y_train=y_train, hyperparameters= experiment["parameters"], scoring=scoring, random_state=random_state)
            asc, basc, ras, f1 = Pipeline.model_evaluation_classification(trained_model, x_test=X_test_scaled, y_test=y_test)
            df_results = "test"
            end_time = time.time() - start_time
            #print(f'Optimized for {scoring} the {experiment['name']} achieved the following scores:')
            print(f'Accuracy: {asc}')
            print(f'Balanced Accuracy: {basc}')
            print(f'F1 Score: {f1}')
            print(f'RAS: {ras}')
            print(f'The best results achieved with parameters: {trained_model.best_params_}')
            print(f'Time: {int(round(end_time, 1))} seconds ({int(round(end_time/60, 1))} minutes)')
            df_results.append({"name": experiment["name"],
            "trained_model": trained_model.cv_results_["params"],
            "best_results": {
                "accuracy": asc,
                "balanced_accuracy":basc,
                "RAS": ras,
                "f1_score":f1
            }})
        return df_results