TimeSeries/IQR at main · swapyface/TimeSeries · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
import openpyxl

# Assuming your clean dataset is a Pandas DataFrame with 'category' and 'numerical_column'
df = pd.DataFrame({
    'category': np.random.choice(['A', 'B', 'C'], size=1000000),
    'numerical_column': np.random.normal(loc=100, scale=15, size=1000000)
})

# Step 1: Prepare the data
# Label encode the categorical column
le = LabelEncoder()
df['category_encoded'] = le.fit_transform(df['category'])

# Standardize the numerical column
scaler = StandardScaler()
df['scaled_numerical'] = scaler.fit_transform(df[['numerical_column']])

# Step 2: Define the autoencoder model function for KerasRegressor
def build_autoencoder(learning_rate=0.001, neurons_1=16, neurons_2=8):
    model = Sequential([
        Dense(neurons_1, activation='relu', input_shape=(1,)),
        Dense(neurons_2, activation='relu'),
        Dense(neurons_1, activation='relu'),
        Dense(1, activation='linear')  # Output layer to reconstruct the input
    ])

    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mse')
    return model

# DataFrame to store the results
results = pd.DataFrame(columns=['category', 'original', 'reconstructed', 'status'])

# Step 3: Train the model for each category separately
categories = df['category'].unique()

# Hyperparameter tuning parameters
param_dist = {
    'neurons_1': [8, 16, 32],
    'neurons_2': [4, 8, 16],
    'learning_rate': [0.001, 0.01, 0.1],
    'epochs': [50, 100],
    'batch_size': [256, 512, 1024]
}

for category in categories:
    print(f"Training autoencoder for category: {category}")

    # Select the data for the current category
    category_data = df[df['category'] == category]['scaled_numerical'].values
    category_data = category_data.reshape(-1, 1)  # Reshape for the model

    # Get the original values (before scaling) for bounds calculation
    original_category_data = df[df['category'] == category]['numerical_column'].values

    # Train/test split
    X_train, X_test = train_test_split(category_data, test_size=0.2, random_state=42)

    # Create the KerasRegressor wrapper
    model = KerasRegressor(build_fn=build_autoencoder, verbose=0)

    # Perform RandomizedSearchCV for hyperparameter tuning
    random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist,
                                       n_iter=5, cv=3, verbose=1, random_state=42)

    random_search_result = random_search.fit(X_train, X_train)

    # Get the best hyperparameters
    best_params = random_search_result.best_params_
    print(f"Best hyperparameters for category {category}: {best_params}")

    # Use the best hyperparameters to build and train the final model
    best_model = build_autoencoder(learning_rate=best_params['learning_rate'],
                                   neurons_1=best_params['neurons_1'],
                                   neurons_2=best_params['neurons_2'])

    # Train the model
    best_model.fit(X_train, X_train, epochs=best_params['epochs'],
                   batch_size=best_params['batch_size'], validation_data=(X_test, X_test), verbose=1)

    # Reconstruct the entire category data
    reconstruction = best_model.predict(category_data)

    # Inverse transform to get original values
    original_values = scaler.inverse_transform(category_data)
    reconstructed_values = scaler.inverse_transform(reconstruction)

    # Check if the difference between original and reconstructed value is greater than 20%
    for original, reconstructed in zip(original_values, reconstructed_values):
        # Calculate the percentage difference
        difference_percentage = abs(original[0] - reconstructed[0]) / original[0]

        # Check if the difference exceeds 20%
        if difference_percentage > 0.2:
            status = 'Bad Prediction'
        else:
            status = 'Good Prediction'

        # Append to the results DataFrame
        results = results.append({
            'category': category,
            'original': original[0],
            'reconstructed': reconstructed[0],
            'status': status
        }, ignore_index=True)

# Step 4: Save the results to an Excel file
results.to_excel("good_bad_predictions.xlsx", index=False)

print("Results saved to 'good_bad_predictions.xlsx'")