-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathIQR
More file actions
119 lines (94 loc) · 4.65 KB
/
IQR
File metadata and controls
119 lines (94 loc) · 4.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
import openpyxl
# Assuming your clean dataset is a Pandas DataFrame with 'category' and 'numerical_column'
df = pd.DataFrame({
'category': np.random.choice(['A', 'B', 'C'], size=1000000),
'numerical_column': np.random.normal(loc=100, scale=15, size=1000000)
})
# Step 1: Prepare the data
# Label encode the categorical column
le = LabelEncoder()
df['category_encoded'] = le.fit_transform(df['category'])
# Standardize the numerical column
scaler = StandardScaler()
df['scaled_numerical'] = scaler.fit_transform(df[['numerical_column']])
# Step 2: Define the autoencoder model function for KerasRegressor
def build_autoencoder(learning_rate=0.001, neurons_1=16, neurons_2=8):
model = Sequential([
Dense(neurons_1, activation='relu', input_shape=(1,)),
Dense(neurons_2, activation='relu'),
Dense(neurons_1, activation='relu'),
Dense(1, activation='linear') # Output layer to reconstruct the input
])
optimizer = Adam(learning_rate=learning_rate)
model.compile(optimizer=optimizer, loss='mse')
return model
# DataFrame to store the results
results = pd.DataFrame(columns=['category', 'original', 'reconstructed', 'status'])
# Step 3: Train the model for each category separately
categories = df['category'].unique()
# Hyperparameter tuning parameters
param_dist = {
'neurons_1': [8, 16, 32],
'neurons_2': [4, 8, 16],
'learning_rate': [0.001, 0.01, 0.1],
'epochs': [50, 100],
'batch_size': [256, 512, 1024]
}
for category in categories:
print(f"Training autoencoder for category: {category}")
# Select the data for the current category
category_data = df[df['category'] == category]['scaled_numerical'].values
category_data = category_data.reshape(-1, 1) # Reshape for the model
# Get the original values (before scaling) for bounds calculation
original_category_data = df[df['category'] == category]['numerical_column'].values
# Train/test split
X_train, X_test = train_test_split(category_data, test_size=0.2, random_state=42)
# Create the KerasRegressor wrapper
model = KerasRegressor(build_fn=build_autoencoder, verbose=0)
# Perform RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist,
n_iter=5, cv=3, verbose=1, random_state=42)
random_search_result = random_search.fit(X_train, X_train)
# Get the best hyperparameters
best_params = random_search_result.best_params_
print(f"Best hyperparameters for category {category}: {best_params}")
# Use the best hyperparameters to build and train the final model
best_model = build_autoencoder(learning_rate=best_params['learning_rate'],
neurons_1=best_params['neurons_1'],
neurons_2=best_params['neurons_2'])
# Train the model
best_model.fit(X_train, X_train, epochs=best_params['epochs'],
batch_size=best_params['batch_size'], validation_data=(X_test, X_test), verbose=1)
# Reconstruct the entire category data
reconstruction = best_model.predict(category_data)
# Inverse transform to get original values
original_values = scaler.inverse_transform(category_data)
reconstructed_values = scaler.inverse_transform(reconstruction)
# Check if the difference between original and reconstructed value is greater than 20%
for original, reconstructed in zip(original_values, reconstructed_values):
# Calculate the percentage difference
difference_percentage = abs(original[0] - reconstructed[0]) / original[0]
# Check if the difference exceeds 20%
if difference_percentage > 0.2:
status = 'Bad Prediction'
else:
status = 'Good Prediction'
# Append to the results DataFrame
results = results.append({
'category': category,
'original': original[0],
'reconstructed': reconstructed[0],
'status': status
}, ignore_index=True)
# Step 4: Save the results to an Excel file
results.to_excel("good_bad_predictions.xlsx", index=False)
print("Results saved to 'good_bad_predictions.xlsx'")