Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@ description = "Add your description here"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"lightgbm>=4.6.0",
"mlflow-skinny>=3.1.4",
"notebook>=7.4.4",
"numpy==2.3.2",
"optuna>=4.5.0",
"pandas==2.3.1",
"pyarrow==20.0.0",
"scikit-learn==1.7.1",
Expand Down
232 changes: 120 additions & 112 deletions src/binary_model/train.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,50 @@
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import mlflow
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import (
accuracy_score,
roc_auc_score,
confusion_matrix,
classification_report,
)
import socket
import urllib.parse
import uuid

import mlflow
import mlflow.sklearn
import optuna
import pandas as pd
from lightgbm import LGBMClassifier
from mlflow.models import infer_signature
from sklearn.model_selection import cross_validate, StratifiedKFold

# --- Configuration ---
TRACKING_SERVER_URI = "http://localhost:5000"
EXPERIMENT_NAME = "binary_classifier"

DATA_FILE = "data/2022-10-26_hiscore_data.parquet.gzip"
DATA_FILE = "../../data/2022-10-26_hiscore_data.parquet.gzip"
TARGET_COLUMN = "confirmed_ban"


CV = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


PARAM_GUESS = {
"colsample_bytree": 0.9651651365751598,
"learning_rate": 0.05310861766637499,
"max_depth": 23,
"min_child_samples": 84,
"min_child_weight": 0.004787755397014158,
"min_split_gain": 0.022266875377142784,
"n_estimators": 1445,
"num_leaves": 255,
"reg_alpha": 1.7177943743459194,
"reg_lambda": 1.1992203482868355e-05,
"subsample": 0.7224340426136543,
}

SCORING = [
"accuracy",
"roc_auc",
"average_precision",
"balanced_accuracy",
"precision",
"recall",
"f1",
]

SKILLS = [
"attack",
"defence",
Expand All @@ -42,6 +70,7 @@
"hunter",
"construction",
]

MINIGAMES = [
"league",
"bounty_hunter_hunter",
Expand All @@ -56,6 +85,7 @@
"cs_elite",
"cs_master",
]

BOSSES = [
"abyssal_sire",
"alchemical_hydra",
Expand Down Expand Up @@ -112,128 +142,106 @@
FEATURE_COLUMNS = SKILLS + MINIGAMES + BOSSES


def load_data(file_path: str, feature_columns: list[str]):
print(f"Loading data from {file_path}...")
df = pd.read_parquet(file_path)
print(f"Data loaded with {len(df)} samples and {len(df.columns)} columns.")
def server_running(uri="http://localhost:5000", timeout=1):
parsed = urllib.parse.urlparse(uri)
host = parsed.hostname or "127.0.0.1"
port = parsed.port or 5000

# Ensure all feature columns are present
missing_features = [col for col in feature_columns if col not in df.columns]
if missing_features:
raise ValueError(f"Missing feature columns: {missing_features}")
return df
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.settimeout(timeout)
return s.connect_ex((host, port)) == 0


def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
def load_data(file_path: str, feature_columns: list[str]) -> pd.DataFrame:
df = pd.read_parquet(file_path)
missing = [c for c in feature_columns if c not in df.columns]
if missing:
raise ValueError(f"Missing feature columns: {missing}")
return df


def log_metrics(model: DecisionTreeClassifier, X_test, y_test):
# Predict and evaluate
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
def log_run(output, params):
mlflow.log_params(params)
for metric, values in output.items():
metric = metric.removeprefix("test_")

mlflow.log_metric("true_positives", tp)
mlflow.log_metric("false_positives", fp)
mlflow.log_metric("true_negatives", tn)
mlflow.log_metric("false_negatives", fn)
print(f"\t- TP: {tp}, FP: {fp}")
mlflow.log_metric(f"mean_{metric}", values.mean())
mlflow.log_metric(f"std_{metric}", values.std())

accuracy = round(float(accuracy_score(y_test, y_pred)), 4)
mlflow.log_metric("accuracy", accuracy)
print(f"\t- Accuracy: {accuracy}")

auc = round(float(roc_auc_score(y_test, y_proba)), 4)
mlflow.log_metric("auc", auc)
print(f"\t- auc: {auc}")
def objective(trial, X, y):

report_dict = classification_report(y_test, y_pred, output_dict=True)
if not isinstance(report_dict, dict):
return
params = {
"objective": "binary",
"n_estimators": trial.suggest_int("n_estimators", 50, 1500),
"learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
"num_leaves": trial.suggest_int("num_leaves", 15, 255, log=True),
"max_depth": trial.suggest_int("max_depth", -1, 24),
"min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
"min_child_weight": trial.suggest_float("min_child_weight", 1e-3, 10.0, log=True),
"subsample": trial.suggest_float("subsample", 0.6, 1.0),
"colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
"reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
"reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
"min_split_gain": trial.suggest_float("min_split_gain", 0.0, 0.5),
"n_jobs": -1,
"random_state": 42,
"verbose": -1,
}

report_dict: dict[str, dict | str]
for pred, cf_rep in report_dict.items():
if not isinstance(cf_rep, dict):
continue
for k, v in cf_rep.items():
mlflow.log_metric(key=f"{pred}-{k}", value=round(v, 4))
model = LGBMClassifier(**params)
out = cross_validate(
model, X, y, cv=CV, scoring=SCORING, n_jobs=1
) # n_jobs is 1 because LGBM uses all cores
with mlflow.start_run(run_name=f"trial_{trial.number}"):
log_run(out, params=params)


def train(X_train, y_train, X_test, y_test, model_name, params: dict, experiment_id):
with mlflow.start_run(nested=True, experiment_id=experiment_id) as run:
print(f"Run ID: {run.info.run_id} - run_name={run.info.run_name}")
print(f"Training with params: {params}")
model = DecisionTreeClassifier(random_state=42)
model.set_params(**params)
model.fit(X=X_train, y=y_train)
log_metrics(model=model, X_test=X_test, y_test=y_test)
mlflow.sklearn.log_model(sk_model=model, name=model_name)
mlflow.log_params(params)
return out["test_roc_auc"].mean()


def main():
df = load_data(file_path=DATA_FILE, feature_columns=FEATURE_COLUMNS)
df = feature_engineering(df)
if server_running(TRACKING_SERVER_URI):
mlflow.set_tracking_uri(TRACKING_SERVER_URI)
else:
mlflow.set_tracking_uri("file:./mlruns")

experiment_id = f"{EXPERIMENT_NAME}_{uuid.uuid4().hex[:4]}"
mlflow.set_experiment(experiment_id)

df = load_data(DATA_FILE, FEATURE_COLUMNS)
X, y = df[FEATURE_COLUMNS], df[TARGET_COLUMN]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)

param_grid = ParameterGrid(
{
"criterion": ["gini", "entropy"],
"max_depth": [10, 30, 50, 100, 200, 500],
"min_samples_leaf": [5, 10, 20],
}
study = optuna.create_study(
direction="maximize",
sampler=optuna.samplers.TPESampler(seed=42),
study_name=experiment_id,
)

mlflow.set_tracking_uri(TRACKING_SERVER_URI)
experiment_id = mlflow.create_experiment(
f"{EXPERIMENT_NAME}-{str(uuid.uuid4())[:4]}"
study.enqueue_trial(PARAM_GUESS)

study.optimize(
lambda t: objective(t, X, y),
n_trials=10,
)
with mlflow.start_run(experiment_id=experiment_id) as parent_run:
print(
f"Run ID: {parent_run.info.run_id} - run_name={parent_run.info.run_name} - parent"

# cross-validated metrics with the final params
best_params = study.best_trial.params
cv_model = LGBMClassifier(random_state=42, **best_params)
out = cross_validate(cv_model, X, y, cv=CV, scoring=SCORING, n_jobs=1)

# fit on full data for the artifact
refit_model = LGBMClassifier(random_state=42, **best_params)
refit_model.fit(X, y)

with mlflow.start_run(run_name="refit_best"):
log_run(out, best_params)
sig = infer_signature(X.head(100), refit_model.predict_proba(X.head(100)))
mlflow.sklearn.log_model(
refit_model,
name="refit_model",
signature=sig,
input_example=X.head(5),
)
for i, params in enumerate(param_grid):
train(
X_train=X_train,
y_train=y_train,
X_test=X_test,
y_test=y_test,
model_name=f"{EXPERIMENT_NAME}_{i}",
params=params,
experiment_id=experiment_id,
)

# query = f"tags.mlflow.parentRunId = '{parent_run.info.run_id}'"
# results = mlflow.search_runs(
# experiment_ids=[experiment_id],
# filter_string=query,
# order_by=["metrics.`weighted avg-f1-score` DESC"],
# max_results=5,
# )

# best_run_id = results.iloc[0]["run_id"]

# model_uri = f"runs:/{best_run_id}/model"
# print(model_uri)
# loaded_model: DecisionTreeClassifier = mlflow.sklearn.load_model(model_uri)
# # Get feature importances
# importances = loaded_model.feature_importances_

# # Select features with importance greater than a threshold
# threshold = 0.1 # Adjust as needed
# selected_features = X.columns[importances > threshold]
# print(selected_features)

# # Use only the selected features
# X_train_selected = X_train[selected_features]
# X_test_selected = X_test[selected_features]
# loaded_model.fit(X_train_selected, y_train)


if __name__ == "__main__":
Expand Down
12 changes: 12 additions & 0 deletions src/multi_model/_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
import pandas as pd
import socket
import urllib.parse


def server_running(uri="http://localhost:5000", timeout=1):
parsed = urllib.parse.urlparse(uri)
host = parsed.hostname or "127.0.0.1"
port = parsed.port or 5000

with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.settimeout(timeout)
return s.connect_ex((host, port)) == 0


def load_data(file_path: str, feature_columns: list[str]):
Expand Down
16 changes: 12 additions & 4 deletions src/multi_model/_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,24 @@

import pandas as pd
from mlflow.pyfunc.model import PythonModel, PythonModelContext
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier

from _features import feature_engineering
from _structs import InputData, OutputData


class DecisionTreeWrapper(PythonModel):
class LGBMWrapper(PythonModel):
def __init__(self, params: dict):
self.params = params
self.model = DecisionTreeClassifier(**params)
self.model = LGBMClassifier(**params)

@classmethod
def from_lgbm(cls, model: LGBMClassifier):
"""Create LGBMWrapper from an existing LGBMClassifier."""
params = model.get_params()
wrapper = cls(params)
wrapper.model = model
return wrapper

def fit(self, X, y):
"""Train the decision tree model"""
Expand All @@ -22,7 +30,7 @@ def load_context(self, context: PythonModelContext):
with open(context.artifacts["model"], "rb") as f:
self.model = pickle.load(f)

def get_model(self) -> DecisionTreeClassifier:
def get_model(self) -> LGBMClassifier:
return self.model

def get_input_json_schema(self):
Expand Down
Loading