Bot-detector · Heiaha · Oct 14, 2025 · Oct 18, 2025 · Oct 19, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -5,9 +5,11 @@ description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
+    "lightgbm>=4.6.0",
     "mlflow-skinny>=3.1.4",
     "notebook>=7.4.4",
     "numpy==2.3.2",
+    "optuna>=4.5.0",
     "pandas==2.3.1",
     "pyarrow==20.0.0",
     "scikit-learn==1.7.1",

diff --git a/src/binary_model/train.py b/src/binary_model/train.py
@@ -1,22 +1,50 @@
-import pandas as pd
-from sklearn.model_selection import train_test_split
-from sklearn.tree import DecisionTreeClassifier
-import mlflow
-from sklearn.model_selection import ParameterGrid
-from sklearn.metrics import (
-    accuracy_score,
-    roc_auc_score,
-    confusion_matrix,
-    classification_report,
-)
+import socket
+import urllib.parse
 import uuid
 
+import mlflow
+import mlflow.sklearn
+import optuna
+import pandas as pd
+from lightgbm import LGBMClassifier
+from mlflow.models import infer_signature
+from sklearn.model_selection import cross_validate, StratifiedKFold
+
 # --- Configuration ---
 TRACKING_SERVER_URI = "http://localhost:5000"
 EXPERIMENT_NAME = "binary_classifier"
 
-DATA_FILE = "data/2022-10-26_hiscore_data.parquet.gzip"
+DATA_FILE = "../../data/2022-10-26_hiscore_data.parquet.gzip"
 TARGET_COLUMN = "confirmed_ban"
+
+
+CV = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
+
+
+PARAM_GUESS = {
+    "colsample_bytree": 0.9651651365751598,
+    "learning_rate": 0.05310861766637499,
+    "max_depth": 23,
+    "min_child_samples": 84,
+    "min_child_weight": 0.004787755397014158,
+    "min_split_gain": 0.022266875377142784,
+    "n_estimators": 1445,
+    "num_leaves": 255,
+    "reg_alpha": 1.7177943743459194,
+    "reg_lambda": 1.1992203482868355e-05,
+    "subsample": 0.7224340426136543,
+}
+
+SCORING = [
+    "accuracy",
+    "roc_auc",
+    "average_precision",
+    "balanced_accuracy",
+    "precision",
+    "recall",
+    "f1",
+]
+
 SKILLS = [
     "attack",
     "defence",
@@ -42,6 +70,7 @@
     "hunter",
     "construction",
 ]
+
 MINIGAMES = [
     "league",
     "bounty_hunter_hunter",
@@ -56,6 +85,7 @@
     "cs_elite",
     "cs_master",
 ]
+
 BOSSES = [
     "abyssal_sire",
     "alchemical_hydra",
@@ -112,128 +142,106 @@
 FEATURE_COLUMNS = SKILLS + MINIGAMES + BOSSES
 
 
-def load_data(file_path: str, feature_columns: list[str]):
-    print(f"Loading data from {file_path}...")
-    df = pd.read_parquet(file_path)
-    print(f"Data loaded with {len(df)} samples and {len(df.columns)} columns.")
+def server_running(uri="http://localhost:5000", timeout=1):
+    parsed = urllib.parse.urlparse(uri)
+    host = parsed.hostname or "127.0.0.1"
+    port = parsed.port or 5000
 
-    # Ensure all feature columns are present
-    missing_features = [col for col in feature_columns if col not in df.columns]
-    if missing_features:
-        raise ValueError(f"Missing feature columns: {missing_features}")
-    return df
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.settimeout(timeout)
+        return s.connect_ex((host, port)) == 0
 
 
-def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
+def load_data(file_path: str, feature_columns: list[str]) -> pd.DataFrame:
+    df = pd.read_parquet(file_path)
+    missing = [c for c in feature_columns if c not in df.columns]
+    if missing:
+        raise ValueError(f"Missing feature columns: {missing}")
     return df
 
 
-def log_metrics(model: DecisionTreeClassifier, X_test, y_test):
-    # Predict and evaluate
-    y_pred = model.predict(X_test)
-    y_proba = model.predict_proba(X_test)[:, 1]
-
-    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
+def log_run(output, params):
+    mlflow.log_params(params)
+    for metric, values in output.items():
+        metric = metric.removeprefix("test_")
 
-    mlflow.log_metric("true_positives", tp)
-    mlflow.log_metric("false_positives", fp)
-    mlflow.log_metric("true_negatives", tn)
-    mlflow.log_metric("false_negatives", fn)
-    print(f"\t- TP: {tp}, FP: {fp}")
+        mlflow.log_metric(f"mean_{metric}", values.mean())
+        mlflow.log_metric(f"std_{metric}", values.std())
 
-    accuracy = round(float(accuracy_score(y_test, y_pred)), 4)
-    mlflow.log_metric("accuracy", accuracy)
-    print(f"\t- Accuracy: {accuracy}")
 
-    auc = round(float(roc_auc_score(y_test, y_proba)), 4)
-    mlflow.log_metric("auc", auc)
-    print(f"\t- auc: {auc}")
+def objective(trial, X, y):
 
-    report_dict = classification_report(y_test, y_pred, output_dict=True)
-    if not isinstance(report_dict, dict):
-        return
+    params = {
+        "objective": "binary",
+        "n_estimators": trial.suggest_int("n_estimators", 50, 1500),
+        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
+        "num_leaves": trial.suggest_int("num_leaves", 15, 255, log=True),
+        "max_depth": trial.suggest_int("max_depth", -1, 24),
+        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
+        "min_child_weight": trial.suggest_float("min_child_weight", 1e-3, 10.0, log=True),
+        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
+        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
+        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
+        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
+        "min_split_gain": trial.suggest_float("min_split_gain", 0.0, 0.5),
+        "n_jobs": -1,
+        "random_state": 42,
+        "verbose": -1,
+    }
 
-    report_dict: dict[str, dict | str]
-    for pred, cf_rep in report_dict.items():
-        if not isinstance(cf_rep, dict):
-            continue
-        for k, v in cf_rep.items():
-            mlflow.log_metric(key=f"{pred}-{k}", value=round(v, 4))
+    model = LGBMClassifier(**params)
+    out = cross_validate(
+        model, X, y, cv=CV, scoring=SCORING, n_jobs=1
+    )  # n_jobs is 1 because LGBM uses all cores
+    with mlflow.start_run(run_name=f"trial_{trial.number}"):
+        log_run(out, params=params)
 
-
-def train(X_train, y_train, X_test, y_test, model_name, params: dict, experiment_id):
-    with mlflow.start_run(nested=True, experiment_id=experiment_id) as run:
-        print(f"Run ID: {run.info.run_id} - run_name={run.info.run_name}")
-        print(f"Training with params: {params}")
-        model = DecisionTreeClassifier(random_state=42)
-        model.set_params(**params)
-        model.fit(X=X_train, y=y_train)
-        log_metrics(model=model, X_test=X_test, y_test=y_test)
-        mlflow.sklearn.log_model(sk_model=model, name=model_name)
-        mlflow.log_params(params)
+    return out["test_roc_auc"].mean()
 
 
 def main():
-    df = load_data(file_path=DATA_FILE, feature_columns=FEATURE_COLUMNS)
-    df = feature_engineering(df)
+    if server_running(TRACKING_SERVER_URI):
+        mlflow.set_tracking_uri(TRACKING_SERVER_URI)
+    else:
+        mlflow.set_tracking_uri("file:./mlruns")
+
+    experiment_id = f"{EXPERIMENT_NAME}_{uuid.uuid4().hex[:4]}"
+    mlflow.set_experiment(experiment_id)
 
+    df = load_data(DATA_FILE, FEATURE_COLUMNS)
     X, y = df[FEATURE_COLUMNS], df[TARGET_COLUMN]
-    X_train, X_test, y_train, y_test = train_test_split(
-        X, y, test_size=0.2, random_state=42
-    )
 
-    param_grid = ParameterGrid(
-        {
-            "criterion": ["gini", "entropy"],
-            "max_depth": [10, 30, 50, 100, 200, 500],
-            "min_samples_leaf": [5, 10, 20],
-        }
+    study = optuna.create_study(
+        direction="maximize",
+        sampler=optuna.samplers.TPESampler(seed=42),
+        study_name=experiment_id,
     )
 
-    mlflow.set_tracking_uri(TRACKING_SERVER_URI)
-    experiment_id = mlflow.create_experiment(
-        f"{EXPERIMENT_NAME}-{str(uuid.uuid4())[:4]}"
+    study.enqueue_trial(PARAM_GUESS)
+
+    study.optimize(
+        lambda t: objective(t, X, y),
+        n_trials=10,
     )
-    with mlflow.start_run(experiment_id=experiment_id) as parent_run:
-        print(
-            f"Run ID: {parent_run.info.run_id} - run_name={parent_run.info.run_name} - parent"
+
+    # cross-validated metrics with the final params
+    best_params = study.best_trial.params
+    cv_model = LGBMClassifier(random_state=42, **best_params)
+    out = cross_validate(cv_model, X, y, cv=CV, scoring=SCORING, n_jobs=1)
+
+    # fit on full data for the artifact
+    refit_model = LGBMClassifier(random_state=42, **best_params)
+    refit_model.fit(X, y)
+
+    with mlflow.start_run(run_name="refit_best"):
+        log_run(out, best_params)
+        sig = infer_signature(X.head(100), refit_model.predict_proba(X.head(100)))
+        mlflow.sklearn.log_model(
+            refit_model,
+            name="refit_model",
+            signature=sig,
+            input_example=X.head(5),
         )
-        for i, params in enumerate(param_grid):
-            train(
-                X_train=X_train,
-                y_train=y_train,
-                X_test=X_test,
-                y_test=y_test,
-                model_name=f"{EXPERIMENT_NAME}_{i}",
-                params=params,
-                experiment_id=experiment_id,
-            )
-
-    # query = f"tags.mlflow.parentRunId = '{parent_run.info.run_id}'"
-    # results = mlflow.search_runs(
-    #     experiment_ids=[experiment_id],
-    #     filter_string=query,
-    #     order_by=["metrics.`weighted avg-f1-score` DESC"],
-    #     max_results=5,
-    # )
-
-    # best_run_id = results.iloc[0]["run_id"]
-
-    # model_uri = f"runs:/{best_run_id}/model"
-    # print(model_uri)
-    # loaded_model: DecisionTreeClassifier = mlflow.sklearn.load_model(model_uri)
-    # # Get feature importances
-    # importances = loaded_model.feature_importances_
-
-    # # Select features with importance greater than a threshold
-    # threshold = 0.1  # Adjust as needed
-    # selected_features = X.columns[importances > threshold]
-    # print(selected_features)
-
-    # # Use only the selected features
-    # X_train_selected = X_train[selected_features]
-    # X_test_selected = X_test[selected_features]
-    # loaded_model.fit(X_train_selected, y_train)
 
 
 if __name__ == "__main__":

diff --git a/src/multi_model/_utils.py b/src/multi_model/_utils.py
@@ -1,4 +1,16 @@
 import pandas as pd
+import socket
+import urllib.parse
+
+
+def server_running(uri="http://localhost:5000", timeout=1):
+    parsed = urllib.parse.urlparse(uri)
+    host = parsed.hostname or "127.0.0.1"
+    port = parsed.port or 5000
+
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.settimeout(timeout)
+        return s.connect_ex((host, port)) == 0
 
 
 def load_data(file_path: str, feature_columns: list[str]):

diff --git a/src/multi_model/_wrapper.py b/src/multi_model/_wrapper.py
@@ -2,16 +2,24 @@
 
 import pandas as pd
 from mlflow.pyfunc.model import PythonModel, PythonModelContext
-from sklearn.tree import DecisionTreeClassifier
+from lightgbm import LGBMClassifier
 
 from _features import feature_engineering
 from _structs import InputData, OutputData
 
 
-class DecisionTreeWrapper(PythonModel):
+class LGBMWrapper(PythonModel):
     def __init__(self, params: dict):
         self.params = params
-        self.model = DecisionTreeClassifier(**params)
+        self.model = LGBMClassifier(**params)
+
+    @classmethod
+    def from_lgbm(cls, model: LGBMClassifier):
+        """Create LGBMWrapper from an existing LGBMClassifier."""
+        params = model.get_params()
+        wrapper = cls(params)
+        wrapper.model = model
+        return wrapper
 
     def fit(self, X, y):
         """Train the decision tree model"""
@@ -22,7 +30,7 @@ def load_context(self, context: PythonModelContext):
         with open(context.artifacts["model"], "rb") as f:
             self.model = pickle.load(f)
 
-    def get_model(self) -> DecisionTreeClassifier:
+    def get_model(self) -> LGBMClassifier:
         return self.model
 
     def get_input_json_schema(self):