From 848349e85f01102302e1003e06b9a2d23ee36031 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Thu, 2 Oct 2025 13:18:26 +1000
Subject: [PATCH 01/74] chore(init): scaffold project structure
Create empty modules, configs, and test shells; no implementations yet.
---
.../TimeLOB_TimeGAN_49088276/.gitignore | 19 +++++++++++++++++++
.../TimeLOB_TimeGAN_49088276/README.MD | 17 +++++++++++++++++
.../TimeLOB_TimeGAN_49088276/dataset.py | 19 +++++++++++++++++++
.../TimeLOB_TimeGAN_49088276/modules.py | 0
.../TimeLOB_TimeGAN_49088276/predict.py | 0
recognition/TimeLOB_TimeGAN_49088276/train.py | 0
6 files changed, 55 insertions(+)
create mode 100644 recognition/TimeLOB_TimeGAN_49088276/.gitignore
create mode 100644 recognition/TimeLOB_TimeGAN_49088276/README.MD
create mode 100644 recognition/TimeLOB_TimeGAN_49088276/dataset.py
create mode 100644 recognition/TimeLOB_TimeGAN_49088276/modules.py
create mode 100644 recognition/TimeLOB_TimeGAN_49088276/predict.py
create mode 100644 recognition/TimeLOB_TimeGAN_49088276/train.py
diff --git a/recognition/TimeLOB_TimeGAN_49088276/.gitignore b/recognition/TimeLOB_TimeGAN_49088276/.gitignore
new file mode 100644
index 000000000..7a6136c0e
--- /dev/null
+++ b/recognition/TimeLOB_TimeGAN_49088276/.gitignore
@@ -0,0 +1,19 @@
+# editor specific files
+.idea/
+.vscode/
+
+# python cache files
+./__pycache__/
+*.pyc
+
+# model specific files
+data/
+*.csv
+*.pt
+*.pkl
+outputs/
+checkpoints/
+logs/
+
+# OS generated files
+.DS_Store
\ No newline at end of file
diff --git a/recognition/TimeLOB_TimeGAN_49088276/README.MD b/recognition/TimeLOB_TimeGAN_49088276/README.MD
new file mode 100644
index 000000000..1a01b637d
--- /dev/null
+++ b/recognition/TimeLOB_TimeGAN_49088276/README.MD
@@ -0,0 +1,17 @@
+# TimeLOB
+
+**COMP3710 - Pattern Recognition and Analysis**
+
+
+
+
+ | Task 14 |
+ Generative Model of AMZN LOBSTER Level-10 using TimeGAN |
+
+
+ | Author |
+ Radhesh Goel (49088276) |
+
+
+
+
diff --git a/recognition/TimeLOB_TimeGAN_49088276/dataset.py b/recognition/TimeLOB_TimeGAN_49088276/dataset.py
new file mode 100644
index 000000000..099190790
--- /dev/null
+++ b/recognition/TimeLOB_TimeGAN_49088276/dataset.py
@@ -0,0 +1,19 @@
+"""
+A module to tell to how we handle the dataset.
+
+Created By:
+ID: s49088276
+
+References:
+-
+"""
+
+import os
+
+def test_dataset_exists():
+ data_dir = "data"
+ files = os.listdir(data_dir)
+ print(f"Files in '{data_dir}': {files}")
+
+if __name__ == "__main__":
+ test_dataset_exists()
\ No newline at end of file
diff --git a/recognition/TimeLOB_TimeGAN_49088276/modules.py b/recognition/TimeLOB_TimeGAN_49088276/modules.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/recognition/TimeLOB_TimeGAN_49088276/predict.py b/recognition/TimeLOB_TimeGAN_49088276/predict.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/recognition/TimeLOB_TimeGAN_49088276/train.py b/recognition/TimeLOB_TimeGAN_49088276/train.py
new file mode 100644
index 000000000..e69de29bb
From 3203bb0e1c2b915dd63bd1237098d61932a1b345 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Thu, 2 Oct 2025 13:28:53 +1000
Subject: [PATCH 02/74] docs(module): improve top-level module docstring
Clarify module purpose, responsibilities, and public API; add usage example and references. No functional changes.
---
.../TimeLOB_TimeGAN_49088276/dataset.py | 15 +++++++++---
.../TimeLOB_TimeGAN_49088276/modules.py | 24 +++++++++++++++++++
.../TimeLOB_TimeGAN_49088276/predict.py | 17 +++++++++++++
3 files changed, 53 insertions(+), 3 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/dataset.py b/recognition/TimeLOB_TimeGAN_49088276/dataset.py
index 099190790..f376f946b 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/dataset.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/dataset.py
@@ -1,7 +1,16 @@
"""
-A module to tell to how we handle the dataset.
+Load and preprocess LOBSTER level-10 order book data for TimeGAN.
-Created By:
+This module provides a PyTorch Dataset and DataLoader factory that align,
+window, and scale limit order book features (e.g., top-10 bid/ask prices and
+size) into fixed-length sequences. Splits should be time-based to avoid
+leakage. Tensors are returned in that shape ``(seq_len, feature_dim)``.
+
+Exports:
+ - LOBSTERDataset
+ - make_dataloader
+
+Created By: Radhesh Goel (Keys-I)
ID: s49088276
References:
@@ -16,4 +25,4 @@ def test_dataset_exists():
print(f"Files in '{data_dir}': {files}")
if __name__ == "__main__":
- test_dataset_exists()
\ No newline at end of file
+ test_dataset_exists()
diff --git a/recognition/TimeLOB_TimeGAN_49088276/modules.py b/recognition/TimeLOB_TimeGAN_49088276/modules.py
index e69de29bb..be69760f3 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/modules.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/modules.py
@@ -0,0 +1,24 @@
+"""
+Define the core TimeGAN components for limit order book sequences.
+
+This module declares the building blocks of the TimeGAN adapted to LOBSTER
+level-10 order book data (e.g., AMZN). It typically includes the Embedder,
+Recovery, Generator, Supervisor, and Discriminator, and a TimeGAN wrapper that
+wires them together. Inputs are sequences shaped
+``(batch_size, seq_len, feature_dim)`` and outputs mirror that shape.
+
+Exports:
+ - Embedder
+ - Recovery
+ - Generator
+ - Supervisor
+ - Discriminator
+ - TimeGAN
+
+Created By: Radhesh Goel (Keys-I)
+ID: s49088276
+
+References:
+-
+"""
+# TODO: Implement model classes and a TimeGAN wrapper here; keep public APIs compliant with PEP 8 and other best practices.
\ No newline at end of file
diff --git a/recognition/TimeLOB_TimeGAN_49088276/predict.py b/recognition/TimeLOB_TimeGAN_49088276/predict.py
index e69de29bb..3bdc4077d 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/predict.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/predict.py
@@ -0,0 +1,17 @@
+"""
+Sample synthetic sequences using a trained TimeGAN model and visualise results.
+
+This module loads a saved checkpoint, generates synthetic limit order book
+windows, prints summary statistics, and produces basic visualisations
+(e.g., feature lines and depth heatmaps) to compare real vs. synthetic data.
+
+Typical Usage:
+ python3 -m train --data_dir --seq_len 100 --batch_size 64 --epochs 20
+
+Created By: Radhesh Goel (Keys-I)
+ID: s49088276
+
+References:
+-
+"""
+# TODO: Implement checkpoint load, sampling, basic stats, and visualisations.
\ No newline at end of file
From 029462611bba0648af7e8c671418d74c7397a4ec Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Thu, 2 Oct 2025 19:45:39 +1000
Subject: [PATCH 03/74] build(env,src): add conda environment.yml (py3.13) and
adopt src/ layout
Add environment.yml pinned to python=3.13.* (conda-forge, strict priority) with numpy>=2,<3, pandas>=2.2, scipy>=1.13, scikit-learn>=1.5, matplotlib>=3.9, jupyterlab, ipykernel. Refactor code into src/ (add __init__.py), update script imports to use the package, and rename any lib-shadowing files (e.g., matplotlib.py).
---
.../TimeLOB_TimeGAN_49088276/dataset.py | 28 --
.../TimeLOB_TimeGAN_49088276/environment.yml | 14 +
.../scripts/analyze_features.py | 408 ++++++++++++++++++
.../TimeLOB_TimeGAN_49088276/src/dataset.py | 52 +++
.../{ => src}/modules.py | 0
.../{ => src}/predict.py | 0
.../{ => src}/train.py | 0
7 files changed, 474 insertions(+), 28 deletions(-)
delete mode 100644 recognition/TimeLOB_TimeGAN_49088276/dataset.py
create mode 100644 recognition/TimeLOB_TimeGAN_49088276/environment.yml
create mode 100644 recognition/TimeLOB_TimeGAN_49088276/scripts/analyze_features.py
create mode 100644 recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
rename recognition/TimeLOB_TimeGAN_49088276/{ => src}/modules.py (100%)
rename recognition/TimeLOB_TimeGAN_49088276/{ => src}/predict.py (100%)
rename recognition/TimeLOB_TimeGAN_49088276/{ => src}/train.py (100%)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/dataset.py b/recognition/TimeLOB_TimeGAN_49088276/dataset.py
deleted file mode 100644
index f376f946b..000000000
--- a/recognition/TimeLOB_TimeGAN_49088276/dataset.py
+++ /dev/null
@@ -1,28 +0,0 @@
-"""
-Load and preprocess LOBSTER level-10 order book data for TimeGAN.
-
-This module provides a PyTorch Dataset and DataLoader factory that align,
-window, and scale limit order book features (e.g., top-10 bid/ask prices and
-size) into fixed-length sequences. Splits should be time-based to avoid
-leakage. Tensors are returned in that shape ``(seq_len, feature_dim)``.
-
-Exports:
- - LOBSTERDataset
- - make_dataloader
-
-Created By: Radhesh Goel (Keys-I)
-ID: s49088276
-
-References:
--
-"""
-
-import os
-
-def test_dataset_exists():
- data_dir = "data"
- files = os.listdir(data_dir)
- print(f"Files in '{data_dir}': {files}")
-
-if __name__ == "__main__":
- test_dataset_exists()
diff --git a/recognition/TimeLOB_TimeGAN_49088276/environment.yml b/recognition/TimeLOB_TimeGAN_49088276/environment.yml
new file mode 100644
index 000000000..b085e0eb3
--- /dev/null
+++ b/recognition/TimeLOB_TimeGAN_49088276/environment.yml
@@ -0,0 +1,14 @@
+name: proj-env
+channels:
+ - conda-forge
+dependencies:
+ - python=3.13
+ - pip
+ - numpy
+ - pandas
+ - scipy
+ - scikit-learn
+ - matplotlib
+ - jupyterlab
+ - ipykernel
+ - pip:
\ No newline at end of file
diff --git a/recognition/TimeLOB_TimeGAN_49088276/scripts/analyze_features.py b/recognition/TimeLOB_TimeGAN_49088276/scripts/analyze_features.py
new file mode 100644
index 000000000..3b8838ef6
--- /dev/null
+++ b/recognition/TimeLOB_TimeGAN_49088276/scripts/analyze_features.py
@@ -0,0 +1,408 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Analyze engineered LOBSTER features and justify a 5-feature subset.
+
+This script loads paired LOBSTER message/order book CSVs (Level 10), computes the 10 engineered
+features below, and generates quantitative evidence to support selecting a compact 5-feature set
+for TimeGAN training and evaluation on AMZN Level-10 data.
+
+Engineered features (10):
+ 1) mid_price = 0.5 * (ask_price_1 + bid_price_1)
+ 2) spread = ask_price_1 - bid_price_1
+ 3) rel_spread = spread / mid_price
+ 4) mid_log_return = log(mid_price_t) - log(mid_price_{t-1})
+ 5) queue_imbalance_l1 = (bid_size_1 - ask_size_1) / (bid_size_1 + ask_size_1 + eps)
+ 6) depth_imbalance_l5 = (Σ_i≤5 bid_size_i - Σ_i≤5 ask_size_i) /
+ (Σ_i≤5 bid_size_i + Σ_i≤5 ask_size_i + eps)
+ 7) depth_imbalance_l10 = (Σ_i≤10 bid_size_i - Σ_i≤10 ask_size_i) /
+ (Σ_i≤10 bid_size_i + Σ_i≤10 ask_size_i + eps)
+ 8) cum_depth_bid_10 = Σ_i≤10 bid_size_i
+ 9) cum_depth_ask_10 = Σ_i≤10 ask_size_i
+ 10) time_delta = time_t - time_{t-1} (seconds)
+
+Evidence produced:
+ • Relevance: mutual information (MI) with next-step mid_log_return (predictive dynamics) and
+ with current spread (matches your report metrics).
+ • Redundancy: Spearman correlation matrix + greedy mRMR-style selection.
+ • Coverage: PCA explained variance + feature loading contributions (top 3 PCs).
+ • Summary: Markdown report with the final top-5 and numeric justifications.
+
+Usage:
+ python analyze_features.py \
+ --message AMZN_2012-06-21_34200000_57600000_message_10.csv \
+ --orderbook AMZN_2012-06-21_34200000_57600000_orderbook_10.csv \
+ --outdir results_amzn_lvl10
+
+Notes:
+ • LOBSTER quotes prices as ticks (price * 10_000). This script converts to dollars.
+ • Outputs include PNG plots, CSV/JSON metrics, and a summary.md rationale.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+from dataclasses import dataclass
+from typing import Dict, List, Tuple
+
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from scipy.stats import spearmanr
+from sklearn.decomposition import PCA
+from sklearn.feature_selection import mutual_info_regression
+from sklearn.preprocessing import StandardScaler
+
+
+EPS = 1e-9
+TICK_SCALE = 10_000.0 # LOBSTER price ticks: quoted as price * 10_000
+
+
+@dataclass
+class AnalysisOutputs:
+ mi_next_return: Dict[str, float]
+ mi_spread: Dict[str, float]
+ corr_matrix: pd.DataFrame
+ pca_var_ratio: np.ndarray
+ pca_loadings: pd.DataFrame
+ selected5: List[str]
+ reasons: Dict[str, Dict[str, float]]
+
+
+def _make_orderbook_columns(levels: int = 10) -> List[str]:
+ cols = []
+ for i in range(1, levels + 1):
+ cols.append(f"ask_price_{i}")
+ cols.append(f"ask_size_{i}")
+ for i in range(1, levels + 1):
+ cols.append(f"bid_price_{i}")
+ cols.append(f"bid_size_{i}")
+ return cols # 40 columns
+
+
+def load_lobster(orderbook_csv: str, message_csv: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
+ # order book: 40 columns, no header
+ ob_cols = _make_orderbook_columns(10)
+ ob = pd.read_csv(orderbook_csv, header=None, names=ob_cols)
+
+ # message: 6 columns, no header per LOBSTER docs
+ msg_cols = ["time", "event_type", "order_id", "size", "price", "direction"]
+ msg = pd.read_csv(message_csv, header=None, names=msg_cols)
+
+ n = min(len(ob), len(msg))
+ if len(ob) != len(msg):
+ print(f"[warn] Row mismatch (orderbook={len(ob)}, message={len(msg)}). Truncating to {n}.")
+ ob = ob.iloc[:n].reset_index(drop=True)
+ msg = msg.iloc[:n].reset_index(drop=True)
+
+ return ob, msg
+
+
+def compute_features(ob: pd.DataFrame, msg: pd.DataFrame) -> pd.DataFrame:
+ # Convert price ticks to dollars
+ ask1 = ob["ask_price_1"] / TICK_SCALE
+ bid1 = ob["bid_price_1"] / TICK_SCALE
+
+ mid_price = 0.5 * (ask1 + bid1)
+ spread = (ask1 - bid1) # already in dollars
+ rel_spread = spread / (mid_price + EPS)
+ mid_log_return = np.log(mid_price + EPS).diff().fillna(0.0)
+
+ ask_sizes = [f"ask_size_{i}" for i in range(1, 11)]
+ bid_sizes = [f"bid_size_{i}" for i in range(1, 11)]
+
+ queue_imbalance_l1 = (
+ (ob["bid_size_1"] - ob["ask_size_1"]) / (ob["bid_size_1"] + ob["ask_size_1"] + EPS)
+ )
+
+ cum_bid_5 = ob[[f"bid_size_{i}" for i in range(1, 6)]].sum(axis=1)
+ cum_ask_5 = ob[[f"ask_size_{i}" for i in range(1, 6)]].sum(axis=1)
+ depth_imbalance_l5 = (cum_bid_5 - cum_ask_5) / (cum_bid_5 + cum_ask_5 + EPS)
+
+ cum_bid_10 = ob[bid_sizes].sum(axis=1)
+ cum_ask_10 = ob[ask_sizes].sum(axis=1)
+ depth_imbalance_l10 = (cum_bid_10 - cum_ask_10) / (cum_bid_10 + cum_ask_10 + EPS)
+
+ cum_depth_bid_10 = cum_bid_10
+ cum_depth_ask_10 = cum_ask_10
+
+ time_delta = msg["time"].diff().fillna(0.0)
+
+ feats = pd.DataFrame(
+ {
+ "mid_price": mid_price,
+ "spread": spread,
+ "rel_spread": rel_spread,
+ "mid_log_return": mid_log_return,
+ "queue_imbalance_l1": queue_imbalance_l1,
+ "depth_imbalance_l5": depth_imbalance_l5,
+ "depth_imbalance_l10": depth_imbalance_l10,
+ "cum_depth_bid_10": cum_depth_bid_10,
+ "cum_depth_ask_10": cum_depth_ask_10,
+ "time_delta": time_delta,
+ }
+ )
+
+ # Align for next-step relationships; drop the last row to form y_{t+1}
+ feats = feats.dropna().reset_index(drop=True)
+ return feats
+
+
+def compute_mi_scores(feats: pd.DataFrame) -> Tuple[Dict[str, float], Dict[str, float]]:
+ # Targets: next-step mid_log_return (shift -1) and current spread
+ y_next_ret = feats["mid_log_return"].shift(-1).iloc[:-1].values
+ y_spread = feats["spread"].iloc[:-1].values
+ X = feats.iloc[:-1].values
+ names = feats.columns.tolist()
+
+ # Standardize features for MI numeric stability (MI itself is scale-free but helps neighbors)
+ X_std = StandardScaler(with_mean=True, with_std=True).fit_transform(X)
+
+ mi_next = mutual_info_regression(X_std, y_next_ret, random_state=0)
+ mi_spr = mutual_info_regression(X_std, y_spread, random_state=0)
+
+ mi_next_dict = {n: float(v) for n, v in zip(names, mi_next)}
+ mi_spr_dict = {n: float(v) for n, v in zip(names, mi_spr)}
+ return mi_next_dict, mi_spr_dict
+
+
+def compute_correlations(feats: pd.DataFrame) -> pd.DataFrame:
+ corr, _ = spearmanr(feats.values, axis=0)
+ corr_df = pd.DataFrame(corr, index=feats.columns, columns=feats.columns)
+ return corr_df
+
+
+def compute_pca(feats: pd.DataFrame, n_components: int = 5) -> Tuple[np.ndarray, pd.DataFrame]:
+ X_std = StandardScaler().fit_transform(feats.values)
+ pca = PCA(n_components=n_components, random_state=0)
+ X_pca = pca.fit_transform(X_std)
+ var_ratio = pca.explained_variance_ratio_
+ loadings = pd.DataFrame(
+ pca.components_.T, index=feats.columns, columns=[f"PC{i+1}" for i in range(n_components)]
+ )
+ return var_ratio, loadings
+
+
+def greedy_select_5(
+ mi_next: Dict[str, float],
+ mi_spr: Dict[str, float],
+ corr: pd.DataFrame,
+ must_include: List[str] | None = None,
+ lambda_red: float = 0.5,
+) -> Tuple[List[str], Dict[str, Dict[str, float]]]:
+ """
+ Greedy mRMR-like selection:
+ score = 0.6 * MI(next_ret) + 0.4 * MI(spread) - λ * avg_abs_corr_with_selected
+ Always include 'must_include' first (mid_price, spread) to align with report metrics.
+ """
+ if must_include is None:
+ must_include = ["mid_price", "spread"]
+
+ # Normalize MI to [0, 1] per target for fair combination
+ all_feats = list(mi_next.keys())
+ mi_next_arr = np.array([mi_next[f] for f in all_feats])
+ mi_spr_arr = np.array([mi_spr[f] for f in all_feats])
+ mi_next_norm = (mi_next_arr - mi_next_arr.min()) / (np.ptp(mi_next_arr) + EPS)
+ mi_spr_norm = (mi_spr_arr - mi_spr_arr.min()) / (np.ptp(mi_spr_arr) + EPS)
+ mi_combo = 0.6 * mi_next_norm + 0.4 * mi_spr_norm
+ mi_combo_dict = {f: float(v) for f, v in zip(all_feats, mi_combo)}
+
+ selected: List[str] = []
+ reasons: Dict[str, Dict[str, float]] = {}
+
+ for m in must_include:
+ selected.append(m)
+ reasons[m] = {
+ "mi_next_norm": mi_combo_dict[m], # combined normalized MI
+ "mi_spread_raw": mi_spr[m],
+ "mi_next_raw": mi_next[m],
+ "avg_redundancy": 0.0,
+ }
+
+ candidates = [f for f in all_feats if f not in selected]
+ while len(selected) < 5 and candidates:
+ best_feat = None
+ best_score = -np.inf
+ best_red = None
+ for f in candidates:
+ # Redundancy: average absolute Spearman corr with already selected
+ red = float(np.mean(np.abs(corr.loc[f, selected].values)))
+ score = mi_combo_dict[f] - lambda_red * red
+ if score > best_score:
+ best_score = score
+ best_feat = f
+ best_red = red
+ assert best_feat is not None
+ selected.append(best_feat)
+ reasons[best_feat] = {
+ "mi_next_norm": mi_combo_dict[best_feat],
+ "mi_spread_raw": mi_spr[best_feat],
+ "mi_next_raw": mi_next[best_feat],
+ "avg_redundancy": float(best_red),
+ }
+ candidates.remove(best_feat)
+
+ return selected, reasons
+
+
+def plot_bar(values: Dict[str, float], title: str, ylabel: str, outpath: str) -> None:
+ names = list(values.keys())
+ vals = list(values.values())
+ plt.figure(figsize=(10, 4))
+ plt.bar(range(len(names)), vals)
+ plt.xticks(range(len(names)), names, rotation=45, ha="right")
+ plt.ylabel(ylabel)
+ plt.title(title)
+ plt.tight_layout()
+ plt.savefig(outpath, dpi=160)
+ plt.close()
+
+
+def plot_corr_heatmap(corr: pd.DataFrame, title: str, outpath: str) -> None:
+ plt.figure(figsize=(7.5, 6.5))
+ im = plt.imshow(corr.values, vmin=-1, vmax=1, interpolation="nearest", aspect="auto")
+ plt.colorbar(im, fraction=0.035, pad=0.04)
+ plt.xticks(range(len(corr)), corr.columns, rotation=45, ha="right")
+ plt.yticks(range(len(corr)), corr.index)
+ plt.title(title)
+ plt.tight_layout()
+ plt.savefig(outpath, dpi=160)
+ plt.close()
+
+
+def plot_pca(var_ratio: np.ndarray, loadings: pd.DataFrame, outdir: str) -> None:
+ plt.figure(figsize=(6, 4))
+ plt.bar(range(1, len(var_ratio) + 1), var_ratio)
+ plt.xlabel("Principal component")
+ plt.ylabel("Explained variance ratio")
+ plt.title("PCA explained variance ratio (standardized features)")
+ plt.tight_layout()
+ plt.savefig(os.path.join(outdir, "pca_explained_variance.png"), dpi=160)
+ plt.close()
+
+ # Sum absolute loadings across top 3 PCs as a proxy of contribution
+ topk = min(3, loadings.shape[1])
+ contrib = loadings.iloc[:, :topk].abs().sum(axis=1)
+ contrib = contrib.sort_values(ascending=False)
+ plt.figure(figsize=(8, 4))
+ plt.bar(range(len(contrib)), contrib.values)
+ plt.xticks(range(len(contrib)), contrib.index, rotation=45, ha="right")
+ plt.ylabel("Σ|loading| over top 3 PCs")
+ plt.title("PCA loading contributions (top 3 PCs)")
+ plt.tight_layout()
+ plt.savefig(os.path.join(outdir, "pca_loading_contributions.png"), dpi=160)
+ plt.close()
+
+ contrib.to_csv(os.path.join(outdir, "pca_loading_contributions.csv"))
+
+
+def write_summary(
+ out: AnalysisOutputs,
+ outdir: str,
+ fixed_keep: List[str] | None = None,
+) -> None:
+ if fixed_keep is None:
+ fixed_keep = ["mid_price", "spread"]
+
+ md = []
+ md.append("# Feature analysis summary\n")
+ md.append("**Final selected 5 features:** " + ", ".join(out.selected5) + "\n")
+ md.append("We pin *mid_price* and *spread* as must-haves because your report metrics directly use "
+ "the mid-price return distribution and the spread; the remaining three are chosen by "
+ "a greedy mRMR-style criterion that balances relevance (MI) and redundancy.\n")
+
+ md.append("## Mutual information (relevance)\n")
+ md.append("- We compute MI with **next-step mid_log_return** (predictive dynamics) and with the "
+ "**current spread** (distributional target). Higher is better.\n")
+ md.append("\n**Top MI (next-step return)**\n\n")
+ top_mi_next = sorted(out.mi_next_return.items(), key=lambda x: x[1], reverse=True)
+ md.extend([f"- {k}: {v:.4f}" for k, v in top_mi_next[:5]])
+ md.append("\n**Top MI (spread)**\n\n")
+ top_mi_spr = sorted(out.mi_spread.items(), key=lambda x: x[1], reverse=True)
+ md.extend([f"- {k}: {v:.4f}" for k, v in top_mi_spr[:5]])
+ md.append("\n")
+
+ md.append("## Redundancy (Spearman correlation)\n")
+ md.append("The heatmap (corr_heatmap.png) shows strong collinearity between "
+ "`depth_imbalance_l5` and `depth_imbalance_l10`, and between "
+ "`cum_depth_bid_10` and `cum_depth_ask_10`. We keep only one of each redundant "
+ "family to avoid duplication.\n")
+
+ md.append("## PCA coverage\n")
+ md.append("PCA plots indicate how much variance is captured and which features contribute most "
+ "to the top components (pca_explained_variance.png, pca_loading_contributions.png).\n")
+
+ md.append("## Why these 5?\n")
+ for f in out.selected5:
+ r = out.reasons[f]
+ pinned = " (pinned)" if f in fixed_keep else ""
+ md.append(
+ f"- **{f}**{pinned}: MI(next)≈{r['mi_next_raw']:.4f}, "
+ f"MI(spread)≈{r['mi_spread_raw']:.4f}, avg redundancy≈{r['avg_redundancy']:.3f}.\n"
+ " Contributes strongly while staying non-redundant with the rest."
+ )
+
+ with open(os.path.join(outdir, "summary.md"), "w", encoding="utf-8") as f:
+ f.write("\n".join(md))
+
+
+def run_analysis(orderbook_csv: str, message_csv: str, outdir: str) -> AnalysisOutputs:
+ os.makedirs(outdir, exist_ok=True)
+
+ ob, msg = load_lobster(orderbook_csv, message_csv)
+ feats = compute_features(ob, msg)
+ feats.to_csv(os.path.join(outdir, "engineered_features.csv"), index=False)
+
+ mi_next, mi_spr = compute_mi_scores(feats)
+ corr = compute_correlations(feats)
+ var_ratio, loadings = compute_pca(feats, n_components=5)
+
+ # Plots/tables
+ plot_bar(mi_next, "MI with next-step mid_log_return", "MI", os.path.join(outdir, "mi_next.png"))
+ plot_bar(mi_spr, "MI with current spread", "MI", os.path.join(outdir, "mi_spread.png"))
+ plot_corr_heatmap(corr, "Spearman correlation (10 engineered features)",
+ os.path.join(outdir, "corr_heatmap.png"))
+ pd.DataFrame({"feature": list(mi_next.keys()),
+ "mi_next": list(mi_next.values()),
+ "mi_spread": [mi_spr[k] for k in mi_next.keys()],
+ }).to_csv(os.path.join(outdir, "mi_scores.csv"), index=False)
+ loadings.to_csv(os.path.join(outdir, "pca_loadings.csv"))
+ plot_pca(var_ratio, loadings, outdir)
+
+ # Greedy selection with mid_price, spread as must-keep
+ selected5, reasons = greedy_select_5(mi_next, mi_spr, corr, must_include=["mid_price", "spread"])
+ with open(os.path.join(outdir, "selected_features.json"), "w", encoding="utf-8") as f:
+ json.dump({"selected5": selected5, "reasons": reasons}, f, indent=2)
+
+ out = AnalysisOutputs(
+ mi_next_return=mi_next,
+ mi_spread=mi_spr,
+ corr_matrix=corr,
+ pca_var_ratio=var_ratio,
+ pca_loadings=loadings,
+ selected5=selected5,
+ reasons=reasons,
+ )
+
+ write_summary(out, outdir)
+ return out
+
+
+def parse_args() -> argparse.Namespace:
+ ap = argparse.ArgumentParser(description="Analyze LOBSTER features and justify a 5-feature set.")
+ ap.add_argument("--orderbook", required=True, help="Path to orderbook_10.csv")
+ ap.add_argument("--message", required=True, help="Path to message_10.csv")
+ ap.add_argument("--outdir", required=True, help="Output directory for plots and tables")
+ return ap.parse_args()
+
+
+def main() -> None:
+ args = parse_args()
+ run_analysis(orderbook_csv=args.orderbook, message_csv=args.message, outdir=args.outdir)
+ print(f"[done] Analysis complete. Results in: {args.outdir}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
new file mode 100644
index 000000000..22d0e1a3a
--- /dev/null
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
@@ -0,0 +1,52 @@
+"""
+Preprocesses LOBSTER Limit Order Book (Level 10) data for TimeGAN training.
+
+Loads paired LOBSTER message/order book CSVs, aligns by event index, windows into fixed-length
+sequences, and scales features. Splits are chronological to avoid leakage. Samples are returned as
+``(seq_len, num_features)``.
+
+Inputs:
+- ``message_10.csv`` and ``orderbook_10.csv`` for the same day (aligned rows; AMZN Level-10).
+
+Outputs:
+- NumPy arrays ``(train, val, test)`` with shape ``[num_seq, seq_len, num_features]``.
+
+Features:
+- Default ``feature_set="core"`` (5 engineered features):
+ 1) ``mid_price`` = 0.5 * (ask_price_1 + bid_price_1)
+ 2) ``spread`` = ask_price_1 - bid_price_1
+ 3) ``mid_log_return`` = log(mid_price_t) - log(mid_price_{t-1})
+ 4) ``queue_imbalance_l1`` = (bid_size_1 - ask_size_1) / (bid_size_1 + ask_size_1 + eps)
+ 5) ``depth_imbalance_l10`` = (Σ_i≤10 bid_size_i - Σ_i≤10 ask_size_i)
+ / (Σ_i≤10 bid_size_i + Σ_i≤10 ask_size_i + eps)
+
+- Alternative ``feature_set="raw10"`` (40 raw LOB columns):
+ ask_price_1..10, ask_size_1..10, bid_price_1..10, bid_size_1..10.
+
+Evaluation (for the accompanying report):
+- Distribution similarity: KL divergence ≤ 0.1 between generated vs. real spread and mid-price
+ return distributions on a held-out test split.
+- Visual similarity: SSIM > 0.6 between heatmaps of generated vs. real LOB depth snapshots.
+- Also include: model architecture and parameter count, training strategy (full TimeGAN vs.
+ adversarial-only or supervised-only variants), GPU type, VRAM, epochs, and total training time.
+ Provide 3–5 representative heatmaps with a short error analysis.
+
+Exports:
+- ``LOBSTERDataset`` — PyTorch Dataset yielding windowed sequences.
+- ``make_dataloader`` — Convenience factory for a configured DataLoader.
+
+Created by: Radhesh Goel (Keys-I) | ID: s49088276
+"""
+
+
+import os
+
+
+def test_dataset_exists():
+ data_dir = "data"
+ files = os.listdir(data_dir)
+ print(f"Files in '{data_dir}': {files}")
+
+
+if __name__ == "__main__":
+ test_dataset_exists()
diff --git a/recognition/TimeLOB_TimeGAN_49088276/modules.py b/recognition/TimeLOB_TimeGAN_49088276/src/modules.py
similarity index 100%
rename from recognition/TimeLOB_TimeGAN_49088276/modules.py
rename to recognition/TimeLOB_TimeGAN_49088276/src/modules.py
diff --git a/recognition/TimeLOB_TimeGAN_49088276/predict.py b/recognition/TimeLOB_TimeGAN_49088276/src/predict.py
similarity index 100%
rename from recognition/TimeLOB_TimeGAN_49088276/predict.py
rename to recognition/TimeLOB_TimeGAN_49088276/src/predict.py
diff --git a/recognition/TimeLOB_TimeGAN_49088276/train.py b/recognition/TimeLOB_TimeGAN_49088276/src/train.py
similarity index 100%
rename from recognition/TimeLOB_TimeGAN_49088276/train.py
rename to recognition/TimeLOB_TimeGAN_49088276/src/train.py
From a815817c98d142709d2c911b7b40e0944189a09e Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Thu, 2 Oct 2025 20:54:45 +1000
Subject: [PATCH 04/74] code(script): fixed formatting issues
---
.../scripts/analyze_features.py | 70 ++++++++++++-------
1 file changed, 43 insertions(+), 27 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/scripts/analyze_features.py b/recognition/TimeLOB_TimeGAN_49088276/scripts/analyze_features.py
index 3b8838ef6..ea487ed54 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/scripts/analyze_features.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/scripts/analyze_features.py
@@ -55,7 +55,6 @@
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import StandardScaler
-
EPS = 1e-9
TICK_SCALE = 10_000.0 # LOBSTER price ticks: quoted as price * 10_000
@@ -93,7 +92,8 @@ def load_lobster(orderbook_csv: str, message_csv: str) -> Tuple[pd.DataFrame, pd
n = min(len(ob), len(msg))
if len(ob) != len(msg):
- print(f"[warn] Row mismatch (orderbook={len(ob)}, message={len(msg)}). Truncating to {n}.")
+ print(
+ f"[warn] Row mismatch (orderbook={len(ob)}, message={len(msg)}). Truncating to {n}.")
ob = ob.iloc[:n].reset_index(drop=True)
msg = msg.iloc[:n].reset_index(drop=True)
@@ -114,16 +114,19 @@ def compute_features(ob: pd.DataFrame, msg: pd.DataFrame) -> pd.DataFrame:
bid_sizes = [f"bid_size_{i}" for i in range(1, 11)]
queue_imbalance_l1 = (
- (ob["bid_size_1"] - ob["ask_size_1"]) / (ob["bid_size_1"] + ob["ask_size_1"] + EPS)
+ (ob["bid_size_1"] - ob["ask_size_1"]) /
+ (ob["bid_size_1"] + ob["ask_size_1"] + EPS)
)
cum_bid_5 = ob[[f"bid_size_{i}" for i in range(1, 6)]].sum(axis=1)
cum_ask_5 = ob[[f"ask_size_{i}" for i in range(1, 6)]].sum(axis=1)
- depth_imbalance_l5 = (cum_bid_5 - cum_ask_5) / (cum_bid_5 + cum_ask_5 + EPS)
+ depth_imbalance_l5 = (cum_bid_5 - cum_ask_5) / \
+ (cum_bid_5 + cum_ask_5 + EPS)
cum_bid_10 = ob[bid_sizes].sum(axis=1)
cum_ask_10 = ob[ask_sizes].sum(axis=1)
- depth_imbalance_l10 = (cum_bid_10 - cum_ask_10) / (cum_bid_10 + cum_ask_10 + EPS)
+ depth_imbalance_l10 = (cum_bid_10 - cum_ask_10) / \
+ (cum_bid_10 + cum_ask_10 + EPS)
cum_depth_bid_10 = cum_bid_10
cum_depth_ask_10 = cum_ask_10
@@ -180,17 +183,18 @@ def compute_pca(feats: pd.DataFrame, n_components: int = 5) -> Tuple[np.ndarray,
X_pca = pca.fit_transform(X_std)
var_ratio = pca.explained_variance_ratio_
loadings = pd.DataFrame(
- pca.components_.T, index=feats.columns, columns=[f"PC{i+1}" for i in range(n_components)]
+ pca.components_.T, index=feats.columns, columns=[
+ f"PC{i + 1}" for i in range(n_components)]
)
return var_ratio, loadings
def greedy_select_5(
- mi_next: Dict[str, float],
- mi_spr: Dict[str, float],
- corr: pd.DataFrame,
- must_include: List[str] | None = None,
- lambda_red: float = 0.5,
+ mi_next: Dict[str, float],
+ mi_spr: Dict[str, float],
+ corr: pd.DataFrame,
+ must_include: List[str] | None = None,
+ lambda_red: float = 0.5,
) -> Tuple[List[str], Dict[str, Dict[str, float]]]:
"""
Greedy mRMR-like selection:
@@ -204,7 +208,8 @@ def greedy_select_5(
all_feats = list(mi_next.keys())
mi_next_arr = np.array([mi_next[f] for f in all_feats])
mi_spr_arr = np.array([mi_spr[f] for f in all_feats])
- mi_next_norm = (mi_next_arr - mi_next_arr.min()) / (np.ptp(mi_next_arr) + EPS)
+ mi_next_norm = (mi_next_arr - mi_next_arr.min()) / \
+ (np.ptp(mi_next_arr) + EPS)
mi_spr_norm = (mi_spr_arr - mi_spr_arr.min()) / (np.ptp(mi_spr_arr) + EPS)
mi_combo = 0.6 * mi_next_norm + 0.4 * mi_spr_norm
mi_combo_dict = {f: float(v) for f, v in zip(all_feats, mi_combo)}
@@ -262,7 +267,8 @@ def plot_bar(values: Dict[str, float], title: str, ylabel: str, outpath: str) ->
def plot_corr_heatmap(corr: pd.DataFrame, title: str, outpath: str) -> None:
plt.figure(figsize=(7.5, 6.5))
- im = plt.imshow(corr.values, vmin=-1, vmax=1, interpolation="nearest", aspect="auto")
+ im = plt.imshow(corr.values, vmin=-1, vmax=1,
+ interpolation="nearest", aspect="auto")
plt.colorbar(im, fraction=0.035, pad=0.04)
plt.xticks(range(len(corr)), corr.columns, rotation=45, ha="right")
plt.yticks(range(len(corr)), corr.index)
@@ -299,16 +305,17 @@ def plot_pca(var_ratio: np.ndarray, loadings: pd.DataFrame, outdir: str) -> None
def write_summary(
- out: AnalysisOutputs,
- outdir: str,
- fixed_keep: List[str] | None = None,
+ out: AnalysisOutputs,
+ outdir: str,
+ fixed_keep: List[str] | None = None,
) -> None:
if fixed_keep is None:
fixed_keep = ["mid_price", "spread"]
md = []
md.append("# Feature analysis summary\n")
- md.append("**Final selected 5 features:** " + ", ".join(out.selected5) + "\n")
+ md.append("**Final selected 5 features:** " +
+ ", ".join(out.selected5) + "\n")
md.append("We pin *mid_price* and *spread* as must-haves because your report metrics directly use "
"the mid-price return distribution and the spread; the remaining three are chosen by "
"a greedy mRMR-style criterion that balances relevance (MI) and redundancy.\n")
@@ -317,10 +324,12 @@ def write_summary(
md.append("- We compute MI with **next-step mid_log_return** (predictive dynamics) and with the "
"**current spread** (distributional target). Higher is better.\n")
md.append("\n**Top MI (next-step return)**\n\n")
- top_mi_next = sorted(out.mi_next_return.items(), key=lambda x: x[1], reverse=True)
+ top_mi_next = sorted(out.mi_next_return.items(),
+ key=lambda x: x[1], reverse=True)
md.extend([f"- {k}: {v:.4f}" for k, v in top_mi_next[:5]])
md.append("\n**Top MI (spread)**\n\n")
- top_mi_spr = sorted(out.mi_spread.items(), key=lambda x: x[1], reverse=True)
+ top_mi_spr = sorted(out.mi_spread.items(),
+ key=lambda x: x[1], reverse=True)
md.extend([f"- {k}: {v:.4f}" for k, v in top_mi_spr[:5]])
md.append("\n")
@@ -360,19 +369,22 @@ def run_analysis(orderbook_csv: str, message_csv: str, outdir: str) -> AnalysisO
var_ratio, loadings = compute_pca(feats, n_components=5)
# Plots/tables
- plot_bar(mi_next, "MI with next-step mid_log_return", "MI", os.path.join(outdir, "mi_next.png"))
- plot_bar(mi_spr, "MI with current spread", "MI", os.path.join(outdir, "mi_spread.png"))
+ plot_bar(mi_next, "MI with next-step mid_log_return",
+ "MI", os.path.join(outdir, "mi_next.png"))
+ plot_bar(mi_spr, "MI with current spread", "MI",
+ os.path.join(outdir, "mi_spread.png"))
plot_corr_heatmap(corr, "Spearman correlation (10 engineered features)",
os.path.join(outdir, "corr_heatmap.png"))
pd.DataFrame({"feature": list(mi_next.keys()),
"mi_next": list(mi_next.values()),
"mi_spread": [mi_spr[k] for k in mi_next.keys()],
- }).to_csv(os.path.join(outdir, "mi_scores.csv"), index=False)
+ }).to_csv(os.path.join(outdir, "mi_scores.csv"), index=False)
loadings.to_csv(os.path.join(outdir, "pca_loadings.csv"))
plot_pca(var_ratio, loadings, outdir)
# Greedy selection with mid_price, spread as must-keep
- selected5, reasons = greedy_select_5(mi_next, mi_spr, corr, must_include=["mid_price", "spread"])
+ selected5, reasons = greedy_select_5(
+ mi_next, mi_spr, corr, must_include=["mid_price", "spread"])
with open(os.path.join(outdir, "selected_features.json"), "w", encoding="utf-8") as f:
json.dump({"selected5": selected5, "reasons": reasons}, f, indent=2)
@@ -391,16 +403,20 @@ def run_analysis(orderbook_csv: str, message_csv: str, outdir: str) -> AnalysisO
def parse_args() -> argparse.Namespace:
- ap = argparse.ArgumentParser(description="Analyze LOBSTER features and justify a 5-feature set.")
- ap.add_argument("--orderbook", required=True, help="Path to orderbook_10.csv")
+ ap = argparse.ArgumentParser(
+ description="Analyze LOBSTER features and justify a 5-feature set.")
+ ap.add_argument("--orderbook", required=True,
+ help="Path to orderbook_10.csv")
ap.add_argument("--message", required=True, help="Path to message_10.csv")
- ap.add_argument("--outdir", required=True, help="Output directory for plots and tables")
+ ap.add_argument("--outdir", required=True,
+ help="Output directory for plots and tables")
return ap.parse_args()
def main() -> None:
args = parse_args()
- run_analysis(orderbook_csv=args.orderbook, message_csv=args.message, outdir=args.outdir)
+ run_analysis(orderbook_csv=args.orderbook,
+ message_csv=args.message, outdir=args.outdir)
print(f"[done] Analysis complete. Results in: {args.outdir}")
From 14b75d1e79cf01efcb831ceb5b4223000057642f Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Fri, 3 Oct 2025 04:54:34 +1000
Subject: [PATCH 05/74] feat(data): add LOBSTERData with headerless support
Adds CLI smoke test, core/raw10 features, chronological split, train-only scaling, and windowing.
---
.../TimeLOB_TimeGAN_49088276/src/dataset.py | 279 +++++++++++++++++-
1 file changed, 273 insertions(+), 6 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
index 22d0e1a3a..462b06fa1 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
@@ -37,16 +37,283 @@
Created by: Radhesh Goel (Keys-I) | ID: s49088276
"""
-
+from __future__ import annotations
import os
+import argparse
+from typing import Tuple, List, Literal, Optional
+
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import StandardScaler, MinMaxScaler
+
+
+class LOBSTERData:
+ """
+ Minimal loader -> features -> windows -> splits for LOBSTER L10 data.
+ """
+ def __init__(
+ self,
+ data_dir: str,
+ message_file: str = "message_10.csv",
+ orderbook_file: str = "orderbook_10.csv",
+ feature_set: Literal["core", "raw10"] = "core",
+ seq_len: int = 64,
+ stride: Optional[int] = None,
+ splits: Tuple[float, float, float] = (0.7, 0.15, 0.15),
+ scaler: Literal["standard", "minmax", "none"] = "standard",
+ eps: float = 1e-8,
+ headerless_message: bool = False,
+ headerless_orderbook: bool = False,
+ ):
+ self.data_dir = data_dir
+ self.message_path = os.path.join(data_dir, message_file)
+ self.orderbook_path = os.path.join(data_dir, orderbook_file)
+ self.feature_set = feature_set
+ self.seq_len = int(seq_len)
+ self.stride = int(stride) if stride is not None else self.seq_len
+ self.splits = splits
+ self.scaler_kind = scaler
+ self.eps = eps
+ self.headerless_message = headerless_message
+ self.headerless_orderbook = headerless_orderbook
+
+ assert abs(sum(splits) - 1.0) < 1e-9, "splits must sum to 1.0"
+ assert self.seq_len > 0 and self.stride > 0, "seq_len and stride must be positive"
+
+ self._scaler = None # fitted on train only
+ self._feature_names: List[str] = []
+
+ # ------------------- public API -------------------
+
+ def load_arrays(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+ """
+ Returns train, val, test arrays shaped (num_seq, seq_len, num_features).
+ """
+ msg_df, ob_df = self._load_csvs()
+ self._check_alignment(msg_df, ob_df)
+ feats = self._build_features(ob_df)
+
+ train, val, test = self._split_chronologically(feats)
+ train_s, val_s, test_s = self._scale_train_only(train, val, test)
+ W_train = self._windowize(train_s)
+ W_val = self._windowize(val_s)
+ W_test = self._windowize(test_s)
+ return W_train, W_val, W_test
+
+ def get_feature_names(self) -> List[str]:
+ return list(self._feature_names)
+
+ def get_scaler(self):
+ return self._scaler
+
+ # ------------------- internals --------------------
+
+ def _load_csvs(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
+ if not os.path.isfile(self.orderbook_path):
+ raise FileNotFoundError(f"Missing {self.orderbook_path}")
+ if not os.path.isfile(self.message_path):
+ raise FileNotFoundError(f"Missing {self.message_path}")
+
+ # Message (6 columns)
+ msg_cols = ["time", "type", "order_id", "size", "price", "direction"]
+ if self.headerless_message:
+ msg_df = pd.read_csv(self.message_path, header=None, names=msg_cols)
+ else:
+ msg_df = pd.read_csv(self.message_path)
+ msg_df.columns = [str(c).strip().lower().replace(" ", "_") for c in msg_df.columns]
+ if len(msg_df.columns) == 6 and set(msg_df.columns) != set(msg_cols):
+ msg_df.columns = msg_cols
+
+ # Orderbook (40 columns)
+ ob_cols = (
+ [f"ask_price_{i}" for i in range(1, 11)] +
+ [f"ask_size_{i}" for i in range(1, 11)] +
+ [f"bid_price_{i}" for i in range(1, 11)] +
+ [f"bid_size_{i}" for i in range(1, 11)]
+ )
+ if self.headerless_orderbook:
+ ob_df = pd.read_csv(self.orderbook_path, header=None, names=ob_cols)
+ else:
+ ob_df = pd.read_csv(self.orderbook_path)
+ ob_df = self._normalize_orderbook_headers(ob_df, ob_cols)
+
+ return msg_df, ob_df
+
+ def _normalize_orderbook_headers(self, df: pd.DataFrame, target_cols: List[str]) -> pd.DataFrame:
+ # Map common LOBSTER styles to snake_case:
+ # e.g., AskPrice1 -> ask_price_1, BidSize10 -> bid_size_10
+ new_cols = []
+ for c in df.columns:
+ s = str(c)
+ s = s.replace(" ", "").replace("-", "").replace(".", "")
+ s = s.replace("AskPrice", "ask_price_").replace("AskSize", "ask_size_") \
+ .replace("BidPrice", "bid_price_").replace("BidSize", "bid_size_")
+ s = s.lower()
+ s = s.replace("ask_price", "ask_price_").replace("ask_size", "ask_size_") \
+ .replace("bid_price", "bid_price_").replace("bid_size", "bid_size_")
+ s = s.replace("__", "_")
+ new_cols.append(s)
+ df.columns = new_cols
+
+ if set(df.columns) != set(target_cols) and len(df.columns) == len(target_cols):
+ df.columns = target_cols
+ return df
+
+ def _check_alignment(self, msg_df: pd.DataFrame, ob_df: pd.DataFrame) -> None:
+ if len(msg_df) != len(ob_df):
+ raise ValueError(f"Message/Orderbook row count mismatch: {len(msg_df)} vs {len(ob_df)}")
+ # LOBSTER rows are synchronized by event index; we trust row order.
+
+ def _build_features(self, ob_df: pd.DataFrame) -> np.ndarray:
+ # Ensure standard L10 columns exist
+ for prefix in ("ask_price_", "ask_size_", "bid_price_", "bid_size_"):
+ for L in range(1, 11):
+ col = f"{prefix}{L}"
+ if col not in ob_df.columns:
+ raise ValueError(f"Expected column missing: {col}")
+ if self.feature_set == "raw10":
+ cols = (
+ [f"ask_price_{i}" for i in range(1, 11)]
+ + [f"ask_size_{i}" for i in range(1, 11)]
+ + [f"bid_price_{i}" for i in range(1, 11)]
+ + [f"bid_size_{i}" for i in range(1, 11)]
+ )
+ X = ob_df[cols].to_numpy(dtype=np.float64)
+ self._feature_names = cols
+ return X
-def test_dataset_exists():
- data_dir = "data"
- files = os.listdir(data_dir)
- print(f"Files in '{data_dir}': {files}")
+ if self.feature_set == "core":
+ ap1 = ob_df["ask_price_1"].to_numpy(dtype=np.float64)
+ bp1 = ob_df["bid_price_1"].to_numpy(dtype=np.float64)
+ as1 = ob_df["ask_size_1"].to_numpy(dtype=np.float64)
+ bs1 = ob_df["bid_size_1"].to_numpy(dtype=np.float64)
+
+ # 1) mid_price
+ mid_price = 0.5 * (ap1 + bp1)
+
+ # 2) spread
+ spread = ap1 - bp1
+
+ # 3) mid_log_return
+ mid_log = np.log(np.clip(mid_price, 1e-12, None))
+ mid_log_return = np.concatenate([[0.0], np.diff(mid_log)])
+
+ # 4) queue_imbalance_l1
+ qi_l1 = (bs1 - as1) / (bs1 + as1 + self.eps)
+
+ # 5) depth_imbalance_l10
+ bid_depth = sum(ob_df[f"bid_size_{i}"].to_numpy(dtype=np.float64) for i in range(1, 11))
+ ask_depth = sum(ob_df[f"ask_size_{i}"].to_numpy(dtype=np.float64) for i in range(1, 11))
+ di_l10 = (bid_depth - ask_depth) / (bid_depth + ask_depth + self.eps)
+
+ X = np.vstack([mid_price, spread, mid_log_return, qi_l1, di_l10]).T
+ self._feature_names = [
+ "mid_price",
+ "spread",
+ "mid_log_return",
+ "queue_imbalance_l1",
+ "depth_imbalance_l10",
+ ]
+ return X
+
+ raise ValueError("feature_set must be 'core' or 'raw10'")
+
+ def _split_chronologically(self, X: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+ n = len(X)
+ n_train = int(n * self.splits[0])
+ n_val = int(n * self.splits[1])
+ n_test = n - n_train - n_val
+ train = X[:n_train]
+ val = X[n_train : n_train + n_val]
+ test = X[n_train + n_val :]
+ return train, val, test
+
+ def _scale_train_only(
+ self, train: np.ndarray, val: np.ndarray, test: np.ndarray
+ ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+ if self.scaler_kind == "none":
+ return train, val, test
+
+ if self.scaler_kind == "standard":
+ scaler = StandardScaler()
+ elif self.scaler_kind == "minmax":
+ scaler = MinMaxScaler()
+ else:
+ raise ValueError("scaler must be 'standard', 'minmax', or 'none'")
+
+ scaler.fit(train)
+ self._scaler = scaler
+ return scaler.transform(train), scaler.transform(val), scaler.transform(test)
+
+ def _windowize(self, X: np.ndarray) -> np.ndarray:
+ """
+ Returns windows shaped (num_seq, seq_len, num_features).
+ """
+ n, d = X.shape
+ if n < self.seq_len:
+ return np.empty((0, self.seq_len, d), dtype=np.float64)
+
+ starts = np.arange(0, n - self.seq_len + 1, self.stride, dtype=int)
+ W = np.empty((len(starts), self.seq_len, d), dtype=np.float64)
+ for i, s in enumerate(starts):
+ W[i] = X[s : s + self.seq_len]
+ return W
+
+
+# -------------------------- CLI smoke test ------------------------------------
+
+def _basic_test_cli():
+ """
+ Run a smoke test ONLY when file names are provided by the user.
+
+ Example:
+ python lobster_data.py --data-dir data/AMZN/2014-01-02 \
+ --message AMZN_2014-01-02_34200000_57600000_message_10.csv \
+ --orderbook AMZN_2014-01-02_34200000_57600000_orderbook_10.csv \
+ --headerless-message --headerless-orderbook
+ """
+ parser = argparse.ArgumentParser(description="LOBSTERData smoke test (filenames required).")
+ parser.add_argument("--data-dir", default="data", help="Folder containing the CSVs")
+ parser.add_argument("--message", required=True, help="Message CSV file name (e.g., message_10.csv)")
+ parser.add_argument("--orderbook", required=True, help="Orderbook CSV file name (e.g., orderbook_10.csv)")
+ parser.add_argument("--feature-set", choices=["core", "raw10"], default="core")
+ parser.add_argument("--seq-len", type=int, default=64)
+ parser.add_argument("--stride", type=int, default=16)
+ parser.add_argument("--scaler", choices=["standard", "minmax", "none"], default="standard")
+ parser.add_argument("--headerless-message", action="store_true", help="Treat message CSV as headerless")
+ parser.add_argument("--headerless-orderbook", action="store_true", help="Treat orderbook CSV as headerless")
+ args = parser.parse_args()
+
+ data_dir = args.data_dir
+ print(f"Files in '{data_dir}': {sorted(os.listdir(data_dir)) if os.path.isdir(data_dir) else 'MISSING'}")
+
+ try:
+ loader = LOBSTERData(
+ data_dir=data_dir,
+ message_file=args.message,
+ orderbook_file=args.orderbook,
+ feature_set=args.feature_set,
+ seq_len=args.seq_len,
+ stride=args.stride,
+ splits=(0.7, 0.15, 0.15),
+ scaler=args.scaler,
+ headerless_message=args.headerless_message,
+ headerless_orderbook=args.headerless_orderbook,
+ )
+ W_train, W_val, W_test = loader.load_arrays()
+ print("Feature names:", loader.get_feature_names())
+ print("Train windows:", W_train.shape)
+ print("Val windows: ", W_val.shape)
+ print("Test windows: ", W_test.shape)
+ if W_train.size:
+ print("Example window[0] stats -> mean:", float(W_train[0].mean()),
+ "std:", float(W_train[0].std()))
+ except Exception as e:
+ print("Basic test error:", e)
if __name__ == "__main__":
- test_dataset_exists()
+ _basic_test_cli()
+
From 227d74e06d851bbbda32c53ce1f69deb6b98cc24 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Fri, 3 Oct 2025 06:29:19 +1000
Subject: [PATCH 06/74] feat(dataset): support headerless LOBSTER CSVs + CLI
smoke test
Add --headerless-message/--headerless-orderbook flags, robust header normalization, train-only scaling, NaN/inf filtering, dtype control, meta accessors, and optional NPZ export. Includes improved errors and windowing checks.
---
recognition/TimeLOB_TimeGAN_49088276/src/dataset.py | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
index 462b06fa1..7e8eca5ac 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
@@ -315,5 +315,4 @@ def _basic_test_cli():
if __name__ == "__main__":
- _basic_test_cli()
-
+ _basic_test_cli()
\ No newline at end of file
From d286307232c3fb46363980f87b260501f21ce4f3 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Fri, 3 Oct 2025 07:45:34 +1000
Subject: [PATCH 07/74] feat(dataset): add CSV summaries and CLI flags
Introduce summarize() and --summary/--peek to inspect message/orderbook tables. Keep headerless support with robust normalization; chronological splits; train-only scaling; NaN/inf cleaning; dtype control; NPZ export; inverse_transform; and metadata accessors.
---
.../TimeLOB_TimeGAN_49088276/src/dataset.py | 255 ++++++++++++------
1 file changed, 179 insertions(+), 76 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
index 7e8eca5ac..5e4d99330 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
@@ -1,57 +1,68 @@
"""
-Preprocesses LOBSTER Limit Order Book (Level 10) data for TimeGAN training.
-
-Loads paired LOBSTER message/order book CSVs, aligns by event index, windows into fixed-length
-sequences, and scales features. Splits are chronological to avoid leakage. Samples are returned as
-``(seq_len, num_features)``.
-
-Inputs:
-- ``message_10.csv`` and ``orderbook_10.csv`` for the same day (aligned rows; AMZN Level-10).
-
-Outputs:
-- NumPy arrays ``(train, val, test)`` with shape ``[num_seq, seq_len, num_features]``.
-
-Features:
-- Default ``feature_set="core"`` (5 engineered features):
- 1) ``mid_price`` = 0.5 * (ask_price_1 + bid_price_1)
- 2) ``spread`` = ask_price_1 - bid_price_1
- 3) ``mid_log_return`` = log(mid_price_t) - log(mid_price_{t-1})
- 4) ``queue_imbalance_l1`` = (bid_size_1 - ask_size_1) / (bid_size_1 + ask_size_1 + eps)
- 5) ``depth_imbalance_l10`` = (Σ_i≤10 bid_size_i - Σ_i≤10 ask_size_i)
- / (Σ_i≤10 bid_size_i + Σ_i≤10 ask_size_i + eps)
-
-- Alternative ``feature_set="raw10"`` (40 raw LOB columns):
- ask_price_1..10, ask_size_1..10, bid_price_1..10, bid_size_1..10.
-
-Evaluation (for the accompanying report):
-- Distribution similarity: KL divergence ≤ 0.1 between generated vs. real spread and mid-price
- return distributions on a held-out test split.
-- Visual similarity: SSIM > 0.6 between heatmaps of generated vs. real LOB depth snapshots.
-- Also include: model architecture and parameter count, training strategy (full TimeGAN vs.
- adversarial-only or supervised-only variants), GPU type, VRAM, epochs, and total training time.
- Provide 3–5 representative heatmaps with a short error analysis.
-
-Exports:
-- ``LOBSTERDataset`` — PyTorch Dataset yielding windowed sequences.
-- ``make_dataloader`` — Convenience factory for a configured DataLoader.
+LOBSTERData: load, featurize, window, split (TimeGAN-ready) + CSV summaries.
+
+- Works with headerless LOBSTER CSVs (message_10.csv, orderbook_10.csv).
+- Engineered 5-feature "core" set or raw level-10 (40 columns).
+- Chronological train/val/test split; scaler fit on train only.
+- Windows shape: (num_seq, seq_len, num_features).
+- Extras: NaN/inf cleaning, dtype control, meta, inverse_transform, NPZ export.
+- NEW: summarize() and --summary CLI to inspect both message & orderbook tables.
Created by: Radhesh Goel (Keys-I) | ID: s49088276
"""
+
from __future__ import annotations
import os
import argparse
-from typing import Tuple, List, Literal, Optional
+from typing import Tuple, List, Literal, Optional, Dict
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
+# ------------------------------ utilities ------------------------------------ #
+
+def _summarize_df(df: pd.DataFrame, name: str, peek: int = 5) -> str:
+ lines = []
+ lines.append(f"=== {name} ===")
+ lines.append(f"shape: {df.shape[0]} rows × {df.shape[1]} cols")
+ lines.append(f"columns: {list(df.columns)}")
+ dtypes = df.dtypes.astype(str).to_dict()
+ lines.append(f"dtypes: {dtypes}")
+ na_counts = df.isna().sum().to_dict()
+ lines.append(f"na_counts: {na_counts}")
+ # time range if a 'time' column exists
+ if "time" in df.columns:
+ try:
+ t = pd.to_datetime(df["time"], errors="coerce", unit=None)
+ lines.append(f"time: min={t.min()} max={t.max()}")
+ except Exception:
+ pass
+ # numeric quick stats
+ num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+ if num_cols:
+ desc = df[num_cols].describe().to_dict()
+ # ensure json-like floats, not numpy types
+ desc = {k: {m: float(v) for m, v in stats.items()} for k, stats in desc.items()}
+ lines.append("numeric.describe():")
+ lines.append(str(desc))
+ # head/tail
+ lines.append("head:")
+ lines.append(df.head(peek).to_string(index=False))
+ lines.append("tail:")
+ lines.append(df.tail(peek).to_string(index=False))
+ return "\n".join(lines)
+
+
+# ------------------------------- core class ---------------------------------- #
+
class LOBSTERData:
"""
- Minimal loader -> features -> windows -> splits for LOBSTER L10 data.
+ Loader -> features -> windows -> splits for LOBSTER L10 data.
"""
+
def __init__(
self,
data_dir: str,
@@ -62,9 +73,12 @@ def __init__(
stride: Optional[int] = None,
splits: Tuple[float, float, float] = (0.7, 0.15, 0.15),
scaler: Literal["standard", "minmax", "none"] = "standard",
+ feature_range: Tuple[float, float] = (0.0, 1.0), # for minmax
eps: float = 1e-8,
headerless_message: bool = False,
headerless_orderbook: bool = False,
+ dropna: bool = True,
+ output_dtype: Literal["float32", "float64"] = "float32",
):
self.data_dir = data_dir
self.message_path = os.path.join(data_dir, message_file)
@@ -74,15 +88,20 @@ def __init__(
self.stride = int(stride) if stride is not None else self.seq_len
self.splits = splits
self.scaler_kind = scaler
+ self.feature_range = feature_range
self.eps = eps
self.headerless_message = headerless_message
self.headerless_orderbook = headerless_orderbook
+ self.dropna = dropna
+ self.output_dtype = np.float32 if output_dtype == "float32" else np.float64
- assert abs(sum(splits) - 1.0) < 1e-9, "splits must sum to 1.0"
- assert self.seq_len > 0 and self.stride > 0, "seq_len and stride must be positive"
+ self._validate_splits()
+ if not (self.seq_len > 0 and self.stride > 0):
+ raise ValueError("seq_len and stride must be positive")
self._scaler = None # fitted on train only
self._feature_names: List[str] = []
+ self._row_counts: Dict[str, int] = {}
# ------------------- public API -------------------
@@ -94,21 +113,78 @@ def load_arrays(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
self._check_alignment(msg_df, ob_df)
feats = self._build_features(ob_df)
+ # hygiene
+ if self.dropna:
+ feats = feats[~np.isnan(feats).any(axis=1)]
+ feats = feats[np.isfinite(feats).all(axis=1)]
+ self._row_counts["post_clean"] = int(feats.shape[0])
+
train, val, test = self._split_chronologically(feats)
+ self._row_counts.update(train=len(train), val=len(val), test=len(test))
+
train_s, val_s, test_s = self._scale_train_only(train, val, test)
W_train = self._windowize(train_s)
W_val = self._windowize(val_s)
W_test = self._windowize(test_s)
+
+ # final dtype cast
+ W_train = W_train.astype(self.output_dtype, copy=False)
+ W_val = W_val.astype(self.output_dtype, copy=False)
+ W_test = W_test.astype(self.output_dtype, copy=False)
return W_train, W_val, W_test
+ def summarize(self, peek: int = 5) -> str:
+ """Human-readable summary of both message and orderbook CSVs."""
+ msg_df, ob_df = self._load_csvs()
+ # ensure normalized headers for orderbook are visible
+ _ = self._normalize_orderbook_headers(
+ ob_df,
+ [f"ask_price_{i}" for i in range(1, 11)]
+ + [f"ask_size_{i}" for i in range(1, 11)]
+ + [f"bid_price_{i}" for i in range(1, 11)]
+ + [f"bid_size_{i}" for i in range(1, 11)]
+ )
+ parts = [
+ _summarize_df(msg_df, "message_10.csv", peek=peek),
+ _summarize_df(ob_df, "orderbook_10.csv", peek=peek),
+ ]
+ return "\n\n".join(parts)
+
def get_feature_names(self) -> List[str]:
return list(self._feature_names)
def get_scaler(self):
return self._scaler
+ def inverse_transform(self, arr: np.ndarray) -> np.ndarray:
+ """Inverse-transform features (per time-step) using the fitted scaler."""
+ if self._scaler is None:
+ raise RuntimeError("Scaler not fitted; call load_arrays() first or use scaler='none'.")
+ orig_shape = arr.shape
+ flat = arr.reshape(-1, arr.shape[-1])
+ inv = self._scaler.inverse_transform(flat)
+ return inv.reshape(orig_shape)
+
+ def get_meta(self) -> Dict[str, object]:
+ return {
+ "feature_set": self.feature_set,
+ "feature_names": self.get_feature_names(),
+ "seq_len": self.seq_len,
+ "stride": self.stride,
+ "splits": self.splits,
+ "scaler": type(self._scaler).__name__ if self._scaler is not None else "None",
+ "row_counts": self._row_counts,
+ }
+
# ------------------- internals --------------------
+ def _validate_splits(self) -> None:
+ s = sum(self.splits)
+ if not (abs(s - 1.0) < 1e-12):
+ raise ValueError(f"splits must sum to 1.0, got {self.splits} (sum={s})")
+ if any(x < 0 for x in self.splits):
+ raise ValueError("splits cannot be negative")
+
def _load_csvs(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
if not os.path.isfile(self.orderbook_path):
raise FileNotFoundError(f"Missing {self.orderbook_path}")
@@ -122,6 +198,7 @@ def _load_csvs(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
else:
msg_df = pd.read_csv(self.message_path)
msg_df.columns = [str(c).strip().lower().replace(" ", "_") for c in msg_df.columns]
+ # if columns are 6 but non-standard, coerce to canonical names
if len(msg_df.columns) == 6 and set(msg_df.columns) != set(msg_cols):
msg_df.columns = msg_cols
@@ -156,6 +233,7 @@ def _normalize_orderbook_headers(self, df: pd.DataFrame, target_cols: List[str])
new_cols.append(s)
df.columns = new_cols
+ # If still mismatched but counts align, force target order.
if set(df.columns) != set(target_cols) and len(df.columns) == len(target_cols):
df.columns = target_cols
return df
@@ -196,7 +274,7 @@ def _build_features(self, ob_df: pd.DataFrame) -> np.ndarray:
# 2) spread
spread = ap1 - bp1
- # 3) mid_log_return
+ # 3) mid_log_return (first element 0.0 to preserve length)
mid_log = np.log(np.clip(mid_price, 1e-12, None))
mid_log_return = np.concatenate([[0.0], np.diff(mid_log)])
@@ -222,6 +300,11 @@ def _build_features(self, ob_df: pd.DataFrame) -> np.ndarray:
def _split_chronologically(self, X: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
n = len(X)
+ if n < self.seq_len:
+ raise ValueError(
+ f"Not enough rows ({n}) for seq_len={self.seq_len}. "
+ "Consider reducing seq_len or collecting more data."
+ )
n_train = int(n * self.splits[0])
n_val = int(n * self.splits[1])
n_test = n - n_train - n_val
@@ -239,7 +322,7 @@ def _scale_train_only(
if self.scaler_kind == "standard":
scaler = StandardScaler()
elif self.scaler_kind == "minmax":
- scaler = MinMaxScaler()
+ scaler = MinMaxScaler(feature_range=self.feature_range)
else:
raise ValueError("scaler must be 'standard', 'minmax', or 'none'")
@@ -256,63 +339,83 @@ def _windowize(self, X: np.ndarray) -> np.ndarray:
return np.empty((0, self.seq_len, d), dtype=np.float64)
starts = np.arange(0, n - self.seq_len + 1, self.stride, dtype=int)
+ if starts.size == 0:
+ return np.empty((0, self.seq_len, d), dtype=np.float64)
+
W = np.empty((len(starts), self.seq_len, d), dtype=np.float64)
for i, s in enumerate(starts):
W[i] = X[s : s + self.seq_len]
return W
-# -------------------------- CLI smoke test ------------------------------------
-
-def _basic_test_cli():
- """
- Run a smoke test ONLY when file names are provided by the user.
+# -------------------------- CLI: smoke test & summary ------------------------- #
- Example:
- python lobster_data.py --data-dir data/AMZN/2014-01-02 \
- --message AMZN_2014-01-02_34200000_57600000_message_10.csv \
- --orderbook AMZN_2014-01-02_34200000_57600000_orderbook_10.csv \
- --headerless-message --headerless-orderbook
- """
- parser = argparse.ArgumentParser(description="LOBSTERData smoke test (filenames required).")
+def _main_cli():
+ parser = argparse.ArgumentParser(description="LOBSTERData (preprocess + summarize).")
parser.add_argument("--data-dir", default="data", help="Folder containing the CSVs")
parser.add_argument("--message", required=True, help="Message CSV file name (e.g., message_10.csv)")
parser.add_argument("--orderbook", required=True, help="Orderbook CSV file name (e.g., orderbook_10.csv)")
parser.add_argument("--feature-set", choices=["core", "raw10"], default="core")
parser.add_argument("--seq-len", type=int, default=64)
parser.add_argument("--stride", type=int, default=16)
+ parser.add_argument("--splits", type=float, nargs=3, metavar=("TRAIN", "VAL", "TEST"),
+ default=(0.7, 0.15, 0.15), help="Fractions that must sum to 1.0")
parser.add_argument("--scaler", choices=["standard", "minmax", "none"], default="standard")
+ parser.add_argument("--feature-range", type=float, nargs=2, metavar=("MIN", "MAX"), default=(0.0, 1.0))
parser.add_argument("--headerless-message", action="store_true", help="Treat message CSV as headerless")
parser.add_argument("--headerless-orderbook", action="store_true", help="Treat orderbook CSV as headerless")
+ parser.add_argument("--no-dropna", action="store_true", help="Disable row drop for NaN")
+ parser.add_argument("--dtype", choices=["float32", "float64"], default="float32")
+ parser.add_argument("--save-npz", type=str, default=None, help="If set, save windows to this .npz path")
+ parser.add_argument("--summary", action="store_true", help="Print a summary of both CSVs and exit")
+ parser.add_argument("--peek", type=int, default=5, help="Rows to show in head/tail for summary")
args = parser.parse_args()
data_dir = args.data_dir
print(f"Files in '{data_dir}': {sorted(os.listdir(data_dir)) if os.path.isdir(data_dir) else 'MISSING'}")
- try:
- loader = LOBSTERData(
- data_dir=data_dir,
- message_file=args.message,
- orderbook_file=args.orderbook,
- feature_set=args.feature_set,
- seq_len=args.seq_len,
- stride=args.stride,
- splits=(0.7, 0.15, 0.15),
- scaler=args.scaler,
- headerless_message=args.headerless_message,
- headerless_orderbook=args.headerless_orderbook,
+ loader = LOBSTERData(
+ data_dir=data_dir,
+ message_file=args.message,
+ orderbook_file=args.orderbook,
+ feature_set=args.feature_set,
+ seq_len=args.seq_len,
+ stride=args.stride,
+ splits=tuple(args.splits),
+ scaler=args.scaler,
+ feature_range=tuple(args.feature_range),
+ headerless_message=args.headerless_message,
+ headerless_orderbook=args.headerless_orderbook,
+ dropna=not args.no_dropna,
+ output_dtype=args.dtype,
+ )
+
+ if args.summary:
+ print(loader.summarize(peek=args.peek))
+ return
+
+ # Build windows
+ W_train, W_val, W_test = loader.load_arrays()
+ meta = loader.get_meta()
+
+ print("Feature names:", loader.get_feature_names())
+ print("Meta:", meta)
+ print("Train windows:", W_train.shape)
+ print("Val windows: ", W_val.shape)
+ print("Test windows: ", W_test.shape)
+ if W_train.size:
+ print("Example window[0] stats -> mean:", float(W_train[0].mean()),
+ "std:", float(W_train[0].std()))
+
+ if args.save_npz:
+ np.savez_compressed(
+ args.save_npz,
+ train=W_train, val=W_val, test=W_test,
+ feature_names=np.array(loader.get_feature_names(), dtype=object),
+ meta=np.array([str(meta)], dtype=object),
)
- W_train, W_val, W_test = loader.load_arrays()
- print("Feature names:", loader.get_feature_names())
- print("Train windows:", W_train.shape)
- print("Val windows: ", W_val.shape)
- print("Test windows: ", W_test.shape)
- if W_train.size:
- print("Example window[0] stats -> mean:", float(W_train[0].mean()),
- "std:", float(W_train[0].std()))
- except Exception as e:
- print("Basic test error:", e)
+ print(f"Saved windows to: {args.save_npz}")
if __name__ == "__main__":
- _basic_test_cli()
\ No newline at end of file
+ _main_cli()
From 0ac2cac2fe709c3b81b7b0e9868dee3afa085b43 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Fri, 3 Oct 2025 08:37:56 +1000
Subject: [PATCH 08/74] style(dataset): prettier dataset previews (aligned
columns, rounded floats, split summaries)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- Add --pretty flag to print tidy console tables for train/val/test splits
- Show split shapes (num_seq × seq_len × num_features) and a small head/tail sample
- Right-align numeric columns; thousands separators; configurable precision
- CLI knobs: --head N --tail N --width 120 --precision 4
- Add quick feature stats (min/p25/median/mean/p75/max, std) for the selected feature set
- Purely display-layer changes; no impact on saved arrays or training
---
.../TimeLOB_TimeGAN_49088276/src/dataset.py | 364 +++++++++++++-----
1 file changed, 268 insertions(+), 96 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
index 5e4d99330..420337299 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
@@ -1,20 +1,42 @@
"""
-LOBSTERData: load, featurize, window, split (TimeGAN-ready) + CSV summaries.
+LOBSTER (Level-10) preprocessing for TimeGAN.
-- Works with headerless LOBSTER CSVs (message_10.csv, orderbook_10.csv).
-- Engineered 5-feature "core" set or raw level-10 (40 columns).
-- Chronological train/val/test split; scaler fit on train only.
-- Windows shape: (num_seq, seq_len, num_features).
-- Extras: NaN/inf cleaning, dtype control, meta, inverse_transform, NPZ export.
-- NEW: summarize() and --summary CLI to inspect both message & orderbook tables.
+- Loads paired LOBSTER CSVs (message_10.csv, orderbook_10.csv), aligned by event index.
+- Builds either a compact engineered 5-feature set ("core") or raw level-10 depth ("raw10").
+- Chronological train/val/test split (prevents leakage), train-only scaling.
+- Sliding-window sequences shaped (num_seq, seq_len, num_features).
+
+Inputs (per trading session):
+ message_10.csv, orderbook_10.csv
+ - If headers are missing, pass --headerless-message / --headerless-orderbook (CLI).
+
+Outputs:
+ train, val, test — NumPy arrays with shape [num_seq, seq_len, num_features]
+
+Feature sets:
+ feature_set="core" (5 engineered features)
+ 1) mid_price = 0.5 * (ask_price_1 + bid_price_1)
+ 2) spread = ask_price_1 - bid_price_1
+ 3) mid_log_return = log(mid_price_t) - log(mid_price_{t-1})
+ 4) queue_imbalance_l1 = (bid_size_1 - ask_size_1) / (bid_size_1 + ask_size_1 + eps)
+ 5) depth_imbalance_l10 = (Σ_i≤10 bid_size_i - Σ_i≤10 ask_size_i) /
+ (Σ_i≤10 bid_size_i + Σ_i≤10 ask_size_i + eps)
+
+ feature_set="raw10" (40 raw columns)
+ ask_price_1..10, ask_size_1..10, bid_price_1..10, bid_size_1..10
+
+Notes:
+- Scaling is fit on TRAIN only (Standard/MinMax/None).
+- Windows default to non-overlapping (stride=seq_len); set stride str:
+# ============================== Pretty printing ===============================
+
+def _supports_color(no_color_flag: bool) -> bool:
+ if no_color_flag:
+ return False
+ try:
+ return os.isatty(1)
+ except Exception:
+ return False
+
+class _C:
+ def __init__(self, enabled: bool):
+ n = "" if enabled else ""
+ self.RESET = n
+ self.DIM = "\033[2m" if enabled else ""
+ self.BOLD = "\033[1m" if enabled else ""
+ self.CYAN = "\033[36m" if enabled else ""
+ self.YELLOW = "\033[33m" if enabled else ""
+ self.GREEN = "\033[32m" if enabled else ""
+ self.MAGENTA = "\033[35m" if enabled else ""
+ self.BLUE = "\033[34m" if enabled else ""
+
+def _term_width(default: int = 100) -> int:
+ try:
+ return shutil.get_terminal_size((default, 20)).columns
+ except Exception:
+ return default
+
+def _hr(width: int, c: _C) -> str:
+ return f"{c.DIM}{'─'*width}{c.RESET}"
+
+def _box(title: str, body_lines: List[str], c: _C, width: int | None = None) -> str:
+ width = width or _term_width()
+ border = "─" * (width - 2)
+ out = [f"{c.CYAN}┌{border}┐{c.RESET}"]
+ title_line = f" {title} "
+ pad = max(0, width - 2 - len(title_line))
+ out.append(f"{c.CYAN}│{c.RESET}{c.BOLD}{title_line}{c.RESET}{' '*pad}{c.CYAN}│{c.RESET}")
+ out.append(f"{c.CYAN}├{border}┤{c.RESET}")
+ for ln in body_lines:
+ for sub in _wrap(ln, width - 4):
+ pad = max(0, width - 4 - len(sub))
+ out.append(f"{c.CYAN}│{c.RESET} {sub}{' '*pad} {c.CYAN}│{c.RESET}")
+ out.append(f"{c.CYAN}└{border}┘{c.RESET}")
+ return "\n".join(out)
+
+def _wrap(s: str, width: int) -> List[str]:
+ if len(s) <= width:
+ return [s]
+ out, cur = [], ""
+ for tok in s.split(" "):
+ if not cur:
+ cur = tok
+ elif len(cur) + 1 + len(tok) <= width:
+ cur += " " + tok
+ else:
+ out.append(cur)
+ cur = tok
+ if cur:
+ out.append(cur)
+ return out
+
+def _fmt_shape(arr: tuple | list | np.ndarray) -> str:
+ if isinstance(arr, np.ndarray):
+ return "×".join(map(str, arr.shape))
+ if isinstance(arr, (tuple, list)):
+ return "×".join(map(str, arr))
+ return str(arr)
+
+def _kv_lines(d: Dict[str, object]) -> List[str]:
lines = []
- lines.append(f"=== {name} ===")
+ for k, v in d.items():
+ if isinstance(v, dict):
+ lines.append(f"{k}:")
+ for sk, sv in v.items():
+ lines.append(f" {sk}: {sv}")
+ else:
+ lines.append(f"{k}: {v}")
+ return lines
+
+
+# ================================ Summaries ===================================
+
+def _summarize_df(df: pd.DataFrame, name: str, peek: int = 5) -> List[str]:
+ lines: List[str] = []
+ lines.append(f"{name}")
lines.append(f"shape: {df.shape[0]} rows × {df.shape[1]} cols")
- lines.append(f"columns: {list(df.columns)}")
+ # columns (trim if very long)
+ cols = list(df.columns)
+ col_str = ", ".join(cols)
+ lines.append("columns: " + col_str if len(col_str) < 160 else "columns: " + ", ".join(cols[:12]) + ", ...")
+ # dtypes / NA counts (only non-zero NA counts shown)
dtypes = df.dtypes.astype(str).to_dict()
- lines.append(f"dtypes: {dtypes}")
- na_counts = df.isna().sum().to_dict()
- lines.append(f"na_counts: {na_counts}")
- # time range if a 'time' column exists
+ na_counts = {k: int(v) for k, v in df.isna().sum().items() if int(v) > 0}
+ lines.append("dtypes: " + ", ".join([f"{k}:{v}" for k, v in dtypes.items()]))
+ lines.append("na_counts: " + (str(na_counts) if na_counts else "{}"))
+ # value counts of common message fields
+ for col in ("type", "direction"):
+ if col in df.columns:
+ try:
+ vc = df[col].value_counts(dropna=False).to_dict()
+ lines.append(f"value_counts[{col}]: {vc}")
+ except Exception:
+ pass
+ # time range + monotonic check
if "time" in df.columns:
try:
t = pd.to_datetime(df["time"], errors="coerce", unit=None)
lines.append(f"time: min={t.min()} max={t.max()}")
+ if t.notna().all():
+ is_mono = bool((t.diff().dropna() >= pd.Timedelta(0)).all())
+ lines.append(f"time monotonic nondecreasing: {is_mono}")
except Exception:
pass
- # numeric quick stats
+ # numeric quick stats (only a few cols to keep output tidy)
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if num_cols:
- desc = df[num_cols].describe().to_dict()
- # ensure json-like floats, not numpy types
+ sample_cols = num_cols[:6]
+ desc = df[sample_cols].describe().to_dict()
desc = {k: {m: float(v) for m, v in stats.items()} for k, stats in desc.items()}
- lines.append("numeric.describe():")
- lines.append(str(desc))
- # head/tail
- lines.append("head:")
- lines.append(df.head(peek).to_string(index=False))
- lines.append("tail:")
- lines.append(df.tail(peek).to_string(index=False))
- return "\n".join(lines)
+ lines.append("describe(sample of numeric cols):")
+ for k, stats in desc.items():
+ stats_str = ", ".join([f"{m}={val:.4g}" for m, val in stats.items()])
+ lines.append(f" {k}: {stats_str}")
+ # head / tail
+ if peek > 0:
+ lines.append("head:")
+ lines.append(df.head(peek).to_string(index=False))
+ lines.append("tail:")
+ lines.append(df.tail(peek).to_string(index=False))
+ return lines
-# ------------------------------- core class ---------------------------------- #
+# =============================== Core class ===================================
class LOBSTERData:
"""
Loader -> features -> windows -> splits for LOBSTER L10 data.
"""
-
def __init__(
self,
data_dir: str,
@@ -73,12 +193,15 @@ def __init__(
stride: Optional[int] = None,
splits: Tuple[float, float, float] = (0.7, 0.15, 0.15),
scaler: Literal["standard", "minmax", "none"] = "standard",
- feature_range: Tuple[float, float] = (0.0, 1.0), # for minmax
+ feature_range: Tuple[float, float] = (0.0, 1.0),
eps: float = 1e-8,
headerless_message: bool = False,
headerless_orderbook: bool = False,
dropna: bool = True,
output_dtype: Literal["float32", "float64"] = "float32",
+ sort_by_time: bool = False,
+ every: int = 1,
+ clip_quantiles: Optional[Tuple[float, float]] = None,
):
self.data_dir = data_dir
self.message_path = os.path.join(data_dir, message_file)
@@ -95,25 +218,36 @@ def __init__(
self.dropna = dropna
self.output_dtype = np.float32 if output_dtype == "float32" else np.float64
+ self.sort_by_time = bool(sort_by_time)
+ self.every = max(1, int(every))
+ self.clip_quantiles = clip_quantiles
+
self._validate_splits()
if not (self.seq_len > 0 and self.stride > 0):
raise ValueError("seq_len and stride must be positive")
- self._scaler = None # fitted on train only
+ self._scaler = None
self._feature_names: List[str] = []
self._row_counts: Dict[str, int] = {}
+ self._clip_bounds: Optional[Tuple[np.ndarray, np.ndarray]] = None # (lo, hi)
# ------------------- public API -------------------
def load_arrays(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
- """
- Returns train, val, test arrays shaped (num_seq, seq_len, num_features).
- """
msg_df, ob_df = self._load_csvs()
+
+ if self.sort_by_time and "time" in msg_df.columns:
+ order = msg_df["time"].reset_index(drop=True).sort_values().index
+ msg_df = msg_df.iloc[order].reset_index(drop=True)
+ ob_df = ob_df.iloc[order].reset_index(drop=True)
+
self._check_alignment(msg_df, ob_df)
feats = self._build_features(ob_df)
- # hygiene
+ if self.every > 1:
+ feats = feats[::self.every]
+ self._row_counts["decimated_every"] = self.every
+
if self.dropna:
feats = feats[~np.isnan(feats).any(axis=1)]
feats = feats[np.isfinite(feats).all(axis=1)]
@@ -122,21 +256,29 @@ def load_arrays(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
train, val, test = self._split_chronologically(feats)
self._row_counts.update(train=len(train), val=len(val), test=len(test))
+ if self.clip_quantiles is not None:
+ qmin, qmax = self.clip_quantiles
+ if not (0.0 <= qmin < qmax <= 1.0):
+ raise ValueError("clip_quantiles must satisfy 0 <= qmin < qmax <= 1")
+ lo = np.quantile(train, qmin, axis=0)
+ hi = np.quantile(train, qmax, axis=0)
+ self._clip_bounds = (lo, hi)
+ train = np.clip(train, lo, hi)
+ val = np.clip(val, lo, hi)
+ test = np.clip(test, lo, hi)
+
train_s, val_s, test_s = self._scale_train_only(train, val, test)
W_train = self._windowize(train_s)
W_val = self._windowize(val_s)
W_test = self._windowize(test_s)
- # final dtype cast
W_train = W_train.astype(self.output_dtype, copy=False)
W_val = W_val.astype(self.output_dtype, copy=False)
W_test = W_test.astype(self.output_dtype, copy=False)
return W_train, W_val, W_test
- def summarize(self, peek: int = 5) -> str:
- """Human-readable summary of both message and orderbook CSVs."""
+ def summarize(self, peek: int = 5) -> List[str]:
msg_df, ob_df = self._load_csvs()
- # ensure normalized headers for orderbook are visible
_ = self._normalize_orderbook_headers(
ob_df,
[f"ask_price_{i}" for i in range(1, 11)]
@@ -144,11 +286,11 @@ def summarize(self, peek: int = 5) -> str:
+ [f"bid_price_{i}" for i in range(1, 11)]
+ [f"bid_size_{i}" for i in range(1, 11)]
)
- parts = [
- _summarize_df(msg_df, "message_10.csv", peek=peek),
- _summarize_df(ob_df, "orderbook_10.csv", peek=peek),
- ]
- return "\n\n".join(parts)
+ lines = []
+ lines += _summarize_df(msg_df, "message_10.csv", peek=peek)
+ lines.append("") # spacer
+ lines += _summarize_df(ob_df, "orderbook_10.csv", peek=peek)
+ return lines
def get_feature_names(self) -> List[str]:
return list(self._feature_names)
@@ -157,7 +299,6 @@ def get_scaler(self):
return self._scaler
def inverse_transform(self, arr: np.ndarray) -> np.ndarray:
- """Inverse-transform features (per time-step) using the fitted scaler."""
if self._scaler is None:
raise RuntimeError("Scaler not fitted; call load_arrays() first or use scaler='none'.")
orig_shape = arr.shape
@@ -174,6 +315,12 @@ def get_meta(self) -> Dict[str, object]:
"splits": self.splits,
"scaler": type(self._scaler).__name__ if self._scaler is not None else "None",
"row_counts": self._row_counts,
+ "clip_bounds": None if self._clip_bounds is None else {
+ "lo": self._clip_bounds[0].tolist(),
+ "hi": self._clip_bounds[1].tolist(),
+ },
+ "every": self.every,
+ "sorted_by_time": self.sort_by_time,
}
# ------------------- internals --------------------
@@ -198,7 +345,6 @@ def _load_csvs(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
else:
msg_df = pd.read_csv(self.message_path)
msg_df.columns = [str(c).strip().lower().replace(" ", "_") for c in msg_df.columns]
- # if columns are 6 but non-standard, coerce to canonical names
if len(msg_df.columns) == 6 and set(msg_df.columns) != set(msg_cols):
msg_df.columns = msg_cols
@@ -218,8 +364,6 @@ def _load_csvs(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
return msg_df, ob_df
def _normalize_orderbook_headers(self, df: pd.DataFrame, target_cols: List[str]) -> pd.DataFrame:
- # Map common LOBSTER styles to snake_case:
- # e.g., AskPrice1 -> ask_price_1, BidSize10 -> bid_size_10
new_cols = []
for c in df.columns:
s = str(c)
@@ -232,8 +376,6 @@ def _normalize_orderbook_headers(self, df: pd.DataFrame, target_cols: List[str])
s = s.replace("__", "_")
new_cols.append(s)
df.columns = new_cols
-
- # If still mismatched but counts align, force target order.
if set(df.columns) != set(target_cols) and len(df.columns) == len(target_cols):
df.columns = target_cols
return df
@@ -241,10 +383,8 @@ def _normalize_orderbook_headers(self, df: pd.DataFrame, target_cols: List[str])
def _check_alignment(self, msg_df: pd.DataFrame, ob_df: pd.DataFrame) -> None:
if len(msg_df) != len(ob_df):
raise ValueError(f"Message/Orderbook row count mismatch: {len(msg_df)} vs {len(ob_df)}")
- # LOBSTER rows are synchronized by event index; we trust row order.
def _build_features(self, ob_df: pd.DataFrame) -> np.ndarray:
- # Ensure standard L10 columns exist
for prefix in ("ask_price_", "ask_size_", "bid_price_", "bid_size_"):
for L in range(1, 11):
col = f"{prefix}{L}"
@@ -268,20 +408,11 @@ def _build_features(self, ob_df: pd.DataFrame) -> np.ndarray:
as1 = ob_df["ask_size_1"].to_numpy(dtype=np.float64)
bs1 = ob_df["bid_size_1"].to_numpy(dtype=np.float64)
- # 1) mid_price
mid_price = 0.5 * (ap1 + bp1)
-
- # 2) spread
spread = ap1 - bp1
-
- # 3) mid_log_return (first element 0.0 to preserve length)
mid_log = np.log(np.clip(mid_price, 1e-12, None))
mid_log_return = np.concatenate([[0.0], np.diff(mid_log)])
-
- # 4) queue_imbalance_l1
qi_l1 = (bs1 - as1) / (bs1 + as1 + self.eps)
-
- # 5) depth_imbalance_l10
bid_depth = sum(ob_df[f"bid_size_{i}"].to_numpy(dtype=np.float64) for i in range(1, 11))
ask_depth = sum(ob_df[f"ask_size_{i}"].to_numpy(dtype=np.float64) for i in range(1, 11))
di_l10 = (bid_depth - ask_depth) / (bid_depth + ask_depth + self.eps)
@@ -303,11 +434,13 @@ def _split_chronologically(self, X: np.ndarray) -> Tuple[np.ndarray, np.ndarray,
if n < self.seq_len:
raise ValueError(
f"Not enough rows ({n}) for seq_len={self.seq_len}. "
- "Consider reducing seq_len or collecting more data."
+ "Reduce seq_len or use a longer session."
)
n_train = int(n * self.splits[0])
n_val = int(n * self.splits[1])
n_test = n - n_train - n_val
+ if n_train < self.seq_len:
+ raise ValueError(f"Train split too small ({n_train} rows) for seq_len={self.seq_len}")
train = X[:n_train]
val = X[n_train : n_train + n_val]
test = X[n_train + n_val :]
@@ -325,57 +458,101 @@ def _scale_train_only(
scaler = MinMaxScaler(feature_range=self.feature_range)
else:
raise ValueError("scaler must be 'standard', 'minmax', or 'none'")
-
scaler.fit(train)
self._scaler = scaler
return scaler.transform(train), scaler.transform(val), scaler.transform(test)
def _windowize(self, X: np.ndarray) -> np.ndarray:
- """
- Returns windows shaped (num_seq, seq_len, num_features).
- """
n, d = X.shape
if n < self.seq_len:
return np.empty((0, self.seq_len, d), dtype=np.float64)
-
starts = np.arange(0, n - self.seq_len + 1, self.stride, dtype=int)
if starts.size == 0:
return np.empty((0, self.seq_len, d), dtype=np.float64)
-
W = np.empty((len(starts), self.seq_len, d), dtype=np.float64)
for i, s in enumerate(starts):
W[i] = X[s : s + self.seq_len]
return W
-# -------------------------- CLI: smoke test & summary ------------------------- #
+# ============================ CLI and nice output =============================
+
+def _print_dir_listing(path: str, c: _C) -> None:
+ if os.path.isdir(path):
+ files = sorted(os.listdir(path))
+ lines = [f"path: {path}", f"files: {len(files)}"]
+ lines += [f" - {f}" for f in files[:12]]
+ if len(files) > 12:
+ lines.append(f" ... (+{len(files)-12} more)")
+ else:
+ lines = [f"path: {path}", "files: (missing)"]
+ print(_box("Data directory", lines, c))
+
+def _print_summary(lines: List[str], c: _C) -> None:
+ print(_box("CSV Summary", lines, c))
+
+def _print_report(W_train, W_val, W_test, meta: Dict[str, object], c: _C) -> None:
+ shapes = {
+ "train windows": _fmt_shape(W_train.shape),
+ "val windows": _fmt_shape(W_val.shape),
+ "test windows": _fmt_shape(W_test.shape),
+ "seq_len": meta.get("seq_len"),
+ "stride": meta.get("stride"),
+ "feature_set": meta.get("feature_set"),
+ "features": len(meta.get("feature_names", [])),
+ "scaler": meta.get("scaler"),
+ "sorted_by_time": meta.get("sorted_by_time"),
+ "every": meta.get("every"),
+ }
+ lines = _kv_lines(shapes)
+ rc = meta.get("row_counts", {})
+ if rc:
+ lines.append("")
+ lines.append("row_counts:")
+ for k, v in rc.items():
+ lines.append(f" {k}: {v}")
+ print(_box("Preprocessing Report", lines, c))
+
+ # quick sample stats on first window (if exists)
+ if getattr(W_train, "size", 0):
+ win = W_train[0]
+ stats = {
+ "window[0] mean": f"{float(win.mean()):.5f}",
+ "window[0] std": f"{float(win.std()):.5f}",
+ "feature_names (first 8)": ", ".join(meta.get("feature_names", [])[:8]) + ("..." if len(meta.get("feature_names", [])) > 8 else "")
+ }
+ print(_box("Sample Window Stats", _kv_lines(stats), c))
def _main_cli():
parser = argparse.ArgumentParser(description="LOBSTERData (preprocess + summarize).")
- parser.add_argument("--data-dir", default="data", help="Folder containing the CSVs")
- parser.add_argument("--message", required=True, help="Message CSV file name (e.g., message_10.csv)")
- parser.add_argument("--orderbook", required=True, help="Orderbook CSV file name (e.g., orderbook_10.csv)")
+ parser.add_argument("--data-dir", default="data")
+ parser.add_argument("--message", required=True)
+ parser.add_argument("--orderbook", required=True)
parser.add_argument("--feature-set", choices=["core", "raw10"], default="core")
parser.add_argument("--seq-len", type=int, default=64)
parser.add_argument("--stride", type=int, default=16)
parser.add_argument("--splits", type=float, nargs=3, metavar=("TRAIN", "VAL", "TEST"),
- default=(0.7, 0.15, 0.15), help="Fractions that must sum to 1.0")
+ default=(0.7, 0.15, 0.15))
parser.add_argument("--scaler", choices=["standard", "minmax", "none"], default="standard")
parser.add_argument("--feature-range", type=float, nargs=2, metavar=("MIN", "MAX"), default=(0.0, 1.0))
- parser.add_argument("--headerless-message", action="store_true", help="Treat message CSV as headerless")
- parser.add_argument("--headerless-orderbook", action="store_true", help="Treat orderbook CSV as headerless")
- parser.add_argument("--no-dropna", action="store_true", help="Disable row drop for NaN")
+ parser.add_argument("--headerless-message", action="store_true")
+ parser.add_argument("--headerless-orderbook", action="store_true")
+ parser.add_argument("--no-dropna", action="store_true")
parser.add_argument("--dtype", choices=["float32", "float64"], default="float32")
- parser.add_argument("--save-npz", type=str, default=None, help="If set, save windows to this .npz path")
- parser.add_argument("--summary", action="store_true", help="Print a summary of both CSVs and exit")
- parser.add_argument("--peek", type=int, default=5, help="Rows to show in head/tail for summary")
+ parser.add_argument("--save-npz", type=str, default=None)
+ parser.add_argument("--summary", action="store_true")
+ parser.add_argument("--peek", type=int, default=5)
+ parser.add_argument("--sort-by-time", action="store_true")
+ parser.add_argument("--every", type=int, default=1)
+ parser.add_argument("--clip-quantiles", type=float, nargs=2, metavar=("QMIN", "QMAX"), default=None)
+ parser.add_argument("--no-color", action="store_true", help="Disable ANSI colors in output")
args = parser.parse_args()
- data_dir = args.data_dir
- print(f"Files in '{data_dir}': {sorted(os.listdir(data_dir)) if os.path.isdir(data_dir) else 'MISSING'}")
+ c = _C(_supports_color(args.no_color))
+ _print_dir_listing(args.data_dir, c)
loader = LOBSTERData(
- data_dir=data_dir,
+ data_dir=args.data_dir,
message_file=args.message,
orderbook_file=args.orderbook,
feature_set=args.feature_set,
@@ -388,24 +565,19 @@ def _main_cli():
headerless_orderbook=args.headerless_orderbook,
dropna=not args.no_dropna,
output_dtype=args.dtype,
+ sort_by_time=args.sort_by_time,
+ every=args.every,
+ clip_quantiles=tuple(args.clip_quantiles) if args.clip_quantiles else None,
)
if args.summary:
- print(loader.summarize(peek=args.peek))
+ lines = loader.summarize(peek=args.peek)
+ _print_summary(lines, c)
return
- # Build windows
W_train, W_val, W_test = loader.load_arrays()
meta = loader.get_meta()
-
- print("Feature names:", loader.get_feature_names())
- print("Meta:", meta)
- print("Train windows:", W_train.shape)
- print("Val windows: ", W_val.shape)
- print("Test windows: ", W_test.shape)
- if W_train.size:
- print("Example window[0] stats -> mean:", float(W_train[0].mean()),
- "std:", float(W_train[0].std()))
+ _print_report(W_train, W_val, W_test, meta, c)
if args.save_npz:
np.savez_compressed(
@@ -414,8 +586,8 @@ def _main_cli():
feature_names=np.array(loader.get_feature_names(), dtype=object),
meta=np.array([str(meta)], dtype=object),
)
- print(f"Saved windows to: {args.save_npz}")
+ print(_box("Saved", [f"path: {args.save_npz}"], c))
if __name__ == "__main__":
- _main_cli()
+ _main_cli()
\ No newline at end of file
From 0b70e9990f4fcccea5bfa1da09df9c9199a088fe Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Thu, 2 Oct 2025 10:45:49 +1000
Subject: [PATCH 09/74] feat(dataset): chat-style CLI output with bubbles and
KV tables
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Add --style chat|box and --no-color; render directory, CSV summaries, preprocessing report, and sample window as message-like bubbles with aligned key–value tables. Keep headerless support, time-sort, decimation, quantile clipping, chronological splits, and train-only scaling unchanged.
---
.../TimeLOB_TimeGAN_49088276/src/dataset.py | 232 ++++++++++--------
1 file changed, 136 insertions(+), 96 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
index 420337299..b88661041 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
@@ -37,6 +37,7 @@
import os
import argparse
import shutil
+from datetime import datetime
from typing import Tuple, List, Literal, Optional, Dict
import numpy as np
@@ -56,41 +57,23 @@ def _supports_color(no_color_flag: bool) -> bool:
class _C:
def __init__(self, enabled: bool):
- n = "" if enabled else ""
- self.RESET = n
- self.DIM = "\033[2m" if enabled else ""
- self.BOLD = "\033[1m" if enabled else ""
- self.CYAN = "\033[36m" if enabled else ""
+ self.enabled = enabled
+ self.RESET = "\033[0m" if enabled else ""
+ self.DIM = "\033[2m" if enabled else ""
+ self.BOLD = "\033[1m" if enabled else ""
+ self.CYAN = "\033[36m" if enabled else ""
self.YELLOW = "\033[33m" if enabled else ""
- self.GREEN = "\033[32m" if enabled else ""
- self.MAGENTA = "\033[35m" if enabled else ""
- self.BLUE = "\033[34m" if enabled else ""
+ self.GREEN = "\033[32m" if enabled else ""
+ self.MAGENTA= "\033[35m" if enabled else ""
+ self.BLUE = "\033[34m" if enabled else ""
-def _term_width(default: int = 100) -> int:
+def _term_width(default: int = 96) -> int:
try:
return shutil.get_terminal_size((default, 20)).columns
except Exception:
return default
-def _hr(width: int, c: _C) -> str:
- return f"{c.DIM}{'─'*width}{c.RESET}"
-
-def _box(title: str, body_lines: List[str], c: _C, width: int | None = None) -> str:
- width = width or _term_width()
- border = "─" * (width - 2)
- out = [f"{c.CYAN}┌{border}┐{c.RESET}"]
- title_line = f" {title} "
- pad = max(0, width - 2 - len(title_line))
- out.append(f"{c.CYAN}│{c.RESET}{c.BOLD}{title_line}{c.RESET}{' '*pad}{c.CYAN}│{c.RESET}")
- out.append(f"{c.CYAN}├{border}┤{c.RESET}")
- for ln in body_lines:
- for sub in _wrap(ln, width - 4):
- pad = max(0, width - 4 - len(sub))
- out.append(f"{c.CYAN}│{c.RESET} {sub}{' '*pad} {c.CYAN}│{c.RESET}")
- out.append(f"{c.CYAN}└{border}┘{c.RESET}")
- return "\n".join(out)
-
-def _wrap(s: str, width: int) -> List[str]:
+def _wrap(s: str, width: int) -> list[str]:
if len(s) <= width:
return [s]
out, cur = [], ""
@@ -106,24 +89,67 @@ def _wrap(s: str, width: int) -> List[str]:
out.append(cur)
return out
-def _fmt_shape(arr: tuple | list | np.ndarray) -> str:
- if isinstance(arr, np.ndarray):
- return "×".join(map(str, arr.shape))
- if isinstance(arr, (tuple, list)):
- return "×".join(map(str, arr))
- return str(arr)
-
-def _kv_lines(d: Dict[str, object]) -> List[str]:
- lines = []
- for k, v in d.items():
- if isinstance(v, dict):
- lines.append(f"{k}:")
- for sk, sv in v.items():
- lines.append(f" {sk}: {sv}")
- else:
- lines.append(f"{k}: {v}")
+def _kv_table(rows: list[tuple[str, str]], width: int, pad: int = 2) -> list[str]:
+ """Render aligned key: value lines as a compact message table."""
+ if not rows:
+ return []
+ key_w = min(max(len(k) for k,_ in rows), max(12, int(0.35*width)))
+ val_w = max(8, width - key_w - pad)
+ lines: list[str] = []
+ for k, v in rows:
+ k = (k[:key_w-1] + "…") if len(k) > key_w else k
+ wrapped = _wrap(v, val_w)
+ lines.append(f"{k.ljust(key_w)}: {wrapped[0]}")
+ for cont in wrapped[1:]:
+ lines.append(f"{' '*key_w} {cont}")
return lines
+def _bubble(title: str, body_lines: list[str], c: _C, align: str = "left", width: int | None = None) -> str:
+ """
+ Render a chat-style message bubble.
+ align: 'left' (incoming) or 'right' (outgoing)
+ """
+ width = min(_term_width(), width or _term_width())
+ max_inner = max(24, width - 10) # inner text width
+ indent = 2 if align == "left" else max(2, width - (max_inner + 8))
+ pad = " " * indent
+
+ ts = datetime.now().strftime("%H:%M")
+ head = f"{c.BOLD}{title}{c.RESET} {c.DIM}{ts}{c.RESET}"
+ head_lines = _wrap(head, max_inner)
+ lines = [pad + " " + head_lines[0]]
+ for hl in head_lines[1:]:
+ lines.append(pad + " " + hl)
+
+ # bubble
+ lines.append(pad + " " + ("╭" + "─" * (max_inner + 2) + "╮"))
+ for ln in body_lines:
+ for wln in _wrap(ln, max_inner):
+ lines.append(pad + " " + "│ " + wln.ljust(max_inner) + " │")
+ tail_left = pad + " " + "╰" + "─" * (max_inner + 2) + "╯" + "⟋"
+ tail_right = pad + " " + "⟍" + "╰" + "─" * (max_inner + 2) + "╯"
+ lines.append(tail_left if align == "left" else tail_right)
+ return "\n".join(lines)
+
+def _panel(title: str, body_lines: list[str], c: _C, width: int | None = None) -> str:
+ """Box panel fallback (non-chat style)."""
+ width = width or _term_width()
+ border = "─" * (width - 2)
+ out = [f"{c.CYAN}┌{border}┐{c.RESET}"]
+ title_line = f" {title} "
+ pad = max(0, width - 2 - len(title_line))
+ out.append(f"{c.CYAN}│{c.RESET}{c.BOLD}{title_line}{c.RESET}{' '*pad}{c.CYAN}│{c.RESET}")
+ out.append(f"{c.CYAN}├{border}┤{c.RESET}")
+ for ln in body_lines:
+ for sub in _wrap(ln, width - 4):
+ pad = max(0, width - 4 - len(sub))
+ out.append(f"{c.CYAN}│{c.RESET} {sub}{' '*pad} {c.CYAN}│{c.RESET}")
+ out.append(f"{c.CYAN}└{border}┘{c.RESET}")
+ return "\n".join(out)
+
+def _render_card(title: str, body_lines: list[str], c: _C, style: str = "chat", align: str = "left") -> str:
+ return _bubble(title, body_lines, c, align=align) if style == "chat" else _panel(title, body_lines, c)
+
# ================================ Summaries ===================================
@@ -131,16 +157,13 @@ def _summarize_df(df: pd.DataFrame, name: str, peek: int = 5) -> List[str]:
lines: List[str] = []
lines.append(f"{name}")
lines.append(f"shape: {df.shape[0]} rows × {df.shape[1]} cols")
- # columns (trim if very long)
cols = list(df.columns)
col_str = ", ".join(cols)
- lines.append("columns: " + col_str if len(col_str) < 160 else "columns: " + ", ".join(cols[:12]) + ", ...")
- # dtypes / NA counts (only non-zero NA counts shown)
+ lines.append("columns: " + col_str if len(col_str) < 160 else "columns: " + ", ".join(cols[:12]) + ", …")
dtypes = df.dtypes.astype(str).to_dict()
na_counts = {k: int(v) for k, v in df.isna().sum().items() if int(v) > 0}
lines.append("dtypes: " + ", ".join([f"{k}:{v}" for k, v in dtypes.items()]))
lines.append("na_counts: " + (str(na_counts) if na_counts else "{}"))
- # value counts of common message fields
for col in ("type", "direction"):
if col in df.columns:
try:
@@ -148,7 +171,6 @@ def _summarize_df(df: pd.DataFrame, name: str, peek: int = 5) -> List[str]:
lines.append(f"value_counts[{col}]: {vc}")
except Exception:
pass
- # time range + monotonic check
if "time" in df.columns:
try:
t = pd.to_datetime(df["time"], errors="coerce", unit=None)
@@ -158,17 +180,15 @@ def _summarize_df(df: pd.DataFrame, name: str, peek: int = 5) -> List[str]:
lines.append(f"time monotonic nondecreasing: {is_mono}")
except Exception:
pass
- # numeric quick stats (only a few cols to keep output tidy)
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if num_cols:
sample_cols = num_cols[:6]
desc = df[sample_cols].describe().to_dict()
desc = {k: {m: float(v) for m, v in stats.items()} for k, stats in desc.items()}
- lines.append("describe(sample of numeric cols):")
+ lines.append("describe(sample numeric cols):")
for k, stats in desc.items():
stats_str = ", ".join([f"{m}={val:.4g}" for m, val in stats.items()])
lines.append(f" {k}: {stats_str}")
- # head / tail
if peek > 0:
lines.append("head:")
lines.append(df.head(peek).to_string(index=False))
@@ -288,7 +308,7 @@ def summarize(self, peek: int = 5) -> List[str]:
)
lines = []
lines += _summarize_df(msg_df, "message_10.csv", peek=peek)
- lines.append("") # spacer
+ lines.append("") # spacer between the two tables
lines += _summarize_df(ob_df, "orderbook_10.csv", peek=peek)
return lines
@@ -475,53 +495,72 @@ def _windowize(self, X: np.ndarray) -> np.ndarray:
return W
-# ============================ CLI and nice output =============================
+# ============================ CLI and message output ==========================
-def _print_dir_listing(path: str, c: _C) -> None:
+def _print_dir_listing(path: str, c: _C, style: str) -> None:
if os.path.isdir(path):
files = sorted(os.listdir(path))
- lines = [f"path: {path}", f"files: {len(files)}"]
- lines += [f" - {f}" for f in files[:12]]
- if len(files) > 12:
- lines.append(f" ... (+{len(files)-12} more)")
+ body = [f"path: {path}", f"files: {len(files)}"]
+ body += [f"• {f}" for f in files[:10]]
+ if len(files) > 10:
+ body.append(f"• (+{len(files)-10} more)")
+ else:
+ body = [f"path: {path}", "files: (missing)"]
+ print(_render_card("Data directory", body, c, style=style, align="left"))
+
+def _print_summary(lines: list[str], c: _C, style: str) -> None:
+ # split into two bubbles by blank line
+ if "" in lines:
+ idx = lines.index("")
+ msg_part = lines[:idx]
+ ob_part = lines[idx+1:]
else:
- lines = [f"path: {path}", "files: (missing)"]
- print(_box("Data directory", lines, c))
-
-def _print_summary(lines: List[str], c: _C) -> None:
- print(_box("CSV Summary", lines, c))
-
-def _print_report(W_train, W_val, W_test, meta: Dict[str, object], c: _C) -> None:
- shapes = {
- "train windows": _fmt_shape(W_train.shape),
- "val windows": _fmt_shape(W_val.shape),
- "test windows": _fmt_shape(W_test.shape),
- "seq_len": meta.get("seq_len"),
- "stride": meta.get("stride"),
- "feature_set": meta.get("feature_set"),
- "features": len(meta.get("feature_names", [])),
- "scaler": meta.get("scaler"),
- "sorted_by_time": meta.get("sorted_by_time"),
- "every": meta.get("every"),
- }
- lines = _kv_lines(shapes)
+ msg_part, ob_part = lines, []
+
+ def split_title(block: list[str]) -> tuple[str, list[str]]:
+ if not block:
+ return ("", [])
+ title, body = block[0], block[1:]
+ return (title, body)
+
+ t1, b1 = split_title(msg_part)
+ if t1:
+ print(_render_card(f"🟣 {t1}", b1, c, style=style, align="left"))
+ t2, b2 = split_title(ob_part)
+ if t2:
+ print(_render_card(f"🟢 {t2}", b2, c, style=style, align="left"))
+
+def _print_report(W_train, W_val, W_test, meta: dict, c: _C, style: str) -> None:
+ block1 = [
+ ("train windows", "×".join(map(str, W_train.shape))),
+ ("val windows", "×".join(map(str, W_val.shape))),
+ ("test windows", "×".join(map(str, W_test.shape))),
+ ("seq_len", str(meta.get("seq_len"))),
+ ("stride", str(meta.get("stride"))),
+ ("feature_set", str(meta.get("feature_set"))),
+ ("#features", str(len(meta.get("feature_names", [])))),
+ ("scaler", str(meta.get("scaler"))),
+ ("sorted_by_time",str(meta.get("sorted_by_time"))),
+ ("every", str(meta.get("every"))),
+ ]
+ lines1 = _kv_table(block1, width=min(_term_width(), 84))
+ print(_render_card("Preprocessing report", lines1, c, style=style, align="right"))
+
rc = meta.get("row_counts", {})
if rc:
- lines.append("")
- lines.append("row_counts:")
- for k, v in rc.items():
- lines.append(f" {k}: {v}")
- print(_box("Preprocessing Report", lines, c))
+ block2 = [(k, str(v)) for k, v in rc.items()]
+ lines2 = _kv_table(block2, width=min(_term_width(), 84))
+ print(_render_card("Row counts", lines2, c, style=style, align="right"))
- # quick sample stats on first window (if exists)
if getattr(W_train, "size", 0):
win = W_train[0]
- stats = {
- "window[0] mean": f"{float(win.mean()):.5f}",
- "window[0] std": f"{float(win.std()):.5f}",
- "feature_names (first 8)": ", ".join(meta.get("feature_names", [])[:8]) + ("..." if len(meta.get("feature_names", [])) > 8 else "")
- }
- print(_box("Sample Window Stats", _kv_lines(stats), c))
+ block3 = [
+ ("window[0] mean", f"{float(win.mean()):.6f}"),
+ ("window[0] std", f"{float(win.std()):.6f}"),
+ ("features", ", ".join(meta.get("feature_names", [])[:8]) + ("…" if len(meta.get("feature_names", []))>8 else "")),
+ ]
+ lines3 = _kv_table(block3, width=min(_term_width(), 84))
+ print(_render_card("Sample window", lines3, c, style=style, align="right"))
def _main_cli():
parser = argparse.ArgumentParser(description="LOBSTERData (preprocess + summarize).")
@@ -545,11 +584,12 @@ def _main_cli():
parser.add_argument("--sort-by-time", action="store_true")
parser.add_argument("--every", type=int, default=1)
parser.add_argument("--clip-quantiles", type=float, nargs=2, metavar=("QMIN", "QMAX"), default=None)
+ parser.add_argument("--style", choices=["chat", "box"], default="chat", help="Output style")
parser.add_argument("--no-color", action="store_true", help="Disable ANSI colors in output")
args = parser.parse_args()
c = _C(_supports_color(args.no_color))
- _print_dir_listing(args.data_dir, c)
+ _print_dir_listing(args.data_dir, c, style=args.style)
loader = LOBSTERData(
data_dir=args.data_dir,
@@ -572,12 +612,12 @@ def _main_cli():
if args.summary:
lines = loader.summarize(peek=args.peek)
- _print_summary(lines, c)
+ _print_summary(lines, c, style=args.style)
return
W_train, W_val, W_test = loader.load_arrays()
meta = loader.get_meta()
- _print_report(W_train, W_val, W_test, meta, c)
+ _print_report(W_train, W_val, W_test, meta, c, style=args.style)
if args.save_npz:
np.savez_compressed(
@@ -586,8 +626,8 @@ def _main_cli():
feature_names=np.array(loader.get_feature_names(), dtype=object),
meta=np.array([str(meta)], dtype=object),
)
- print(_box("Saved", [f"path: {args.save_npz}"], c))
+ print(_render_card("💾 Saved", [f"path: {args.save_npz}"], c, style=args.style, align="right"))
if __name__ == "__main__":
- _main_cli()
\ No newline at end of file
+ _main_cli()
From a1a8fb589e43cdc015eff8e7fe60a4a04f21a027 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Fri, 3 Oct 2025 12:39:25 +1000
Subject: [PATCH 10/74] feat(dataset): verbose chat-style CLI with diagnostics
Add --verbose and --meta-json; report memory footprint, time coverage, scaler parameters, clip bounds preview, and windowing math. Keep chat/box styles, headerless support, time-sort, decimation, quantile clipping, chronological splits, and train-only scaling.
---
.../TimeLOB_TimeGAN_49088276/src/dataset.py | 136 +++++++++++++++++-
1 file changed, 133 insertions(+), 3 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
index b88661041..717c98bbd 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
@@ -151,6 +151,27 @@ def _render_card(title: str, body_lines: list[str], c: _C, style: str = "chat",
return _bubble(title, body_lines, c, align=align) if style == "chat" else _panel(title, body_lines, c)
+# ============================== Verbose helpers ===============================
+
+def _fmt_bytes(n: int) -> str:
+ units = ["B", "KB", "MB", "GB", "TB"]
+ i = 0
+ f = float(n)
+ while f >= 1024 and i < len(units) - 1:
+ f /= 1024.0
+ i += 1
+ return f"{f:.2f} {units[i]}"
+
+def _first_last_time(msg_df: pd.DataFrame) -> tuple[str, str]:
+ if "time" not in msg_df.columns:
+ return ("", "")
+ try:
+ t = pd.to_datetime(msg_df["time"], errors="coerce", unit=None)
+ return (str(t.min()), str(t.max()))
+ except Exception:
+ return ("", "")
+
+
# ================================ Summaries ===================================
def _summarize_df(df: pd.DataFrame, name: str, peek: int = 5) -> List[str]:
@@ -530,7 +551,12 @@ def split_title(block: list[str]) -> tuple[str, list[str]]:
if t2:
print(_render_card(f"🟢 {t2}", b2, c, style=style, align="left"))
-def _print_report(W_train, W_val, W_test, meta: dict, c: _C, style: str) -> None:
+def _print_report(W_train, W_val, W_test, meta: dict, c: _C, style: str, *,
+ verbose: bool = False,
+ scaler_obj = None,
+ clip_bounds = None,
+ time_coverage: tuple[str, str] = ("","")) -> None:
+ # Basic block
block1 = [
("train windows", "×".join(map(str, W_train.shape))),
("val windows", "×".join(map(str, W_val.shape))),
@@ -546,12 +572,14 @@ def _print_report(W_train, W_val, W_test, meta: dict, c: _C, style: str) -> None
lines1 = _kv_table(block1, width=min(_term_width(), 84))
print(_render_card("Preprocessing report", lines1, c, style=style, align="right"))
+ # Row counts
rc = meta.get("row_counts", {})
if rc:
block2 = [(k, str(v)) for k, v in rc.items()]
lines2 = _kv_table(block2, width=min(_term_width(), 84))
print(_render_card("Row counts", lines2, c, style=style, align="right"))
+ # Sample window stats
if getattr(W_train, "size", 0):
win = W_train[0]
block3 = [
@@ -562,6 +590,78 @@ def _print_report(W_train, W_val, W_test, meta: dict, c: _C, style: str) -> None
lines3 = _kv_table(block3, width=min(_term_width(), 84))
print(_render_card("Sample window", lines3, c, style=style, align="right"))
+ if not verbose:
+ return
+
+ # Verbose extras
+ vlines: list[str] = []
+ # Memory footprint
+ total_bytes = (W_train.nbytes if hasattr(W_train, "nbytes") else 0) + \
+ (W_val.nbytes if hasattr(W_val, "nbytes") else 0) + \
+ (W_test.nbytes if hasattr(W_test, "nbytes") else 0)
+ vlines.append(f"memory total: {_fmt_bytes(total_bytes)}")
+ vlines.append(f"train bytes: {_fmt_bytes(getattr(W_train, 'nbytes', 0))}")
+ vlines.append(f"val bytes: {_fmt_bytes(getattr(W_val, 'nbytes', 0))}")
+ vlines.append(f"test bytes: {_fmt_bytes(getattr(W_test, 'nbytes', 0))}")
+
+ # Time coverage if available
+ tmin, tmax = time_coverage
+ if tmin or tmax:
+ vlines.append(f"time coverage: {tmin} → {tmax}")
+
+ print(_render_card("Resources & coverage", vlines, c, style=style, align="right"))
+
+ # Scaler params
+ if scaler_obj is not None:
+ s_lines = []
+ if hasattr(scaler_obj, "mean_") and hasattr(scaler_obj, "scale_"):
+ # StandardScaler
+ means = scaler_obj.mean_
+ scales = scaler_obj.scale_
+ s_lines += _kv_table([
+ ("type", "StandardScaler"),
+ ("mean[0:8]", np.array2string(means[:8], precision=4, separator=", ")),
+ ("scale[0:8]", np.array2string(scales[:8], precision=4, separator=", ")),
+ ], width=min(_term_width(), 84))
+ elif hasattr(scaler_obj, "data_min_") and hasattr(scaler_obj, "data_max_"):
+ # MinMaxScaler
+ s_lines += _kv_table([
+ ("type", "MinMaxScaler"),
+ ("data_min[0:8]", np.array2string(scaler_obj.data_min_[:8], precision=4, separator=", ")),
+ ("data_max[0:8]", np.array2string(scaler_obj.data_max_[:8], precision=4, separator=", ")),
+ ("feature_range", str(getattr(scaler_obj, "feature_range", None))),
+ ], width=min(_term_width(), 84))
+ if s_lines:
+ print(_render_card("Scaler parameters", s_lines, c, style=style, align="right"))
+
+ # Clip bounds preview
+ if clip_bounds is not None:
+ lo, hi = clip_bounds
+ cb_lines = _kv_table([
+ ("q-lo[0:8]", np.array2string(lo[:8], precision=4, separator=", ")),
+ ("q-hi[0:8]", np.array2string(hi[:8], precision=4, separator=", ")),
+ ], width=min(_term_width(), 84))
+ print(_render_card("Clip bounds (preview)", cb_lines, c, style=style, align="right"))
+
+ # Per-split window counts and overlap ratio
+ def _count_windows(n_rows: int, seq_len: int, stride: int) -> int:
+ if n_rows < seq_len:
+ return 0
+ return 1 + (n_rows - seq_len) // stride
+
+ rc_train = rc.get("train", 0)
+ rc_val = rc.get("val", 0)
+ rc_test = rc.get("test", 0)
+ overlap = 1.0 - (meta.get("stride", 1) / max(1, meta.get("seq_len", 1)))
+ perf = _kv_table([
+ ("expected train windows", str(_count_windows(rc_train, meta.get("seq_len", 0), meta.get("stride", 1)))),
+ ("expected val windows", str(_count_windows(rc_val, meta.get("seq_len", 0), meta.get("stride", 1)))),
+ ("expected test windows", str(_count_windows(rc_test, meta.get("seq_len", 0), meta.get("stride", 1)))),
+ ("overlap ratio", f"{overlap:.3f}"),
+ ], width=min(_term_width(), 84))
+ print(_render_card("Windowing details", perf, c, style=style, align="right"))
+
+
def _main_cli():
parser = argparse.ArgumentParser(description="LOBSTERData (preprocess + summarize).")
parser.add_argument("--data-dir", default="data")
@@ -586,6 +686,8 @@ def _main_cli():
parser.add_argument("--clip-quantiles", type=float, nargs=2, metavar=("QMIN", "QMAX"), default=None)
parser.add_argument("--style", choices=["chat", "box"], default="chat", help="Output style")
parser.add_argument("--no-color", action="store_true", help="Disable ANSI colors in output")
+ parser.add_argument("--verbose", action="store_true", help="Print extra diagnostics (memory, scaler, clip bounds)")
+ parser.add_argument("--meta-json", type=str, default=None, help="Optional path to dump meta JSON")
args = parser.parse_args()
c = _C(_supports_color(args.no_color))
@@ -617,8 +719,36 @@ def _main_cli():
W_train, W_val, W_test = loader.load_arrays()
meta = loader.get_meta()
- _print_report(W_train, W_val, W_test, meta, c, style=args.style)
+ # verbose context
+ scaler_obj = loader.get_scaler()
+ clip_bounds = None
+ if meta.get("clip_bounds"):
+ lo = np.array(meta["clip_bounds"]["lo"], dtype=float)
+ hi = np.array(meta["clip_bounds"]["hi"], dtype=float)
+ clip_bounds = (lo, hi)
+
+ # best-effort message time coverage
+ try:
+ msg_df, _ = loader._load_csvs()
+ tmin, tmax = _first_last_time(msg_df)
+ except Exception:
+ tmin = tmax = ""
+
+ _print_report(
+ W_train, W_val, W_test, meta, c, style=args.style,
+ verbose=args.verbose, scaler_obj=scaler_obj,
+ clip_bounds=clip_bounds, time_coverage=(tmin, tmax)
+ )
+
+ # optional meta dump
+ if args.meta_json:
+ import json
+ with open(args.meta_json, "w", encoding="utf-8") as f:
+ json.dump(meta, f, indent=2)
+ print(_render_card("Saved", [f"meta: {args.meta_json}"], c, style=args.style, align="right"))
+
+ # optional arrays NPZ
if args.save_npz:
np.savez_compressed(
args.save_npz,
@@ -626,7 +756,7 @@ def _main_cli():
feature_names=np.array(loader.get_feature_names(), dtype=object),
meta=np.array([str(meta)], dtype=object),
)
- print(_render_card("💾 Saved", [f"path: {args.save_npz}"], c, style=args.style, align="right"))
+ print(_render_card("Saved", [f"windows: {args.save_npz}"], c, style=args.style, align="right"))
if __name__ == "__main__":
From 07860dd41436cc33ab324601958e5a6e6d29542c Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Fri, 3 Oct 2025 15:20:34 +1000
Subject: [PATCH 11/74] feat(dataset): pretty tables via tabulate; render
cleanly in chat/box
Integrate tabulate for head/tail/describe and 2-col KV sections. Preserve table lines inside bubbles/boxes (no wrapping) and auto-fit inner width to widest table row. Retains headerless support, time sort, decimation, quantile clipping, chronological splits, train-only scaling, and verbose diagnostics.
---
.../TimeLOB_TimeGAN_49088276/environment.yml | 1 +
.../TimeLOB_TimeGAN_49088276/src/dataset.py | 216 +++++++++++++++---
2 files changed, 185 insertions(+), 32 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/environment.yml b/recognition/TimeLOB_TimeGAN_49088276/environment.yml
index b085e0eb3..a329baaf8 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/environment.yml
+++ b/recognition/TimeLOB_TimeGAN_49088276/environment.yml
@@ -4,6 +4,7 @@ channels:
dependencies:
- python=3.13
- pip
+ - tabulate
- numpy
- pandas
- scipy
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
index 717c98bbd..1a41b3578 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
@@ -35,6 +35,7 @@
from __future__ import annotations
import os
+import re
import argparse
import shutil
from datetime import datetime
@@ -43,6 +44,7 @@
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
+from tabulate import tabulate
# ============================== Pretty printing ===============================
@@ -89,31 +91,66 @@ def _wrap(s: str, width: int) -> list[str]:
out.append(cur)
return out
+# ---- detect and preserve tabulate tables inside panels/bubbles ----
+
+_ANSI_RE = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]")
+
+def _visible_len(s: str) -> int:
+ """Visible length without ANSI codes (so width calc matches terminal)."""
+ return len(_ANSI_RE.sub("", s))
+
+def _is_table_line(s: str) -> bool:
+ """
+ Heuristic for tabulate-like lines we should not wrap:
+ - GitHub style: lines starting with '|' and having columns separated by '|'
+ - Grid style: rule lines with '+' borders
+ - Simple header/rule lines made of '-:|+ '
+ """
+ t = s.strip()
+ if not t:
+ return False
+ if t.startswith("|") and "|" in t[1:]:
+ return True
+ if t.startswith("+") and t.endswith("+"):
+ return True
+ if set(t) <= set("-:|+ "):
+ return True
+ return False
+
def _kv_table(rows: list[tuple[str, str]], width: int, pad: int = 2) -> list[str]:
- """Render aligned key: value lines as a compact message table."""
+ """
+ Render key–value rows as a compact 2-col table using tabulate.
+ Returns a list of lines to embed inside bubbles/boxes.
+ """
if not rows:
return []
- key_w = min(max(len(k) for k,_ in rows), max(12, int(0.35*width)))
- val_w = max(8, width - key_w - pad)
- lines: list[str] = []
- for k, v in rows:
- k = (k[:key_w-1] + "…") if len(k) > key_w else k
- wrapped = _wrap(v, val_w)
- lines.append(f"{k.ljust(key_w)}: {wrapped[0]}")
- for cont in wrapped[1:]:
- lines.append(f"{' '*key_w} {cont}")
- return lines
+ table = tabulate(rows, headers=["key", "value"], tablefmt="github", stralign="left")
+ return table.splitlines()
def _bubble(title: str, body_lines: list[str], c: _C, align: str = "left", width: int | None = None) -> str:
"""
Render a chat-style message bubble.
- align: 'left' (incoming) or 'right' (outgoing)
+ - Does NOT wrap lines that look like preformatted tables.
+ - Auto-fits inner width to the widest table line (within terminal limit).
"""
- width = min(_term_width(), width or _term_width())
- max_inner = max(24, width - 10) # inner text width
+ termw = _term_width()
+ width = min(termw, width or termw)
+
+ # Baseline inner width
+ base_inner = max(24, width - 10)
+
+ # If there are preformatted table lines, fit to the widest visible line
+ widest_tbl = 0
+ for ln in body_lines:
+ if _is_table_line(ln):
+ widest_tbl = max(widest_tbl, _visible_len(ln))
+ max_inner = min(max(base_inner, widest_tbl), width - 10)
+
+ # Left/right alignment
indent = 2 if align == "left" else max(2, width - (max_inner + 8))
pad = " " * indent
+ # Header
ts = datetime.now().strftime("%H:%M")
head = f"{c.BOLD}{title}{c.RESET} {c.DIM}{ts}{c.RESET}"
head_lines = _wrap(head, max_inner)
@@ -121,29 +158,64 @@ def _bubble(title: str, body_lines: list[str], c: _C, align: str = "left", width
for hl in head_lines[1:]:
lines.append(pad + " " + hl)
- # bubble
+ # Bubble top border
lines.append(pad + " " + ("╭" + "─" * (max_inner + 2) + "╮"))
+
+ # Body: keep table lines intact; wrap normal text
for ln in body_lines:
- for wln in _wrap(ln, max_inner):
- lines.append(pad + " " + "│ " + wln.ljust(max_inner) + " │")
+ if _is_table_line(ln):
+ vis = _visible_len(ln)
+ if vis <= max_inner:
+ out = ln + " " * (max_inner - vis)
+ else:
+ out = ln[:max_inner]
+ lines.append(pad + " " + "│ " + out + " │")
+ else:
+ for wln in _wrap(ln, max_inner):
+ lines.append(pad + " " + "│ " + wln.ljust(max_inner) + " │")
+
+ # Bubble bottom + tail
tail_left = pad + " " + "╰" + "─" * (max_inner + 2) + "╯" + "⟋"
tail_right = pad + " " + "⟍" + "╰" + "─" * (max_inner + 2) + "╯"
lines.append(tail_left if align == "left" else tail_right)
return "\n".join(lines)
def _panel(title: str, body_lines: list[str], c: _C, width: int | None = None) -> str:
- """Box panel fallback (non-chat style)."""
- width = width or _term_width()
+ """Box panel; does not wrap tabulated lines; auto-fits to widest table row."""
+ termw = _term_width()
+ width = width or termw
+ inner = width - 4 # borders + spaces
+
+ # Fit inner width to widest table line if present (within terminal width)
+ widest_tbl = 0
+ for ln in body_lines:
+ if _is_table_line(ln):
+ widest_tbl = max(widest_tbl, _visible_len(ln))
+ inner = min(max(inner, widest_tbl), termw - 4)
+ width = inner + 4
+
border = "─" * (width - 2)
out = [f"{c.CYAN}┌{border}┐{c.RESET}"]
title_line = f" {title} "
pad = max(0, width - 2 - len(title_line))
out.append(f"{c.CYAN}│{c.RESET}{c.BOLD}{title_line}{c.RESET}{' '*pad}{c.CYAN}│{c.RESET}")
out.append(f"{c.CYAN}├{border}┤{c.RESET}")
+
for ln in body_lines:
- for sub in _wrap(ln, width - 4):
- pad = max(0, width - 4 - len(sub))
- out.append(f"{c.CYAN}│{c.RESET} {sub}{' '*pad} {c.CYAN}│{c.RESET}")
+ if _is_table_line(ln):
+ vis = _visible_len(ln)
+ # inner-2 for side spaces inside the box content
+ width_ok = inner - 2
+ if vis <= width_ok:
+ body = ln + " " * (width_ok - vis)
+ else:
+ body = ln[:width_ok]
+ out.append(f"{c.CYAN}│{c.RESET} {body} {c.CYAN}│{c.RESET}")
+ else:
+ for sub in _wrap(ln, inner - 2):
+ padlen = max(0, (inner - 2) - len(sub))
+ out.append(f"{c.CYAN}│{c.RESET} {sub}{' '*padlen} {c.CYAN}│{c.RESET}")
+
out.append(f"{c.CYAN}└{border}┘{c.RESET}")
return "\n".join(out)
@@ -201,20 +273,24 @@ def _summarize_df(df: pd.DataFrame, name: str, peek: int = 5) -> List[str]:
lines.append(f"time monotonic nondecreasing: {is_mono}")
except Exception:
pass
+
+ # numeric quick stats (pretty table)
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if num_cols:
- sample_cols = num_cols[:6]
- desc = df[sample_cols].describe().to_dict()
- desc = {k: {m: float(v) for m, v in stats.items()} for k, stats in desc.items()}
+ sample_cols = num_cols[: min(8, len(num_cols))]
+ desc_df = df[sample_cols].describe().round(6)
lines.append("describe(sample numeric cols):")
- for k, stats in desc.items():
- stats_str = ", ".join([f"{m}={val:.4g}" for m, val in stats.items()])
- lines.append(f" {k}: {stats_str}")
+ lines.extend(tabulate(desc_df, headers="keys", tablefmt="github").splitlines())
+
+ # head / tail (pretty tables)
if peek > 0:
lines.append("head:")
- lines.append(df.head(peek).to_string(index=False))
+ head_tbl = tabulate(df.head(peek), headers="keys", tablefmt="github", showindex=False)
+ lines.extend(head_tbl.splitlines())
lines.append("tail:")
- lines.append(df.tail(peek).to_string(index=False))
+ tail_tbl = tabulate(df.tail(peek), headers="keys", tablefmt="github", showindex=False)
+ lines.extend(tail_tbl.splitlines())
+
return lines
@@ -546,10 +622,10 @@ def split_title(block: list[str]) -> tuple[str, list[str]]:
t1, b1 = split_title(msg_part)
if t1:
- print(_render_card(f"🟣 {t1}", b1, c, style=style, align="left"))
+ print(_render_card(f"{t1}", b1, c, style=style, align="left"))
t2, b2 = split_title(ob_part)
if t2:
- print(_render_card(f"🟢 {t2}", b2, c, style=style, align="left"))
+ print(_render_card(f"{t2}", b2, c, style=style, align="left"))
def _print_report(W_train, W_val, W_test, meta: dict, c: _C, style: str, *,
verbose: bool = False,
@@ -662,6 +738,81 @@ def _count_windows(n_rows: int, seq_len: int, stride: int) -> int:
print(_render_card("Windowing details", perf, c, style=style, align="right"))
+# ========================== Dataset info (report card) ========================
+
+def _print_dataset_info(loader: "LOBSTERData", c: _C, style: str, peek: int = 5) -> None:
+ """Print detailed information about the dataset and feature set."""
+ meta = loader.get_meta()
+ feature_set = meta.get("feature_set")
+ feats = meta.get("feature_names") or []
+
+ # Fallback feature names if meta is empty
+ if not feats:
+ if feature_set == "core":
+ feats = [
+ "mid_price",
+ "spread",
+ "mid_log_return",
+ "queue_imbalance_l1",
+ "depth_imbalance_l10",
+ ]
+ elif feature_set == "raw10":
+ feats = (
+ [f"ask_price_{i}" for i in range(1, 11)] +
+ [f"ask_size_{i}" for i in range(1, 11)] +
+ [f"bid_price_{i}" for i in range(1, 11)] +
+ [f"bid_size_{i}" for i in range(1, 11)]
+ )
+
+ lines: List[str] = [
+ f"Feature set: {feature_set}",
+ f"Total features: {len(feats)}",
+ ""
+ ]
+
+ # aggregated statistics across splits (pretty tables)
+ try:
+ W_train, W_val, W_test = loader.load_arrays()
+ if W_train.size + W_val.size + W_test.size == 0:
+ raise ValueError("No windows produced; consider lowering seq_len or stride.")
+ blocks = []
+ for W in (W_train, W_val, W_test):
+ if getattr(W, "size", 0):
+ blocks.append(W.reshape(-1, W.shape[-1]))
+ all_data = np.concatenate(blocks, axis=0)
+ df = pd.DataFrame(all_data, columns=feats)
+
+ # describe()
+ lines.append("Statistical summary (aggregated across splits):")
+ desc_df = df.describe().round(6)
+ lines.extend(tabulate(desc_df, headers="keys", tablefmt="github").splitlines())
+ lines.append("")
+
+ # peaks: means and stds tables
+ means = df.mean().sort_values(ascending=False).head(5)
+ stds = df.std().sort_values(ascending=False).head(5)
+
+ lines.append("Highest-mean features:")
+ lines.extend(tabulate(list(means.items()), headers=["feature", "mean"], tablefmt="github").splitlines())
+ lines.append("")
+
+ lines.append("Most-variable features (by std):")
+ lines.extend(tabulate(list(stds.items()), headers=["feature", "std"], tablefmt="github").splitlines())
+ lines.append("")
+
+ # example rows
+ lines.append("Example rows (first few timesteps):")
+ ex_tbl = tabulate(df.head(peek).round(6), headers="keys", tablefmt="github", showindex=True)
+ lines.extend(ex_tbl.splitlines())
+
+ except Exception as e:
+ lines.append(f"(Could not compute stats: {e})")
+
+ print(_render_card("Dataset summary", lines, c, style=style, align="left"))
+
+
+# ================================== CLI ======================================
+
def _main_cli():
parser = argparse.ArgumentParser(description="LOBSTERData (preprocess + summarize).")
parser.add_argument("--data-dir", default="data")
@@ -715,6 +866,7 @@ def _main_cli():
if args.summary:
lines = loader.summarize(peek=args.peek)
_print_summary(lines, c, style=args.style)
+ _print_dataset_info(loader, c, style=args.style, peek=args.peek)
return
W_train, W_val, W_test = loader.load_arrays()
From 790de5dea465cfd3d6904d1fa47dc2f44c6f7148 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Fri, 3 Oct 2025 16:37:49 +1000
Subject: [PATCH 12/74] feat(dataset): colored, polished CLI with tabulated
tables
Add ANSI color themes, chat/box message panels, and --table-style (github|grid|simple). Preserve tabulate tables inside panels without wrapping and auto-fit widths. Keep headerless support, time sort, decimation, quantile clipping, chronological splits, train-only scaling, verbose diagnostics, and dataset summary report.
---
.../TimeLOB_TimeGAN_49088276/src/dataset.py | 201 ++++++++----------
1 file changed, 93 insertions(+), 108 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
index 1a41b3578..3dc56b753 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
@@ -68,8 +68,9 @@ def __init__(self, enabled: bool):
self.GREEN = "\033[32m" if enabled else ""
self.MAGENTA= "\033[35m" if enabled else ""
self.BLUE = "\033[34m" if enabled else ""
+ self.RED = "\033[31m" if enabled else ""
-def _term_width(default: int = 96) -> int:
+def _term_width(default: int = 100) -> int:
try:
return shutil.get_terminal_size((default, 20)).columns
except Exception:
@@ -85,8 +86,7 @@ def _wrap(s: str, width: int) -> list[str]:
elif len(cur) + 1 + len(tok) <= width:
cur += " " + tok
else:
- out.append(cur)
- cur = tok
+ out.append(cur); cur = tok
if cur:
out.append(cur)
return out
@@ -117,14 +117,21 @@ def _is_table_line(s: str) -> bool:
return True
return False
-def _kv_table(rows: list[tuple[str, str]], width: int, pad: int = 2) -> list[str]:
+# Global table format (overridable via CLI)
+TABLE_FMT = "github"
+
+def _kv_table(rows: list[tuple[str, str]], width: int, c: _C, headers: tuple[str,str]=("key","value")) -> list[str]:
"""
Render key–value rows as a compact 2-col table using tabulate.
Returns a list of lines to embed inside bubbles/boxes.
"""
if not rows:
return []
- table = tabulate(rows, headers=["key", "value"], tablefmt="github", stralign="left")
+ h_key = f"{c.BOLD}{c.MAGENTA}{headers[0]}{c.RESET}" if c.enabled else headers[0]
+ h_val = f"{c.BOLD}{c.MAGENTA}{headers[1]}{c.RESET}" if c.enabled else headers[1]
+ # tint keys
+ tinted = [(f"{c.CYAN}{k}{c.RESET}" if c.enabled else k, v) for k, v in rows]
+ table = tabulate(tinted, headers=[h_key, h_val], tablefmt=TABLE_FMT, stralign="left")
return table.splitlines()
def _bubble(title: str, body_lines: list[str], c: _C, align: str = "left", width: int | None = None) -> str:
@@ -139,7 +146,7 @@ def _bubble(title: str, body_lines: list[str], c: _C, align: str = "left", width
# Baseline inner width
base_inner = max(24, width - 10)
- # If there are preformatted table lines, fit to the widest visible line
+ # Expand to widest table row if present
widest_tbl = 0
for ln in body_lines:
if _is_table_line(ln):
@@ -152,7 +159,8 @@ def _bubble(title: str, body_lines: list[str], c: _C, align: str = "left", width
# Header
ts = datetime.now().strftime("%H:%M")
- head = f"{c.BOLD}{title}{c.RESET} {c.DIM}{ts}{c.RESET}"
+ title_colored = f"{c.BOLD}{c.BLUE}{title}{c.RESET}" if c.enabled else title
+ head = f"{title_colored} {c.DIM}{ts}{c.RESET}"
head_lines = _wrap(head, max_inner)
lines = [pad + " " + head_lines[0]]
for hl in head_lines[1:]:
@@ -165,11 +173,8 @@ def _bubble(title: str, body_lines: list[str], c: _C, align: str = "left", width
for ln in body_lines:
if _is_table_line(ln):
vis = _visible_len(ln)
- if vis <= max_inner:
- out = ln + " " * (max_inner - vis)
- else:
- out = ln[:max_inner]
- lines.append(pad + " " + "│ " + out + " │")
+ out = ln + " " * max(0, max_inner - vis)
+ lines.append(pad + " " + "│ " + out[:max_inner] + " │")
else:
for wln in _wrap(ln, max_inner):
lines.append(pad + " " + "│ " + wln.ljust(max_inner) + " │")
@@ -195,22 +200,19 @@ def _panel(title: str, body_lines: list[str], c: _C, width: int | None = None) -
width = inner + 4
border = "─" * (width - 2)
+ title_colored = f"{c.BOLD}{c.BLUE}{title}{c.RESET}" if c.enabled else title
out = [f"{c.CYAN}┌{border}┐{c.RESET}"]
- title_line = f" {title} "
- pad = max(0, width - 2 - len(title_line))
- out.append(f"{c.CYAN}│{c.RESET}{c.BOLD}{title_line}{c.RESET}{' '*pad}{c.CYAN}│{c.RESET}")
+ title_line = f" {title_colored} "
+ pad = max(0, width - 2 - _visible_len(title_line))
+ out.append(f"{c.CYAN}│{c.RESET}{title_line}{' '*pad}{c.CYAN}│{c.RESET}")
out.append(f"{c.CYAN}├{border}┤{c.RESET}")
for ln in body_lines:
if _is_table_line(ln):
vis = _visible_len(ln)
- # inner-2 for side spaces inside the box content
width_ok = inner - 2
- if vis <= width_ok:
- body = ln + " " * (width_ok - vis)
- else:
- body = ln[:width_ok]
- out.append(f"{c.CYAN}│{c.RESET} {body} {c.CYAN}│{c.RESET}")
+ body = ln + " " * max(0, width_ok - vis)
+ out.append(f"{c.CYAN}│{c.RESET} {body[:width_ok]} {c.CYAN}│{c.RESET}")
else:
for sub in _wrap(ln, inner - 2):
padlen = max(0, (inner - 2) - len(sub))
@@ -227,11 +229,9 @@ def _render_card(title: str, body_lines: list[str], c: _C, style: str = "chat",
def _fmt_bytes(n: int) -> str:
units = ["B", "KB", "MB", "GB", "TB"]
- i = 0
- f = float(n)
+ i = 0; f = float(n)
while f >= 1024 and i < len(units) - 1:
- f /= 1024.0
- i += 1
+ f /= 1024.0; i += 1
return f"{f:.2f} {units[i]}"
def _first_last_time(msg_df: pd.DataFrame) -> tuple[str, str]:
@@ -246,9 +246,10 @@ def _first_last_time(msg_df: pd.DataFrame) -> tuple[str, str]:
# ================================ Summaries ===================================
-def _summarize_df(df: pd.DataFrame, name: str, peek: int = 5) -> List[str]:
+def _summarize_df(df: pd.DataFrame, name: str, peek: int, c: _C) -> List[str]:
lines: List[str] = []
- lines.append(f"{name}")
+ title = f"{c.BOLD}{name}{c.RESET}" if c.enabled else name
+ lines.append(title)
lines.append(f"shape: {df.shape[0]} rows × {df.shape[1]} cols")
cols = list(df.columns)
col_str = ", ".join(cols)
@@ -279,16 +280,16 @@ def _summarize_df(df: pd.DataFrame, name: str, peek: int = 5) -> List[str]:
if num_cols:
sample_cols = num_cols[: min(8, len(num_cols))]
desc_df = df[sample_cols].describe().round(6)
- lines.append("describe(sample numeric cols):")
- lines.extend(tabulate(desc_df, headers="keys", tablefmt="github").splitlines())
+ lines.append(f"{c.BOLD}describe(sample numeric cols):{c.RESET}" if c.enabled else "describe(sample numeric cols):")
+ lines.extend(tabulate(desc_df, headers="keys", tablefmt=TABLE_FMT).splitlines())
# head / tail (pretty tables)
if peek > 0:
- lines.append("head:")
- head_tbl = tabulate(df.head(peek), headers="keys", tablefmt="github", showindex=False)
+ lines.append(f"{c.BOLD}head:{c.RESET}" if c.enabled else "head:")
+ head_tbl = tabulate(df.head(peek), headers="keys", tablefmt=TABLE_FMT, showindex=False)
lines.extend(head_tbl.splitlines())
- lines.append("tail:")
- tail_tbl = tabulate(df.tail(peek), headers="keys", tablefmt="github", showindex=False)
+ lines.append(f"{c.BOLD}tail:{c.RESET}" if c.enabled else "tail:")
+ tail_tbl = tabulate(df.tail(peek), headers="keys", tablefmt=TABLE_FMT, showindex=False)
lines.extend(tail_tbl.splitlines())
return lines
@@ -394,7 +395,7 @@ def load_arrays(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
W_test = W_test.astype(self.output_dtype, copy=False)
return W_train, W_val, W_test
- def summarize(self, peek: int = 5) -> List[str]:
+ def summarize(self, peek: int, c: _C) -> List[str]:
msg_df, ob_df = self._load_csvs()
_ = self._normalize_orderbook_headers(
ob_df,
@@ -404,9 +405,9 @@ def summarize(self, peek: int = 5) -> List[str]:
+ [f"bid_size_{i}" for i in range(1, 11)]
)
lines = []
- lines += _summarize_df(msg_df, "message_10.csv", peek=peek)
+ lines += _summarize_df(msg_df, "message_10.csv", peek=peek, c=c)
lines.append("") # spacer between the two tables
- lines += _summarize_df(ob_df, "orderbook_10.csv", peek=peek)
+ lines += _summarize_df(ob_df, "orderbook_10.csv", peek=peek, c=c)
return lines
def get_feature_names(self) -> List[str]:
@@ -602,7 +603,7 @@ def _print_dir_listing(path: str, c: _C, style: str) -> None:
if len(files) > 10:
body.append(f"• (+{len(files)-10} more)")
else:
- body = [f"path: {path}", "files: (missing)"]
+ body = [f"path: {path}", f"{c.RED}files: (missing){c.RESET}" if c.enabled else "files: (missing)"]
print(_render_card("Data directory", body, c, style=style, align="left"))
def _print_summary(lines: list[str], c: _C, style: str) -> None:
@@ -622,10 +623,10 @@ def split_title(block: list[str]) -> tuple[str, list[str]]:
t1, b1 = split_title(msg_part)
if t1:
- print(_render_card(f"{t1}", b1, c, style=style, align="left"))
+ print(_render_card(t1, b1, c, style=style, align="left"))
t2, b2 = split_title(ob_part)
if t2:
- print(_render_card(f"{t2}", b2, c, style=style, align="left"))
+ print(_render_card(t2, b2, c, style=style, align="left"))
def _print_report(W_train, W_val, W_test, meta: dict, c: _C, style: str, *,
verbose: bool = False,
@@ -645,14 +646,14 @@ def _print_report(W_train, W_val, W_test, meta: dict, c: _C, style: str, *,
("sorted_by_time",str(meta.get("sorted_by_time"))),
("every", str(meta.get("every"))),
]
- lines1 = _kv_table(block1, width=min(_term_width(), 84))
+ lines1 = _kv_table(block1, width=min(_term_width(), 84), c=c)
print(_render_card("Preprocessing report", lines1, c, style=style, align="right"))
# Row counts
rc = meta.get("row_counts", {})
if rc:
block2 = [(k, str(v)) for k, v in rc.items()]
- lines2 = _kv_table(block2, width=min(_term_width(), 84))
+ lines2 = _kv_table(block2, width=min(_term_width(), 84), c=c)
print(_render_card("Row counts", lines2, c, style=style, align="right"))
# Sample window stats
@@ -663,7 +664,7 @@ def _print_report(W_train, W_val, W_test, meta: dict, c: _C, style: str, *,
("window[0] std", f"{float(win.std()):.6f}"),
("features", ", ".join(meta.get("feature_names", [])[:8]) + ("…" if len(meta.get("feature_names", []))>8 else "")),
]
- lines3 = _kv_table(block3, width=min(_term_width(), 84))
+ lines3 = _kv_table(block3, width=min(_term_width(), 84), c=c)
print(_render_card("Sample window", lines3, c, style=style, align="right"))
if not verbose:
@@ -671,16 +672,14 @@ def _print_report(W_train, W_val, W_test, meta: dict, c: _C, style: str, *,
# Verbose extras
vlines: list[str] = []
- # Memory footprint
- total_bytes = (W_train.nbytes if hasattr(W_train, "nbytes") else 0) + \
- (W_val.nbytes if hasattr(W_val, "nbytes") else 0) + \
- (W_test.nbytes if hasattr(W_test, "nbytes") else 0)
+ total_bytes = (getattr(W_train, "nbytes", 0) +
+ getattr(W_val, "nbytes", 0) +
+ getattr(W_test, "nbytes", 0))
vlines.append(f"memory total: {_fmt_bytes(total_bytes)}")
vlines.append(f"train bytes: {_fmt_bytes(getattr(W_train, 'nbytes', 0))}")
vlines.append(f"val bytes: {_fmt_bytes(getattr(W_val, 'nbytes', 0))}")
vlines.append(f"test bytes: {_fmt_bytes(getattr(W_test, 'nbytes', 0))}")
- # Time coverage if available
tmin, tmax = time_coverage
if tmin or tmax:
vlines.append(f"time coverage: {tmin} → {tmax}")
@@ -689,53 +688,47 @@ def _print_report(W_train, W_val, W_test, meta: dict, c: _C, style: str, *,
# Scaler params
if scaler_obj is not None:
- s_lines = []
+ s_rows = []
if hasattr(scaler_obj, "mean_") and hasattr(scaler_obj, "scale_"):
- # StandardScaler
- means = scaler_obj.mean_
- scales = scaler_obj.scale_
- s_lines += _kv_table([
+ s_rows = [
("type", "StandardScaler"),
- ("mean[0:8]", np.array2string(means[:8], precision=4, separator=", ")),
- ("scale[0:8]", np.array2string(scales[:8], precision=4, separator=", ")),
- ], width=min(_term_width(), 84))
+ ("mean[0:8]", np.array2string(scaler_obj.mean_[:8], precision=4, separator=", ")),
+ ("scale[0:8]", np.array2string(scaler_obj.scale_[:8], precision=4, separator=", ")),
+ ]
elif hasattr(scaler_obj, "data_min_") and hasattr(scaler_obj, "data_max_"):
- # MinMaxScaler
- s_lines += _kv_table([
+ s_rows = [
("type", "MinMaxScaler"),
("data_min[0:8]", np.array2string(scaler_obj.data_min_[:8], precision=4, separator=", ")),
("data_max[0:8]", np.array2string(scaler_obj.data_max_[:8], precision=4, separator=", ")),
("feature_range", str(getattr(scaler_obj, "feature_range", None))),
- ], width=min(_term_width(), 84))
- if s_lines:
- print(_render_card("Scaler parameters", s_lines, c, style=style, align="right"))
+ ]
+ if s_rows:
+ print(_render_card("Scaler parameters", _kv_table(s_rows, min(_term_width(),84), c=c), c, style=style, align="right"))
# Clip bounds preview
if clip_bounds is not None:
lo, hi = clip_bounds
- cb_lines = _kv_table([
+ cb_rows = [
("q-lo[0:8]", np.array2string(lo[:8], precision=4, separator=", ")),
("q-hi[0:8]", np.array2string(hi[:8], precision=4, separator=", ")),
- ], width=min(_term_width(), 84))
- print(_render_card("Clip bounds (preview)", cb_lines, c, style=style, align="right"))
+ ]
+ print(_render_card("Clip bounds (preview)", _kv_table(cb_rows, min(_term_width(),84), c=c), c, style=style, align="right"))
- # Per-split window counts and overlap ratio
+ # Windowing math
def _count_windows(n_rows: int, seq_len: int, stride: int) -> int:
if n_rows < seq_len:
return 0
return 1 + (n_rows - seq_len) // stride
- rc_train = rc.get("train", 0)
- rc_val = rc.get("val", 0)
- rc_test = rc.get("test", 0)
+ rc_train = rc.get("train", 0); rc_val = rc.get("val", 0); rc_test = rc.get("test", 0)
overlap = 1.0 - (meta.get("stride", 1) / max(1, meta.get("seq_len", 1)))
- perf = _kv_table([
+ perf_rows = [
("expected train windows", str(_count_windows(rc_train, meta.get("seq_len", 0), meta.get("stride", 1)))),
("expected val windows", str(_count_windows(rc_val, meta.get("seq_len", 0), meta.get("stride", 1)))),
("expected test windows", str(_count_windows(rc_test, meta.get("seq_len", 0), meta.get("stride", 1)))),
("overlap ratio", f"{overlap:.3f}"),
- ], width=min(_term_width(), 84))
- print(_render_card("Windowing details", perf, c, style=style, align="right"))
+ ]
+ print(_render_card("Windowing details", _kv_table(perf_rows, min(_term_width(),84), c=c), c, style=style, align="right"))
# ========================== Dataset info (report card) ========================
@@ -746,69 +739,56 @@ def _print_dataset_info(loader: "LOBSTERData", c: _C, style: str, peek: int = 5)
feature_set = meta.get("feature_set")
feats = meta.get("feature_names") or []
- # Fallback feature names if meta is empty
+ # Fallback feature names if meta not populated
if not feats:
if feature_set == "core":
- feats = [
- "mid_price",
- "spread",
- "mid_log_return",
- "queue_imbalance_l1",
- "depth_imbalance_l10",
- ]
+ feats = ["mid_price","spread","mid_log_return","queue_imbalance_l1","depth_imbalance_l10"]
elif feature_set == "raw10":
- feats = (
- [f"ask_price_{i}" for i in range(1, 11)] +
- [f"ask_size_{i}" for i in range(1, 11)] +
- [f"bid_price_{i}" for i in range(1, 11)] +
- [f"bid_size_{i}" for i in range(1, 11)]
- )
+ feats = ([f"ask_price_{i}" for i in range(1,11)] +
+ [f"ask_size_{i}" for i in range(1,11)] +
+ [f"bid_price_{i}" for i in range(1,11)] +
+ [f"bid_size_{i}" for i in range(1,11)])
- lines: List[str] = [
- f"Feature set: {feature_set}",
+ intro = [
+ f"Feature set: {c.BOLD}{feature_set}{c.RESET}" if c.enabled else f"Feature set: {feature_set}",
f"Total features: {len(feats)}",
""
]
- # aggregated statistics across splits (pretty tables)
try:
W_train, W_val, W_test = loader.load_arrays()
if W_train.size + W_val.size + W_test.size == 0:
- raise ValueError("No windows produced; consider lowering seq_len or stride.")
- blocks = []
- for W in (W_train, W_val, W_test):
- if getattr(W, "size", 0):
- blocks.append(W.reshape(-1, W.shape[-1]))
+ raise ValueError("No windows produced; lower seq_len or stride.")
+ blocks = [W.reshape(-1, W.shape[-1]) for W in (W_train, W_val, W_test) if getattr(W,"size",0)]
all_data = np.concatenate(blocks, axis=0)
df = pd.DataFrame(all_data, columns=feats)
# describe()
- lines.append("Statistical summary (aggregated across splits):")
+ intro.append(f"{c.BOLD}Statistical summary (aggregated across splits):{c.RESET}" if c.enabled else "Statistical summary (aggregated across splits):")
desc_df = df.describe().round(6)
- lines.extend(tabulate(desc_df, headers="keys", tablefmt="github").splitlines())
- lines.append("")
+ intro.extend(tabulate(desc_df, headers="keys", tablefmt=TABLE_FMT).splitlines())
+ intro.append("")
# peaks: means and stds tables
means = df.mean().sort_values(ascending=False).head(5)
stds = df.std().sort_values(ascending=False).head(5)
- lines.append("Highest-mean features:")
- lines.extend(tabulate(list(means.items()), headers=["feature", "mean"], tablefmt="github").splitlines())
- lines.append("")
+ intro.append(f"{c.BOLD}Highest-mean features:{c.RESET}" if c.enabled else "Highest-mean features:")
+ intro.extend(tabulate(list(means.items()), headers=[f"{c.MAGENTA}feature{c.RESET}" if c.enabled else "feature", "mean"], tablefmt=TABLE_FMT).splitlines())
+ intro.append("")
- lines.append("Most-variable features (by std):")
- lines.extend(tabulate(list(stds.items()), headers=["feature", "std"], tablefmt="github").splitlines())
- lines.append("")
+ intro.append(f"{c.BOLD}Most-variable features (by std):{c.RESET}" if c.enabled else "Most-variable features (by std):")
+ intro.extend(tabulate(list(stds.items()), headers=[f"{c.MAGENTA}feature{c.RESET}" if c.enabled else "feature", "std"], tablefmt=TABLE_FMT).splitlines())
+ intro.append("")
- # example rows
- lines.append("Example rows (first few timesteps):")
- ex_tbl = tabulate(df.head(peek).round(6), headers="keys", tablefmt="github", showindex=True)
- lines.extend(ex_tbl.splitlines())
+ intro.append(f"{c.BOLD}Example rows (first few timesteps):{c.RESET}" if c.enabled else "Example rows (first few timesteps):")
+ ex_tbl = tabulate(df.head(peek).round(6), headers="keys", tablefmt=TABLE_FMT, showindex=True)
+ intro.extend(ex_tbl.splitlines())
except Exception as e:
- lines.append(f"(Could not compute stats: {e})")
+ intro.append(f"{c.RED}(Could not compute stats: {e}){c.RESET}" if c.enabled else f"(Could not compute stats: {e})")
- print(_render_card("Dataset summary", lines, c, style=style, align="left"))
+ print(_render_card("Dataset summary", intro, c, style=style, align="left"))
# ================================== CLI ======================================
@@ -836,11 +816,16 @@ def _main_cli():
parser.add_argument("--every", type=int, default=1)
parser.add_argument("--clip-quantiles", type=float, nargs=2, metavar=("QMIN", "QMAX"), default=None)
parser.add_argument("--style", choices=["chat", "box"], default="chat", help="Output style")
+ parser.add_argument("--table-style", choices=["github","grid","simple"], default="github", help="Tabulate table style")
parser.add_argument("--no-color", action="store_true", help="Disable ANSI colors in output")
parser.add_argument("--verbose", action="store_true", help="Print extra diagnostics (memory, scaler, clip bounds)")
parser.add_argument("--meta-json", type=str, default=None, help="Optional path to dump meta JSON")
args = parser.parse_args()
+ # set global table format
+ global TABLE_FMT
+ TABLE_FMT = args.table_style
+
c = _C(_supports_color(args.no_color))
_print_dir_listing(args.data_dir, c, style=args.style)
@@ -864,7 +849,7 @@ def _main_cli():
)
if args.summary:
- lines = loader.summarize(peek=args.peek)
+ lines = loader.summarize(peek=args.peek, c=c)
_print_summary(lines, c, style=args.style)
_print_dataset_info(loader, c, style=args.style, peek=args.peek)
return
From a5e907be852fa2a69f22cbb524c73623ea858016 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Fri, 3 Oct 2025 17:19:58 +1000
Subject: [PATCH 13/74] docs(report): rewrite project overview for TimeGAN LOB
generation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Train a generative time series model on LOBSTER AMZN Level 10 data to
produce realistic limit order book sequences. Targets: KL divergence ≤0.1
for spread and midprice returns, and SSIM >0.6 for depth heatmaps. The
report records architecture and parameter count, training variants
(full, adversarial only, supervised only), GPU and VRAM, epochs, and
total training time. Includes 3–5 paired heatmaps with a short error
analysis.
---
recognition/TimeLOB_TimeGAN_49088276/README.MD | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/README.MD b/recognition/TimeLOB_TimeGAN_49088276/README.MD
index 1a01b637d..b155235ea 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/README.MD
+++ b/recognition/TimeLOB_TimeGAN_49088276/README.MD
@@ -1,4 +1,4 @@
-# TimeLOB
+# TimeGAN for Synthetic Limit Order Books (AMZN, LOBSTER Level-10)
**COMP3710 - Pattern Recognition and Analysis**
@@ -15,3 +15,11 @@
+## Project Overview
+This project trains a generative time series model to produce realistic sequences of limit order book events using the LOBSTER dataset, focusing on AMZN Level 10 data. The aim is to create high quality synthetic LOB sequences that can expand training sets for market microstructure research where balanced, fine grained data is expensive and difficult to collect. By learning the dynamics of spreads, midprice movements, and depth across ten levels, the model seeks to capture both short term fluctuations and broader order flow patterns.
+
+Quality is assessed on a held out test split using objective targets:
+- Distribution similarity: KL divergence at or below 0.1 for spread and midprice return distributions between generated and real data.
+- Visual similarity: SSIM above 0.6 between heatmaps of generated and real order book depth snapshots.
+
+The report will document the model architecture and total parameter count, and compare training strategies such as full TimeGAN, adversarial only, and supervised only variants. It will record the hardware used, including GPU model, available VRAM, number of epochs, and total training time. To aid interpretation, the report will include three to five representative heatmaps that pair generated and real order books, along with a short error analysis that explains where the synthetic sequences align with reality and where they fall short. The goal is a practical, well evidenced benchmark for synthetic LOB generation on AMZN Level 10.
From b08467b7e39df094fa9157f55d131024b820f0c3 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Fri, 3 Oct 2025 23:58:39 +1000
Subject: [PATCH 14/74] refactor(dataset): split monolithic dataset.py into
helpers pkg
Break out I/O, feature engineering, scaling, and windowing into dataset_helpers/ (io.py, features.py, scaling.py, windows.py). Keep public Dataset/loader logic in dataset.py and re-export via __init__.py for backward compatibility (from dataset import LOBSTERDataset still works). Updated imports, added basic tests/placeholders, and kept defaults/paths unchanged.
---
.../TimeLOB_TimeGAN_49088276/src/dataset.py | 984 ++++++++----------
.../src/helpers/__init__.py | 0
.../src/helpers/summaries.py | 260 +++++
.../src/helpers/textui.py | 303 ++++++
.../TimeLOB_TimeGAN_49088276/src/train.py | 19 +
5 files changed, 997 insertions(+), 569 deletions(-)
create mode 100644 recognition/TimeLOB_TimeGAN_49088276/src/helpers/__init__.py
create mode 100644 recognition/TimeLOB_TimeGAN_49088276/src/helpers/summaries.py
create mode 100644 recognition/TimeLOB_TimeGAN_49088276/src/helpers/textui.py
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
index 3dc56b753..208845cab 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
@@ -26,7 +26,9 @@
ask_price_1..10, ask_size_1..10, bid_price_1..10, bid_size_1..10
Notes:
-- Scaling is fit on TRAIN only (Standard/MinMax/None).
+- Scaling is fit on TRAIN only (Standard/MinMax/None). Advanced scalers: Robust, Quantile, Power.
+- Optional whitening: PCA (with variance threshold) or ZCA.
+- Optional train-only sequence augmentations (jitter, scaling, time-warp) for GANs.
- Windows default to non-overlapping (stride=seq_len); set stride bool:
- if no_color_flag:
- return False
- try:
- return os.isatty(1)
- except Exception:
- return False
-
-class _C:
- def __init__(self, enabled: bool):
- self.enabled = enabled
- self.RESET = "\033[0m" if enabled else ""
- self.DIM = "\033[2m" if enabled else ""
- self.BOLD = "\033[1m" if enabled else ""
- self.CYAN = "\033[36m" if enabled else ""
- self.YELLOW = "\033[33m" if enabled else ""
- self.GREEN = "\033[32m" if enabled else ""
- self.MAGENTA= "\033[35m" if enabled else ""
- self.BLUE = "\033[34m" if enabled else ""
- self.RED = "\033[31m" if enabled else ""
-
-def _term_width(default: int = 100) -> int:
- try:
- return shutil.get_terminal_size((default, 20)).columns
- except Exception:
- return default
-
-def _wrap(s: str, width: int) -> list[str]:
- if len(s) <= width:
- return [s]
- out, cur = [], ""
- for tok in s.split(" "):
- if not cur:
- cur = tok
- elif len(cur) + 1 + len(tok) <= width:
- cur += " " + tok
- else:
- out.append(cur); cur = tok
- if cur:
- out.append(cur)
- return out
-
-# ---- detect and preserve tabulate tables inside panels/bubbles ----
-
-_ANSI_RE = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]")
+from sklearn.preprocessing import RobustScaler, QuantileTransformer, PowerTransformer
+from sklearn.decomposition import PCA
+import json
+try:
+ import joblib # optional persistence
+except Exception:
+ joblib = None
-def _visible_len(s: str) -> int:
- """Visible length without ANSI codes (so width calc matches terminal)."""
- return len(_ANSI_RE.sub("", s))
-
-def _is_table_line(s: str) -> bool:
- """
- Heuristic for tabulate-like lines we should not wrap:
- - GitHub style: lines starting with '|' and having columns separated by '|'
- - Grid style: rule lines with '+' borders
- - Simple header/rule lines made of '-:|+ '
- """
- t = s.strip()
- if not t:
- return False
- if t.startswith("|") and "|" in t[1:]:
- return True
- if t.startswith("+") and t.endswith("+"):
- return True
- if set(t) <= set("-:|+ "):
- return True
- return False
-
-# Global table format (overridable via CLI)
-TABLE_FMT = "github"
-
-def _kv_table(rows: list[tuple[str, str]], width: int, c: _C, headers: tuple[str,str]=("key","value")) -> list[str]:
- """
- Render key–value rows as a compact 2-col table using tabulate.
- Returns a list of lines to embed inside bubbles/boxes.
- """
- if not rows:
- return []
- h_key = f"{c.BOLD}{c.MAGENTA}{headers[0]}{c.RESET}" if c.enabled else headers[0]
- h_val = f"{c.BOLD}{c.MAGENTA}{headers[1]}{c.RESET}" if c.enabled else headers[1]
- # tint keys
- tinted = [(f"{c.CYAN}{k}{c.RESET}" if c.enabled else k, v) for k, v in rows]
- table = tabulate(tinted, headers=[h_key, h_val], tablefmt=TABLE_FMT, stralign="left")
- return table.splitlines()
-
-def _bubble(title: str, body_lines: list[str], c: _C, align: str = "left", width: int | None = None) -> str:
- """
- Render a chat-style message bubble.
- - Does NOT wrap lines that look like preformatted tables.
- - Auto-fits inner width to the widest table line (within terminal limit).
- """
- termw = _term_width()
- width = min(termw, width or termw)
-
- # Baseline inner width
- base_inner = max(24, width - 10)
-
- # Expand to widest table row if present
- widest_tbl = 0
- for ln in body_lines:
- if _is_table_line(ln):
- widest_tbl = max(widest_tbl, _visible_len(ln))
- max_inner = min(max(base_inner, widest_tbl), width - 10)
-
- # Left/right alignment
- indent = 2 if align == "left" else max(2, width - (max_inner + 8))
- pad = " " * indent
-
- # Header
- ts = datetime.now().strftime("%H:%M")
- title_colored = f"{c.BOLD}{c.BLUE}{title}{c.RESET}" if c.enabled else title
- head = f"{title_colored} {c.DIM}{ts}{c.RESET}"
- head_lines = _wrap(head, max_inner)
- lines = [pad + " " + head_lines[0]]
- for hl in head_lines[1:]:
- lines.append(pad + " " + hl)
-
- # Bubble top border
- lines.append(pad + " " + ("╭" + "─" * (max_inner + 2) + "╮"))
-
- # Body: keep table lines intact; wrap normal text
- for ln in body_lines:
- if _is_table_line(ln):
- vis = _visible_len(ln)
- out = ln + " " * max(0, max_inner - vis)
- lines.append(pad + " " + "│ " + out[:max_inner] + " │")
- else:
- for wln in _wrap(ln, max_inner):
- lines.append(pad + " " + "│ " + wln.ljust(max_inner) + " │")
-
- # Bubble bottom + tail
- tail_left = pad + " " + "╰" + "─" * (max_inner + 2) + "╯" + "⟋"
- tail_right = pad + " " + "⟍" + "╰" + "─" * (max_inner + 2) + "╯"
- lines.append(tail_left if align == "left" else tail_right)
- return "\n".join(lines)
-
-def _panel(title: str, body_lines: list[str], c: _C, width: int | None = None) -> str:
- """Box panel; does not wrap tabulated lines; auto-fits to widest table row."""
- termw = _term_width()
- width = width or termw
- inner = width - 4 # borders + spaces
-
- # Fit inner width to widest table line if present (within terminal width)
- widest_tbl = 0
- for ln in body_lines:
- if _is_table_line(ln):
- widest_tbl = max(widest_tbl, _visible_len(ln))
- inner = min(max(inner, widest_tbl), termw - 4)
- width = inner + 4
-
- border = "─" * (width - 2)
- title_colored = f"{c.BOLD}{c.BLUE}{title}{c.RESET}" if c.enabled else title
- out = [f"{c.CYAN}┌{border}┐{c.RESET}"]
- title_line = f" {title_colored} "
- pad = max(0, width - 2 - _visible_len(title_line))
- out.append(f"{c.CYAN}│{c.RESET}{title_line}{' '*pad}{c.CYAN}│{c.RESET}")
- out.append(f"{c.CYAN}├{border}┤{c.RESET}")
-
- for ln in body_lines:
- if _is_table_line(ln):
- vis = _visible_len(ln)
- width_ok = inner - 2
- body = ln + " " * max(0, width_ok - vis)
- out.append(f"{c.CYAN}│{c.RESET} {body[:width_ok]} {c.CYAN}│{c.RESET}")
- else:
- for sub in _wrap(ln, inner - 2):
- padlen = max(0, (inner - 2) - len(sub))
- out.append(f"{c.CYAN}│{c.RESET} {sub}{' '*padlen} {c.CYAN}│{c.RESET}")
-
- out.append(f"{c.CYAN}└{border}┘{c.RESET}")
- return "\n".join(out)
-
-def _render_card(title: str, body_lines: list[str], c: _C, style: str = "chat", align: str = "left") -> str:
- return _bubble(title, body_lines, c, align=align) if style == "chat" else _panel(title, body_lines, c)
-
-
-# ============================== Verbose helpers ===============================
-
-def _fmt_bytes(n: int) -> str:
- units = ["B", "KB", "MB", "GB", "TB"]
- i = 0; f = float(n)
- while f >= 1024 and i < len(units) - 1:
- f /= 1024.0; i += 1
- return f"{f:.2f} {units[i]}"
-
-def _first_last_time(msg_df: pd.DataFrame) -> tuple[str, str]:
- if "time" not in msg_df.columns:
- return ("", "")
- try:
- t = pd.to_datetime(msg_df["time"], errors="coerce", unit=None)
- return (str(t.min()), str(t.max()))
- except Exception:
- return ("", "")
-
-
-# ================================ Summaries ===================================
-
-def _summarize_df(df: pd.DataFrame, name: str, peek: int, c: _C) -> List[str]:
- lines: List[str] = []
- title = f"{c.BOLD}{name}{c.RESET}" if c.enabled else name
- lines.append(title)
- lines.append(f"shape: {df.shape[0]} rows × {df.shape[1]} cols")
- cols = list(df.columns)
- col_str = ", ".join(cols)
- lines.append("columns: " + col_str if len(col_str) < 160 else "columns: " + ", ".join(cols[:12]) + ", …")
- dtypes = df.dtypes.astype(str).to_dict()
- na_counts = {k: int(v) for k, v in df.isna().sum().items() if int(v) > 0}
- lines.append("dtypes: " + ", ".join([f"{k}:{v}" for k, v in dtypes.items()]))
- lines.append("na_counts: " + (str(na_counts) if na_counts else "{}"))
- for col in ("type", "direction"):
- if col in df.columns:
- try:
- vc = df[col].value_counts(dropna=False).to_dict()
- lines.append(f"value_counts[{col}]: {vc}")
- except Exception:
- pass
- if "time" in df.columns:
- try:
- t = pd.to_datetime(df["time"], errors="coerce", unit=None)
- lines.append(f"time: min={t.min()} max={t.max()}")
- if t.notna().all():
- is_mono = bool((t.diff().dropna() >= pd.Timedelta(0)).all())
- lines.append(f"time monotonic nondecreasing: {is_mono}")
- except Exception:
- pass
-
- # numeric quick stats (pretty table)
- num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
- if num_cols:
- sample_cols = num_cols[: min(8, len(num_cols))]
- desc_df = df[sample_cols].describe().round(6)
- lines.append(f"{c.BOLD}describe(sample numeric cols):{c.RESET}" if c.enabled else "describe(sample numeric cols):")
- lines.extend(tabulate(desc_df, headers="keys", tablefmt=TABLE_FMT).splitlines())
-
- # head / tail (pretty tables)
- if peek > 0:
- lines.append(f"{c.BOLD}head:{c.RESET}" if c.enabled else "head:")
- head_tbl = tabulate(df.head(peek), headers="keys", tablefmt=TABLE_FMT, showindex=False)
- lines.extend(head_tbl.splitlines())
- lines.append(f"{c.BOLD}tail:{c.RESET}" if c.enabled else "tail:")
- tail_tbl = tabulate(df.tail(peek), headers="keys", tablefmt=TABLE_FMT, showindex=False)
- lines.extend(tail_tbl.splitlines())
-
- return lines
-
-
-# =============================== Core class ===================================
class LOBSTERData:
"""
- Loader -> features -> windows -> splits for LOBSTER L10 data.
+ Loader → features → windows → splits for LOBSTER Level-10 data.
+
+ Feature sets:
+ - "core": engineered 5-feature set (+ optional extras)
+ - "raw10": 40 raw columns (ask/bid price/size × levels 1..10) (+ optional extras)
"""
+
def __init__(
self,
data_dir: str,
@@ -310,7 +69,7 @@ def __init__(
seq_len: int = 64,
stride: Optional[int] = None,
splits: Tuple[float, float, float] = (0.7, 0.15, 0.15),
- scaler: Literal["standard", "minmax", "none"] = "standard",
+ scaler: Literal["standard", "minmax", "robust", "quantile", "power", "none"] = "standard",
feature_range: Tuple[float, float] = (0.0, 1.0),
eps: float = 1e-8,
headerless_message: bool = False,
@@ -320,6 +79,28 @@ def __init__(
sort_by_time: bool = False,
every: int = 1,
clip_quantiles: Optional[Tuple[float, float]] = None,
+
+ # --- extra feature engineering knobs ---
+ add_rel_spread: bool = True,
+ add_microprice: bool = True,
+ add_imbalance_l5: bool = True,
+ add_roll_stats: bool = True,
+ roll_window: int = 64,
+ add_diff1: bool = True,
+ add_pct_change: bool = False,
+
+ # --- whitening / dimensionality reduction ---
+ whiten: Optional[Literal["pca", "zca"]] = None,
+ pca_var: float = 0.99,
+
+ # --- train-only augmentation for GANs ---
+ aug_prob: float = 0.0,
+ aug_jitter_std: float = 0.01,
+ aug_scaling_std: float = 0.05,
+ aug_timewarp_max: float = 0.1,
+
+ # --- persistence ---
+ save_dir: Optional[str] = None,
):
self.data_dir = data_dir
self.message_path = os.path.join(data_dir, message_file)
@@ -335,11 +116,34 @@ def __init__(
self.headerless_orderbook = headerless_orderbook
self.dropna = dropna
self.output_dtype = np.float32 if output_dtype == "float32" else np.float64
-
self.sort_by_time = bool(sort_by_time)
self.every = max(1, int(every))
self.clip_quantiles = clip_quantiles
+ # feature knobs
+ self.add_rel_spread = add_rel_spread
+ self.add_microprice = add_microprice
+ self.add_imbalance_l5 = add_imbalance_l5
+ self.add_roll_stats = add_roll_stats
+ self.roll_window = int(roll_window)
+ self.add_diff1 = add_diff1
+ self.add_pct_change = add_pct_change
+
+ # whitening/DR
+ self.whiten = whiten
+ self.pca_var = float(pca_var)
+ self._pca = None # set later
+ self._zca_cov = None # (mean, whitening_mat)
+
+ # augmentation
+ self.aug_prob = float(aug_prob)
+ self.aug_jitter_std = float(aug_jitter_std)
+ self.aug_scaling_std = float(aug_scaling_std)
+ self.aug_timewarp_max = float(aug_timewarp_max)
+
+ # save
+ self.save_dir = save_dir
+
self._validate_splits()
if not (self.seq_len > 0 and self.stride > 0):
raise ValueError("seq_len and stride must be positive")
@@ -390,25 +194,38 @@ def load_arrays(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
W_val = self._windowize(val_s)
W_test = self._windowize(test_s)
+ # train-only augmentations for GANs
+ W_train = self._augment_windows(W_train)
+
W_train = W_train.astype(self.output_dtype, copy=False)
W_val = W_val.astype(self.output_dtype, copy=False)
W_test = W_test.astype(self.output_dtype, copy=False)
- return W_train, W_val, W_test
- def summarize(self, peek: int, c: _C) -> List[str]:
- msg_df, ob_df = self._load_csvs()
- _ = self._normalize_orderbook_headers(
- ob_df,
- [f"ask_price_{i}" for i in range(1, 11)]
- + [f"ask_size_{i}" for i in range(1, 11)]
- + [f"bid_price_{i}" for i in range(1, 11)]
- + [f"bid_size_{i}" for i in range(1, 11)]
- )
- lines = []
- lines += _summarize_df(msg_df, "message_10.csv", peek=peek, c=c)
- lines.append("") # spacer between the two tables
- lines += _summarize_df(ob_df, "orderbook_10.csv", peek=peek, c=c)
- return lines
+ # optional persistence
+ if self.save_dir:
+ os.makedirs(self.save_dir, exist_ok=True)
+ np.savez_compressed(
+ os.path.join(self.save_dir, "windows.npz"),
+ train=W_train, val=W_val, test=W_test
+ )
+ meta = self.get_meta()
+ meta["whiten"] = self.whiten
+ meta["pca_var"] = self.pca_var
+ meta["aug"] = {
+ "prob": self.aug_prob, "jitter_std": self.aug_jitter_std,
+ "scaling_std": self.aug_scaling_std, "timewarp_max": self.aug_timewarp_max
+ }
+ with open(os.path.join(self.save_dir, "meta.json"), "w", encoding="utf-8") as f:
+ json.dump(meta, f, indent=2)
+
+ if joblib is not None and self._scaler is not None:
+ joblib.dump(self._scaler, os.path.join(self.save_dir, "scaler.pkl"))
+ if joblib is not None and self._pca is not None:
+ joblib.dump(self._pca, os.path.join(self.save_dir, "pca.pkl"))
+ if joblib is not None and self._zca_cov is not None:
+ joblib.dump(self._zca_cov, os.path.join(self.save_dir, "zca.pkl"))
+
+ return W_train, W_val, W_test
def get_feature_names(self) -> List[str]:
return list(self._feature_names)
@@ -431,7 +248,7 @@ def get_meta(self) -> Dict[str, object]:
"seq_len": self.seq_len,
"stride": self.stride,
"splits": self.splits,
- "scaler": type(self._scaler).__name__ if self._scaler is not None else "None",
+ "scaler": (type(self._scaler).__name__ if self._scaler is not None else "None"),
"row_counts": self._row_counts,
"clip_bounds": None if self._clip_bounds is None else {
"lo": self._clip_bounds[0].tolist(),
@@ -439,6 +256,8 @@ def get_meta(self) -> Dict[str, object]:
},
"every": self.every,
"sorted_by_time": self.sort_by_time,
+ "whiten": self.whiten,
+ "pca_var": self.pca_var,
}
# ------------------- internals --------------------
@@ -502,6 +321,54 @@ def _check_alignment(self, msg_df: pd.DataFrame, ob_df: pd.DataFrame) -> None:
if len(msg_df) != len(ob_df):
raise ValueError(f"Message/Orderbook row count mismatch: {len(msg_df)} vs {len(ob_df)}")
+ # ------ extra engineering helpers ------
+ def _engineer_extra(self, ob_df: pd.DataFrame, base: np.ndarray) -> np.ndarray:
+ """Append engineered features onto base matrix (N x d)."""
+ feats = [base]
+
+ ap1 = ob_df["ask_price_1"].to_numpy(np.float64)
+ bp1 = ob_df["bid_price_1"].to_numpy(np.float64)
+ as1 = ob_df["ask_size_1"].to_numpy(np.float64)
+ bs1 = ob_df["bid_size_1"].to_numpy(np.float64)
+
+ mid_price = 0.5 * (ap1 + bp1)
+ spread = ap1 - bp1
+
+ if self.add_rel_spread:
+ rel_spread = spread / (mid_price + self.eps)
+ feats.append(rel_spread[:, None])
+
+ if self.add_microprice:
+ # microprice using L1 sizes
+ w_bid = bs1 / (bs1 + as1 + self.eps)
+ w_ask = 1.0 - w_bid
+ micro = w_ask * ap1 + w_bid * bp1
+ feats.append(micro[:, None])
+
+ if self.add_imbalance_l5:
+ bid5 = np.sum([ob_df[f"bid_size_{i}"].to_numpy(np.float64) for i in range(1, 6)], axis=0)
+ ask5 = np.sum([ob_df[f"ask_size_{i}"].to_numpy(np.float64) for i in range(1, 6)], axis=0)
+ im5 = (bid5 - ask5) / (bid5 + ask5 + self.eps)
+ feats.append(im5[:, None])
+
+ if self.add_diff1:
+ diff = np.vstack([np.zeros((1, base.shape[1])), np.diff(base, axis=0)])
+ feats.append(diff)
+
+ if self.add_pct_change:
+ pct = np.zeros_like(base)
+ pct[1:] = (base[1:] - base[:-1]) / (np.abs(base[:-1]) + self.eps)
+ feats.append(pct)
+
+ if self.add_roll_stats:
+ W = max(2, int(self.roll_window))
+ roll_mean = pd.Series(mid_price).rolling(W, min_periods=1).mean().to_numpy()
+ roll_std = pd.Series(mid_price).rolling(W, min_periods=1).std(ddof=0).fillna(0.0).to_numpy()
+ vol = pd.Series(np.diff(np.log(np.clip(mid_price, 1e-12, None)), prepend=0.0) ** 2).rolling(W, min_periods=1).mean().to_numpy()
+ feats += [roll_mean[:, None], roll_std[:, None], vol[:, None]]
+
+ return np.concatenate(feats, axis=1)
+
def _build_features(self, ob_df: pd.DataFrame) -> np.ndarray:
for prefix in ("ask_price_", "ask_size_", "bid_price_", "bid_size_"):
for L in range(1, 11):
@@ -518,6 +385,15 @@ def _build_features(self, ob_df: pd.DataFrame) -> np.ndarray:
)
X = ob_df[cols].to_numpy(dtype=np.float64)
self._feature_names = cols
+ X = self._engineer_extra(ob_df, X)
+ extras = []
+ if self.add_rel_spread: extras.append("rel_spread")
+ if self.add_microprice: extras.append("microprice")
+ if self.add_imbalance_l5: extras.append("depth_imbalance_l5")
+ if self.add_diff1: extras += [f"diff1_{n}" for n in self._feature_names]
+ if self.add_pct_change: extras += [f"pct_{n}" for n in self._feature_names]
+ if self.add_roll_stats: extras += ["roll_mid_mean","roll_mid_std","roll_vol"]
+ self._feature_names = self._feature_names + extras
return X
if self.feature_set == "core":
@@ -535,14 +411,25 @@ def _build_features(self, ob_df: pd.DataFrame) -> np.ndarray:
ask_depth = sum(ob_df[f"ask_size_{i}"].to_numpy(dtype=np.float64) for i in range(1, 11))
di_l10 = (bid_depth - ask_depth) / (bid_depth + ask_depth + self.eps)
- X = np.vstack([mid_price, spread, mid_log_return, qi_l1, di_l10]).T
- self._feature_names = [
+ X_base = np.vstack([mid_price, spread, mid_log_return, qi_l1, di_l10]).T
+ base_names = [
"mid_price",
"spread",
"mid_log_return",
"queue_imbalance_l1",
"depth_imbalance_l10",
]
+ X = self._engineer_extra(ob_df, X_base)
+
+ extra_names = []
+ if self.add_rel_spread: extra_names.append("rel_spread")
+ if self.add_microprice: extra_names.append("microprice")
+ if self.add_imbalance_l5: extra_names.append("depth_imbalance_l5")
+ if self.add_diff1: extra_names += [f"diff1_{n}" for n in base_names]
+ if self.add_pct_change: extra_names += [f"pct_{n}" for n in base_names]
+ if self.add_roll_stats: extra_names += ["roll_mid_mean","roll_mid_std","roll_vol"]
+
+ self._feature_names = base_names + extra_names
return X
raise ValueError("feature_set must be 'core' or 'raw10'")
@@ -551,8 +438,7 @@ def _split_chronologically(self, X: np.ndarray) -> Tuple[np.ndarray, np.ndarray,
n = len(X)
if n < self.seq_len:
raise ValueError(
- f"Not enough rows ({n}) for seq_len={self.seq_len}. "
- "Reduce seq_len or use a longer session."
+ f"Not enough rows ({n}) for seq_len={self.seq_len}. Reduce seq_len or use a longer session."
)
n_train = int(n * self.splits[0])
n_val = int(n * self.splits[1])
@@ -567,18 +453,53 @@ def _split_chronologically(self, X: np.ndarray) -> Tuple[np.ndarray, np.ndarray,
def _scale_train_only(
self, train: np.ndarray, val: np.ndarray, test: np.ndarray
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
- if self.scaler_kind == "none":
- return train, val, test
-
- if self.scaler_kind == "standard":
- scaler = StandardScaler()
- elif self.scaler_kind == "minmax":
- scaler = MinMaxScaler(feature_range=self.feature_range)
+ kind = self.scaler_kind
+ if kind == "none":
+ scaler = None
+ Xt, Xv, Xs = train, val, test
else:
- raise ValueError("scaler must be 'standard', 'minmax', or 'none'")
- scaler.fit(train)
+ if kind == "standard":
+ scaler = StandardScaler()
+ elif kind == "minmax":
+ scaler = MinMaxScaler(feature_range=self.feature_range)
+ elif kind == "robust":
+ scaler = RobustScaler()
+ elif kind == "quantile":
+ scaler = QuantileTransformer(output_distribution="normal", subsample=100000, random_state=42)
+ elif kind == "power":
+ scaler = PowerTransformer(method="yeo-johnson", standardize=True)
+ else:
+ raise ValueError("scaler must be 'standard','minmax','robust','quantile','power', or 'none'")
+ scaler.fit(train)
+ Xt, Xv, Xs = scaler.transform(train), scaler.transform(val), scaler.transform(test)
+
self._scaler = scaler
- return scaler.transform(train), scaler.transform(val), scaler.transform(test)
+
+ # optional whitening
+ if self.whiten is None:
+ return Xt, Xv, Xs
+
+ if self.whiten == "pca":
+ p = PCA(n_components=self.pca_var, svd_solver="full", whiten=True, random_state=42)
+ p.fit(Xt)
+ self._pca = p
+ return p.transform(Xt), p.transform(Xv), p.transform(Xs)
+
+ if self.whiten == "zca":
+ mu = Xt.mean(axis=0, keepdims=True)
+ Xc = Xt - mu
+ cov = (Xc.T @ Xc) / max(1, Xc.shape[0]-1)
+ U, S, _ = np.linalg.svd(cov + 1e-6*np.eye(cov.shape[0]), full_matrices=False)
+ S_inv_sqrt = np.diag(1.0 / np.sqrt(S + 1e-6))
+ W = U @ S_inv_sqrt @ U.T
+ self._zca_cov = (mu, W)
+
+ def apply_zca(A: np.ndarray) -> np.ndarray:
+ return (A - mu) @ W
+
+ return apply_zca(Xt), apply_zca(Xv), apply_zca(Xs)
+
+ raise ValueError("whiten must be None, 'pca', or 'zca'")
def _windowize(self, X: np.ndarray) -> np.ndarray:
n, d = X.shape
@@ -592,244 +513,90 @@ def _windowize(self, X: np.ndarray) -> np.ndarray:
W[i] = X[s : s + self.seq_len]
return W
+ # ------ augmentations (sequence-level, applied after windowing to TRAIN only) ------
+ def _augment_windows(self, W: np.ndarray) -> np.ndarray:
+ if self.aug_prob <= 0.0:
+ return W
+ out = W.copy()
+ rng = np.random.default_rng(42)
+ for i in range(out.shape[0]):
+ if rng.random() < self.aug_prob:
+ seq = out[i]
+ # jitter (add Gaussian noise)
+ seq = seq + rng.normal(0.0, self.aug_jitter_std, size=seq.shape)
+ # scaling (per-feature)
+ scale = rng.normal(1.0, self.aug_scaling_std, size=(1, seq.shape[-1]))
+ seq = seq * scale
+ # simple time warp (resample along time axis by a small factor)
+ max_alpha = self.aug_timewarp_max
+ alpha = float(np.clip(rng.normal(1.0, max_alpha/3), 1.0-max_alpha, 1.0+max_alpha))
+ T, D = seq.shape
+ new_idx = np.linspace(0, T-1, num=T) ** alpha
+ new_idx = (new_idx / new_idx.max()) * (T-1)
+ left = np.floor(new_idx).astype(int)
+ right = np.clip(left+1, 0, T-1)
+ w = (new_idx - left)[:, None]
+ seq = (1-w) * seq[left, :] + w * seq[right, :]
+ out[i] = seq
+ return out
-# ============================ CLI and message output ==========================
-
-def _print_dir_listing(path: str, c: _C, style: str) -> None:
- if os.path.isdir(path):
- files = sorted(os.listdir(path))
- body = [f"path: {path}", f"files: {len(files)}"]
- body += [f"• {f}" for f in files[:10]]
- if len(files) > 10:
- body.append(f"• (+{len(files)-10} more)")
- else:
- body = [f"path: {path}", f"{c.RED}files: (missing){c.RESET}" if c.enabled else "files: (missing)"]
- print(_render_card("Data directory", body, c, style=style, align="left"))
-
-def _print_summary(lines: list[str], c: _C, style: str) -> None:
- # split into two bubbles by blank line
- if "" in lines:
- idx = lines.index("")
- msg_part = lines[:idx]
- ob_part = lines[idx+1:]
- else:
- msg_part, ob_part = lines, []
-
- def split_title(block: list[str]) -> tuple[str, list[str]]:
- if not block:
- return ("", [])
- title, body = block[0], block[1:]
- return (title, body)
-
- t1, b1 = split_title(msg_part)
- if t1:
- print(_render_card(t1, b1, c, style=style, align="left"))
- t2, b2 = split_title(ob_part)
- if t2:
- print(_render_card(t2, b2, c, style=style, align="left"))
-
-def _print_report(W_train, W_val, W_test, meta: dict, c: _C, style: str, *,
- verbose: bool = False,
- scaler_obj = None,
- clip_bounds = None,
- time_coverage: tuple[str, str] = ("","")) -> None:
- # Basic block
- block1 = [
- ("train windows", "×".join(map(str, W_train.shape))),
- ("val windows", "×".join(map(str, W_val.shape))),
- ("test windows", "×".join(map(str, W_test.shape))),
- ("seq_len", str(meta.get("seq_len"))),
- ("stride", str(meta.get("stride"))),
- ("feature_set", str(meta.get("feature_set"))),
- ("#features", str(len(meta.get("feature_names", [])))),
- ("scaler", str(meta.get("scaler"))),
- ("sorted_by_time",str(meta.get("sorted_by_time"))),
- ("every", str(meta.get("every"))),
- ]
- lines1 = _kv_table(block1, width=min(_term_width(), 84), c=c)
- print(_render_card("Preprocessing report", lines1, c, style=style, align="right"))
-
- # Row counts
- rc = meta.get("row_counts", {})
- if rc:
- block2 = [(k, str(v)) for k, v in rc.items()]
- lines2 = _kv_table(block2, width=min(_term_width(), 84), c=c)
- print(_render_card("Row counts", lines2, c, style=style, align="right"))
-
- # Sample window stats
- if getattr(W_train, "size", 0):
- win = W_train[0]
- block3 = [
- ("window[0] mean", f"{float(win.mean()):.6f}"),
- ("window[0] std", f"{float(win.std()):.6f}"),
- ("features", ", ".join(meta.get("feature_names", [])[:8]) + ("…" if len(meta.get("feature_names", []))>8 else "")),
- ]
- lines3 = _kv_table(block3, width=min(_term_width(), 84), c=c)
- print(_render_card("Sample window", lines3, c, style=style, align="right"))
-
- if not verbose:
- return
-
- # Verbose extras
- vlines: list[str] = []
- total_bytes = (getattr(W_train, "nbytes", 0) +
- getattr(W_val, "nbytes", 0) +
- getattr(W_test, "nbytes", 0))
- vlines.append(f"memory total: {_fmt_bytes(total_bytes)}")
- vlines.append(f"train bytes: {_fmt_bytes(getattr(W_train, 'nbytes', 0))}")
- vlines.append(f"val bytes: {_fmt_bytes(getattr(W_val, 'nbytes', 0))}")
- vlines.append(f"test bytes: {_fmt_bytes(getattr(W_test, 'nbytes', 0))}")
-
- tmin, tmax = time_coverage
- if tmin or tmax:
- vlines.append(f"time coverage: {tmin} → {tmax}")
-
- print(_render_card("Resources & coverage", vlines, c, style=style, align="right"))
-
- # Scaler params
- if scaler_obj is not None:
- s_rows = []
- if hasattr(scaler_obj, "mean_") and hasattr(scaler_obj, "scale_"):
- s_rows = [
- ("type", "StandardScaler"),
- ("mean[0:8]", np.array2string(scaler_obj.mean_[:8], precision=4, separator=", ")),
- ("scale[0:8]", np.array2string(scaler_obj.scale_[:8], precision=4, separator=", ")),
- ]
- elif hasattr(scaler_obj, "data_min_") and hasattr(scaler_obj, "data_max_"):
- s_rows = [
- ("type", "MinMaxScaler"),
- ("data_min[0:8]", np.array2string(scaler_obj.data_min_[:8], precision=4, separator=", ")),
- ("data_max[0:8]", np.array2string(scaler_obj.data_max_[:8], precision=4, separator=", ")),
- ("feature_range", str(getattr(scaler_obj, "feature_range", None))),
- ]
- if s_rows:
- print(_render_card("Scaler parameters", _kv_table(s_rows, min(_term_width(),84), c=c), c, style=style, align="right"))
-
- # Clip bounds preview
- if clip_bounds is not None:
- lo, hi = clip_bounds
- cb_rows = [
- ("q-lo[0:8]", np.array2string(lo[:8], precision=4, separator=", ")),
- ("q-hi[0:8]", np.array2string(hi[:8], precision=4, separator=", ")),
- ]
- print(_render_card("Clip bounds (preview)", _kv_table(cb_rows, min(_term_width(),84), c=c), c, style=style, align="right"))
-
- # Windowing math
- def _count_windows(n_rows: int, seq_len: int, stride: int) -> int:
- if n_rows < seq_len:
- return 0
- return 1 + (n_rows - seq_len) // stride
-
- rc_train = rc.get("train", 0); rc_val = rc.get("val", 0); rc_test = rc.get("test", 0)
- overlap = 1.0 - (meta.get("stride", 1) / max(1, meta.get("seq_len", 1)))
- perf_rows = [
- ("expected train windows", str(_count_windows(rc_train, meta.get("seq_len", 0), meta.get("stride", 1)))),
- ("expected val windows", str(_count_windows(rc_val, meta.get("seq_len", 0), meta.get("stride", 1)))),
- ("expected test windows", str(_count_windows(rc_test, meta.get("seq_len", 0), meta.get("stride", 1)))),
- ("overlap ratio", f"{overlap:.3f}"),
- ]
- print(_render_card("Windowing details", _kv_table(perf_rows, min(_term_width(),84), c=c), c, style=style, align="right"))
-
-
-# ========================== Dataset info (report card) ========================
-
-def _print_dataset_info(loader: "LOBSTERData", c: _C, style: str, peek: int = 5) -> None:
- """Print detailed information about the dataset and feature set."""
- meta = loader.get_meta()
- feature_set = meta.get("feature_set")
- feats = meta.get("feature_names") or []
-
- # Fallback feature names if meta not populated
- if not feats:
- if feature_set == "core":
- feats = ["mid_price","spread","mid_log_return","queue_imbalance_l1","depth_imbalance_l10"]
- elif feature_set == "raw10":
- feats = ([f"ask_price_{i}" for i in range(1,11)] +
- [f"ask_size_{i}" for i in range(1,11)] +
- [f"bid_price_{i}" for i in range(1,11)] +
- [f"bid_size_{i}" for i in range(1,11)])
-
- intro = [
- f"Feature set: {c.BOLD}{feature_set}{c.RESET}" if c.enabled else f"Feature set: {feature_set}",
- f"Total features: {len(feats)}",
- ""
- ]
-
- try:
- W_train, W_val, W_test = loader.load_arrays()
- if W_train.size + W_val.size + W_test.size == 0:
- raise ValueError("No windows produced; lower seq_len or stride.")
- blocks = [W.reshape(-1, W.shape[-1]) for W in (W_train, W_val, W_test) if getattr(W,"size",0)]
- all_data = np.concatenate(blocks, axis=0)
- df = pd.DataFrame(all_data, columns=feats)
-
- # describe()
- intro.append(f"{c.BOLD}Statistical summary (aggregated across splits):{c.RESET}" if c.enabled else "Statistical summary (aggregated across splits):")
- desc_df = df.describe().round(6)
- intro.extend(tabulate(desc_df, headers="keys", tablefmt=TABLE_FMT).splitlines())
- intro.append("")
-
- # peaks: means and stds tables
- means = df.mean().sort_values(ascending=False).head(5)
- stds = df.std().sort_values(ascending=False).head(5)
-
- intro.append(f"{c.BOLD}Highest-mean features:{c.RESET}" if c.enabled else "Highest-mean features:")
- intro.extend(tabulate(list(means.items()), headers=[f"{c.MAGENTA}feature{c.RESET}" if c.enabled else "feature", "mean"], tablefmt=TABLE_FMT).splitlines())
- intro.append("")
-
- intro.append(f"{c.BOLD}Most-variable features (by std):{c.RESET}" if c.enabled else "Most-variable features (by std):")
- intro.extend(tabulate(list(stds.items()), headers=[f"{c.MAGENTA}feature{c.RESET}" if c.enabled else "feature", "std"], tablefmt=TABLE_FMT).splitlines())
- intro.append("")
-
- intro.append(f"{c.BOLD}Example rows (first few timesteps):{c.RESET}" if c.enabled else "Example rows (first few timesteps):")
- ex_tbl = tabulate(df.head(peek).round(6), headers="keys", tablefmt=TABLE_FMT, showindex=True)
- intro.extend(ex_tbl.splitlines())
-
- except Exception as e:
- intro.append(f"{c.RED}(Could not compute stats: {e}){c.RESET}" if c.enabled else f"(Could not compute stats: {e})")
-
- print(_render_card("Dataset summary", intro, c, style=style, align="left"))
+if __name__ == "__main__":
+ # Demo / summary with styled box panels by default
+ import argparse
-# ================================== CLI ======================================
+ from helpers.textui import (
+ C, supports_color, set_table_style,
+ render_kv_panel, render_card, table, DEFAULT_STYLE
+ )
-def _main_cli():
- parser = argparse.ArgumentParser(description="LOBSTERData (preprocess + summarize).")
- parser.add_argument("--data-dir", default="data")
- parser.add_argument("--message", required=True)
- parser.add_argument("--orderbook", required=True)
+ parser = argparse.ArgumentParser(description="Run dataset preprocessing demo or print a quick summary.")
+ parser.add_argument("--data-dir", required=True)
+ parser.add_argument("--message", default="message_10.csv")
+ parser.add_argument("--orderbook", default="orderbook_10.csv")
parser.add_argument("--feature-set", choices=["core", "raw10"], default="core")
parser.add_argument("--seq-len", type=int, default=64)
- parser.add_argument("--stride", type=int, default=16)
- parser.add_argument("--splits", type=float, nargs=3, metavar=("TRAIN", "VAL", "TEST"),
- default=(0.7, 0.15, 0.15))
- parser.add_argument("--scaler", choices=["standard", "minmax", "none"], default="standard")
- parser.add_argument("--feature-range", type=float, nargs=2, metavar=("MIN", "MAX"), default=(0.0, 1.0))
+ parser.add_argument("--stride", type=int, default=64)
+ parser.add_argument("--scaler", choices=["standard", "minmax", "robust", "quantile", "power", "none"], default="standard")
+ parser.add_argument("--splits", type=float, nargs=3, metavar=("TRAIN", "VAL", "TEST"), default=(0.7, 0.15, 0.15))
parser.add_argument("--headerless-message", action="store_true")
parser.add_argument("--headerless-orderbook", action="store_true")
- parser.add_argument("--no-dropna", action="store_true")
- parser.add_argument("--dtype", choices=["float32", "float64"], default="float32")
- parser.add_argument("--save-npz", type=str, default=None)
- parser.add_argument("--summary", action="store_true")
- parser.add_argument("--peek", type=int, default=5)
- parser.add_argument("--sort-by-time", action="store_true")
- parser.add_argument("--every", type=int, default=1)
- parser.add_argument("--clip-quantiles", type=float, nargs=2, metavar=("QMIN", "QMAX"), default=None)
- parser.add_argument("--style", choices=["chat", "box"], default="chat", help="Output style")
- parser.add_argument("--table-style", choices=["github","grid","simple"], default="github", help="Tabulate table style")
- parser.add_argument("--no-color", action="store_true", help="Disable ANSI colors in output")
- parser.add_argument("--verbose", action="store_true", help="Print extra diagnostics (memory, scaler, clip bounds)")
- parser.add_argument("--meta-json", type=str, default=None, help="Optional path to dump meta JSON")
- args = parser.parse_args()
- # set global table format
- global TABLE_FMT
- TABLE_FMT = args.table_style
+ # style & summary controls
+ parser.add_argument("--summary", action="store_true", help="Print a concise dataset summary (heads/dtypes/stats).")
+ parser.add_argument("--peek", type=int, default=5, help="Rows to show for head/tail in --summary mode.")
+ parser.add_argument("--style", choices=["box", "chat"], default=DEFAULT_STYLE, help="Output card style (default: box).")
+ parser.add_argument("--table-style", choices=["github", "grid", "simple"], default="github", help="Tabulate table style.")
+ parser.add_argument("--no-color", action="store_true", help="Disable ANSI colors.")
+
+ # extra feature engineering
+ parser.add_argument("--no-rel-spread", dest="add_rel_spread", action="store_false")
+ parser.add_argument("--no-microprice", dest="add_microprice", action="store_false")
+ parser.add_argument("--no-imbalance-l5", dest="add_imbalance_l5", action="store_false")
+ parser.add_argument("--no-roll-stats", dest="add_roll_stats", action="store_false")
+ parser.add_argument("--roll-window", type=int, default=64)
+ parser.add_argument("--no-diff1", dest="add_diff1", action="store_false")
+ parser.add_argument("--pct-change", action="store_true")
+
+ # whitening / DR
+ parser.add_argument("--whiten", choices=["pca", "zca"], default=None)
+ parser.add_argument("--pca-var", type=float, default=0.99)
+
+ # augmentation
+ parser.add_argument("--aug-prob", type=float, default=0.0)
+ parser.add_argument("--aug-jitter-std", type=float, default=0.01)
+ parser.add_argument("--aug-scaling-std", type=float, default=0.05)
+ parser.add_argument("--aug-timewarp-max", type=float, default=0.1)
+
+ # persistence
+ parser.add_argument("--save-dir", type=str, default=None)
+
+ args = parser.parse_args()
- c = _C(_supports_color(args.no_color))
- _print_dir_listing(args.data_dir, c, style=args.style)
+ set_table_style(args.table_style)
+ c = C(enabled=supports_color(args.no_color))
- loader = LOBSTERData(
+ ds = LOBSTERData(
data_dir=args.data_dir,
message_file=args.message,
orderbook_file=args.orderbook,
@@ -838,63 +605,142 @@ def _main_cli():
stride=args.stride,
splits=tuple(args.splits),
scaler=args.scaler,
- feature_range=tuple(args.feature_range),
headerless_message=args.headerless_message,
headerless_orderbook=args.headerless_orderbook,
- dropna=not args.no_dropna,
- output_dtype=args.dtype,
- sort_by_time=args.sort_by_time,
- every=args.every,
- clip_quantiles=tuple(args.clip_quantiles) if args.clip_quantiles else None,
- )
- if args.summary:
- lines = loader.summarize(peek=args.peek, c=c)
- _print_summary(lines, c, style=args.style)
- _print_dataset_info(loader, c, style=args.style, peek=args.peek)
- return
-
- W_train, W_val, W_test = loader.load_arrays()
- meta = loader.get_meta()
-
- # verbose context
- scaler_obj = loader.get_scaler()
- clip_bounds = None
- if meta.get("clip_bounds"):
- lo = np.array(meta["clip_bounds"]["lo"], dtype=float)
- hi = np.array(meta["clip_bounds"]["hi"], dtype=float)
- clip_bounds = (lo, hi)
-
- # best-effort message time coverage
- try:
- msg_df, _ = loader._load_csvs()
- tmin, tmax = _first_last_time(msg_df)
- except Exception:
- tmin = tmax = ""
-
- _print_report(
- W_train, W_val, W_test, meta, c, style=args.style,
- verbose=args.verbose, scaler_obj=scaler_obj,
- clip_bounds=clip_bounds, time_coverage=(tmin, tmax)
+ add_rel_spread=getattr(args, "add_rel_spread", True),
+ add_microprice=getattr(args, "add_microprice", True),
+ add_imbalance_l5=getattr(args, "add_imbalance_l5", True),
+ add_roll_stats=getattr(args, "add_roll_stats", True),
+ roll_window=args.roll_window,
+ add_diff1=getattr(args, "add_diff1", True),
+ add_pct_change=args.pct_change,
+
+ whiten=args.whiten,
+ pca_var=args.pca_var,
+
+ aug_prob=args.aug_prob,
+ aug_jitter_std=args.aug_jitter_std,
+ aug_scaling_std=args.aug_scaling_std,
+ aug_timewarp_max=args.aug_timewarp_max,
+
+ save_dir=args.save_dir,
)
- # optional meta dump
- if args.meta_json:
- import json
- with open(args.meta_json, "w", encoding="utf-8") as f:
- json.dump(meta, f, indent=2)
- print(_render_card("Saved", [f"meta: {args.meta_json}"], c, style=args.style, align="right"))
-
- # optional arrays NPZ
- if args.save_npz:
- np.savez_compressed(
- args.save_npz,
- train=W_train, val=W_val, test=W_test,
- feature_names=np.array(loader.get_feature_names(), dtype=object),
- meta=np.array([str(meta)], dtype=object),
- )
- print(_render_card("Saved", [f"windows: {args.save_npz}"], c, style=args.style, align="right"))
+ # Always show a small preprocessing report card (even without --summary)
+ base_rows = [
+ ("data_dir", args.data_dir),
+ ("message", args.message),
+ ("orderbook", args.orderbook),
+ ("feature_set", args.feature_set),
+ ("seq_len", str(args.seq_len)),
+ ("stride", str(args.stride)),
+ ("scaler", args.scaler),
+ ("whiten", str(args.whiten)),
+ ("aug_prob", str(args.aug_prob)),
+ ("save_dir", str(args.save_dir)),
+ ]
+ print(render_kv_panel("Preprocessing config", base_rows, c, style=args.style, align="right"))
+ if args.summary:
+ # ---------- helpers that render subpanels with textui and nest them ----------
+ from helpers.textui import table as tx_table # alias for clarity
+
+ def _rows_from_df(df: pd.DataFrame, limit_rows: int, limit_cols: int) -> tuple[list[str], list[list[str]]]:
+ cols_all = list(map(str, df.columns))
+ cols = cols_all[:limit_cols]
+ rows_df = df.iloc[:limit_rows, :limit_cols].astype(object).astype(str)
+ headers = cols + (["…"] if len(cols_all) > limit_cols else [])
+ rows = rows_df.values.tolist()
+ if len(cols_all) > limit_cols:
+ rows = [r + ["…"] for r in rows]
+ return headers, rows
+
+ def _subpanel_lines(title: str, body_lines: list[str]) -> list[str]:
+ # Render a mini panel and return its lines to embed inside the big panel
+ return render_card(title, body_lines, c, style=args.style, align="left").splitlines()
+
+ def _panel_df(title: str, df: pd.DataFrame, peek: int) -> list[str]:
+ headers, rows = _rows_from_df(df, limit_rows=peek, limit_cols=12)
+ return _subpanel_lines(title, tx_table(rows, headers, c))
+
+ def _panel_dtypes(df: pd.DataFrame) -> list[str]:
+ headers = ["column", "dtype"]
+ dtypes_rows = [[str(k), str(v)] for k, v in df.dtypes.items()]
+ note = f"total: {len(df.columns)} columns" + (" (showing first 24)" if len(dtypes_rows) > 24 else "")
+ dtypes_rows = dtypes_rows[:24]
+ body = [note] + tx_table(dtypes_rows, headers, c)
+ return _subpanel_lines("dtypes", body)
+
+ def _panel_describe(df: pd.DataFrame) -> list[str]:
+ num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+ if not num_cols:
+ return _subpanel_lines("describe (numeric subset)", ["no numeric columns"])
+ sample = num_cols[: min(8, len(num_cols))]
+ desc = df[sample].describe().round(6).reset_index(names="stat")
+ headers = list(map(str, desc.columns))
+ rows = desc.astype(object).astype(str).values.tolist()
+ return _subpanel_lines("describe (numeric subset)", tx_table(rows, headers, c))
+
+ def _big_panel(title: str, subpanels: list[list[str]]) -> str:
+ # Flatten the subpanel line blocks with a blank spacer between them
+ body_lines: list[str] = []
+ for i, block in enumerate(subpanels):
+ if i > 0:
+ body_lines.append("") # spacer line
+ body_lines.extend(block)
+ return render_card(title, body_lines, c, style=args.style, align="left")
+
+ # ---------- load CSVs ----------
+ msg_df, ob_df = ds._load_csvs()
+
+ # high-level config card (already styled)
+ print(render_kv_panel("CSV summary config", [
+ ("message file", args.message),
+ ("orderbook file", args.orderbook),
+ ("rows (message, orderbook)", f"{len(msg_df)}, {len(ob_df)}"),
+ ("columns (message, orderbook)", f"{msg_df.shape[1]}, {ob_df.shape[1]}"),
+ ], c, style=args.style, align="right"))
+
+ # ---------- message big panel ----------
+ msg_subs = []
+ msg_subs.append(_subpanel_lines("shape", [f"{msg_df.shape[0]} rows × {msg_df.shape[1]} cols"]))
+ msg_subs.append(_panel_dtypes(msg_df))
+ msg_subs.append(_panel_describe(msg_df))
+ msg_subs.append(_panel_df("head", msg_df.head(args.peek), args.peek))
+ msg_subs.append(_panel_df("tail", msg_df.tail(args.peek), args.peek))
+ print(_big_panel("message_10.csv", msg_subs))
+
+ # ---------- orderbook big panel ----------
+ ob_subs = []
+ ob_subs.append(_subpanel_lines("shape", [f"{ob_df.shape[0]} rows × {ob_df.shape[1]} cols"]))
+ ob_subs.append(_panel_dtypes(ob_df))
+ ob_subs.append(_panel_describe(ob_df))
+ ob_subs.append(_panel_df("head", ob_df.head(args.peek), args.peek))
+ ob_subs.append(_panel_df("tail", ob_df.tail(args.peek), args.peek))
+ print(_big_panel("orderbook_10.csv", ob_subs))
+
+ # ---------- windowed output card (after preprocessing) ----------
+ W_train, W_val, W_test = ds.load_arrays()
+ rows = [
+ ("train windows", "×".join(map(str, W_train.shape))),
+ ("val windows", "×".join(map(str, W_val.shape))),
+ ("test windows", "×".join(map(str, W_test.shape))),
+ ("#features", str(len(ds.get_feature_names()))),
+ ]
+ print(render_kv_panel("Windows & features", rows, c, style=args.style, align="right"))
+ print(render_card(
+ "Feature names (first 12)",
+ [", ".join(ds.get_feature_names()[:12]) + (" …" if len(ds.get_feature_names())>12 else "")],
+ c, style=args.style, align="left"
+ ))
-if __name__ == "__main__":
- _main_cli()
+ else:
+ W_train, W_val, W_test = ds.load_arrays()
+ rows = [
+ ("train", "×".join(map(str, W_train.shape))),
+ ("val", "×".join(map(str, W_val.shape))),
+ ("test", "×".join(map(str, W_test.shape))),
+ ("features", ", ".join(ds.get_feature_names()[:12]) + (" …" if len(ds.get_feature_names())>12 else "")),
+ ]
+ print(render_kv_panel("Output shapes", rows, c, style=args.style, align="right"))
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/__init__.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/summaries.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/summaries.py
new file mode 100644
index 000000000..d803303e7
--- /dev/null
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/summaries.py
@@ -0,0 +1,260 @@
+from __future__ import annotations
+
+from typing import List, Tuple
+import numpy as np
+import pandas as pd
+from tabulate import tabulate
+
+from .textui import C, render_card, kv_table, set_table_style, term_width, bold_white_borders, TABLE_FMT
+
+
+def first_last_time(msg_df: pd.DataFrame) -> tuple[str, str]:
+ if "time" not in msg_df.columns:
+ return ("", "")
+ try:
+ t = pd.to_datetime(msg_df["time"], errors="coerce", unit=None)
+ return (str(t.min()), str(t.max()))
+ except Exception:
+ return ("", "")
+
+
+def summarize_df(df: pd.DataFrame, name: str, peek: int, c: C) -> List[str]:
+ lines: List[str] = []
+ title = f"{c.BOLD}{name}{c.RESET}" if c.enabled else name
+ lines.append(title)
+ lines.append(f"shape: {df.shape[0]} rows × {df.shape[1]} cols")
+ cols = list(df.columns)
+ col_str = ", ".join(cols)
+ lines.append("columns: " + col_str if len(col_str) < 160 else "columns: " + ", ".join(cols[:12]) + ", …")
+ dtypes = df.dtypes.astype(str).to_dict()
+ na_counts = {k: int(v) for k, v in df.isna().sum().items() if int(v) > 0}
+ lines.append("dtypes: " + ", ".join([f"{k}:{v}" for k, v in dtypes.items()]))
+ lines.append("na_counts: " + (str(na_counts) if na_counts else "{}"))
+ for col in ("type", "direction"):
+ if col in df.columns:
+ try:
+ vc = df[col].value_counts(dropna=False).to_dict()
+ lines.append(f"value_counts[{col}]: {vc}")
+ except Exception:
+ pass
+ if "time" in df.columns:
+ try:
+ t = pd.to_datetime(df["time"], errors="coerce", unit=None)
+ lines.append(f"time: min={t.min()} max={t.max()}")
+ if t.notna().all():
+ is_mono = bool((t.diff().dropna() >= pd.Timedelta(0)).all())
+ lines.append(f"time monotonic nondecreasing: {is_mono}")
+ except Exception:
+ pass
+
+ num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+ if num_cols:
+ sample_cols = num_cols[: min(8, len(num_cols))]
+ desc_df = df[sample_cols].describe().round(6)
+ lines.append(f"{c.BOLD}describe(sample numeric cols):{c.RESET}" if c.enabled else "describe(sample numeric cols):")
+ lines.extend(tabulate(desc_df, headers="keys", tablefmt=TABLE_FMT).splitlines())
+
+ if peek > 0:
+ lines.append(f"{c.BOLD}head:{c.RESET}" if c.enabled else "head:")
+ head_tbl = tabulate(df.head(peek), headers="keys", tablefmt=TABLE_FMT, showindex=False)
+ lines.extend(head_tbl.splitlines())
+ lines.append(f"{c.BOLD}tail:{c.RESET}" if c.enabled else "tail:")
+ tail_tbl = tabulate(df.tail(peek), headers="keys", tablefmt=TABLE_FMT, showindex=False)
+ lines.extend(tail_tbl.splitlines())
+
+ return lines
+
+
+def print_dir_listing(path: str, c: C, style: str) -> str:
+ import os
+ if os.path.isdir(path):
+ files = sorted(os.listdir(path))
+ body = [f"path: {path}", f"files: {len(files)}"]
+ body += [f"• {f}" for f in files[:10]]
+ if len(files) > 10:
+ body.append(f"• (+{len(files)-10} more)")
+ else:
+ body = [f"path: {path}", f"{'files: (missing)'}"]
+ return render_card("Data directory", body, c, style=style, align="left")
+
+
+def print_summary(lines: list[str], c: C, style: str) -> str:
+ if "" in lines:
+ idx = lines.index("")
+ msg_part = lines[:idx]
+ ob_part = lines[idx+1:]
+ else:
+ msg_part, ob_part = lines, []
+
+ def split_title(block: list[str]) -> tuple[str, list[str]]:
+ if not block:
+ return ("", [])
+ title, body = block[0], block[1:]
+ return (title, body)
+
+ out = []
+ t1, b1 = split_title(msg_part)
+ if t1:
+ out.append(render_card(t1, b1, c, style=style, align="left"))
+ t2, b2 = split_title(ob_part)
+ if t2:
+ out.append(render_card(t2, b2, c, style=style, align="left"))
+ return "\n".join(out)
+
+
+def _fmt_bytes(n: int) -> str:
+ units = ["B", "KB", "MB", "GB", "TB"]
+ i = 0; f = float(n)
+ while f >= 1024 and i < len(units) - 1:
+ f /= 1024.0; i += 1
+ return f"{f:.2f} {units[i]}"
+
+
+def print_report(W_train, W_val, W_test, meta: dict, c: C, style: str, *,
+ verbose: bool = False,
+ scaler_obj = None,
+ clip_bounds = None,
+ time_coverage: tuple[str, str] = ("","")) -> str:
+ block1 = [
+ ("train windows", "×".join(map(str, W_train.shape))),
+ ("val windows", "×".join(map(str, W_val.shape))),
+ ("test windows", "×".join(map(str, W_test.shape))),
+ ("seq_len", str(meta.get("seq_len"))),
+ ("stride", str(meta.get("stride"))),
+ ("feature_set", str(meta.get("feature_set"))),
+ ("#features", str(len(meta.get("feature_names", [])))),
+ ("scaler", str(meta.get("scaler"))),
+ ("sorted_by_time",str(meta.get("sorted_by_time"))),
+ ("every", str(meta.get("every"))),
+ ]
+ lines1 = kv_table(block1, c)
+ out = [render_card("Preprocessing report", lines1, c, style=style, align="right")]
+
+ rc = meta.get("row_counts", {})
+ if rc:
+ block2 = [(k, str(v)) for k, v in rc.items()]
+ lines2 = kv_table(block2, c)
+ out.append(render_card("Row counts", lines2, c, style=style, align="right"))
+
+ if getattr(W_train, "size", 0):
+ win = W_train[0]
+ block3 = [
+ ("window[0] mean", f"{float(win.mean()):.6f}"),
+ ("window[0] std", f"{float(win.std()):.6f}"),
+ ("features", ", ".join(meta.get("feature_names", [])[:8]) + ("…" if len(meta.get("feature_names", []))>8 else "")),
+ ]
+ lines3 = kv_table(block3, c)
+ out.append(render_card("Sample window", lines3, c, style=style, align="right"))
+
+ if not verbose:
+ return "\n".join(out)
+
+ vlines: list[str] = []
+ total_bytes = (getattr(W_train, "nbytes", 0) + getattr(W_val, "nbytes", 0) + getattr(W_test, "nbytes", 0))
+ vlines.append(f"memory total: {_fmt_bytes(total_bytes)}")
+ vlines.append(f"train bytes: {_fmt_bytes(getattr(W_train, 'nbytes', 0))}")
+ vlines.append(f"val bytes: {_fmt_bytes(getattr(W_val, 'nbytes', 0))}")
+ vlines.append(f"test bytes: {_fmt_bytes(getattr(W_test, 'nbytes', 0))}")
+
+ tmin, tmax = time_coverage
+ if tmin or tmax:
+ vlines.append(f"time coverage: {tmin} → {tmax}")
+
+ out.append(render_card("Resources & coverage", vlines, c, style=style, align="right"))
+
+ if scaler_obj is not None:
+ s_rows = []
+ if hasattr(scaler_obj, "mean_") and hasattr(scaler_obj, "scale_"):
+ s_rows = [
+ ("type", "StandardScaler"),
+ ("mean[0:8]", np.array2string(scaler_obj.mean_[:8], precision=4, separator=", ")),
+ ("scale[0:8]", np.array2string(scaler_obj.scale_[:8], precision=4, separator=", ")),
+ ]
+ elif hasattr(scaler_obj, "data_min_") and hasattr(scaler_obj, "data_max_"):
+ s_rows = [
+ ("type", "MinMaxScaler"),
+ ("data_min[0:8]", np.array2string(scaler_obj.data_min_[:8], precision=4, separator=", ")),
+ ("data_max[0:8]", np.array2string(scaler_obj.data_max_[:8], precision=4, separator=", ")),
+ ("feature_range", str(getattr(scaler_obj, "feature_range", None))),
+ ]
+ if s_rows:
+ out.append(render_card("Scaler parameters", kv_table(s_rows, c), c, style=style, align="right"))
+
+ if clip_bounds is not None:
+ lo, hi = clip_bounds
+ cb_rows = [
+ ("q-lo[0:8]", np.array2string(lo[:8], precision=4, separator=", ")),
+ ("q-hi[0:8]", np.array2string(hi[:8], precision=4, separator=", ")),
+ ]
+ out.append(render_card("Clip bounds (preview)", kv_table(cb_rows, c), c, style=style, align="right"))
+
+ def _count_windows(n_rows: int, seq_len: int, stride: int) -> int:
+ if n_rows < seq_len:
+ return 0
+ return 1 + (n_rows - seq_len) // stride
+
+ rc_train = rc.get("train", 0); rc_val = rc.get("val", 0); rc_test = rc.get("test", 0)
+ overlap = 1.0 - (meta.get("stride", 1) / max(1, meta.get("seq_len", 1)))
+ perf_rows = [
+ ("expected train windows", str(_count_windows(rc_train, meta.get("seq_len", 0), meta.get("stride", 1)))),
+ ("expected val windows", str(_count_windows(rc_val, meta.get("seq_len", 0), meta.get("stride", 1)))),
+ ("expected test windows", str(_count_windows(rc_test, meta.get("seq_len", 0), meta.get("stride", 1)))),
+ ("overlap ratio", f"{overlap:.3f}"),
+ ]
+ out.append(render_card("Windowing details", kv_table(perf_rows, c), c, style=style, align="right"))
+
+ return "\n".join(out)
+
+
+def print_dataset_info(loader, c: C, style: str, peek: int = 5) -> str:
+ meta = loader.get_meta()
+ feature_set = meta.get("feature_set")
+ feats = meta.get("feature_names") or []
+
+ if not feats:
+ if feature_set == "core":
+ feats = ["mid_price","spread","mid_log_return","queue_imbalance_l1","depth_imbalance_l10"]
+ elif feature_set == "raw10":
+ feats = ([f"ask_price_{i}" for i in range(1,11)] +
+ [f"ask_size_{i}" for i in range(1,11)] +
+ [f"bid_price_{i}" for i in range(1,11)] +
+ [f"bid_size_{i}" for i in range(1,11)])
+
+ intro = [
+ f"Feature set: {c.BOLD}{feature_set}{c.RESET}" if c.enabled else f"Feature set: {feature_set}",
+ f"Total features: {len(feats)}",
+ ""
+ ]
+
+ try:
+ W_train, W_val, W_test = loader.load_arrays()
+ if W_train.size + W_val.size + W_test.size == 0:
+ raise ValueError("No windows produced; lower seq_len or stride.")
+ blocks = [W.reshape(-1, W.shape[-1]) for W in (W_train, W_val, W_test) if getattr(W,"size",0)]
+ all_data = np.concatenate(blocks, axis=0)
+ df = pd.DataFrame(all_data, columns=feats)
+
+ intro.append(f"{c.BOLD}Statistical summary (aggregated across splits):{c.RESET}" if c.enabled else "Statistical summary (aggregated across splits):")
+ desc_df = df.describe().round(6)
+ intro.extend(tabulate(desc_df, headers="keys", tablefmt=TABLE_FMT).splitlines())
+ intro.append("")
+
+ means = df.mean().sort_values(ascending=False).head(5)
+ stds = df.std().sort_values(ascending=False).head(5)
+
+ intro.append(f"{c.BOLD}Highest-mean features:{c.RESET}" if c.enabled else "Highest-mean features:")
+ intro.extend(tabulate(list(means.items()), headers=[f"{c.MAGENTA}feature{c.RESET}" if c.enabled else "feature", "mean"], tablefmt=TABLE_FMT).splitlines())
+ intro.append("")
+
+ intro.append(f"{c.BOLD}Most-variable features (by std):{c.RESET}" if c.enabled else "Most-variable features (by std):")
+ intro.extend(tabulate(list(stds.items()), headers=[f"{c.MAGENTA}feature{c.RESET}" if c.enabled else "feature", "std"], tablefmt=TABLE_FMT).splitlines())
+ intro.append("")
+
+ intro.append(f"{c.BOLD}Example rows (first few timesteps):{c.RESET}" if c.enabled else "Example rows (first few timesteps):")
+ ex_tbl = tabulate(df.head(peek).round(6), headers="keys", tablefmt=TABLE_FMT, showindex=True)
+ intro.extend(ex_tbl.splitlines())
+
+ except Exception as e:
+ intro.append(f"{c.RED}(Could not compute stats: {e}){c.RESET}" if c.enabled else f"(Could not compute stats: {e})")
+
+ return render_card("Dataset summary", intro, c, style=style, align="left")
\ No newline at end of file
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/textui.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/textui.py
new file mode 100644
index 000000000..f530edcaf
--- /dev/null
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/textui.py
@@ -0,0 +1,303 @@
+import os
+import re
+import shutil
+from datetime import datetime
+from typing import List, Tuple, Sequence
+from tabulate import tabulate
+
+# Try Colorama on Windows (optional)
+try:
+ import colorama # type: ignore
+ colorama.just_fix_windows_console()
+except Exception:
+ pass
+
+# ---------------- defaults ----------------
+DEFAULT_STYLE = "box" # default to box panels
+TABLE_FMT = "github" # tabulate format; switch with set_table_style()
+
+# ------------- terminal capabilities & colors -------------
+def supports_color(no_color_flag: bool) -> bool:
+ if no_color_flag or os.environ.get("NO_COLOR"):
+ return False
+ try:
+ # If stdout is a TTY, assume color; terminals and most IDE consoles support it.
+ return os.isatty(1)
+ except Exception:
+ return False
+
+class C:
+ def __init__(self, enabled: bool):
+ self.enabled = enabled
+ self.RESET = "\033[0m" if enabled else ""
+ self.DIM = "\033[2m" if enabled else ""
+ self.BOLD = "\033[1m" if enabled else ""
+ self.CYAN = "\033[36m" if enabled else ""
+ self.YELLOW = "\033[33m" if enabled else ""
+ self.GREEN = "\033[32m" if enabled else ""
+ self.MAGENTA = "\033[35m" if enabled else ""
+ self.BLUE = "\033[34m" if enabled else ""
+ self.RED = "\033[31m" if enabled else ""
+ self.WHITE = "\033[37m" if enabled else ""
+
+# ------------- ANSI helpers -------------
+_ANSI_RE = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]")
+
+def visible_len(s: str) -> int:
+ """Printable width (strip ANSI first)."""
+ return len(_ANSI_RE.sub("", s))
+
+def strip_ansi(s: str) -> str:
+ return _ANSI_RE.sub("", s)
+
+def truncate_visible(s: str, max_cols: int) -> str:
+ """
+ Truncate to max_cols printable columns without breaking ANSI sequences.
+ """
+ if max_cols <= 0:
+ return ""
+ out, cols = [], 0
+ i, n = 0, len(s)
+ while i < n and cols < max_cols:
+ m = _ANSI_RE.match(s, i)
+ if m:
+ out.append(m.group(0))
+ i = m.end()
+ continue
+ ch = s[i]
+ out.append(ch)
+ cols += 1
+ i += 1
+ # ensure we don't end inside an ANSI state (we don't maintain state machine,
+ # but common sequences are self-contained; still append reset for safety)
+ if cols >= max_cols:
+ out.append("\033[0m")
+ return "".join(out)
+
+def ljust_visible(s: str, width: int) -> str:
+ pad = max(0, width - visible_len(s))
+ return s + (" " * pad)
+
+# ------------- layout helpers -------------
+def set_table_style(name: str) -> None:
+ """Set tabulate tablefmt. Small whitelist, but allow custom strings."""
+ global TABLE_FMT
+ allowed = {
+ "github", "grid", "fancy_grid", "heavy_grid", "simple", "outline",
+ "rounded_grid", "double_grid", "pipe", "orgtbl", "jira", "psql"
+ }
+ TABLE_FMT = name if name in allowed else name # pass-through (tabulate will raise if invalid)
+
+def term_width(default: int = 100) -> int:
+ try:
+ return shutil.get_terminal_size((default, 20)).columns
+ except Exception:
+ return default
+
+def wrap_text(s: str, width: int) -> List[str]:
+ """
+ ANSI-aware word wrap by visible width.
+ """
+ if visible_len(s) <= width:
+ return [s]
+ parts = s.split(" ")
+ out, cur = [], ""
+ for tok in parts:
+ if not cur:
+ cur = tok
+ elif visible_len(cur) + 1 + visible_len(tok) <= width:
+ cur += " " + tok
+ else:
+ out.append(cur)
+ cur = tok
+ if cur:
+ out.append(cur)
+ return out
+
+def is_table_line(s: str) -> bool:
+ """
+ Heuristic: lines that look like tables (markdown pipes or box-drawing).
+ """
+ t = strip_ansi(s).strip()
+ if not t:
+ return False
+ if t.startswith("|") and "|" in t[1:]:
+ return True
+ if t.startswith("+") and t.endswith("+"):
+ return True
+ # box drawing / markdown borders
+ if set(t) <= set("-:|+ ─═│║┼┬┴├┤┌┐└┘╭╮╯╰╪╫╠╬╣╦╩╔╗╚╝"):
+ return True
+ return False
+
+# ------------- table/border styling -------------
+def bold_white_borders(table: str, c: C) -> str:
+ """
+ Paint table border glyphs in bold white without touching cell content.
+ Works for markdown pipes and Unicode box drawing.
+ """
+ if not getattr(c, "enabled", False):
+ return table
+
+ bold, white, reset = c.BOLD, c.WHITE, c.RESET
+ border_chars = set("│║|┼┬┴├┤┌┐└┘─═╭╮╯╰╪╫╠╬╣╦╩╔╗╚╝+-:")
+ horiz_set = set("─═-")
+ vert_set = set("│║|:")
+
+ def paint(ch: str) -> str:
+ return f"{bold}{white}{ch}{reset}"
+
+ painted_lines = []
+ for raw in table.splitlines():
+ line = raw
+ # operate on non-ANSI plane but keep indexes by iterating char-by-char
+ out_chars = []
+ for ch in line:
+ if ch in border_chars:
+ out_chars.append(paint(ch))
+ else:
+ out_chars.append(ch)
+ painted_lines.append("".join(out_chars))
+ return "\n".join(painted_lines)
+
+def kv_table(
+ rows: List[Tuple[str, str]],
+ c: C,
+ headers: Tuple[str, str] = ("key", "value"),
+) -> List[str]:
+ if not rows:
+ return []
+
+ if c.enabled:
+ h_key = f"{c.BOLD}{c.MAGENTA}{headers[0]}{c.RESET}"
+ h_val = f"{c.BOLD}{c.MAGENTA}{headers[1]}{c.RESET}"
+ tinted = [(f"{c.CYAN}{k}{c.RESET}", v) for k, v in rows]
+ else:
+ h_key, h_val = headers
+ tinted = rows
+
+ table_txt = tabulate(
+ tinted,
+ headers=[h_key, h_val],
+ tablefmt=TABLE_FMT,
+ stralign="left",
+ disable_numparse=True,
+ )
+ table_txt = bold_white_borders(table_txt, c)
+ return table_txt.splitlines()
+
+# -------------------- NEW: generic table renderer --------------------
+def table(
+ rows: Sequence[Sequence[str]],
+ headers: Sequence[str],
+ c: C,
+ *,
+ tint_header: bool = True,
+ tint_first_col: bool = True,
+) -> List[str]:
+ """
+ Render a 2D table (rows + headers) with optional header-row tint
+ and first-column tint, plus bold white borders.
+ """
+ rows_list = [list(map(str, r)) for r in rows]
+ if c.enabled and tint_first_col and rows_list:
+ for i, r in enumerate(rows_list):
+ if r:
+ r[0] = f"{c.YELLOW}{r[0]}{c.RESET}"
+
+ if c.enabled and tint_header:
+ hdr = [f"{c.BOLD}{c.MAGENTA}{h}{c.RESET}" for h in headers]
+ else:
+ hdr = list(map(str, headers))
+
+ tbl = tabulate(
+ rows_list,
+ headers=hdr,
+ tablefmt=TABLE_FMT,
+ stralign="left",
+ disable_numparse=True,
+ showindex=False,
+ )
+ tbl = bold_white_borders(tbl, c)
+ return tbl.splitlines()
+
+# ------------- message bubbles & panels -------------
+def _bubble(title: str, body_lines: List[str], c: C, align: str = "left", width: int | None = None) -> str:
+ termw = term_width()
+ width = min(termw, width or termw)
+ base_inner = max(24, width - 10)
+
+ widest_tbl = 0
+ for ln in body_lines:
+ if is_table_line(ln):
+ widest_tbl = max(widest_tbl, visible_len(ln))
+
+ max_inner = min(max(base_inner, widest_tbl), width - 10)
+ indent = 2 if align == "left" else max(2, width - (max_inner + 8))
+ pad = " " * indent
+
+ ts = datetime.now().strftime("%H:%M")
+ title_colored = f"{c.BOLD}{c.BLUE}{title}{c.RESET}" if c.enabled else title
+ head = f"{title_colored} {c.DIM}{ts}{c.RESET}"
+ head_lines = wrap_text(head, max_inner)
+
+ lines = [pad + " " + head_lines[0]]
+ for hl in head_lines[1:]:
+ lines.append(pad + " " + hl)
+
+ lines.append(pad + " " + ("╭" + "─" * (max_inner + 2) + "╮"))
+
+ for ln in body_lines:
+ if is_table_line(ln):
+ width_ok = max_inner
+ body = ljust_visible(ln, width_ok)
+ body = truncate_visible(body, width_ok)
+ lines.append(pad + " " + "│ " + body + " │")
+ else:
+ for wln in wrap_text(ln, max_inner):
+ lines.append(pad + " " + "│ " + ljust_visible(wln, max_inner) + " │")
+
+ tail_left = pad + " " + "╰" + "─" * (max_inner + 2) + "╯" + "⟋"
+ tail_right = pad + " " + "⟍" + "╰" + "─" * (max_inner + 2) + "╯"
+ lines.append(tail_left if align == "left" else tail_right)
+ return "\n".join(lines)
+
+def _panel(title: str, body_lines: List[str], c: C, width: int | None = None) -> str:
+ termw = term_width()
+ width = width or termw
+ inner = width - 4
+
+ widest_tbl = 0
+ for ln in body_lines:
+ if is_table_line(ln):
+ widest_tbl = max(widest_tbl, visible_len(ln))
+ inner = min(max(inner, widest_tbl + 2), termw - 4)
+ width = inner + 4
+
+ border = "─" * (width - 2)
+ title_colored = f"{c.BOLD}{c.BLUE}{title}{c.RESET}" if c.enabled else title
+ out = [f"{c.CYAN}┌{border}┐{c.RESET}"]
+ title_line = f" {title_colored} "
+ pad_space = max(0, width - 2 - visible_len(title_line))
+ out.append(f"{c.CYAN}│{c.RESET}{title_line}{' '*pad_space}{c.CYAN}│{c.RESET}")
+ out.append(f"{c.CYAN}├{border}┤{c.RESET}")
+
+ content_width = inner - 2
+ for ln in body_lines:
+ if is_table_line(ln):
+ body = ljust_visible(ln, content_width)
+ body = truncate_visible(body, content_width)
+ out.append(f"{c.CYAN}│{c.RESET} {body} {c.CYAN}│{c.RESET}")
+ else:
+ for sub in wrap_text(ln, content_width):
+ out.append(f"{c.CYAN}│{c.RESET} {ljust_visible(sub, content_width)} {c.CYAN}│{c.RESET}")
+
+ out.append(f"{c.CYAN}└{border}┘{c.RESET}")
+ return "\n".join(out)
+
+def render_card(title: str, body_lines: List[str], c: C, style: str = DEFAULT_STYLE, align: str = "left") -> str:
+ return _bubble(title, body_lines, c, align=align) if style == "chat" else _panel(title, body_lines, c)
+
+# Convenience sugar for quick key→value panels
+def render_kv_panel(title: str, rows: List[Tuple[str, str]], c: C, style: str = DEFAULT_STYLE, align: str = "right") -> str:
+ return render_card(title, kv_table(rows, c), c, style=style, align=align)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/train.py b/recognition/TimeLOB_TimeGAN_49088276/src/train.py
index e69de29bb..13eeac4ef 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/train.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/train.py
@@ -0,0 +1,19 @@
+"""
+Train, validate, and test the TimeLOB TimeGAN on LOBSTER sequences.
+
+This module orchestrates the three-phase TimeGAN schedule (autoencoder
+pretrain, supervisor pretrain, joint adversarial training), log losses,
+computes validation metrics (e.g., KL on spread/returns; SSIM on heatmaps),
+and saves model checkpoints and plots. The model is imported from ``modules.py``
+and data loaders from ``dataset.py``.
+
+Typical Usage:
+ python3 -m predict --ckpt checkpoints/best.pt --n 8 --seq_len 120 --out outputs/predictions
+
+Created By: Radhesh Goel (Keys-I)
+ID: s49088276
+
+References:
+-
+"""
+# TODO: Wire training loops, metrics/plots, checkpointing, and CLI Argument parsing.
From f9e98b6e9b2060b7dbc3131c84a89c4ae75f3a50 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Fri, 3 Oct 2025 23:58:39 +1000
Subject: [PATCH 15/74] mm(preprocess): persist scaler/PCA/ZCA as .pkl for
reproducible pipelines & inverse-transform
---
recognition/TimeLOB_TimeGAN_49088276/.gitignore | 1 +
1 file changed, 1 insertion(+)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/.gitignore b/recognition/TimeLOB_TimeGAN_49088276/.gitignore
index 7a6136c0e..7f99e0853 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/.gitignore
+++ b/recognition/TimeLOB_TimeGAN_49088276/.gitignore
@@ -8,6 +8,7 @@
# model specific files
data/
+preproc_final_core/
*.csv
*.pt
*.pkl
From 0e437aaabac7f6a013007d1b7b48167549d1b181 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Sat, 4 Oct 2025 06:34:58 +1000
Subject: [PATCH 16/74] feat(dataset): auto-detect headers + nested summary
panels; GAN-ready preprocessing
Add header auto-detect (no flags needed), enforce canonical column order, and coerce dtypes.
Render one big panel per CSV with subpanels (shape/dtypes/describe/head/tail) via textui.
Expand preprocessing for GANs: advanced scalers (robust/quantile/power), optional PCA/ZCA whitening,
train-only window augmentations (jitter/scaling/time-warp), engineered features (rel_spread, microprice,
L5 imbalance, rolling stats, diffs/pct), chronological split with train-only scaling, and NPZ+meta saving.
---
.../TimeLOB_TimeGAN_49088276/src/dataset.py | 101 +++++++++++++++---
1 file changed, 84 insertions(+), 17 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
index 208845cab..70a8fc771 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
@@ -8,7 +8,8 @@
Inputs (per trading session):
message_10.csv, orderbook_10.csv
- - If headers are missing, pass --headerless-message / --headerless-orderbook (CLI).
+ - If headers are missing, pass --headerless-message / --headerless-orderbook (CLI),
+ but auto-detection now assigns canonical headers when omitted.
Outputs:
train, val, test — NumPy arrays with shape [num_seq, seq_len, num_features]
@@ -27,10 +28,11 @@
Notes:
- Scaling is fit on TRAIN only (Standard/MinMax/None). Advanced scalers: Robust, Quantile, Power.
-- Optional whitening: PCA (with variance threshold) or ZCA.
+- Optional whitening: PCA (variance threshold) or ZCA.
- Optional train-only sequence augmentations (jitter, scaling, time-warp) for GANs.
- Windows default to non-overlapping (stride=seq_len); set stride Tuple[np.ndarray, np.ndarray, np.ndarray]:
ob_df = ob_df.iloc[order].reset_index(drop=True)
self._check_alignment(msg_df, ob_df)
+
+ # enforce numeric types early (prevents string pollution)
+ for col in ("time", "order_id", "size", "price"):
+ if col in msg_df.columns:
+ msg_df[col] = pd.to_numeric(msg_df[col], errors="coerce")
+ ob_df[ob_df.columns] = ob_df[ob_df.columns].apply(pd.to_numeric, errors="coerce")
+
feats = self._build_features(ob_df)
if self.every > 1:
@@ -269,6 +278,52 @@ def _validate_splits(self) -> None:
if any(x < 0 for x in self.splits):
raise ValueError("splits cannot be negative")
+ # ---- header detection helpers ----
+ def _looks_headerless(self, path: str, expected_cols: int, min_numeric: int) -> bool:
+ """
+ Peek the first row with header=None. If the row is mostly numeric and the
+ column count matches what we expect, assume there's NO header.
+ """
+ try:
+ df0 = pd.read_csv(path, header=None, nrows=1)
+ except Exception:
+ return False
+ if df0.shape[1] != expected_cols:
+ return False
+ num_ok = pd.to_numeric(df0.iloc[0], errors="coerce").notna().sum()
+ return num_ok >= min_numeric
+
+ def _read_with_possible_headerless(self, path: str, default_names: list[str],
+ force_headerless: bool,
+ normalize_fn=None) -> pd.DataFrame:
+ """
+ Read CSV, auto-detect headerlessness if not forced.
+ - If forced: header=None, names=default_names
+ - Else: if first row looks numeric & count matches, treat as headerless.
+ otherwise try header=0 and optionally normalize columns.
+ """
+ expected_cols = len(default_names)
+ if force_headerless:
+ return pd.read_csv(path, header=None, names=default_names)
+
+ # Auto-detect headerless
+ if self._looks_headerless(path, expected_cols=expected_cols,
+ min_numeric=max(4, int(0.6 * expected_cols))): # threshold 60%
+ return pd.read_csv(path, header=None, names=default_names)
+
+ # Try with header row, then normalize if asked
+ df = pd.read_csv(path)
+ if normalize_fn is not None:
+ df = normalize_fn(df, default_names)
+
+ # If counts match but names/order differ, force canonical order & names
+ if df.shape[1] == expected_cols and list(df.columns) != default_names:
+ df = df.iloc[:, :expected_cols] # ensure width
+ df.columns = [str(c) for c in df.columns]
+ # If normalize_fn was provided, it likely already tried to normalize.
+ df.columns = default_names
+ return df
+
def _load_csvs(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
if not os.path.isfile(self.orderbook_path):
raise FileNotFoundError(f"Missing {self.orderbook_path}")
@@ -277,13 +332,21 @@ def _load_csvs(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
# Message (6 columns)
msg_cols = ["time", "type", "order_id", "size", "price", "direction"]
- if self.headerless_message:
- msg_df = pd.read_csv(self.message_path, header=None, names=msg_cols)
- else:
- msg_df = pd.read_csv(self.message_path)
- msg_df.columns = [str(c).strip().lower().replace(" ", "_") for c in msg_df.columns]
- if len(msg_df.columns) == 6 and set(msg_df.columns) != set(msg_cols):
- msg_df.columns = msg_cols
+ msg_df = self._read_with_possible_headerless(
+ self.message_path,
+ default_names=msg_cols,
+ force_headerless=self.headerless_message,
+ normalize_fn=lambda df, _: (
+ df.assign(**{}).rename(columns=lambda c: str(c).strip().lower().replace(" ", "_"))
+ )
+ )
+ # Enforce exact column order when shape matches but order differs
+ if msg_df.shape[1] == 6 and list(msg_df.columns) != msg_cols:
+ # Try reorder if all present; else force names in canonical order
+ present = set(msg_df.columns)
+ if set(msg_cols).issubset(present):
+ msg_df = msg_df[msg_cols]
+ msg_df.columns = msg_cols
# Orderbook (40 columns)
ob_cols = (
@@ -292,11 +355,17 @@ def _load_csvs(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
[f"bid_price_{i}" for i in range(1, 11)] +
[f"bid_size_{i}" for i in range(1, 11)]
)
- if self.headerless_orderbook:
- ob_df = pd.read_csv(self.orderbook_path, header=None, names=ob_cols)
- else:
- ob_df = pd.read_csv(self.orderbook_path)
- ob_df = self._normalize_orderbook_headers(ob_df, ob_cols)
+ ob_df = self._read_with_possible_headerless(
+ self.orderbook_path,
+ default_names=ob_cols,
+ force_headerless=self.headerless_orderbook,
+ normalize_fn=lambda df, target: self._normalize_orderbook_headers(df, target)
+ )
+ # Enforce exact column order when counts match but order differs
+ if ob_df.shape[1] == len(ob_cols) and list(ob_df.columns) != ob_cols:
+ if set(ob_cols).issubset(set(ob_df.columns)):
+ ob_df = ob_df[ob_cols]
+ ob_df.columns = ob_cols
return msg_df, ob_df
@@ -657,7 +726,6 @@ def _rows_from_df(df: pd.DataFrame, limit_rows: int, limit_cols: int) -> tuple[l
return headers, rows
def _subpanel_lines(title: str, body_lines: list[str]) -> list[str]:
- # Render a mini panel and return its lines to embed inside the big panel
return render_card(title, body_lines, c, style=args.style, align="left").splitlines()
def _panel_df(title: str, df: pd.DataFrame, peek: int) -> list[str]:
@@ -683,7 +751,6 @@ def _panel_describe(df: pd.DataFrame) -> list[str]:
return _subpanel_lines("describe (numeric subset)", tx_table(rows, headers, c))
def _big_panel(title: str, subpanels: list[list[str]]) -> str:
- # Flatten the subpanel line blocks with a blank spacer between them
body_lines: list[str] = []
for i, block in enumerate(subpanels):
if i > 0:
From 788efe15e7ca447f3af3adcf006888860a1edb58 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Sat, 4 Oct 2025 16:29:56 +1000
Subject: [PATCH 17/74] feat(timegan): add basic TimeGAN components
(Embedder/Recovery/Generator/Supervisor/Discriminator)
Implements minimal TimeGAN in PyTorch:
- GRU/LSTM-based Embedder/Recovery, Generator, Supervisor, Discriminator
- Canonical losses: recon, supervised, GAN (gen/disc), moment + latent feature matching
- Utilities: noise sampling, weight init, optim factory
- Pretrain steps (AE, SUP) and joint training helpers
---
.../TimeLOB_TimeGAN_49088276/src/modules.py | 510 +++++++++++++++++-
1 file changed, 509 insertions(+), 1 deletion(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/modules.py b/recognition/TimeLOB_TimeGAN_49088276/src/modules.py
index be69760f3..6dcc46015 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/modules.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/modules.py
@@ -21,4 +21,512 @@
References:
-
"""
-# TODO: Implement model classes and a TimeGAN wrapper here; keep public APIs compliant with PEP 8 and other best practices.
\ No newline at end of file
+# modules.py
+# Basic TimeGAN components implemented in PyTorch
+# ------------------------------------------------
+# Components:
+# - Embedder (encoder) : X -> H
+# - Recovery (decoder) : H -> X_hat
+# - Generator : Z -> E_tilde (latent)
+# - Supervisor : H -> H_hat (one-step future)
+# - Discriminator : {H, H_tilde} -> real/fake logit
+# Wrapper:
+# - TimeGAN : convenience forward helpers
+# Losses:
+# - reconstruction_loss, supervised_loss, generator_adv_loss,
+# discriminator_loss, moment_loss, generator_feature_matching_loss
+# Utils:
+# - sample_noise, init_weights, make_optim
+
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Tuple, Optional, Dict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+# -------------------------
+# Small building blocks
+# -------------------------
+
+class RNNSeq(nn.Module):
+ """
+ Multi-layer GRU/LSTM that returns sequence outputs [B, T, H].
+ """
+ def __init__(
+ self,
+ input_dim: int,
+ hidden_dim: int,
+ num_layers: int = 2,
+ rnn_type: str = "gru",
+ dropout: float = 0.0,
+ bidirectional: bool = False,
+ ):
+ super().__init__()
+ assert rnn_type in {"gru", "lstm"}
+ self.rnn_type = rnn_type
+ rnn_cls = nn.GRU if rnn_type == "gru" else nn.LSTM
+ self.rnn = rnn_cls(
+ input_dim,
+ hidden_dim,
+ num_layers=num_layers,
+ dropout=dropout if num_layers > 1 else 0.0,
+ batch_first=True,
+ bidirectional=bidirectional,
+ )
+ self.out_dim = hidden_dim * (2 if bidirectional else 1)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ # x: [B, T, D]
+ y, _ = self.rnn(x)
+ return y # [B, T, H']
+
+
+def _linear_head(in_dim: int, out_dim: int) -> nn.Module:
+ return nn.Sequential(
+ nn.Linear(in_dim, out_dim),
+ )
+
+
+def init_weights(m: nn.Module, gain: float = 1.0) -> None:
+ """
+ He init for Linear; orthogonal for RNN; zeros for bias.
+ """
+ if isinstance(m, nn.Linear):
+ nn.init.kaiming_uniform_(m.weight, a=0.0, nonlinearity="linear")
+ if m.bias is not None:
+ nn.init.zeros_(m.bias)
+ if isinstance(m, (nn.GRU, nn.LSTM)):
+ for name, param in m.named_parameters():
+ if "weight_ih" in name:
+ nn.init.xavier_uniform_(param, gain=gain)
+ elif "weight_hh" in name:
+ nn.init.orthogonal_(param, gain=gain)
+ elif "bias" in name:
+ nn.init.zeros_(param)
+
+
+def make_optim(params, lr: float = 1e-3, betas=(0.9, 0.999), weight_decay: float = 0.0):
+ return torch.optim.Adam(params, lr=lr, betas=betas, weight_decay=weight_decay)
+
+
+# -------------------------
+# TimeGAN components
+# -------------------------
+
+class Embedder(nn.Module):
+ """X -> H (latent)"""
+ def __init__(
+ self,
+ x_dim: int,
+ h_dim: int,
+ num_layers: int = 2,
+ rnn_type: str = "gru",
+ dropout: float = 0.1,
+ bidirectional: bool = False,
+ ):
+ super().__init__()
+ self.rnn = RNNSeq(x_dim, h_dim, num_layers, rnn_type, dropout, bidirectional)
+ self.proj = _linear_head(self.rnn.out_dim, h_dim)
+ self.apply(init_weights)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ # x: [B, T, x_dim]
+ h_seq = self.rnn(x)
+ h = self.proj(h_seq)
+ return h # [B, T, h_dim]
+
+
+class Recovery(nn.Module):
+ """H -> X_hat (reconstruct data space)"""
+ def __init__(
+ self,
+ h_dim: int,
+ x_dim: int,
+ num_layers: int = 2,
+ rnn_type: str = "gru",
+ dropout: float = 0.1,
+ bidirectional: bool = False,
+ ):
+ super().__init__()
+ self.rnn = RNNSeq(h_dim, h_dim, num_layers, rnn_type, dropout, bidirectional)
+ self.proj = _linear_head(self.rnn.out_dim, x_dim)
+ self.apply(init_weights)
+
+ def forward(self, h: torch.Tensor) -> torch.Tensor:
+ z = self.rnn(h)
+ x_hat = self.proj(z)
+ return x_hat # [B, T, x_dim]
+
+
+class Generator(nn.Module):
+ """Z -> E_tilde (latent space fake)"""
+ def __init__(
+ self,
+ z_dim: int,
+ h_dim: int,
+ num_layers: int = 2,
+ rnn_type: str = "gru",
+ dropout: float = 0.1,
+ bidirectional: bool = False,
+ ):
+ super().__init__()
+ self.rnn = RNNSeq(z_dim, h_dim, num_layers, rnn_type, dropout, bidirectional)
+ self.proj = _linear_head(self.rnn.out_dim, h_dim)
+ self.apply(init_weights)
+
+ def forward(self, z: torch.Tensor) -> torch.Tensor:
+ g = self.rnn(z)
+ e_tilde = self.proj(g)
+ return e_tilde # [B, T, h_dim]
+
+
+class Supervisor(nn.Module):
+ """H -> H_hat (one-step ahead in latent)"""
+ def __init__(
+ self,
+ h_dim: int,
+ num_layers: int = 1,
+ rnn_type: str = "gru",
+ dropout: float = 0.0,
+ bidirectional: bool = False,
+ ):
+ super().__init__()
+ self.rnn = RNNSeq(h_dim, h_dim, num_layers, rnn_type, dropout, bidirectional)
+ self.proj = _linear_head(self.rnn.out_dim, h_dim)
+ self.apply(init_weights)
+
+ def forward(self, h: torch.Tensor) -> torch.Tensor:
+ s = self.rnn(h)
+ h_hat = self.proj(s)
+ return h_hat # [B, T, h_dim], meant to approximate next-step H
+
+
+class Discriminator(nn.Module):
+ """
+ Sequence-level discriminator: encodes sequence and outputs a single real/fake logit per sequence.
+ """
+ def __init__(
+ self,
+ h_dim: int,
+ hidden_dim: int = 128,
+ num_layers: int = 1,
+ rnn_type: str = "gru",
+ dropout: float = 0.1,
+ bidirectional: bool = False,
+ ):
+ super().__init__()
+ self.rnn = RNNSeq(h_dim, hidden_dim, num_layers, rnn_type, dropout, bidirectional)
+ rnn_out = self.rnn.out_dim
+ self.head = nn.Sequential(
+ nn.Linear(rnn_out, rnn_out),
+ nn.ReLU(inplace=True),
+ nn.Linear(rnn_out, 1),
+ )
+ self.apply(init_weights)
+
+ def forward(self, h_like: torch.Tensor) -> torch.Tensor:
+ # h_like: [B, T, h_dim] (real H or fake H_tilde)
+ z = self.rnn(h_like) # [B, T, H]
+ pooled = z.mean(dim=1) # [B, H] simple temporal pooling
+ logit = self.head(pooled) # [B, 1]
+ return logit
+
+
+# -------------------------
+# TimeGAN wrapper
+# -------------------------
+
+@dataclass
+class TimeGANOutputs:
+ H: torch.Tensor # real latent from embedder
+ X_tilde: torch.Tensor # recovered from H_tilde (generator path)
+ X_hat: torch.Tensor # reconstruction of X (autoencoder path)
+ H_hat_supervise: torch.Tensor # supervisor(H)
+ H_tilde: torch.Tensor # supervisor(generator(Z))
+ D_real: torch.Tensor # discriminator(H)
+ D_fake: torch.Tensor # discriminator(H_tilde)
+
+
+class TimeGAN(nn.Module):
+ """
+ Convenience wrapper that holds all components and exposes common forward passes.
+ """
+ def __init__(
+ self,
+ x_dim: int,
+ z_dim: int,
+ h_dim: int,
+ rnn_type: str = "gru",
+ enc_layers: int = 2,
+ dec_layers: int = 2,
+ gen_layers: int = 2,
+ sup_layers: int = 1,
+ dis_layers: int = 1,
+ dropout: float = 0.1,
+ ):
+ super().__init__()
+ self.embedder = Embedder(x_dim, h_dim, enc_layers, rnn_type, dropout)
+ self.recovery = Recovery(h_dim, x_dim, dec_layers, rnn_type, dropout)
+ self.generator = Generator(z_dim, h_dim, gen_layers, rnn_type, dropout)
+ self.supervisor = Supervisor(h_dim, sup_layers, rnn_type, dropout)
+ self.discriminator = Discriminator(h_dim, hidden_dim=max(64, h_dim), num_layers=dis_layers, rnn_type=rnn_type, dropout=dropout)
+
+ @torch.no_grad()
+ def embed(self, x: torch.Tensor) -> torch.Tensor:
+ return self.embedder(x)
+
+ @torch.no_grad()
+ def recover(self, h: torch.Tensor) -> torch.Tensor:
+ return self.recovery(h)
+
+ def forward_all(self, x: torch.Tensor, z: torch.Tensor) -> TimeGANOutputs:
+ """
+ Full graph for joint training steps.
+ """
+ H = self.embedder(x) # real latent
+ X_hat = self.recovery(H) # reconstruction
+
+ E_tilde = self.generator(z) # generator latent
+ H_hat_supervise = self.supervisor(H) # supervisor on real latent
+ H_tilde = self.supervisor(E_tilde) # supervised generator path
+
+ X_tilde = self.recovery(H_tilde) # map fake latent back to data space
+
+ D_real = self.discriminator(H.detach()) # detach to avoid leaking gradients to embedder in D update
+ D_fake = self.discriminator(H_tilde.detach())
+
+ return TimeGANOutputs(
+ H=H, X_hat=X_hat, X_tilde=X_tilde,
+ H_hat_supervise=H_hat_supervise,
+ H_tilde=H_tilde,
+ D_real=D_real, D_fake=D_fake
+ )
+
+ # convenience for generator forward (no detach on fake for Gen loss)
+ def forward_gen_paths(self, x: torch.Tensor, z: torch.Tensor) -> Dict[str, torch.Tensor]:
+ H = self.embedder(x)
+ H_hat_supervise = self.supervisor(H)
+ E_tilde = self.generator(z)
+ H_tilde = self.supervisor(E_tilde)
+ X_tilde = self.recovery(H_tilde)
+ D_fake_for_gen = self.discriminator(H_tilde) # no detach: grad goes to G/S
+ return dict(H=H, H_hat_supervise=H_hat_supervise, H_tilde=H_tilde, X_tilde=X_tilde, D_fake=D_fake_for_gen)
+
+ # convenience for autoencoder pretrain
+ def forward_autoencoder(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+ H = self.embedder(x)
+ X_hat = self.recovery(H)
+ return H, X_hat
+
+
+# -------------------------
+# Losses (canonical TimeGAN style)
+# -------------------------
+
+def reconstruction_loss(x: torch.Tensor, x_hat: torch.Tensor) -> torch.Tensor:
+ # MSE across batch, time, features
+ return F.mse_loss(x_hat, x)
+
+def supervised_loss(h: torch.Tensor, h_hat: torch.Tensor) -> torch.Tensor:
+ """
+ One-step ahead prediction in latent space:
+ compare h[:, 1:, :] with h_hat[:, :-1, :].
+ """
+ return F.mse_loss(h_hat[:, :-1, :], h[:, 1:, :])
+
+def discriminator_loss(d_real: torch.Tensor, d_fake: torch.Tensor, label_smooth: float = 0.1) -> torch.Tensor:
+ """
+ Standard non-saturating GAN BCE loss for discriminator.
+ """
+ # real labels in [1 - label_smooth, 1]
+ real_tgt = torch.ones_like(d_real) * (1.0 - label_smooth)
+ fake_tgt = torch.zeros_like(d_fake)
+ loss_real = F.binary_cross_entropy_with_logits(d_real, real_tgt)
+ loss_fake = F.binary_cross_entropy_with_logits(d_fake, fake_tgt)
+ return loss_real + loss_fake
+
+def generator_adv_loss(d_fake: torch.Tensor) -> torch.Tensor:
+ """
+ Non-saturating generator loss (wants discriminator to output 1 for fake).
+ """
+ tgt = torch.ones_like(d_fake)
+ return F.binary_cross_entropy_with_logits(d_fake, tgt)
+
+def moment_loss(x: torch.Tensor, x_tilde: torch.Tensor, eps: float = 1e-6) -> torch.Tensor:
+ """
+ Feature-wise mean/variance matching across time+batch dims.
+ """
+ # collapse batch/time for per-feature moments
+ dim = (0, 1)
+ mu_real = x.mean(dim=dim)
+ mu_fake = x_tilde.mean(dim=dim)
+ var_real = x.var(dim=dim, unbiased=False) + eps
+ var_fake = x_tilde.var(dim=dim, unbiased=False) + eps
+ return F.l1_loss(mu_fake, mu_real) + F.l1_loss(torch.sqrt(var_fake), torch.sqrt(var_real))
+
+def generator_feature_matching_loss(h: torch.Tensor, h_tilde: torch.Tensor) -> torch.Tensor:
+ """
+ Optional latent-level matching (helps stability).
+ """
+ return F.mse_loss(h_tilde.mean(dim=(0, 1)), h.mean(dim=(0, 1)))
+
+
+# -------------------------
+# Noise utility
+# -------------------------
+
+def sample_noise(batch_size: int, seq_len: int, z_dim: int, device: Optional[torch.device] = None) -> torch.Tensor:
+ """
+ Standard normal noise sequence for the generator.
+ """
+ z = torch.randn(batch_size, seq_len, z_dim)
+ return z.to(device) if device is not None else z
+
+
+# -------------------------
+# Minimal training scaffolds (optional)
+# -------------------------
+
+@dataclass
+class LossWeights:
+ lambda_embed: float = 10.0 # autoencoder recon weight during embedder pretrain
+ lambda_sup: float = 1.0 # supervisor loss weight
+ lambda_gen: float = 1.0 # adversarial generator weight
+ lambda_moment: float = 10.0 # moment matching weight
+ lambda_fm: float = 1.0 # feature/latent matching weight
+
+
+def timegan_autoencoder_step(
+ model: TimeGAN,
+ x: torch.Tensor,
+ opt: torch.optim.Optimizer,
+) -> Dict[str, float]:
+ """
+ Pretrain the embedder+recovery (autoencoder) with reconstruction loss.
+ """
+ model.train()
+ opt.zero_grad(set_to_none=True)
+ _, x_hat = model.forward_autoencoder(x)
+ loss_recon = reconstruction_loss(x, x_hat)
+ loss_recon.backward()
+ opt.step()
+ return {"recon": float(loss_recon.detach().cpu())}
+
+
+def timegan_supervisor_step(
+ model: TimeGAN,
+ x: torch.Tensor,
+ opt: torch.optim.Optimizer,
+) -> Dict[str, float]:
+ """
+ Pretrain the supervisor to predict next-step in latent space.
+ """
+ model.train()
+ opt.zero_grad(set_to_none=True)
+ h, _ = model.forward_autoencoder(x)
+ h_hat = model.supervisor(h)
+ loss_sup = supervised_loss(h, h_hat)
+ loss_sup.backward()
+ opt.step()
+ return {"sup": float(loss_sup.detach().cpu())}
+
+
+def timegan_joint_step(
+ model: TimeGAN,
+ x: torch.Tensor,
+ z: torch.Tensor,
+ opt_gs: torch.optim.Optimizer,
+ opt_d: torch.optim.Optimizer,
+ weights: LossWeights = LossWeights(),
+) -> Dict[str, float]:
+ """
+ Joint adversarial training step:
+ 1) Update Discriminator
+ 2) Update Generator + Supervisor (+ Embedder via recon & consistency)
+ """
+ model.train()
+
+ # ---- 1) Discriminator update
+ with torch.no_grad():
+ H_real = model.embedder(x)
+ E_tilde = model.generator(z)
+ H_tilde = model.supervisor(E_tilde)
+ D_real = model.discriminator(H_real)
+ D_fake = model.discriminator(H_tilde)
+
+ loss_d = discriminator_loss(D_real, D_fake)
+ opt_d.zero_grad(set_to_none=True)
+ loss_d.backward()
+ opt_d.step()
+
+ # ---- 2) Generator/Supervisor/Embedder update
+ paths = model.forward_gen_paths(x, z) # keeps gradient through G/S
+ H, H_hat, H_tilde, X_tilde, D_fake_for_gen = (
+ paths["H"], paths["H_hat_supervise"], paths["H_tilde"], paths["X_tilde"], paths["D_fake"]
+ )
+
+ # adversarial
+ loss_g_adv = generator_adv_loss(D_fake_for_gen)
+ # supervised (latent next-step)
+ loss_g_sup = supervised_loss(H, H_hat)
+ # moment matching in data space
+ # Optionally generate X via recovery of H_tilde (already X_tilde)
+ loss_g_mom = moment_loss(x, X_tilde)
+ # latent feature matching
+ loss_g_fm = generator_feature_matching_loss(H, H_tilde)
+
+ # total generator loss
+ loss_g_total = (
+ weights.lambda_gen * loss_g_adv
+ + weights.lambda_sup * loss_g_sup
+ + weights.lambda_moment * loss_g_mom
+ + weights.lambda_fm * loss_g_fm
+ )
+
+ # optional small reconstruction on embedder to preserve representation
+ H_e, X_hat = model.forward_autoencoder(x) # reuse embedder/recovery path
+ loss_recon = reconstruction_loss(x, X_hat)
+ # encourage E_tilde to be close to H via supervisor (consistency)
+ loss_consistency = F.mse_loss(H_tilde, H_e).mul(0.1) # small weight
+
+ total = loss_g_total + loss_recon + loss_consistency
+
+ opt_gs.zero_grad(set_to_none=True)
+ total.backward()
+ opt_gs.step()
+
+ return {
+ "d": float(loss_d.detach().cpu()),
+ "g_adv": float(loss_g_adv.detach().cpu()),
+ "g_sup": float(loss_g_sup.detach().cpu()),
+ "g_mom": float(loss_g_mom.detach().cpu()),
+ "g_fm": float(loss_g_fm.detach().cpu()),
+ "recon": float(loss_recon.detach().cpu()),
+ "cons": float(loss_consistency.detach().cpu()),
+ "g_total": float(loss_g_total.detach().cpu()),
+ }
+
+
+# -------------------------
+# Example (for reference)
+# -------------------------
+# if __name__ == "__main__":
+# B, T, x_dim, z_dim, h_dim = 16, 24, 8, 16, 24
+# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# model = TimeGAN(x_dim, z_dim, h_dim).to(device)
+# opt_gs = make_optim(list(model.embedder.parameters()) +
+# list(model.recovery.parameters()) +
+# list(model.generator.parameters()) +
+# list(model.supervisor.parameters()), lr=1e-3)
+# opt_d = make_optim(model.discriminator.parameters(), lr=1e-3)
+# x = torch.randn(B, T, x_dim, device=device)
+# z = sample_noise(B, T, z_dim, device=device)
+# # Pretrain autoencoder
+# print(timegan_autoencoder_step(model, x, opt_gs))
+# # Pretrain supervisor
+# print(timegan_supervisor_step(model, x, opt_gs))
+# # Joint step
+# print(timegan_joint_step(model, x, z, opt_gs, opt_d))
From 53ee1ea87a92ca9370151b0c6825884e450f9018 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Sat, 4 Oct 2025 18:56:23 +1000
Subject: [PATCH 18/74] feat(train): add end-to-end TimeGAN trainer for LOBSTER
windows
Supports windows.npz or on-the-fly preprocessing via LOBSTERData.
Includes 3-phase schedule (AE -> SUP -> Joint), AMP toggle, grad clipping,
basic checkpoints, and moment-loss validation.
---
.../TimeLOB_TimeGAN_49088276/src/train.py | 281 +++++++++++++++++-
1 file changed, 280 insertions(+), 1 deletion(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/train.py b/recognition/TimeLOB_TimeGAN_49088276/src/train.py
index 13eeac4ef..aa34e99c1 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/train.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/train.py
@@ -16,4 +16,283 @@
References:
-
"""
-# TODO: Wire training loops, metrics/plots, checkpointing, and CLI Argument parsing.
+from __future__ import annotations
+import os, json, math, time, argparse, random
+from dataclasses import asdict
+from typing import Tuple, Optional
+
+import numpy as np
+import torch
+from torch.utils.data import TensorDataset, DataLoader
+
+# local imports
+from dataset import LOBSTERData
+from modules import (
+ TimeGAN, sample_noise, make_optim,
+ timegan_autoencoder_step, timegan_supervisor_step, timegan_joint_step,
+ LossWeights
+)
+
+# -------------------------
+# utils
+# -------------------------
+def set_seed(seed: int = 1337):
+ random.seed(seed); np.random.seed(seed)
+ torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
+
+def shape_from_npz(npz_path: str) -> Tuple[int,int,int]:
+ d = np.load(npz_path)
+ w = d["train"]
+ return tuple(w.shape) # num_seq, seq_len, x_dim
+
+def build_loaders_from_npz(npz_path: str, batch_size: int) -> Tuple[DataLoader, DataLoader, DataLoader, int, int]:
+ d = np.load(npz_path)
+ W_train = torch.from_numpy(d["train"]).float()
+ W_val = torch.from_numpy(d["val"]).float()
+ W_test = torch.from_numpy(d["test"]).float()
+ T = W_train.size(1); D = W_train.size(2)
+ train_dl = DataLoader(TensorDataset(W_train), batch_size=batch_size, shuffle=True, drop_last=True)
+ val_dl = DataLoader(TensorDataset(W_val), batch_size=batch_size, shuffle=False)
+ test_dl = DataLoader(TensorDataset(W_test), batch_size=batch_size, shuffle=False)
+ return train_dl, val_dl, test_dl, T, D
+
+def build_loaders_from_csv(args, batch_size: int) -> Tuple[DataLoader, DataLoader, DataLoader, int, int]:
+ ds = LOBSTERData(
+ data_dir=args.data_dir,
+ message_file=args.message,
+ orderbook_file=args.orderbook,
+ feature_set=args.feature_set,
+ seq_len=args.seq_len,
+ stride=args.stride,
+ splits=tuple(args.splits),
+ scaler=args.scaler,
+ headerless_message=args.headerless_message,
+ headerless_orderbook=args.headerless_orderbook,
+ # optional whitening & aug flags if you want them in training too:
+ whiten=args.whiten, pca_var=args.pca_var,
+ aug_prob=args.aug_prob, aug_jitter_std=args.aug_jitter_std,
+ aug_scaling_std=args.aug_scaling_std, aug_timewarp_max=args.aug_timewarp_max,
+ save_dir=args.save_dir,
+ )
+ W_train, W_val, W_test = ds.load_arrays()
+ T = W_train.shape[1]; D = W_train.shape[2]
+ train_dl = DataLoader(TensorDataset(torch.from_numpy(W_train).float()), batch_size=batch_size, shuffle=True, drop_last=True)
+ val_dl = DataLoader(TensorDataset(torch.from_numpy(W_val).float()), batch_size=batch_size, shuffle=False)
+ test_dl = DataLoader(TensorDataset(torch.from_numpy(W_test).float()), batch_size=batch_size, shuffle=False)
+ # Persist meta if saving:
+ if args.save_dir:
+ meta = ds.get_meta()
+ with open(os.path.join(args.save_dir, "meta.train.json"), "w") as f:
+ json.dump(meta, f, indent=2)
+ return train_dl, val_dl, test_dl, T, D
+
+def save_ckpt(path: str, model: TimeGAN, opt_gs, opt_d, step: int, args, extra=None):
+ os.makedirs(os.path.dirname(path), exist_ok=True)
+ payload = {
+ "step": step,
+ "args": vars(args),
+ "embedder": model.embedder.state_dict(),
+ "recovery": model.recovery.state_dict(),
+ "generator": model.generator.state_dict(),
+ "supervisor": model.supervisor.state_dict(),
+ "discriminator": model.discriminator.state_dict(),
+ "opt_gs": opt_gs.state_dict(),
+ "opt_d": opt_d.state_dict(),
+ "extra": extra or {},
+ }
+ torch.save(payload, path)
+
+# -------------------------
+# train loops
+# -------------------------
+def run_autoencoder_phase(model, train_dl, device, opt_gs, epochs: int, amp: bool, clip: Optional[float]):
+ scaler = torch.amp.GradScaler('cuda', enabled=amp)
+ for ep in range(1, epochs+1):
+ t0 = time.time()
+ logs = []
+ for (xb,) in train_dl:
+ xb = xb.to(device, non_blocking=True)
+ opt_gs.zero_grad(set_to_none=True)
+ if amp:
+ with torch.amp.autocast('cuda'):
+ out = timegan_autoencoder_step(model, xb, opt_gs)
+ else:
+ out = timegan_autoencoder_step(model, xb, opt_gs)
+ # timegan_autoencoder_step already steps opt; clip if needed
+ if clip is not None:
+ torch.nn.utils.clip_grad_norm_(model.embedder.parameters(), clip)
+ torch.nn.utils.clip_grad_norm_(model.recovery.parameters(), clip)
+ logs.append(out["recon"])
+ dt = time.time()-t0
+ print(f"[AE] epoch {ep}/{epochs} recon={np.mean(logs):.6f} ({dt:.1f}s)")
+
+def run_supervisor_phase(model, train_dl, device, opt_gs, epochs: int, amp: bool, clip: Optional[float]):
+ for ep in range(1, epochs+1):
+ t0 = time.time()
+ logs = []
+ for (xb,) in train_dl:
+ xb = xb.to(device, non_blocking=True)
+ out = timegan_supervisor_step(model, xb, opt_gs)
+ if clip is not None:
+ torch.nn.utils.clip_grad_norm_(model.supervisor.parameters(), clip)
+ logs.append(out["sup"])
+ dt = time.time()-t0
+ print(f"[SUP] epoch {ep}/{epochs} sup={np.mean(logs):.6f} ({dt:.1f}s)")
+
+def evaluate_moment(model, loader, device, z_dim: int) -> float:
+ # rough eval: moment loss on validation set (lower is better)
+ from modules import moment_loss
+ model.eval()
+ vals = []
+ with torch.no_grad():
+ for (xb,) in loader:
+ xb = xb.to(device)
+ z = sample_noise(xb.size(0), xb.size(1), z_dim, device)
+ # generate one batch
+ paths = model.forward_gen_paths(xb, z)
+ x_tilde = paths["X_tilde"]
+ vals.append(float(moment_loss(xb, x_tilde).cpu()))
+ return float(np.mean(vals)) if vals else math.inf
+
+def run_joint_phase(model, train_dl, val_dl, device, opt_gs, opt_d,
+ z_dim: int, epochs: int, amp: bool, clip: Optional[float],
+ loss_weights: LossWeights, ckpt_dir: Optional[str], args=None):
+ best_val = math.inf
+ step = 0
+ for ep in range(1, epochs+1):
+ t0 = time.time()
+ logs = {"d": [], "g_adv": [], "g_sup": [], "g_mom": [], "g_fm": [], "recon": [], "cons": [], "g_total": []}
+ for (xb,) in train_dl:
+ xb = xb.to(device, non_blocking=True)
+ z = sample_noise(xb.size(0), xb.size(1), z_dim, device)
+ out = timegan_joint_step(model, xb, z, opt_gs, opt_d, loss_weights)
+ if clip is not None:
+ torch.nn.utils.clip_grad_norm_(list(model.embedder.parameters())+
+ list(model.recovery.parameters())+
+ list(model.generator.parameters())+
+ list(model.supervisor.parameters()), clip)
+ torch.nn.utils.clip_grad_norm_(model.discriminator.parameters(), clip)
+ for k, v in out.items(): logs[k].append(v)
+ step += 1
+
+ # validation (moment)
+ val_m = evaluate_moment(model, val_dl, device, z_dim)
+ dt = time.time()-t0
+ log_line = " ".join([f"{k}={np.mean(v):.4f}" for k,v in logs.items()])
+ print(f"[JOINT] epoch {ep}/{epochs} {log_line} | val_moment={val_m:.4f} ({dt:.1f}s)")
+
+ # save best
+ if ckpt_dir:
+ if val_m < best_val:
+ best_val = val_m
+ save_ckpt(os.path.join(ckpt_dir, "best.pt"), model, opt_gs, opt_d, step, args=args,
+ extra={"val_moment": val_m})
+ save_ckpt(os.path.join(ckpt_dir, f"step_{step}.pt"), model, opt_gs, opt_d, step, args=args,
+ extra={"val_moment": val_m})
+
+# -------------------------
+# main
+# -------------------------
+if __name__ == "__main__":
+ p = argparse.ArgumentParser(description="Train TimeGAN on LOBSTERData.")
+ # data sources
+ p.add_argument("--npz", type=str, help="Path to windows.npz (train/val/test). If set, ignores --data-dir.")
+ p.add_argument("--data-dir", type=str, help="Folder with message_10.csv and orderbook_10.csv")
+ p.add_argument("--message", default="message_10.csv")
+ p.add_argument("--orderbook", default="orderbook_10.csv")
+ p.add_argument("--feature-set", choices=["core","raw10"], default="core")
+ p.add_argument("--seq-len", type=int, default=128)
+ p.add_argument("--stride", type=int, default=32)
+ p.add_argument("--splits", type=float, nargs=3, default=(0.7,0.15,0.15))
+ p.add_argument("--scaler", choices=["standard","minmax","robust","quantile","power","none"], default="robust")
+ p.add_argument("--whiten", choices=["pca","zca",None], default="pca")
+ p.add_argument("--pca-var", type=float, default=0.999)
+ p.add_argument("--headerless-message", action="store_true")
+ p.add_argument("--headerless-orderbook", action="store_true")
+ p.add_argument("--save-dir", type=str, default=None, help="If set during CSV mode, saves NPZ/meta here.")
+
+ # model
+ p.add_argument("--x-dim", type=str, default="auto", help="'auto' infers from data; else int")
+ p.add_argument("--z-dim", type=int, default=24)
+ p.add_argument("--h-dim", type=int, default=64)
+ p.add_argument("--rnn-type", choices=["gru","lstm"], default="gru")
+ p.add_argument("--enc-layers", type=int, default=2)
+ p.add_argument("--dec-layers", type=int, default=2)
+ p.add_argument("--gen-layers", type=int, default=2)
+ p.add_argument("--sup-layers", type=int, default=1)
+ p.add_argument("--dis-layers", type=int, default=1)
+ p.add_argument("--dropout", type=float, default=0.1)
+
+ # training
+ p.add_argument("--batch-size", type=int, default=64)
+ p.add_argument("--seed", type=int, default=1337)
+ p.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu")
+ p.add_argument("--amp", action="store_true", help="Enable mixed precision.")
+ p.add_argument("--clip", type=float, default=1.0, help="Grad clip norm; set <=0 to disable.")
+ p.add_argument("--ae-epochs", type=int, default=10)
+ p.add_argument("--sup-epochs", type=int, default=10)
+ p.add_argument("--joint-epochs", type=int, default=50)
+ p.add_argument("--lr", type=float, default=1e-3)
+ p.add_argument("--ckpt-dir", type=str, default="./ckpts")
+
+ # augmentation passthrough when using CSV mode
+ p.add_argument("--aug-prob", type=float, default=0.0)
+ p.add_argument("--aug-jitter-std", type=float, default=0.01)
+ p.add_argument("--aug-scaling-std", type=float, default=0.05)
+ p.add_argument("--aug-timewarp-max", type=float, default=0.1)
+
+ args = p.parse_args()
+ set_seed(args.seed)
+ device = torch.device(args.device)
+ os.makedirs(args.ckpt_dir, exist_ok=True)
+ run_dir = os.path.join(args.ckpt_dir, f"timegan_{time.strftime('%Y%m%d-%H%M%S')}")
+ os.makedirs(run_dir, exist_ok=True)
+
+ # Data
+ if args.npz:
+ train_dl, val_dl, test_dl, T, D = build_loaders_from_npz(args.npz, args.batch_size)
+ elif args.data_dir:
+ train_dl, val_dl, test_dl, T, D = build_loaders_from_csv(args, args.batch_size)
+ else:
+ raise SystemExit("Provide either --npz or --data-dir")
+
+ x_dim = D if args.x_dim == "auto" else int(args.x_dim)
+
+ # Model & optims
+ model = TimeGAN(
+ x_dim=x_dim, z_dim=args.z_dim, h_dim=args.h_dim,
+ rnn_type=args.rnn_type, enc_layers=args.enc_layers, dec_layers=args.dec_layers,
+ gen_layers=args.gen_layers, sup_layers=args.sup_layers, dis_layers=args.dis_layers,
+ dropout=args.dropout
+ ).to(device)
+
+ opt_gs = make_optim(list(model.embedder.parameters()) +
+ list(model.recovery.parameters()) +
+ list(model.generator.parameters()) +
+ list(model.supervisor.parameters()), lr=args.lr)
+ opt_d = make_optim(model.discriminator.parameters(), lr=args.lr)
+
+ # Phase 1: autoencoder pretrain
+ if args.ae_epochs > 0:
+ run_autoencoder_phase(model, train_dl, device, opt_gs, args.ae_epochs, amp=args.amp, clip=args.clip if args.clip>0 else None)
+ save_ckpt(os.path.join(run_dir, "after_autoencoder.pt"), model, opt_gs, opt_d, step=0, args=args)
+
+ # Phase 2: supervisor pretrain
+ if args.sup_epochs > 0:
+ run_supervisor_phase(model, train_dl, device, opt_gs, args.sup_epochs, amp=args.amp, clip=args.clip if args.clip>0 else None)
+ save_ckpt(os.path.join(run_dir, "after_supervisor.pt"), model, opt_gs, opt_d, step=0, args=args)
+
+ # Phase 3: joint training
+ if args.joint_epochs > 0:
+ run_joint_phase(
+ model, train_dl, val_dl, device, opt_gs, opt_d,
+ z_dim=args.z_dim, epochs=args.joint_epochs, amp=args.amp,
+ clip=args.clip if args.clip>0 else None,
+ loss_weights=LossWeights(), ckpt_dir=run_dir, args=args
+ )
+
+
+ # Final test moment score
+ test_m = evaluate_moment(model, test_dl, device, args.z_dim)
+ print(f"[DONE] test moment loss: {test_m:.6f}")
+
From e2f1b74119a6ea182a802c17741c2aa1355fc895 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Sat, 4 Oct 2025 21:32:45 +1000
Subject: [PATCH 19/74] feat(predict): add TimeGAN sampling & visualisation
script (lines + heatmaps + stats)
Loads windows from NPZ or CSV via LOBSTERData, restores trained checkpoint, samples synthetic sequences,
prints per-feature mean/std and quick KL, and saves feature-line plots + depth heatmaps to --outdir.
---
.../TimeLOB_TimeGAN_49088276/src/predict.py | 259 +++++++++++++++++-
1 file changed, 254 insertions(+), 5 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/predict.py b/recognition/TimeLOB_TimeGAN_49088276/src/predict.py
index 3bdc4077d..6e9654b53 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/predict.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/predict.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
"""
Sample synthetic sequences using a trained TimeGAN model and visualise results.
@@ -6,12 +7,260 @@
(e.g., feature lines and depth heatmaps) to compare real vs. synthetic data.
Typical Usage:
- python3 -m train --data_dir --seq_len 100 --batch_size 64 --epochs 20
+ # Using preprocessed windows
+ python sample_viz.py --npz ./preproc_final/windows.npz \
+ --ckpt ./ckpts/timegan_run/best.pt --z-dim 24 --h-dim 64
+
+ # Preprocess on-the-fly (same flags as dataset.py)
+ python sample_viz.py --data-dir /PATH/TO/SESSION --feature-set core \
+ --seq-len 128 --stride 32 --scaler robust --whiten pca --pca-var 0.999 \
+ --ckpt ./ckpts/timegan_run/best.pt --z-dim 24 --h-dim 64
Created By: Radhesh Goel (Keys-I)
ID: s49088276
-
-References:
--
"""
-# TODO: Implement checkpoint load, sampling, basic stats, and visualisations.
\ No newline at end of file
+from __future__ import annotations
+import os
+import argparse
+import numpy as np
+import matplotlib.pyplot as plt
+from typing import Tuple
+
+import torch
+
+# local modules
+from modules import TimeGAN, sample_noise
+from dataset import LOBSTERData
+
+
+# ---------------------------
+# Data loading helpers
+# ---------------------------
+
+def load_windows_npz(npz_path: str) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+ d = np.load(npz_path)
+ return d["train"], d["val"], d["test"]
+
+def load_windows_csv(args) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+ ds = LOBSTERData(
+ data_dir=args.data_dir,
+ message_file=args.message,
+ orderbook_file=args.orderbook,
+ feature_set=args.feature_set,
+ seq_len=args.seq_len,
+ stride=args.stride,
+ splits=tuple(args.splits),
+ scaler=args.scaler,
+ headerless_message=args.headerless_message,
+ headerless_orderbook=args.headerless_orderbook,
+ whiten=args.whiten, pca_var=args.pca_var,
+ aug_prob=0.0, # no aug for visualisation builds
+ save_dir=None,
+ )
+ return ds.load_arrays()
+
+
+# ---------------------------
+# Model restore + sampling
+# ---------------------------
+
+def build_model_from_ckpt(ckpt_path: str, x_dim: int, z_dim: int, h_dim: int, device: torch.device) -> TimeGAN:
+ ckpt = torch.load(ckpt_path, map_location=device)
+ args_in_ckpt = ckpt.get("args", {}) or {}
+ rnn_type = args_in_ckpt.get("rnn_type", "gru")
+ enc_layers = int(args_in_ckpt.get("enc_layers", 2))
+ dec_layers = int(args_in_ckpt.get("dec_layers", 2))
+ gen_layers = int(args_in_ckpt.get("gen_layers", 2))
+ sup_layers = int(args_in_ckpt.get("sup_layers", 1))
+ dis_layers = int(args_in_ckpt.get("dis_layers", 1))
+ dropout = float(args_in_ckpt.get("dropout", 0.1))
+
+ model = TimeGAN(
+ x_dim=x_dim, z_dim=z_dim, h_dim=h_dim,
+ rnn_type=rnn_type, enc_layers=enc_layers, dec_layers=dec_layers,
+ gen_layers=gen_layers, sup_layers=sup_layers, dis_layers=dis_layers,
+ dropout=dropout
+ ).to(device)
+
+ model.embedder.load_state_dict(ckpt["embedder"])
+ model.recovery.load_state_dict(ckpt["recovery"])
+ model.generator.load_state_dict(ckpt["generator"])
+ model.supervisor.load_state_dict(ckpt["supervisor"])
+ model.discriminator.load_state_dict(ckpt["discriminator"])
+ model.eval()
+ return model
+
+@torch.no_grad()
+def sample_synthetic(model: TimeGAN, n_seq: int, seq_len: int, z_dim: int, device: torch.device) -> np.ndarray:
+ z = sample_noise(n_seq, seq_len, z_dim, device)
+ e_tilde = model.generator(z)
+ h_tilde = model.supervisor(e_tilde)
+ x_tilde = model.recovery(h_tilde)
+ return x_tilde.detach().cpu().numpy()
+
+
+# ---------------------------
+# Stats + simple similarity
+# ---------------------------
+
+def summarize(name: str, W: np.ndarray) -> dict:
+ # mean/std over batch+time, per-feature
+ mu = W.mean(axis=(0, 1))
+ sd = W.std(axis=(0, 1))
+ return {"name": name, "mean": mu, "std": sd}
+
+def kl_hist_avg(real: np.ndarray, synth: np.ndarray, bins: int = 64, eps: float = 1e-9) -> float:
+ """
+ Quick histogram-based KL(real || synth) averaged over features.
+ """
+ from scipy.special import rel_entr
+ F = real.shape[2]
+ vals = []
+ R = real.reshape(-1, F)
+ S = synth.reshape(-1, F)
+ for f in range(F):
+ r = R[:, f]; s = S[:, f]
+ lo = np.nanpercentile(np.concatenate([r, s]), 0.5)
+ hi = np.nanpercentile(np.concatenate([r, s]), 99.5)
+ if not np.isfinite(lo) or not np.isfinite(hi) or hi <= lo:
+ continue
+ pr, _ = np.histogram(r, bins=bins, range=(lo, hi), density=True)
+ ps, _ = np.histogram(s, bins=bins, range=(lo, hi), density=True)
+ pr = pr + eps; ps = ps + eps
+ pr = pr / pr.sum(); ps = ps / ps.sum()
+ vals.append(np.sum(rel_entr(pr, ps)))
+ return float(np.mean(vals)) if vals else float("nan")
+
+
+# ---------------------------
+# Visualisations
+# ---------------------------
+
+def plot_feature_lines(real: np.ndarray, synth: np.ndarray, outdir: str, max_feats: int = 4, idx: int = 0):
+ """
+ Plot a few feature time-series (same sequence index) real vs synthetic.
+ """
+ os.makedirs(outdir, exist_ok=True)
+ T, F = real.shape[1], real.shape[2]
+ feats = min(F, max_feats)
+
+ fig, axes = plt.subplots(feats, 1, figsize=(10, 2.2 * feats), sharex=True)
+ if feats == 1:
+ axes = [axes]
+ for i in range(feats):
+ axes[i].plot(real[idx, :, i], label="real", linewidth=1.2)
+ axes[i].plot(synth[idx, :, i], label="synthetic", linewidth=1.2, linestyle="--")
+ axes[i].set_ylabel(f"feat {i}")
+ axes[-1].set_xlabel("time")
+ axes[0].legend(loc="upper right")
+ fig.suptitle("Feature lines: real vs synthetic")
+ fig.tight_layout()
+ fig.savefig(os.path.join(outdir, "feature_lines.png"), dpi=150)
+ plt.close(fig)
+
+def plot_heatmaps(real: np.ndarray, synth: np.ndarray, outdir: str, idx: int = 0):
+ """
+ Plot depth heatmaps (time x features) for a single sequence.
+ """
+ os.makedirs(outdir, exist_ok=True)
+ a = real[idx]; b = synth[idx]
+ # normalize each to [0,1] for visibility
+ def norm01(x):
+ lo, hi = np.percentile(x, 1), np.percentile(x, 99)
+ return np.clip((x - lo) / (hi - lo + 1e-9), 0, 1)
+
+ a = norm01(a); b = norm01(b)
+
+ fig, axes = plt.subplots(1, 2, figsize=(12, 4))
+ im0 = axes[0].imshow(a, aspect="auto", origin="lower")
+ axes[0].set_title("Real (heatmap)")
+ axes[0].set_xlabel("feature"); axes[0].set_ylabel("time")
+ fig.colorbar(im0, ax=axes[0], fraction=0.046, pad=0.04)
+
+ im1 = axes[1].imshow(b, aspect="auto", origin="lower")
+ axes[1].set_title("Synthetic (heatmap)")
+ axes[1].set_xlabel("feature"); axes[1].set_ylabel("time")
+ fig.colorbar(im1, ax=axes[1], fraction=0.046, pad=0.04)
+
+ fig.tight_layout()
+ fig.savefig(os.path.join(outdir, "heatmaps.png"), dpi=150)
+ plt.close(fig)
+
+
+# ---------------------------
+# Main
+# ---------------------------
+
+if __name__ == "__main__":
+ ap = argparse.ArgumentParser(description="Sample & visualise TimeGAN outputs vs real.")
+ # data
+ ap.add_argument("--npz", type=str, help="Path to windows.npz (train/val/test). If set, ignores --data-dir.")
+ ap.add_argument("--data-dir", type=str, help="Folder with message_10.csv and orderbook_10.csv")
+ ap.add_argument("--message", default="message_10.csv")
+ ap.add_argument("--orderbook", default="orderbook_10.csv")
+ ap.add_argument("--feature-set", choices=["core","raw10"], default="core")
+ ap.add_argument("--seq-len", type=int, default=128)
+ ap.add_argument("--stride", type=int, default=32)
+ ap.add_argument("--splits", type=float, nargs=3, default=(0.7,0.15,0.15))
+ ap.add_argument("--scaler", choices=["standard","minmax","robust","quantile","power","none"], default="robust")
+ ap.add_argument("--whiten", choices=["pca","zca",None], default="pca")
+ ap.add_argument("--pca-var", type=float, default=0.999)
+ ap.add_argument("--headerless-message", action="store_true")
+ ap.add_argument("--headerless-orderbook", action="store_true")
+
+ # model restore
+ ap.add_argument("--ckpt", type=str, required=True)
+ ap.add_argument("--z-dim", type=int, required=True)
+ ap.add_argument("--h-dim", type=int, required=True)
+ ap.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu")
+
+ # viz
+ ap.add_argument("--n-synth", type=int, default=128, help="How many synthetic windows to sample.")
+ ap.add_argument("--seq-index", type=int, default=0, help="Which sequence index to plot.")
+ ap.add_argument("--max-feats", type=int, default=4, help="Max features to show in line plot.")
+ ap.add_argument("--outdir", type=str, default="./viz_out")
+
+ args = ap.parse_args()
+ os.makedirs(args.outdir, exist_ok=True)
+ device = torch.device(args.device)
+
+ # Load real windows
+ if args.npz:
+ Wtr, Wval, Wte = load_windows_npz(args.npz)
+ elif args.data_dir:
+ Wtr, Wval, Wte = load_windows_csv(args)
+ else:
+ raise SystemExit("Provide either --npz or --data-dir")
+
+ # Pick a real reference set (test split)
+ real = Wte
+ _, T, D = real.shape
+
+ # Build model & restore
+ model = build_model_from_ckpt(args.ckpt, x_dim=D, z_dim=args.z_dim, h_dim=args.h_dim, device=device)
+ model.eval()
+
+ # Sample synthetic
+ n_synth = min(args.n_synth, len(real))
+ synth = sample_synthetic(model, n_synth, T, args.z_dim, device)
+
+ # Basic stats
+ s_real = summarize("real(test)", real)
+ s_synth = summarize("synthetic", synth)
+ print("=== Summary (per-feature mean/std) ===")
+ print(f"{s_real['name']}: mean[0:5]={s_real['mean'][:5]}, std[0:5]={s_real['std'][:5]}")
+ print(f"{s_synth['name']}: mean[0:5]={s_synth['mean'][:5]}, std[0:5]={s_synth['std'][:5]}")
+
+ # Quick KL(hist) similarity
+ try:
+ kl = kl_hist_avg(real[:n_synth], synth)
+ print(f"KL(real || synth) ~ {kl:.4f} (lower is better)")
+ except Exception as e:
+ print(f"KL computation skipped: {e}")
+
+ # Visualisations
+ idx = max(0, min(args.seq_index, n_synth - 1))
+ plot_feature_lines(real, synth, args.outdir, max_feats=args.max_feats, idx=idx)
+ plot_heatmaps(real, synth, args.outdir, idx=idx)
+
+ print(f"Saved plots to: {args.outdir}")
From 8be97b60d920ea4d819249f78770cbb28c3f2bdf Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Sun, 5 Oct 2025 13:21:56 +1000
Subject: [PATCH 20/74] feat(dataset): simplify pipeline and switch to
continuous MinMax scaler
Streamlined dataset.py by folding helpers inline and removing unused CLI/docs. Normalization now uses a continuous MinMax scaler across windows for stable ranges; I/O paths and outputs simplified without extra flags.
---
.../TimeLOB_TimeGAN_49088276/src/dataset.py | 912 ++++--------------
1 file changed, 166 insertions(+), 746 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
index 70a8fc771..099c4d53c 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
@@ -1,813 +1,233 @@
"""
-LOBSTER (Level-10) preprocessing for TimeGAN.
+Lightweight LOBSTER preprocessing with continuous Min-Max scaling.
-- Loads paired LOBSTER CSVs (message_10.csv, orderbook_10.csv), aligned by event index.
-- Builds either a compact engineered 5-feature set ("core") or raw level-10 depth ("raw10").
-- Chronological train/val/test split (prevents leakage), train-only scaling.
-- Sliding-window sequences shaped (num_seq, seq_len, num_features).
+This module removes the configuration bloat from the original pipeline and
+focuses on the essentials:
+ 1. Load the raw order book snapshot file (level-10).
+ 2. Build either the 5-feature "core" representation or the raw 40 columns.
+ 3. Split chronologically into train/val/test.
+ 4. Fit a streaming-friendly min-max scaler on the training split only.
+ 5. Produce sliding windows ready for TimeGAN.
-Inputs (per trading session):
- message_10.csv, orderbook_10.csv
- - If headers are missing, pass --headerless-message / --headerless-orderbook (CLI),
- but auto-detection now assigns canonical headers when omitted.
-
-Outputs:
- train, val, test — NumPy arrays with shape [num_seq, seq_len, num_features]
-
-Feature sets:
- feature_set="core" (5 engineered features)
- 1) mid_price = 0.5 * (ask_price_1 + bid_price_1)
- 2) spread = ask_price_1 - bid_price_1
- 3) mid_log_return = log(mid_price_t) - log(mid_price_{t-1})
- 4) queue_imbalance_l1 = (bid_size_1 - ask_size_1) / (bid_size_1 + ask_size_1 + eps)
- 5) depth_imbalance_l10 = (Σ_i≤10 bid_size_i - Σ_i≤10 ask_size_i) /
- (Σ_i≤10 bid_size_i + Σ_i≤10 ask_size_i + eps)
-
- feature_set="raw10" (40 raw columns)
- ask_price_1..10, ask_size_1..10, bid_price_1..10, bid_size_1..10
-
-Notes:
-- Scaling is fit on TRAIN only (Standard/MinMax/None). Advanced scalers: Robust, Quantile, Power.
-- Optional whitening: PCA (variance threshold) or ZCA.
-- Optional train-only sequence augmentations (jitter, scaling, time-warp) for GANs.
-- Windows default to non-overlapping (stride=seq_len); set stride "ContinuousMinMaxScaler":
+ arr = np.asarray(data, dtype=np.float64)
+ self.data_min_ = arr.min(axis=0)
+ self.data_max_ = arr.max(axis=0)
+ return self
+
+ def transform(self, data: np.ndarray) -> np.ndarray:
+ if self.data_min_ is None or self.data_max_ is None:
+ raise RuntimeError("Scaler not fitted.")
+ arr = np.asarray(data, dtype=np.float64)
+ denom = np.maximum(self.data_max_ - self.data_min_, self.eps)
+ scaled = (arr - self.data_min_) / denom
+ lo, hi = self.feature_range
+ return (scaled * (hi - lo) + lo).astype(arr.dtype, copy=False)
+
+ def fit_transform(self, data: np.ndarray) -> np.ndarray:
+ return self.fit(data).transform(data)
+
+ def inverse_transform(self, data: np.ndarray) -> np.ndarray:
+ if self.data_min_ is None or self.data_max_ is None:
+ raise RuntimeError("Scaler not fitted.")
+ lo, hi = self.feature_range
+ arr = np.asarray(data, dtype=np.float64)
+ base = (arr - lo) / (hi - lo + self.eps)
+ return base * (self.data_max_ - self.data_min_) + self.data_min_
- Feature sets:
- - "core": engineered 5-feature set (+ optional extras)
- - "raw10": 40 raw columns (ask/bid price/size × levels 1..10) (+ optional extras)
+
+class LOBSTERData:
+ """
+ Minimal LOBSTER loader (orderbook only) with continuous min-max scaling.
+
+ Parameters
+ ----------
+ data_dir : str
+ Folder containing orderbook_10.csv (and optionally message_10.csv).
+ feature_set : {"core", "raw10"}
+ Representation to build.
+ seq_len : int
+ Window length fed to TimeGAN.
+ stride : int, optional
+ Step between consecutive windows (defaults to seq_len for non-overlap).
+ splits : tuple
+ Train/val/test fractions; must sum to 1.0.
"""
def __init__(
self,
data_dir: str,
- message_file: str = "message_10.csv",
+ message_file: str = "message_10.csv", # kept for compatibility; unused
orderbook_file: str = "orderbook_10.csv",
feature_set: Literal["core", "raw10"] = "core",
- seq_len: int = 64,
+ seq_len: int = 128,
stride: Optional[int] = None,
splits: Tuple[float, float, float] = (0.7, 0.15, 0.15),
- scaler: Literal["standard", "minmax", "robust", "quantile", "power", "none"] = "standard",
feature_range: Tuple[float, float] = (0.0, 1.0),
- eps: float = 1e-8,
- headerless_message: bool = False,
- headerless_orderbook: bool = False,
- dropna: bool = True,
- output_dtype: Literal["float32", "float64"] = "float32",
- sort_by_time: bool = False,
- every: int = 1,
- clip_quantiles: Optional[Tuple[float, float]] = None,
-
- # --- extra feature engineering knobs ---
- add_rel_spread: bool = True,
- add_microprice: bool = True,
- add_imbalance_l5: bool = True,
- add_roll_stats: bool = True,
- roll_window: int = 64,
- add_diff1: bool = True,
- add_pct_change: bool = False,
-
- # --- whitening / dimensionality reduction ---
- whiten: Optional[Literal["pca", "zca"]] = None,
- pca_var: float = 0.99,
-
- # --- train-only augmentation for GANs ---
- aug_prob: float = 0.0,
- aug_jitter_std: float = 0.01,
- aug_scaling_std: float = 0.05,
- aug_timewarp_max: float = 0.1,
-
- # --- persistence ---
+ dtype: Literal["float32", "float64"] = "float32",
save_dir: Optional[str] = None,
):
self.data_dir = data_dir
- self.message_path = os.path.join(data_dir, message_file)
+ self.message_file = message_file # placeholder for potential alignment checks
self.orderbook_path = os.path.join(data_dir, orderbook_file)
self.feature_set = feature_set
self.seq_len = int(seq_len)
self.stride = int(stride) if stride is not None else self.seq_len
self.splits = splits
- self.scaler_kind = scaler
- self.feature_range = feature_range
- self.eps = eps
- self.headerless_message = headerless_message
- self.headerless_orderbook = headerless_orderbook
- self.dropna = dropna
- self.output_dtype = np.float32 if output_dtype == "float32" else np.float64
- self.sort_by_time = bool(sort_by_time)
- self.every = max(1, int(every))
- self.clip_quantiles = clip_quantiles
-
- # feature knobs
- self.add_rel_spread = add_rel_spread
- self.add_microprice = add_microprice
- self.add_imbalance_l5 = add_imbalance_l5
- self.add_roll_stats = add_roll_stats
- self.roll_window = int(roll_window)
- self.add_diff1 = add_diff1
- self.add_pct_change = add_pct_change
-
- # whitening/DR
- self.whiten = whiten
- self.pca_var = float(pca_var)
- self._pca = None # set later
- self._zca_cov = None # (mean, whitening_mat)
-
- # augmentation
- self.aug_prob = float(aug_prob)
- self.aug_jitter_std = float(aug_jitter_std)
- self.aug_scaling_std = float(aug_scaling_std)
- self.aug_timewarp_max = float(aug_timewarp_max)
-
- # save
+ self.scaler = ContinuousMinMaxScaler(feature_range=feature_range)
+ self._dtype_name = dtype
+ self.dtype = np.float32 if dtype == "float32" else np.float64
self.save_dir = save_dir
-
- self._validate_splits()
- if not (self.seq_len > 0 and self.stride > 0):
- raise ValueError("seq_len and stride must be positive")
-
- self._scaler = None
- self._feature_names: List[str] = []
- self._row_counts: Dict[str, int] = {}
- self._clip_bounds: Optional[Tuple[np.ndarray, np.ndarray]] = None # (lo, hi)
+ self.eps = 1e-8
+ self._validate_inputs()
# ------------------- public API -------------------
def load_arrays(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
- msg_df, ob_df = self._load_csvs()
-
- if self.sort_by_time and "time" in msg_df.columns:
- order = msg_df["time"].reset_index(drop=True).sort_values().index
- msg_df = msg_df.iloc[order].reset_index(drop=True)
- ob_df = ob_df.iloc[order].reset_index(drop=True)
-
- self._check_alignment(msg_df, ob_df)
-
- # enforce numeric types early (prevents string pollution)
- for col in ("time", "order_id", "size", "price"):
- if col in msg_df.columns:
- msg_df[col] = pd.to_numeric(msg_df[col], errors="coerce")
- ob_df[ob_df.columns] = ob_df[ob_df.columns].apply(pd.to_numeric, errors="coerce")
-
- feats = self._build_features(ob_df)
-
- if self.every > 1:
- feats = feats[::self.every]
- self._row_counts["decimated_every"] = self.every
-
- if self.dropna:
- feats = feats[~np.isnan(feats).any(axis=1)]
- feats = feats[np.isfinite(feats).all(axis=1)]
- self._row_counts["post_clean"] = int(feats.shape[0])
+ orderbook = self._load_orderbook()
+ features = self._build_features(orderbook)
+ features = features[~np.isnan(features).any(axis=1)]
+ train, val, test = self._split(features)
- train, val, test = self._split_chronologically(feats)
- self._row_counts.update(train=len(train), val=len(val), test=len(test))
+ self.scaler.fit(train)
+ train = self.scaler.transform(train)
+ val = self.scaler.transform(val)
+ test = self.scaler.transform(test)
- if self.clip_quantiles is not None:
- qmin, qmax = self.clip_quantiles
- if not (0.0 <= qmin < qmax <= 1.0):
- raise ValueError("clip_quantiles must satisfy 0 <= qmin < qmax <= 1")
- lo = np.quantile(train, qmin, axis=0)
- hi = np.quantile(train, qmax, axis=0)
- self._clip_bounds = (lo, hi)
- train = np.clip(train, lo, hi)
- val = np.clip(val, lo, hi)
- test = np.clip(test, lo, hi)
+ W_train = self._windowize(train)
+ W_val = self._windowize(val)
+ W_test = self._windowize(test)
- train_s, val_s, test_s = self._scale_train_only(train, val, test)
- W_train = self._windowize(train_s)
- W_val = self._windowize(val_s)
- W_test = self._windowize(test_s)
-
- # train-only augmentations for GANs
- W_train = self._augment_windows(W_train)
-
- W_train = W_train.astype(self.output_dtype, copy=False)
- W_val = W_val.astype(self.output_dtype, copy=False)
- W_test = W_test.astype(self.output_dtype, copy=False)
-
- # optional persistence
if self.save_dir:
os.makedirs(self.save_dir, exist_ok=True)
np.savez_compressed(
os.path.join(self.save_dir, "windows.npz"),
train=W_train, val=W_val, test=W_test
)
- meta = self.get_meta()
- meta["whiten"] = self.whiten
- meta["pca_var"] = self.pca_var
- meta["aug"] = {
- "prob": self.aug_prob, "jitter_std": self.aug_jitter_std,
- "scaling_std": self.aug_scaling_std, "timewarp_max": self.aug_timewarp_max
- }
with open(os.path.join(self.save_dir, "meta.json"), "w", encoding="utf-8") as f:
- json.dump(meta, f, indent=2)
-
- if joblib is not None and self._scaler is not None:
- joblib.dump(self._scaler, os.path.join(self.save_dir, "scaler.pkl"))
- if joblib is not None and self._pca is not None:
- joblib.dump(self._pca, os.path.join(self.save_dir, "pca.pkl"))
- if joblib is not None and self._zca_cov is not None:
- joblib.dump(self._zca_cov, os.path.join(self.save_dir, "zca.pkl"))
+ json.dump(self.get_meta(), f, indent=2)
return W_train, W_val, W_test
- def get_feature_names(self) -> List[str]:
- return list(self._feature_names)
-
- def get_scaler(self):
- return self._scaler
-
- def inverse_transform(self, arr: np.ndarray) -> np.ndarray:
- if self._scaler is None:
- raise RuntimeError("Scaler not fitted; call load_arrays() first or use scaler='none'.")
- orig_shape = arr.shape
- flat = arr.reshape(-1, arr.shape[-1])
- inv = self._scaler.inverse_transform(flat)
- return inv.reshape(orig_shape)
-
- def get_meta(self) -> Dict[str, object]:
+ def get_meta(self) -> dict:
return {
"feature_set": self.feature_set,
- "feature_names": self.get_feature_names(),
"seq_len": self.seq_len,
"stride": self.stride,
"splits": self.splits,
- "scaler": (type(self._scaler).__name__ if self._scaler is not None else "None"),
- "row_counts": self._row_counts,
- "clip_bounds": None if self._clip_bounds is None else {
- "lo": self._clip_bounds[0].tolist(),
- "hi": self._clip_bounds[1].tolist(),
- },
- "every": self.every,
- "sorted_by_time": self.sort_by_time,
- "whiten": self.whiten,
- "pca_var": self.pca_var,
+ "feature_range": self.scaler.feature_range,
+ "dtype": self._dtype_name,
}
- # ------------------- internals --------------------
-
- def _validate_splits(self) -> None:
- s = sum(self.splits)
- if not (abs(s - 1.0) < 1e-12):
- raise ValueError(f"splits must sum to 1.0, got {self.splits} (sum={s})")
- if any(x < 0 for x in self.splits):
- raise ValueError("splits cannot be negative")
-
- # ---- header detection helpers ----
- def _looks_headerless(self, path: str, expected_cols: int, min_numeric: int) -> bool:
- """
- Peek the first row with header=None. If the row is mostly numeric and the
- column count matches what we expect, assume there's NO header.
- """
- try:
- df0 = pd.read_csv(path, header=None, nrows=1)
- except Exception:
- return False
- if df0.shape[1] != expected_cols:
- return False
- num_ok = pd.to_numeric(df0.iloc[0], errors="coerce").notna().sum()
- return num_ok >= min_numeric
-
- def _read_with_possible_headerless(self, path: str, default_names: list[str],
- force_headerless: bool,
- normalize_fn=None) -> pd.DataFrame:
- """
- Read CSV, auto-detect headerlessness if not forced.
- - If forced: header=None, names=default_names
- - Else: if first row looks numeric & count matches, treat as headerless.
- otherwise try header=0 and optionally normalize columns.
- """
- expected_cols = len(default_names)
- if force_headerless:
- return pd.read_csv(path, header=None, names=default_names)
-
- # Auto-detect headerless
- if self._looks_headerless(path, expected_cols=expected_cols,
- min_numeric=max(4, int(0.6 * expected_cols))): # threshold 60%
- return pd.read_csv(path, header=None, names=default_names)
-
- # Try with header row, then normalize if asked
- df = pd.read_csv(path)
- if normalize_fn is not None:
- df = normalize_fn(df, default_names)
-
- # If counts match but names/order differ, force canonical order & names
- if df.shape[1] == expected_cols and list(df.columns) != default_names:
- df = df.iloc[:, :expected_cols] # ensure width
- df.columns = [str(c) for c in df.columns]
- # If normalize_fn was provided, it likely already tried to normalize.
- df.columns = default_names
- return df
-
- def _load_csvs(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
- if not os.path.isfile(self.orderbook_path):
- raise FileNotFoundError(f"Missing {self.orderbook_path}")
- if not os.path.isfile(self.message_path):
- raise FileNotFoundError(f"Missing {self.message_path}")
-
- # Message (6 columns)
- msg_cols = ["time", "type", "order_id", "size", "price", "direction"]
- msg_df = self._read_with_possible_headerless(
- self.message_path,
- default_names=msg_cols,
- force_headerless=self.headerless_message,
- normalize_fn=lambda df, _: (
- df.assign(**{}).rename(columns=lambda c: str(c).strip().lower().replace(" ", "_"))
- )
- )
- # Enforce exact column order when shape matches but order differs
- if msg_df.shape[1] == 6 and list(msg_df.columns) != msg_cols:
- # Try reorder if all present; else force names in canonical order
- present = set(msg_df.columns)
- if set(msg_cols).issubset(present):
- msg_df = msg_df[msg_cols]
- msg_df.columns = msg_cols
-
- # Orderbook (40 columns)
- ob_cols = (
- [f"ask_price_{i}" for i in range(1, 11)] +
- [f"ask_size_{i}" for i in range(1, 11)] +
- [f"bid_price_{i}" for i in range(1, 11)] +
- [f"bid_size_{i}" for i in range(1, 11)]
- )
- ob_df = self._read_with_possible_headerless(
- self.orderbook_path,
- default_names=ob_cols,
- force_headerless=self.headerless_orderbook,
- normalize_fn=lambda df, target: self._normalize_orderbook_headers(df, target)
- )
- # Enforce exact column order when counts match but order differs
- if ob_df.shape[1] == len(ob_cols) and list(ob_df.columns) != ob_cols:
- if set(ob_cols).issubset(set(ob_df.columns)):
- ob_df = ob_df[ob_cols]
- ob_df.columns = ob_cols
-
- return msg_df, ob_df
-
- def _normalize_orderbook_headers(self, df: pd.DataFrame, target_cols: List[str]) -> pd.DataFrame:
- new_cols = []
- for c in df.columns:
- s = str(c)
- s = s.replace(" ", "").replace("-", "").replace(".", "")
- s = s.replace("AskPrice", "ask_price_").replace("AskSize", "ask_size_") \
- .replace("BidPrice", "bid_price_").replace("BidSize", "bid_size_")
- s = s.lower()
- s = s.replace("ask_price", "ask_price_").replace("ask_size", "ask_size_") \
- .replace("bid_price", "bid_price_").replace("bid_size", "bid_size_")
- s = s.replace("__", "_")
- new_cols.append(s)
- df.columns = new_cols
- if set(df.columns) != set(target_cols) and len(df.columns) == len(target_cols):
- df.columns = target_cols
+ # ------------------- helpers ---------------------
+
+ def _validate_inputs(self) -> None:
+ if not os.path.exists(self.orderbook_path):
+ raise FileNotFoundError(self.orderbook_path)
+ if self.seq_len <= 0 or self.stride <= 0:
+ raise ValueError("seq_len and stride must be positive.")
+ total = sum(self.splits)
+ if not np.isclose(total, 1.0):
+ raise ValueError(f"splits must sum to 1.0, got {self.splits} (sum={total}).")
+ if any(x <= 0 for x in self.splits):
+ raise ValueError("splits must be positive.")
+ lo, hi = self.scaler.feature_range
+ if hi <= lo:
+ raise ValueError("feature_range must satisfy min < max.")
+
+ def _load_orderbook(self) -> pd.DataFrame:
+ df = pd.read_csv(self.orderbook_path, header=None)
+ if df.shape[1] < len(ORDERBOOK_COLUMNS):
+ raise ValueError(f"Expected >= {len(ORDERBOOK_COLUMNS)} columns, found {df.shape[1]}.")
+ df = df.iloc[:, :len(ORDERBOOK_COLUMNS)]
+ numeric_ratio = pd.to_numeric(df.iloc[0], errors="coerce").notna().mean()
+ if numeric_ratio < 0.5:
+ df = df.iloc[1:].reset_index(drop=True)
+ df.columns = ORDERBOOK_COLUMNS
+ df = df.apply(pd.to_numeric, errors="coerce")
return df
- def _check_alignment(self, msg_df: pd.DataFrame, ob_df: pd.DataFrame) -> None:
- if len(msg_df) != len(ob_df):
- raise ValueError(f"Message/Orderbook row count mismatch: {len(msg_df)} vs {len(ob_df)}")
-
- # ------ extra engineering helpers ------
- def _engineer_extra(self, ob_df: pd.DataFrame, base: np.ndarray) -> np.ndarray:
- """Append engineered features onto base matrix (N x d)."""
- feats = [base]
-
- ap1 = ob_df["ask_price_1"].to_numpy(np.float64)
- bp1 = ob_df["bid_price_1"].to_numpy(np.float64)
- as1 = ob_df["ask_size_1"].to_numpy(np.float64)
- bs1 = ob_df["bid_size_1"].to_numpy(np.float64)
-
- mid_price = 0.5 * (ap1 + bp1)
- spread = ap1 - bp1
-
- if self.add_rel_spread:
- rel_spread = spread / (mid_price + self.eps)
- feats.append(rel_spread[:, None])
-
- if self.add_microprice:
- # microprice using L1 sizes
- w_bid = bs1 / (bs1 + as1 + self.eps)
- w_ask = 1.0 - w_bid
- micro = w_ask * ap1 + w_bid * bp1
- feats.append(micro[:, None])
-
- if self.add_imbalance_l5:
- bid5 = np.sum([ob_df[f"bid_size_{i}"].to_numpy(np.float64) for i in range(1, 6)], axis=0)
- ask5 = np.sum([ob_df[f"ask_size_{i}"].to_numpy(np.float64) for i in range(1, 6)], axis=0)
- im5 = (bid5 - ask5) / (bid5 + ask5 + self.eps)
- feats.append(im5[:, None])
-
- if self.add_diff1:
- diff = np.vstack([np.zeros((1, base.shape[1])), np.diff(base, axis=0)])
- feats.append(diff)
-
- if self.add_pct_change:
- pct = np.zeros_like(base)
- pct[1:] = (base[1:] - base[:-1]) / (np.abs(base[:-1]) + self.eps)
- feats.append(pct)
-
- if self.add_roll_stats:
- W = max(2, int(self.roll_window))
- roll_mean = pd.Series(mid_price).rolling(W, min_periods=1).mean().to_numpy()
- roll_std = pd.Series(mid_price).rolling(W, min_periods=1).std(ddof=0).fillna(0.0).to_numpy()
- vol = pd.Series(np.diff(np.log(np.clip(mid_price, 1e-12, None)), prepend=0.0) ** 2).rolling(W, min_periods=1).mean().to_numpy()
- feats += [roll_mean[:, None], roll_std[:, None], vol[:, None]]
-
- return np.concatenate(feats, axis=1)
-
def _build_features(self, ob_df: pd.DataFrame) -> np.ndarray:
- for prefix in ("ask_price_", "ask_size_", "bid_price_", "bid_size_"):
- for L in range(1, 11):
- col = f"{prefix}{L}"
- if col not in ob_df.columns:
- raise ValueError(f"Expected column missing: {col}")
-
+ data = ob_df.to_numpy(dtype=np.float64)
if self.feature_set == "raw10":
- cols = (
- [f"ask_price_{i}" for i in range(1, 11)]
- + [f"ask_size_{i}" for i in range(1, 11)]
- + [f"bid_price_{i}" for i in range(1, 11)]
- + [f"bid_size_{i}" for i in range(1, 11)]
- )
- X = ob_df[cols].to_numpy(dtype=np.float64)
- self._feature_names = cols
- X = self._engineer_extra(ob_df, X)
- extras = []
- if self.add_rel_spread: extras.append("rel_spread")
- if self.add_microprice: extras.append("microprice")
- if self.add_imbalance_l5: extras.append("depth_imbalance_l5")
- if self.add_diff1: extras += [f"diff1_{n}" for n in self._feature_names]
- if self.add_pct_change: extras += [f"pct_{n}" for n in self._feature_names]
- if self.add_roll_stats: extras += ["roll_mid_mean","roll_mid_std","roll_vol"]
- self._feature_names = self._feature_names + extras
- return X
-
- if self.feature_set == "core":
- ap1 = ob_df["ask_price_1"].to_numpy(dtype=np.float64)
- bp1 = ob_df["bid_price_1"].to_numpy(dtype=np.float64)
- as1 = ob_df["ask_size_1"].to_numpy(dtype=np.float64)
- bs1 = ob_df["bid_size_1"].to_numpy(dtype=np.float64)
-
- mid_price = 0.5 * (ap1 + bp1)
- spread = ap1 - bp1
- mid_log = np.log(np.clip(mid_price, 1e-12, None))
- mid_log_return = np.concatenate([[0.0], np.diff(mid_log)])
- qi_l1 = (bs1 - as1) / (bs1 + as1 + self.eps)
- bid_depth = sum(ob_df[f"bid_size_{i}"].to_numpy(dtype=np.float64) for i in range(1, 11))
- ask_depth = sum(ob_df[f"ask_size_{i}"].to_numpy(dtype=np.float64) for i in range(1, 11))
- di_l10 = (bid_depth - ask_depth) / (bid_depth + ask_depth + self.eps)
-
- X_base = np.vstack([mid_price, spread, mid_log_return, qi_l1, di_l10]).T
- base_names = [
- "mid_price",
- "spread",
- "mid_log_return",
- "queue_imbalance_l1",
- "depth_imbalance_l10",
- ]
- X = self._engineer_extra(ob_df, X_base)
-
- extra_names = []
- if self.add_rel_spread: extra_names.append("rel_spread")
- if self.add_microprice: extra_names.append("microprice")
- if self.add_imbalance_l5: extra_names.append("depth_imbalance_l5")
- if self.add_diff1: extra_names += [f"diff1_{n}" for n in base_names]
- if self.add_pct_change: extra_names += [f"pct_{n}" for n in base_names]
- if self.add_roll_stats: extra_names += ["roll_mid_mean","roll_mid_std","roll_vol"]
-
- self._feature_names = base_names + extra_names
- return X
+ return data
+ ask_prices = data[:, :10]
+ ask_sizes = data[:, 10:20]
+ bid_prices = data[:, 20:30]
+ bid_sizes = data[:, 30:40]
+
+ mid_price = 0.5 * (ask_prices[:, 0] + bid_prices[:, 0])
+ spread = ask_prices[:, 0] - bid_prices[:, 0]
+ log_mid = np.log(np.clip(mid_price, self.eps, None))
+ mid_log_return = np.concatenate([[0.0], np.diff(log_mid)])
+ queue_imbalance = (
+ (bid_sizes[:, 0] - ask_sizes[:, 0]) /
+ (bid_sizes[:, 0] + ask_sizes[:, 0] + self.eps)
+ )
+ depth_imbalance = (
+ (bid_sizes.sum(axis=1) - ask_sizes.sum(axis=1)) /
+ (bid_sizes.sum(axis=1) + ask_sizes.sum(axis=1) + self.eps)
+ )
- raise ValueError("feature_set must be 'core' or 'raw10'")
+ feats = np.stack(
+ [mid_price, spread, mid_log_return, queue_imbalance, depth_imbalance],
+ axis=1,
+ )
+ return feats
- def _split_chronologically(self, X: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
- n = len(X)
- if n < self.seq_len:
+ def _split(self, feats: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+ n = len(feats)
+ n_train = int(n * self.splits[0])
+ n_val = int(n * self.splits[1])
+ n_test = n - n_train - n_val
+ if n_train < self.seq_len or n_val < self.seq_len or n_test < self.seq_len:
raise ValueError(
- f"Not enough rows ({n}) for seq_len={self.seq_len}. Reduce seq_len or use a longer session."
+ "Not enough rows for the requested seq_len/splits combination. "
+ f"Have {n} rows with splits {self.splits}."
)
- n_train = int(n * self.splits[0])
- n_val = int(n * self.splits[1])
- n_test = n - n_train - n_val
- if n_train < self.seq_len:
- raise ValueError(f"Train split too small ({n_train} rows) for seq_len={self.seq_len}")
- train = X[:n_train]
- val = X[n_train : n_train + n_val]
- test = X[n_train + n_val :]
+ train = feats[:n_train]
+ val = feats[n_train:n_train + n_val]
+ test = feats[n_train + n_val:]
return train, val, test
- def _scale_train_only(
- self, train: np.ndarray, val: np.ndarray, test: np.ndarray
- ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
- kind = self.scaler_kind
- if kind == "none":
- scaler = None
- Xt, Xv, Xs = train, val, test
- else:
- if kind == "standard":
- scaler = StandardScaler()
- elif kind == "minmax":
- scaler = MinMaxScaler(feature_range=self.feature_range)
- elif kind == "robust":
- scaler = RobustScaler()
- elif kind == "quantile":
- scaler = QuantileTransformer(output_distribution="normal", subsample=100000, random_state=42)
- elif kind == "power":
- scaler = PowerTransformer(method="yeo-johnson", standardize=True)
- else:
- raise ValueError("scaler must be 'standard','minmax','robust','quantile','power', or 'none'")
- scaler.fit(train)
- Xt, Xv, Xs = scaler.transform(train), scaler.transform(val), scaler.transform(test)
-
- self._scaler = scaler
-
- # optional whitening
- if self.whiten is None:
- return Xt, Xv, Xs
-
- if self.whiten == "pca":
- p = PCA(n_components=self.pca_var, svd_solver="full", whiten=True, random_state=42)
- p.fit(Xt)
- self._pca = p
- return p.transform(Xt), p.transform(Xv), p.transform(Xs)
-
- if self.whiten == "zca":
- mu = Xt.mean(axis=0, keepdims=True)
- Xc = Xt - mu
- cov = (Xc.T @ Xc) / max(1, Xc.shape[0]-1)
- U, S, _ = np.linalg.svd(cov + 1e-6*np.eye(cov.shape[0]), full_matrices=False)
- S_inv_sqrt = np.diag(1.0 / np.sqrt(S + 1e-6))
- W = U @ S_inv_sqrt @ U.T
- self._zca_cov = (mu, W)
-
- def apply_zca(A: np.ndarray) -> np.ndarray:
- return (A - mu) @ W
-
- return apply_zca(Xt), apply_zca(Xv), apply_zca(Xs)
-
- raise ValueError("whiten must be None, 'pca', or 'zca'")
-
- def _windowize(self, X: np.ndarray) -> np.ndarray:
- n, d = X.shape
- if n < self.seq_len:
- return np.empty((0, self.seq_len, d), dtype=np.float64)
- starts = np.arange(0, n - self.seq_len + 1, self.stride, dtype=int)
- if starts.size == 0:
- return np.empty((0, self.seq_len, d), dtype=np.float64)
- W = np.empty((len(starts), self.seq_len, d), dtype=np.float64)
- for i, s in enumerate(starts):
- W[i] = X[s : s + self.seq_len]
- return W
-
- # ------ augmentations (sequence-level, applied after windowing to TRAIN only) ------
- def _augment_windows(self, W: np.ndarray) -> np.ndarray:
- if self.aug_prob <= 0.0:
- return W
- out = W.copy()
- rng = np.random.default_rng(42)
- for i in range(out.shape[0]):
- if rng.random() < self.aug_prob:
- seq = out[i]
- # jitter (add Gaussian noise)
- seq = seq + rng.normal(0.0, self.aug_jitter_std, size=seq.shape)
- # scaling (per-feature)
- scale = rng.normal(1.0, self.aug_scaling_std, size=(1, seq.shape[-1]))
- seq = seq * scale
- # simple time warp (resample along time axis by a small factor)
- max_alpha = self.aug_timewarp_max
- alpha = float(np.clip(rng.normal(1.0, max_alpha/3), 1.0-max_alpha, 1.0+max_alpha))
- T, D = seq.shape
- new_idx = np.linspace(0, T-1, num=T) ** alpha
- new_idx = (new_idx / new_idx.max()) * (T-1)
- left = np.floor(new_idx).astype(int)
- right = np.clip(left+1, 0, T-1)
- w = (new_idx - left)[:, None]
- seq = (1-w) * seq[left, :] + w * seq[right, :]
- out[i] = seq
- return out
-
-
-if __name__ == "__main__":
- # Demo / summary with styled box panels by default
- import argparse
-
- from helpers.textui import (
- C, supports_color, set_table_style,
- render_kv_panel, render_card, table, DEFAULT_STYLE
- )
-
- parser = argparse.ArgumentParser(description="Run dataset preprocessing demo or print a quick summary.")
- parser.add_argument("--data-dir", required=True)
- parser.add_argument("--message", default="message_10.csv")
- parser.add_argument("--orderbook", default="orderbook_10.csv")
- parser.add_argument("--feature-set", choices=["core", "raw10"], default="core")
- parser.add_argument("--seq-len", type=int, default=64)
- parser.add_argument("--stride", type=int, default=64)
- parser.add_argument("--scaler", choices=["standard", "minmax", "robust", "quantile", "power", "none"], default="standard")
- parser.add_argument("--splits", type=float, nargs=3, metavar=("TRAIN", "VAL", "TEST"), default=(0.7, 0.15, 0.15))
- parser.add_argument("--headerless-message", action="store_true")
- parser.add_argument("--headerless-orderbook", action="store_true")
-
- # style & summary controls
- parser.add_argument("--summary", action="store_true", help="Print a concise dataset summary (heads/dtypes/stats).")
- parser.add_argument("--peek", type=int, default=5, help="Rows to show for head/tail in --summary mode.")
- parser.add_argument("--style", choices=["box", "chat"], default=DEFAULT_STYLE, help="Output card style (default: box).")
- parser.add_argument("--table-style", choices=["github", "grid", "simple"], default="github", help="Tabulate table style.")
- parser.add_argument("--no-color", action="store_true", help="Disable ANSI colors.")
-
- # extra feature engineering
- parser.add_argument("--no-rel-spread", dest="add_rel_spread", action="store_false")
- parser.add_argument("--no-microprice", dest="add_microprice", action="store_false")
- parser.add_argument("--no-imbalance-l5", dest="add_imbalance_l5", action="store_false")
- parser.add_argument("--no-roll-stats", dest="add_roll_stats", action="store_false")
- parser.add_argument("--roll-window", type=int, default=64)
- parser.add_argument("--no-diff1", dest="add_diff1", action="store_false")
- parser.add_argument("--pct-change", action="store_true")
-
- # whitening / DR
- parser.add_argument("--whiten", choices=["pca", "zca"], default=None)
- parser.add_argument("--pca-var", type=float, default=0.99)
-
- # augmentation
- parser.add_argument("--aug-prob", type=float, default=0.0)
- parser.add_argument("--aug-jitter-std", type=float, default=0.01)
- parser.add_argument("--aug-scaling-std", type=float, default=0.05)
- parser.add_argument("--aug-timewarp-max", type=float, default=0.1)
-
- # persistence
- parser.add_argument("--save-dir", type=str, default=None)
-
- args = parser.parse_args()
-
- set_table_style(args.table_style)
- c = C(enabled=supports_color(args.no_color))
-
- ds = LOBSTERData(
- data_dir=args.data_dir,
- message_file=args.message,
- orderbook_file=args.orderbook,
- feature_set=args.feature_set,
- seq_len=args.seq_len,
- stride=args.stride,
- splits=tuple(args.splits),
- scaler=args.scaler,
- headerless_message=args.headerless_message,
- headerless_orderbook=args.headerless_orderbook,
-
- add_rel_spread=getattr(args, "add_rel_spread", True),
- add_microprice=getattr(args, "add_microprice", True),
- add_imbalance_l5=getattr(args, "add_imbalance_l5", True),
- add_roll_stats=getattr(args, "add_roll_stats", True),
- roll_window=args.roll_window,
- add_diff1=getattr(args, "add_diff1", True),
- add_pct_change=args.pct_change,
-
- whiten=args.whiten,
- pca_var=args.pca_var,
-
- aug_prob=args.aug_prob,
- aug_jitter_std=args.aug_jitter_std,
- aug_scaling_std=args.aug_scaling_std,
- aug_timewarp_max=args.aug_timewarp_max,
-
- save_dir=args.save_dir,
- )
-
- # Always show a small preprocessing report card (even without --summary)
- base_rows = [
- ("data_dir", args.data_dir),
- ("message", args.message),
- ("orderbook", args.orderbook),
- ("feature_set", args.feature_set),
- ("seq_len", str(args.seq_len)),
- ("stride", str(args.stride)),
- ("scaler", args.scaler),
- ("whiten", str(args.whiten)),
- ("aug_prob", str(args.aug_prob)),
- ("save_dir", str(args.save_dir)),
- ]
- print(render_kv_panel("Preprocessing config", base_rows, c, style=args.style, align="right"))
-
- if args.summary:
- # ---------- helpers that render subpanels with textui and nest them ----------
- from helpers.textui import table as tx_table # alias for clarity
-
- def _rows_from_df(df: pd.DataFrame, limit_rows: int, limit_cols: int) -> tuple[list[str], list[list[str]]]:
- cols_all = list(map(str, df.columns))
- cols = cols_all[:limit_cols]
- rows_df = df.iloc[:limit_rows, :limit_cols].astype(object).astype(str)
- headers = cols + (["…"] if len(cols_all) > limit_cols else [])
- rows = rows_df.values.tolist()
- if len(cols_all) > limit_cols:
- rows = [r + ["…"] for r in rows]
- return headers, rows
-
- def _subpanel_lines(title: str, body_lines: list[str]) -> list[str]:
- return render_card(title, body_lines, c, style=args.style, align="left").splitlines()
-
- def _panel_df(title: str, df: pd.DataFrame, peek: int) -> list[str]:
- headers, rows = _rows_from_df(df, limit_rows=peek, limit_cols=12)
- return _subpanel_lines(title, tx_table(rows, headers, c))
-
- def _panel_dtypes(df: pd.DataFrame) -> list[str]:
- headers = ["column", "dtype"]
- dtypes_rows = [[str(k), str(v)] for k, v in df.dtypes.items()]
- note = f"total: {len(df.columns)} columns" + (" (showing first 24)" if len(dtypes_rows) > 24 else "")
- dtypes_rows = dtypes_rows[:24]
- body = [note] + tx_table(dtypes_rows, headers, c)
- return _subpanel_lines("dtypes", body)
-
- def _panel_describe(df: pd.DataFrame) -> list[str]:
- num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
- if not num_cols:
- return _subpanel_lines("describe (numeric subset)", ["no numeric columns"])
- sample = num_cols[: min(8, len(num_cols))]
- desc = df[sample].describe().round(6).reset_index(names="stat")
- headers = list(map(str, desc.columns))
- rows = desc.astype(object).astype(str).values.tolist()
- return _subpanel_lines("describe (numeric subset)", tx_table(rows, headers, c))
-
- def _big_panel(title: str, subpanels: list[list[str]]) -> str:
- body_lines: list[str] = []
- for i, block in enumerate(subpanels):
- if i > 0:
- body_lines.append("") # spacer line
- body_lines.extend(block)
- return render_card(title, body_lines, c, style=args.style, align="left")
-
- # ---------- load CSVs ----------
- msg_df, ob_df = ds._load_csvs()
-
- # high-level config card (already styled)
- print(render_kv_panel("CSV summary config", [
- ("message file", args.message),
- ("orderbook file", args.orderbook),
- ("rows (message, orderbook)", f"{len(msg_df)}, {len(ob_df)}"),
- ("columns (message, orderbook)", f"{msg_df.shape[1]}, {ob_df.shape[1]}"),
- ], c, style=args.style, align="right"))
-
- # ---------- message big panel ----------
- msg_subs = []
- msg_subs.append(_subpanel_lines("shape", [f"{msg_df.shape[0]} rows × {msg_df.shape[1]} cols"]))
- msg_subs.append(_panel_dtypes(msg_df))
- msg_subs.append(_panel_describe(msg_df))
- msg_subs.append(_panel_df("head", msg_df.head(args.peek), args.peek))
- msg_subs.append(_panel_df("tail", msg_df.tail(args.peek), args.peek))
- print(_big_panel("message_10.csv", msg_subs))
-
- # ---------- orderbook big panel ----------
- ob_subs = []
- ob_subs.append(_subpanel_lines("shape", [f"{ob_df.shape[0]} rows × {ob_df.shape[1]} cols"]))
- ob_subs.append(_panel_dtypes(ob_df))
- ob_subs.append(_panel_describe(ob_df))
- ob_subs.append(_panel_df("head", ob_df.head(args.peek), args.peek))
- ob_subs.append(_panel_df("tail", ob_df.tail(args.peek), args.peek))
- print(_big_panel("orderbook_10.csv", ob_subs))
-
- # ---------- windowed output card (after preprocessing) ----------
- W_train, W_val, W_test = ds.load_arrays()
- rows = [
- ("train windows", "×".join(map(str, W_train.shape))),
- ("val windows", "×".join(map(str, W_val.shape))),
- ("test windows", "×".join(map(str, W_test.shape))),
- ("#features", str(len(ds.get_feature_names()))),
- ]
- print(render_kv_panel("Windows & features", rows, c, style=args.style, align="right"))
- print(render_card(
- "Feature names (first 12)",
- [", ".join(ds.get_feature_names()[:12]) + (" …" if len(ds.get_feature_names())>12 else "")],
- c, style=args.style, align="left"
- ))
-
- else:
- W_train, W_val, W_test = ds.load_arrays()
- rows = [
- ("train", "×".join(map(str, W_train.shape))),
- ("val", "×".join(map(str, W_val.shape))),
- ("test", "×".join(map(str, W_test.shape))),
- ("features", ", ".join(ds.get_feature_names()[:12]) + (" …" if len(ds.get_feature_names())>12 else "")),
- ]
- print(render_kv_panel("Output shapes", rows, c, style=args.style, align="right"))
+ def _windowize(self, arr: np.ndarray) -> np.ndarray:
+ windows = []
+ limit = len(arr) - self.seq_len + 1
+ for start in range(0, limit, self.stride):
+ window = arr[start:start + self.seq_len]
+ if window.shape[0] == self.seq_len:
+ windows.append(window)
+ if not windows:
+ raise ValueError("Not enough rows to create even a single window.")
+ stacked = np.stack(windows).astype(self.dtype, copy=False)
+ return stacked
From bc932cccb321169e57e3d87c7c01e2c7cf3d0ce5 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Mon, 6 Oct 2025 15:56:20 +1000
Subject: [PATCH 21/74] refactor(dataset): simplify loader and convert to
class-based API
Rewrote monolithic functions into a Dataset class with clear init/load/transform methods. Improves readability, reuse, and testability with no external behavior changes.
---
.../TimeLOB_TimeGAN_49088276/src/dataset.py | 397 +++++++++---------
.../src/helpers/args.py | 0
.../src/helpers/constants.py | 23 +
.../src/helpers/summaries.py | 260 ------------
.../src/helpers/textui.py | 303 -------------
5 files changed, 225 insertions(+), 758 deletions(-)
create mode 100644 recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
create mode 100644 recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py
delete mode 100644 recognition/TimeLOB_TimeGAN_49088276/src/helpers/summaries.py
delete mode 100644 recognition/TimeLOB_TimeGAN_49088276/src/helpers/textui.py
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
index 099c4d53c..dd9549c1a 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
@@ -13,221 +13,228 @@
"""
from __future__ import annotations
-import json
-import os
+from argparse import Namespace
from dataclasses import dataclass, field
-from typing import Literal, Optional, Tuple
+from pathlib import Path
+from typing import Optional, Tuple
import numpy as np
-import pandas as pd
+from numpy.typing import NDArray
-ASK_PRICE_COLS = [f"ask_price_{i}" for i in range(1, 11)]
-ASK_SIZE_COLS = [f"ask_size_{i}" for i in range(1, 11)]
-BID_PRICE_COLS = [f"bid_price_{i}" for i in range(1, 11)]
-BID_SIZE_COLS = [f"bid_size_{i}" for i in range(1, 11)]
-ORDERBOOK_COLUMNS = ASK_PRICE_COLS + ASK_SIZE_COLS + BID_PRICE_COLS + BID_SIZE_COLS
+from src.helpers.constants import DATA_DIR, ORDERBOOK_FILENAME, TRAIN_TEST_SPLIT
-@dataclass
-class ContinuousMinMaxScaler:
+class MinMaxScaler:
"""
- Simple min-max scaler that keeps track of per-feature extrema and supports
- repeated transforms without relying on sklearn.
+ Feature-wise min–max scaler with a scikit-learn-like API.
"""
- feature_range: Tuple[float, float] = (0.0, 1.0)
- eps: float = 1e-9
- data_min_: Optional[np.ndarray] = field(default=None, init=False)
- data_max_: Optional[np.ndarray] = field(default=None, init=False)
-
- def fit(self, data: np.ndarray) -> "ContinuousMinMaxScaler":
- arr = np.asarray(data, dtype=np.float64)
- self.data_min_ = arr.min(axis=0)
- self.data_max_ = arr.max(axis=0)
+
+ def __init__(self, epsilon: float = 1e-7):
+ self.epsilon = epsilon
+ self._min: Optional[NDArray[np.floating]] = None
+ self._max: Optional[NDArray[np.floating]] = None
+
+ def fit(self, data: NDArray[np.floating]) -> "MinMaxScaler":
+ self._min = np.min(data, axis=0)
+ self._max = np.max(data, axis=0)
return self
- def transform(self, data: np.ndarray) -> np.ndarray:
- if self.data_min_ is None or self.data_max_ is None:
- raise RuntimeError("Scaler not fitted.")
- arr = np.asarray(data, dtype=np.float64)
- denom = np.maximum(self.data_max_ - self.data_min_, self.eps)
- scaled = (arr - self.data_min_) / denom
- lo, hi = self.feature_range
- return (scaled * (hi - lo) + lo).astype(arr.dtype, copy=False)
+ def transform(
+ self, data: NDArray[np.floating]
+ ) -> NDArray[np.floating]:
+ if self._min is None or self._max is None:
+ raise RuntimeError("Scaler must be fitted before transform.")
+ numerator = data - self._min
+ denominator = (self._max - self._min) + self.epsilon
+ return numerator / denominator
- def fit_transform(self, data: np.ndarray) -> np.ndarray:
+ def fit_transform(self, data: NDArray[np.floating]) -> NDArray[np.floating]:
return self.fit(data).transform(data)
- def inverse_transform(self, data: np.ndarray) -> np.ndarray:
- if self.data_min_ is None or self.data_max_ is None:
- raise RuntimeError("Scaler not fitted.")
- lo, hi = self.feature_range
- arr = np.asarray(data, dtype=np.float64)
- base = (arr - lo) / (hi - lo + self.eps)
- return base * (self.data_max_ - self.data_min_) + self.data_min_
+ def inverse_transform(self, data: NDArray[np.floating]) -> NDArray[np.floating]:
+ if self._min is None or self._max is None:
+ raise RuntimeError("Scaler must be fitted before inverse_transform.")
+ return data * ((self._max - self._min) + self.epsilon) + self._min
-class LOBSTERData:
+@dataclass(frozen=True)
+class DatasetConfig:
"""
- Minimal LOBSTER loader (orderbook only) with continuous min-max scaling.
-
- Parameters
- ----------
- data_dir : str
- Folder containing orderbook_10.csv (and optionally message_10.csv).
- feature_set : {"core", "raw10"}
- Representation to build.
- seq_len : int
- Window length fed to TimeGAN.
- stride : int, optional
- Step between consecutive windows (defaults to seq_len for non-overlap).
- splits : tuple
- Train/val/test fractions; must sum to 1.0.
+ Configuration for loading and preprocessing order-book data.
+ """
+ seq_len: int
+ data_dir: Path = field(default_factory=lambda: Path(DATA_DIR))
+ filename: str = ORDERBOOK_FILENAME
+ splits: Tuple[float, float, float] = TRAIN_TEST_SPLIT
+ shuffle: bool = True
+ dtype: type = np.float32
+ filter_zero_rows: bool = True
+
+ @classmethod
+ def from_namespace(cls, arg: Namespace) -> "DatasetConfig":
+ return cls(
+ seq_len=getattr(arg, "seq_len", 128),
+ data_dir=Path(getattr(arg, "data_dir", DATA_DIR)),
+ filename=getattr(arg, "filename", ORDERBOOK_FILENAME),
+ shuffle=getattr(arg, "shuffle", True),
+ dtype=getattr(arg, "dtype", np.float32),
+ filter_zero_rows=getattr(arg, "filter_zero_rows", True),
+ )
+
+
+class LOBDataset:
+ """
+ End-to-end loader for a single LOBSTER orderbook file
"""
def __init__(
- self,
- data_dir: str,
- message_file: str = "message_10.csv", # kept for compatibility; unused
- orderbook_file: str = "orderbook_10.csv",
- feature_set: Literal["core", "raw10"] = "core",
- seq_len: int = 128,
- stride: Optional[int] = None,
- splits: Tuple[float, float, float] = (0.7, 0.15, 0.15),
- feature_range: Tuple[float, float] = (0.0, 1.0),
- dtype: Literal["float32", "float64"] = "float32",
- save_dir: Optional[str] = None,
+ self, cfg: DatasetConfig,
+ scaler: Optional[MinMaxScaler] = None
):
- self.data_dir = data_dir
- self.message_file = message_file # placeholder for potential alignment checks
- self.orderbook_path = os.path.join(data_dir, orderbook_file)
- self.feature_set = feature_set
- self.seq_len = int(seq_len)
- self.stride = int(stride) if stride is not None else self.seq_len
- self.splits = splits
- self.scaler = ContinuousMinMaxScaler(feature_range=feature_range)
- self._dtype_name = dtype
- self.dtype = np.float32 if dtype == "float32" else np.float64
- self.save_dir = save_dir
- self.eps = 1e-8
- self._validate_inputs()
-
- # ------------------- public API -------------------
-
- def load_arrays(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
- orderbook = self._load_orderbook()
- features = self._build_features(orderbook)
- features = features[~np.isnan(features).any(axis=1)]
- train, val, test = self._split(features)
-
- self.scaler.fit(train)
- train = self.scaler.transform(train)
- val = self.scaler.transform(val)
- test = self.scaler.transform(test)
-
- W_train = self._windowize(train)
- W_val = self._windowize(val)
- W_test = self._windowize(test)
-
- if self.save_dir:
- os.makedirs(self.save_dir, exist_ok=True)
- np.savez_compressed(
- os.path.join(self.save_dir, "windows.npz"),
- train=W_train, val=W_val, test=W_test
+ self.cfg = cfg
+ self.scaler = scaler or MinMaxScaler()
+
+ self._raw: Optional[NDArray[np.int64]] = None
+ self._filtered: Optional[NDArray[np.floating]] = None
+ self._train: Optional[NDArray[np.floating]] = None
+ self._val: Optional[NDArray[np.floating]] = None
+ self._test: Optional[NDArray[np.floating]] = None
+
+ def load(self) -> "LOBDataset":
+ print("Loading and preprocessing LOBSTER orderbook dataset...")
+ data = self._read_raw()
+ data = self._filter_unoccupied(data) if self.cfg.filter_zero_rows else data.astype(self.cfg.dtype)
+ self._filtered = data.astype(self.cfg.dtype)
+
+ self._split_chronological()
+ self._scale_train_only()
+ print("Dataset loaded, split, and scaled.")
+ return self
+
+ def make_windows(
+ self,
+ split: str = "train"
+ ) -> NDArray[np.float32]:
+ """
+ Window the selected split into shape (num_windows, seq_len, num_features).
+ """
+ data = self._select_split(split)
+ return self._windowize(data, self.cfg.seq_len, self.cfg.shuffle)
+
+ def dataset_windowed(
+ self
+ ) -> tuple[NDArray[np.float32], NDArray[np.float32], NDArray[np.float32]]:
+ """
+ Return (train_w, val_w, test_w) as windowed arrays.
+ """
+ train_w = self.make_windows(split="train")
+ val_w = self.make_windows(split="val")
+ test_w = self.make_windows(split="test")
+ return train_w, val_w, test_w
+
+ def _read_raw(self) -> NDArray[np.int64]:
+ path = Path(self.cfg.data_dir, self.cfg.filename)
+ if not path.exists():
+ msg = (
+ f"{path} not found.\n"
+ "Download AMZN level-10 sample from:\n"
+ "https://lobsterdata.com/info/sample/LOBSTER_SampleFile_AMZN_2012-06-21_10.zip\n"
+ "and place the '..._orderbook_10' file in the data directory."
)
- with open(os.path.join(self.save_dir, "meta.json"), "w", encoding="utf-8") as f:
- json.dump(self.get_meta(), f, indent=2)
-
- return W_train, W_val, W_test
-
- def get_meta(self) -> dict:
- return {
- "feature_set": self.feature_set,
- "seq_len": self.seq_len,
- "stride": self.stride,
- "splits": self.splits,
- "feature_range": self.scaler.feature_range,
- "dtype": self._dtype_name,
- }
-
- # ------------------- helpers ---------------------
-
- def _validate_inputs(self) -> None:
- if not os.path.exists(self.orderbook_path):
- raise FileNotFoundError(self.orderbook_path)
- if self.seq_len <= 0 or self.stride <= 0:
- raise ValueError("seq_len and stride must be positive.")
- total = sum(self.splits)
- if not np.isclose(total, 1.0):
- raise ValueError(f"splits must sum to 1.0, got {self.splits} (sum={total}).")
- if any(x <= 0 for x in self.splits):
- raise ValueError("splits must be positive.")
- lo, hi = self.scaler.feature_range
- if hi <= lo:
- raise ValueError("feature_range must satisfy min < max.")
-
- def _load_orderbook(self) -> pd.DataFrame:
- df = pd.read_csv(self.orderbook_path, header=None)
- if df.shape[1] < len(ORDERBOOK_COLUMNS):
- raise ValueError(f"Expected >= {len(ORDERBOOK_COLUMNS)} columns, found {df.shape[1]}.")
- df = df.iloc[:, :len(ORDERBOOK_COLUMNS)]
- numeric_ratio = pd.to_numeric(df.iloc[0], errors="coerce").notna().mean()
- if numeric_ratio < 0.5:
- df = df.iloc[1:].reset_index(drop=True)
- df.columns = ORDERBOOK_COLUMNS
- df = df.apply(pd.to_numeric, errors="coerce")
- return df
-
- def _build_features(self, ob_df: pd.DataFrame) -> np.ndarray:
- data = ob_df.to_numpy(dtype=np.float64)
- if self.feature_set == "raw10":
- return data
- ask_prices = data[:, :10]
- ask_sizes = data[:, 10:20]
- bid_prices = data[:, 20:30]
- bid_sizes = data[:, 30:40]
-
- mid_price = 0.5 * (ask_prices[:, 0] + bid_prices[:, 0])
- spread = ask_prices[:, 0] - bid_prices[:, 0]
- log_mid = np.log(np.clip(mid_price, self.eps, None))
- mid_log_return = np.concatenate([[0.0], np.diff(log_mid)])
- queue_imbalance = (
- (bid_sizes[:, 0] - ask_sizes[:, 0]) /
- (bid_sizes[:, 0] + ask_sizes[:, 0] + self.eps)
- )
- depth_imbalance = (
- (bid_sizes.sum(axis=1) - ask_sizes.sum(axis=1)) /
- (bid_sizes.sum(axis=1) + ask_sizes.sum(axis=1) + self.eps)
+ raise FileNotFoundError(msg)
+ print("Reading orderbook file...", path)
+ raw = np.loadtxt(path, delimiter=",", skiprows=0, dtype=np.int64)
+ print("Raw shape:", raw.shape)
+ self._raw = raw
+ return raw
+
+ def _filter_unoccupied(self, data: NDArray[np.int64]) -> NDArray[np.float32]:
+ """
+ Remove rows containing zeros (dummy volumes) to avoid invalid states
+ """
+ mask = ~(data == 0).any(axis=1)
+ filtered = data[mask].astype(np.float32)
+ print("Filtered rows (no zeros). Shape", filtered.shape)
+ return filtered
+
+ def _split_chronological(self) -> None:
+ assert self._filtered is not None, "Call load() first."
+ n = len(self._filtered)
+ t_frac, v_frac, _ = self.cfg.splits
+ t_cutoff = int(n * t_frac)
+ v_cutoff = int(n * v_frac)
+ self._train = self._filtered[:t_cutoff]
+ self._val = self._filtered[t_cutoff:v_cutoff]
+ self._test = self._filtered[v_cutoff:]
+ assert all(
+ len(d) > 5 for d in (self._train, self._val, self._test)
+ ), "Each split must have at least 5 windows."
+ print("Split sizes - train: %d, val: %d, test: %d", len(self._train), len(self._val), len(self._test))
+
+ def _scale_train_only(self) -> None:
+ assert (
+ self._train is not None
+ and self._val is not None
+ and self._test is not None
)
+ print("Fitting MinMaxScaler on train split.")
+ self._train = self.scaler.fit_transform(self._train)
+ self._val = self.scaler.transform(self._val)
+ self._test = self.scaler.transform(self._test)
+
+ def _windowize(
+ self,
+ data: NDArray[np.float32],
+ seq_len: int,
+ shuffle: bool
+ ) -> NDArray[np.float32]:
+ n_samples, n_features = data.shape
+ n_windows = n_samples - seq_len + 1
+ if n_windows <= 0:
+ raise ValueError(f"seq_len={seq_len} is too large for data of length {n_samples}.")
+
+ out = np.empty((n_windows, seq_len, n_features), dtype=self.cfg.dtype)
+ for i in range(n_windows):
+ out[i] = data[i: i + seq_len]
+ if shuffle:
+ np.random.shuffle(out)
+ return out
+
+ def _select_split(self, split: str) -> NDArray[np.float32]:
+ if split == "train": return self._train
+ if split == "val": return self._val
+ if split == "test": return self._test
+ raise ValueError("split must be 'train', 'val' or 'test'")
+
+
+def batch_generator(
+ data: NDArray[np.float32],
+ time: Optional[NDArray[np.float32]],
+ batch_size: int,
+):
+ """
+ Random mini-batch generator
+ if `time` is None, uses a constant length equal to data.shape[1] (seq_len).
+ """
+ n = len(data)
+ idx = np.random.randint(n)[:batch_size]
+ data_mb = data[idx].astype(np.float32)
+ if time is not None:
+ T_mb = np.full((batch_size,), data_mb.shape[1], dtype=np.int32)
+ else:
+ T_mb = time[idx].astype(np.int32)
+ return data_mb, T_mb
+
+
+def load_data(arg: Namespace) -> tuple[NDArray[np.float32], NDArray[np.float32], NDArray[np.float32]]:
+ """
+ Backwards-compatible wrapper.
+ """
+ cfg = DatasetConfig.from_namespace(arg)
+ loader = LOBDataset(cfg).load()
+ train_w = loader.make_windows("train")
+ val = loader._val
+ test = loader._test
+ print("Stock dataset has been loaded and preprocessed.")
+ return train_w, val, test
- feats = np.stack(
- [mid_price, spread, mid_log_return, queue_imbalance, depth_imbalance],
- axis=1,
- )
- return feats
-
- def _split(self, feats: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
- n = len(feats)
- n_train = int(n * self.splits[0])
- n_val = int(n * self.splits[1])
- n_test = n - n_train - n_val
- if n_train < self.seq_len or n_val < self.seq_len or n_test < self.seq_len:
- raise ValueError(
- "Not enough rows for the requested seq_len/splits combination. "
- f"Have {n} rows with splits {self.splits}."
- )
- train = feats[:n_train]
- val = feats[n_train:n_train + n_val]
- test = feats[n_train + n_val:]
- return train, val, test
-
- def _windowize(self, arr: np.ndarray) -> np.ndarray:
- windows = []
- limit = len(arr) - self.seq_len + 1
- for start in range(0, limit, self.stride):
- window = arr[start:start + self.seq_len]
- if window.shape[0] == self.seq_len:
- windows.append(window)
- if not windows:
- raise ValueError("Not enough rows to create even a single window.")
- stacked = np.stack(windows).astype(self.dtype, copy=False)
- return stacked
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py
new file mode 100644
index 000000000..f22346b8a
--- /dev/null
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py
@@ -0,0 +1,23 @@
+"""
+Configuration constants for the project.
+"""
+from math import isclose
+from typing import Literal
+OUTPUT_DIR = "outs"
+
+# Training hyperparameters for TimeGAN
+NUM_TRAINING_ITERATIONS = 25_000
+VALIDATE_INTERVAL = 300
+
+TRAIN_TEST_SPLIT = (0.7, 0.15, 0.15)
+assert isclose(
+ sum(TRAIN_TEST_SPLIT), 1.0,
+ rel_tol=0.0, abs_tol=1e-6
+), (
+ f"TRAIN_TEST_SPLIT must sum to 1.0 (got {sum(TRAIN_TEST_SPLIT):.8f})"
+)
+
+DATA_DIR = "data"
+ORDERBOOK_FILENAME = "AMZN_2012-06-21_34200000_57600000_orderbook_10.csv"
+
+DATANAME = Literal["message", "orderbook"]
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/summaries.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/summaries.py
deleted file mode 100644
index d803303e7..000000000
--- a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/summaries.py
+++ /dev/null
@@ -1,260 +0,0 @@
-from __future__ import annotations
-
-from typing import List, Tuple
-import numpy as np
-import pandas as pd
-from tabulate import tabulate
-
-from .textui import C, render_card, kv_table, set_table_style, term_width, bold_white_borders, TABLE_FMT
-
-
-def first_last_time(msg_df: pd.DataFrame) -> tuple[str, str]:
- if "time" not in msg_df.columns:
- return ("", "")
- try:
- t = pd.to_datetime(msg_df["time"], errors="coerce", unit=None)
- return (str(t.min()), str(t.max()))
- except Exception:
- return ("", "")
-
-
-def summarize_df(df: pd.DataFrame, name: str, peek: int, c: C) -> List[str]:
- lines: List[str] = []
- title = f"{c.BOLD}{name}{c.RESET}" if c.enabled else name
- lines.append(title)
- lines.append(f"shape: {df.shape[0]} rows × {df.shape[1]} cols")
- cols = list(df.columns)
- col_str = ", ".join(cols)
- lines.append("columns: " + col_str if len(col_str) < 160 else "columns: " + ", ".join(cols[:12]) + ", …")
- dtypes = df.dtypes.astype(str).to_dict()
- na_counts = {k: int(v) for k, v in df.isna().sum().items() if int(v) > 0}
- lines.append("dtypes: " + ", ".join([f"{k}:{v}" for k, v in dtypes.items()]))
- lines.append("na_counts: " + (str(na_counts) if na_counts else "{}"))
- for col in ("type", "direction"):
- if col in df.columns:
- try:
- vc = df[col].value_counts(dropna=False).to_dict()
- lines.append(f"value_counts[{col}]: {vc}")
- except Exception:
- pass
- if "time" in df.columns:
- try:
- t = pd.to_datetime(df["time"], errors="coerce", unit=None)
- lines.append(f"time: min={t.min()} max={t.max()}")
- if t.notna().all():
- is_mono = bool((t.diff().dropna() >= pd.Timedelta(0)).all())
- lines.append(f"time monotonic nondecreasing: {is_mono}")
- except Exception:
- pass
-
- num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
- if num_cols:
- sample_cols = num_cols[: min(8, len(num_cols))]
- desc_df = df[sample_cols].describe().round(6)
- lines.append(f"{c.BOLD}describe(sample numeric cols):{c.RESET}" if c.enabled else "describe(sample numeric cols):")
- lines.extend(tabulate(desc_df, headers="keys", tablefmt=TABLE_FMT).splitlines())
-
- if peek > 0:
- lines.append(f"{c.BOLD}head:{c.RESET}" if c.enabled else "head:")
- head_tbl = tabulate(df.head(peek), headers="keys", tablefmt=TABLE_FMT, showindex=False)
- lines.extend(head_tbl.splitlines())
- lines.append(f"{c.BOLD}tail:{c.RESET}" if c.enabled else "tail:")
- tail_tbl = tabulate(df.tail(peek), headers="keys", tablefmt=TABLE_FMT, showindex=False)
- lines.extend(tail_tbl.splitlines())
-
- return lines
-
-
-def print_dir_listing(path: str, c: C, style: str) -> str:
- import os
- if os.path.isdir(path):
- files = sorted(os.listdir(path))
- body = [f"path: {path}", f"files: {len(files)}"]
- body += [f"• {f}" for f in files[:10]]
- if len(files) > 10:
- body.append(f"• (+{len(files)-10} more)")
- else:
- body = [f"path: {path}", f"{'files: (missing)'}"]
- return render_card("Data directory", body, c, style=style, align="left")
-
-
-def print_summary(lines: list[str], c: C, style: str) -> str:
- if "" in lines:
- idx = lines.index("")
- msg_part = lines[:idx]
- ob_part = lines[idx+1:]
- else:
- msg_part, ob_part = lines, []
-
- def split_title(block: list[str]) -> tuple[str, list[str]]:
- if not block:
- return ("", [])
- title, body = block[0], block[1:]
- return (title, body)
-
- out = []
- t1, b1 = split_title(msg_part)
- if t1:
- out.append(render_card(t1, b1, c, style=style, align="left"))
- t2, b2 = split_title(ob_part)
- if t2:
- out.append(render_card(t2, b2, c, style=style, align="left"))
- return "\n".join(out)
-
-
-def _fmt_bytes(n: int) -> str:
- units = ["B", "KB", "MB", "GB", "TB"]
- i = 0; f = float(n)
- while f >= 1024 and i < len(units) - 1:
- f /= 1024.0; i += 1
- return f"{f:.2f} {units[i]}"
-
-
-def print_report(W_train, W_val, W_test, meta: dict, c: C, style: str, *,
- verbose: bool = False,
- scaler_obj = None,
- clip_bounds = None,
- time_coverage: tuple[str, str] = ("","")) -> str:
- block1 = [
- ("train windows", "×".join(map(str, W_train.shape))),
- ("val windows", "×".join(map(str, W_val.shape))),
- ("test windows", "×".join(map(str, W_test.shape))),
- ("seq_len", str(meta.get("seq_len"))),
- ("stride", str(meta.get("stride"))),
- ("feature_set", str(meta.get("feature_set"))),
- ("#features", str(len(meta.get("feature_names", [])))),
- ("scaler", str(meta.get("scaler"))),
- ("sorted_by_time",str(meta.get("sorted_by_time"))),
- ("every", str(meta.get("every"))),
- ]
- lines1 = kv_table(block1, c)
- out = [render_card("Preprocessing report", lines1, c, style=style, align="right")]
-
- rc = meta.get("row_counts", {})
- if rc:
- block2 = [(k, str(v)) for k, v in rc.items()]
- lines2 = kv_table(block2, c)
- out.append(render_card("Row counts", lines2, c, style=style, align="right"))
-
- if getattr(W_train, "size", 0):
- win = W_train[0]
- block3 = [
- ("window[0] mean", f"{float(win.mean()):.6f}"),
- ("window[0] std", f"{float(win.std()):.6f}"),
- ("features", ", ".join(meta.get("feature_names", [])[:8]) + ("…" if len(meta.get("feature_names", []))>8 else "")),
- ]
- lines3 = kv_table(block3, c)
- out.append(render_card("Sample window", lines3, c, style=style, align="right"))
-
- if not verbose:
- return "\n".join(out)
-
- vlines: list[str] = []
- total_bytes = (getattr(W_train, "nbytes", 0) + getattr(W_val, "nbytes", 0) + getattr(W_test, "nbytes", 0))
- vlines.append(f"memory total: {_fmt_bytes(total_bytes)}")
- vlines.append(f"train bytes: {_fmt_bytes(getattr(W_train, 'nbytes', 0))}")
- vlines.append(f"val bytes: {_fmt_bytes(getattr(W_val, 'nbytes', 0))}")
- vlines.append(f"test bytes: {_fmt_bytes(getattr(W_test, 'nbytes', 0))}")
-
- tmin, tmax = time_coverage
- if tmin or tmax:
- vlines.append(f"time coverage: {tmin} → {tmax}")
-
- out.append(render_card("Resources & coverage", vlines, c, style=style, align="right"))
-
- if scaler_obj is not None:
- s_rows = []
- if hasattr(scaler_obj, "mean_") and hasattr(scaler_obj, "scale_"):
- s_rows = [
- ("type", "StandardScaler"),
- ("mean[0:8]", np.array2string(scaler_obj.mean_[:8], precision=4, separator=", ")),
- ("scale[0:8]", np.array2string(scaler_obj.scale_[:8], precision=4, separator=", ")),
- ]
- elif hasattr(scaler_obj, "data_min_") and hasattr(scaler_obj, "data_max_"):
- s_rows = [
- ("type", "MinMaxScaler"),
- ("data_min[0:8]", np.array2string(scaler_obj.data_min_[:8], precision=4, separator=", ")),
- ("data_max[0:8]", np.array2string(scaler_obj.data_max_[:8], precision=4, separator=", ")),
- ("feature_range", str(getattr(scaler_obj, "feature_range", None))),
- ]
- if s_rows:
- out.append(render_card("Scaler parameters", kv_table(s_rows, c), c, style=style, align="right"))
-
- if clip_bounds is not None:
- lo, hi = clip_bounds
- cb_rows = [
- ("q-lo[0:8]", np.array2string(lo[:8], precision=4, separator=", ")),
- ("q-hi[0:8]", np.array2string(hi[:8], precision=4, separator=", ")),
- ]
- out.append(render_card("Clip bounds (preview)", kv_table(cb_rows, c), c, style=style, align="right"))
-
- def _count_windows(n_rows: int, seq_len: int, stride: int) -> int:
- if n_rows < seq_len:
- return 0
- return 1 + (n_rows - seq_len) // stride
-
- rc_train = rc.get("train", 0); rc_val = rc.get("val", 0); rc_test = rc.get("test", 0)
- overlap = 1.0 - (meta.get("stride", 1) / max(1, meta.get("seq_len", 1)))
- perf_rows = [
- ("expected train windows", str(_count_windows(rc_train, meta.get("seq_len", 0), meta.get("stride", 1)))),
- ("expected val windows", str(_count_windows(rc_val, meta.get("seq_len", 0), meta.get("stride", 1)))),
- ("expected test windows", str(_count_windows(rc_test, meta.get("seq_len", 0), meta.get("stride", 1)))),
- ("overlap ratio", f"{overlap:.3f}"),
- ]
- out.append(render_card("Windowing details", kv_table(perf_rows, c), c, style=style, align="right"))
-
- return "\n".join(out)
-
-
-def print_dataset_info(loader, c: C, style: str, peek: int = 5) -> str:
- meta = loader.get_meta()
- feature_set = meta.get("feature_set")
- feats = meta.get("feature_names") or []
-
- if not feats:
- if feature_set == "core":
- feats = ["mid_price","spread","mid_log_return","queue_imbalance_l1","depth_imbalance_l10"]
- elif feature_set == "raw10":
- feats = ([f"ask_price_{i}" for i in range(1,11)] +
- [f"ask_size_{i}" for i in range(1,11)] +
- [f"bid_price_{i}" for i in range(1,11)] +
- [f"bid_size_{i}" for i in range(1,11)])
-
- intro = [
- f"Feature set: {c.BOLD}{feature_set}{c.RESET}" if c.enabled else f"Feature set: {feature_set}",
- f"Total features: {len(feats)}",
- ""
- ]
-
- try:
- W_train, W_val, W_test = loader.load_arrays()
- if W_train.size + W_val.size + W_test.size == 0:
- raise ValueError("No windows produced; lower seq_len or stride.")
- blocks = [W.reshape(-1, W.shape[-1]) for W in (W_train, W_val, W_test) if getattr(W,"size",0)]
- all_data = np.concatenate(blocks, axis=0)
- df = pd.DataFrame(all_data, columns=feats)
-
- intro.append(f"{c.BOLD}Statistical summary (aggregated across splits):{c.RESET}" if c.enabled else "Statistical summary (aggregated across splits):")
- desc_df = df.describe().round(6)
- intro.extend(tabulate(desc_df, headers="keys", tablefmt=TABLE_FMT).splitlines())
- intro.append("")
-
- means = df.mean().sort_values(ascending=False).head(5)
- stds = df.std().sort_values(ascending=False).head(5)
-
- intro.append(f"{c.BOLD}Highest-mean features:{c.RESET}" if c.enabled else "Highest-mean features:")
- intro.extend(tabulate(list(means.items()), headers=[f"{c.MAGENTA}feature{c.RESET}" if c.enabled else "feature", "mean"], tablefmt=TABLE_FMT).splitlines())
- intro.append("")
-
- intro.append(f"{c.BOLD}Most-variable features (by std):{c.RESET}" if c.enabled else "Most-variable features (by std):")
- intro.extend(tabulate(list(stds.items()), headers=[f"{c.MAGENTA}feature{c.RESET}" if c.enabled else "feature", "std"], tablefmt=TABLE_FMT).splitlines())
- intro.append("")
-
- intro.append(f"{c.BOLD}Example rows (first few timesteps):{c.RESET}" if c.enabled else "Example rows (first few timesteps):")
- ex_tbl = tabulate(df.head(peek).round(6), headers="keys", tablefmt=TABLE_FMT, showindex=True)
- intro.extend(ex_tbl.splitlines())
-
- except Exception as e:
- intro.append(f"{c.RED}(Could not compute stats: {e}){c.RESET}" if c.enabled else f"(Could not compute stats: {e})")
-
- return render_card("Dataset summary", intro, c, style=style, align="left")
\ No newline at end of file
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/textui.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/textui.py
deleted file mode 100644
index f530edcaf..000000000
--- a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/textui.py
+++ /dev/null
@@ -1,303 +0,0 @@
-import os
-import re
-import shutil
-from datetime import datetime
-from typing import List, Tuple, Sequence
-from tabulate import tabulate
-
-# Try Colorama on Windows (optional)
-try:
- import colorama # type: ignore
- colorama.just_fix_windows_console()
-except Exception:
- pass
-
-# ---------------- defaults ----------------
-DEFAULT_STYLE = "box" # default to box panels
-TABLE_FMT = "github" # tabulate format; switch with set_table_style()
-
-# ------------- terminal capabilities & colors -------------
-def supports_color(no_color_flag: bool) -> bool:
- if no_color_flag or os.environ.get("NO_COLOR"):
- return False
- try:
- # If stdout is a TTY, assume color; terminals and most IDE consoles support it.
- return os.isatty(1)
- except Exception:
- return False
-
-class C:
- def __init__(self, enabled: bool):
- self.enabled = enabled
- self.RESET = "\033[0m" if enabled else ""
- self.DIM = "\033[2m" if enabled else ""
- self.BOLD = "\033[1m" if enabled else ""
- self.CYAN = "\033[36m" if enabled else ""
- self.YELLOW = "\033[33m" if enabled else ""
- self.GREEN = "\033[32m" if enabled else ""
- self.MAGENTA = "\033[35m" if enabled else ""
- self.BLUE = "\033[34m" if enabled else ""
- self.RED = "\033[31m" if enabled else ""
- self.WHITE = "\033[37m" if enabled else ""
-
-# ------------- ANSI helpers -------------
-_ANSI_RE = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]")
-
-def visible_len(s: str) -> int:
- """Printable width (strip ANSI first)."""
- return len(_ANSI_RE.sub("", s))
-
-def strip_ansi(s: str) -> str:
- return _ANSI_RE.sub("", s)
-
-def truncate_visible(s: str, max_cols: int) -> str:
- """
- Truncate to max_cols printable columns without breaking ANSI sequences.
- """
- if max_cols <= 0:
- return ""
- out, cols = [], 0
- i, n = 0, len(s)
- while i < n and cols < max_cols:
- m = _ANSI_RE.match(s, i)
- if m:
- out.append(m.group(0))
- i = m.end()
- continue
- ch = s[i]
- out.append(ch)
- cols += 1
- i += 1
- # ensure we don't end inside an ANSI state (we don't maintain state machine,
- # but common sequences are self-contained; still append reset for safety)
- if cols >= max_cols:
- out.append("\033[0m")
- return "".join(out)
-
-def ljust_visible(s: str, width: int) -> str:
- pad = max(0, width - visible_len(s))
- return s + (" " * pad)
-
-# ------------- layout helpers -------------
-def set_table_style(name: str) -> None:
- """Set tabulate tablefmt. Small whitelist, but allow custom strings."""
- global TABLE_FMT
- allowed = {
- "github", "grid", "fancy_grid", "heavy_grid", "simple", "outline",
- "rounded_grid", "double_grid", "pipe", "orgtbl", "jira", "psql"
- }
- TABLE_FMT = name if name in allowed else name # pass-through (tabulate will raise if invalid)
-
-def term_width(default: int = 100) -> int:
- try:
- return shutil.get_terminal_size((default, 20)).columns
- except Exception:
- return default
-
-def wrap_text(s: str, width: int) -> List[str]:
- """
- ANSI-aware word wrap by visible width.
- """
- if visible_len(s) <= width:
- return [s]
- parts = s.split(" ")
- out, cur = [], ""
- for tok in parts:
- if not cur:
- cur = tok
- elif visible_len(cur) + 1 + visible_len(tok) <= width:
- cur += " " + tok
- else:
- out.append(cur)
- cur = tok
- if cur:
- out.append(cur)
- return out
-
-def is_table_line(s: str) -> bool:
- """
- Heuristic: lines that look like tables (markdown pipes or box-drawing).
- """
- t = strip_ansi(s).strip()
- if not t:
- return False
- if t.startswith("|") and "|" in t[1:]:
- return True
- if t.startswith("+") and t.endswith("+"):
- return True
- # box drawing / markdown borders
- if set(t) <= set("-:|+ ─═│║┼┬┴├┤┌┐└┘╭╮╯╰╪╫╠╬╣╦╩╔╗╚╝"):
- return True
- return False
-
-# ------------- table/border styling -------------
-def bold_white_borders(table: str, c: C) -> str:
- """
- Paint table border glyphs in bold white without touching cell content.
- Works for markdown pipes and Unicode box drawing.
- """
- if not getattr(c, "enabled", False):
- return table
-
- bold, white, reset = c.BOLD, c.WHITE, c.RESET
- border_chars = set("│║|┼┬┴├┤┌┐└┘─═╭╮╯╰╪╫╠╬╣╦╩╔╗╚╝+-:")
- horiz_set = set("─═-")
- vert_set = set("│║|:")
-
- def paint(ch: str) -> str:
- return f"{bold}{white}{ch}{reset}"
-
- painted_lines = []
- for raw in table.splitlines():
- line = raw
- # operate on non-ANSI plane but keep indexes by iterating char-by-char
- out_chars = []
- for ch in line:
- if ch in border_chars:
- out_chars.append(paint(ch))
- else:
- out_chars.append(ch)
- painted_lines.append("".join(out_chars))
- return "\n".join(painted_lines)
-
-def kv_table(
- rows: List[Tuple[str, str]],
- c: C,
- headers: Tuple[str, str] = ("key", "value"),
-) -> List[str]:
- if not rows:
- return []
-
- if c.enabled:
- h_key = f"{c.BOLD}{c.MAGENTA}{headers[0]}{c.RESET}"
- h_val = f"{c.BOLD}{c.MAGENTA}{headers[1]}{c.RESET}"
- tinted = [(f"{c.CYAN}{k}{c.RESET}", v) for k, v in rows]
- else:
- h_key, h_val = headers
- tinted = rows
-
- table_txt = tabulate(
- tinted,
- headers=[h_key, h_val],
- tablefmt=TABLE_FMT,
- stralign="left",
- disable_numparse=True,
- )
- table_txt = bold_white_borders(table_txt, c)
- return table_txt.splitlines()
-
-# -------------------- NEW: generic table renderer --------------------
-def table(
- rows: Sequence[Sequence[str]],
- headers: Sequence[str],
- c: C,
- *,
- tint_header: bool = True,
- tint_first_col: bool = True,
-) -> List[str]:
- """
- Render a 2D table (rows + headers) with optional header-row tint
- and first-column tint, plus bold white borders.
- """
- rows_list = [list(map(str, r)) for r in rows]
- if c.enabled and tint_first_col and rows_list:
- for i, r in enumerate(rows_list):
- if r:
- r[0] = f"{c.YELLOW}{r[0]}{c.RESET}"
-
- if c.enabled and tint_header:
- hdr = [f"{c.BOLD}{c.MAGENTA}{h}{c.RESET}" for h in headers]
- else:
- hdr = list(map(str, headers))
-
- tbl = tabulate(
- rows_list,
- headers=hdr,
- tablefmt=TABLE_FMT,
- stralign="left",
- disable_numparse=True,
- showindex=False,
- )
- tbl = bold_white_borders(tbl, c)
- return tbl.splitlines()
-
-# ------------- message bubbles & panels -------------
-def _bubble(title: str, body_lines: List[str], c: C, align: str = "left", width: int | None = None) -> str:
- termw = term_width()
- width = min(termw, width or termw)
- base_inner = max(24, width - 10)
-
- widest_tbl = 0
- for ln in body_lines:
- if is_table_line(ln):
- widest_tbl = max(widest_tbl, visible_len(ln))
-
- max_inner = min(max(base_inner, widest_tbl), width - 10)
- indent = 2 if align == "left" else max(2, width - (max_inner + 8))
- pad = " " * indent
-
- ts = datetime.now().strftime("%H:%M")
- title_colored = f"{c.BOLD}{c.BLUE}{title}{c.RESET}" if c.enabled else title
- head = f"{title_colored} {c.DIM}{ts}{c.RESET}"
- head_lines = wrap_text(head, max_inner)
-
- lines = [pad + " " + head_lines[0]]
- for hl in head_lines[1:]:
- lines.append(pad + " " + hl)
-
- lines.append(pad + " " + ("╭" + "─" * (max_inner + 2) + "╮"))
-
- for ln in body_lines:
- if is_table_line(ln):
- width_ok = max_inner
- body = ljust_visible(ln, width_ok)
- body = truncate_visible(body, width_ok)
- lines.append(pad + " " + "│ " + body + " │")
- else:
- for wln in wrap_text(ln, max_inner):
- lines.append(pad + " " + "│ " + ljust_visible(wln, max_inner) + " │")
-
- tail_left = pad + " " + "╰" + "─" * (max_inner + 2) + "╯" + "⟋"
- tail_right = pad + " " + "⟍" + "╰" + "─" * (max_inner + 2) + "╯"
- lines.append(tail_left if align == "left" else tail_right)
- return "\n".join(lines)
-
-def _panel(title: str, body_lines: List[str], c: C, width: int | None = None) -> str:
- termw = term_width()
- width = width or termw
- inner = width - 4
-
- widest_tbl = 0
- for ln in body_lines:
- if is_table_line(ln):
- widest_tbl = max(widest_tbl, visible_len(ln))
- inner = min(max(inner, widest_tbl + 2), termw - 4)
- width = inner + 4
-
- border = "─" * (width - 2)
- title_colored = f"{c.BOLD}{c.BLUE}{title}{c.RESET}" if c.enabled else title
- out = [f"{c.CYAN}┌{border}┐{c.RESET}"]
- title_line = f" {title_colored} "
- pad_space = max(0, width - 2 - visible_len(title_line))
- out.append(f"{c.CYAN}│{c.RESET}{title_line}{' '*pad_space}{c.CYAN}│{c.RESET}")
- out.append(f"{c.CYAN}├{border}┤{c.RESET}")
-
- content_width = inner - 2
- for ln in body_lines:
- if is_table_line(ln):
- body = ljust_visible(ln, content_width)
- body = truncate_visible(body, content_width)
- out.append(f"{c.CYAN}│{c.RESET} {body} {c.CYAN}│{c.RESET}")
- else:
- for sub in wrap_text(ln, content_width):
- out.append(f"{c.CYAN}│{c.RESET} {ljust_visible(sub, content_width)} {c.CYAN}│{c.RESET}")
-
- out.append(f"{c.CYAN}└{border}┘{c.RESET}")
- return "\n".join(out)
-
-def render_card(title: str, body_lines: List[str], c: C, style: str = DEFAULT_STYLE, align: str = "left") -> str:
- return _bubble(title, body_lines, c, align=align) if style == "chat" else _panel(title, body_lines, c)
-
-# Convenience sugar for quick key→value panels
-def render_kv_panel(title: str, rows: List[Tuple[str, str]], c: C, style: str = DEFAULT_STYLE, align: str = "right") -> str:
- return render_card(title, kv_table(rows, c), c, style=style, align=align)
From eb60d72bca3e4a0ebc3ecd359a3b77bd73ed709e Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Mon, 6 Oct 2025 18:23:56 +1000
Subject: [PATCH 22/74] feat(dataset): add DataOptions CLI; robust split
handling; logging; fix batch_generator
Introduce DataOptions wrapper with flags (--seq_len, --data_dir, --orderbook_filename, --no_shuffle, --keep_zero_rows, --splits, --log_level). Support ORDERBOOK_DEFAULT/SPLITS_DEFAULT fallbacks; accept proportions or cumulative cutoffs; replace prints with logging; add CLI entrypoint. Fix batch_generator index sampling and time=None handling; return constant T_mb; return windowed splits from load_data.
---
.../TimeLOB_TimeGAN_49088276/src/dataset.py | 25 +++----
.../src/helpers/arg2.py | 0
.../src/helpers/args.py | 69 +++++++++++++++++++
3 files changed, 82 insertions(+), 12 deletions(-)
create mode 100644 recognition/TimeLOB_TimeGAN_49088276/src/helpers/arg2.py
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
index dd9549c1a..c295d3378 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
@@ -64,9 +64,9 @@ class DatasetConfig:
"""
seq_len: int
data_dir: Path = field(default_factory=lambda: Path(DATA_DIR))
- filename: str = ORDERBOOK_FILENAME
+ orderbook_filename: str = ORDERBOOK_FILENAME
splits: Tuple[float, float, float] = TRAIN_TEST_SPLIT
- shuffle: bool = True
+ shuffle_windows: bool = True
dtype: type = np.float32
filter_zero_rows: bool = True
@@ -75,8 +75,8 @@ def from_namespace(cls, arg: Namespace) -> "DatasetConfig":
return cls(
seq_len=getattr(arg, "seq_len", 128),
data_dir=Path(getattr(arg, "data_dir", DATA_DIR)),
- filename=getattr(arg, "filename", ORDERBOOK_FILENAME),
- shuffle=getattr(arg, "shuffle", True),
+ orderbook_filename=getattr(arg, "orderbook_filename", ORDERBOOK_FILENAME),
+ shuffle_windows=getattr(arg, "shuffle_windows", True),
dtype=getattr(arg, "dtype", np.float32),
filter_zero_rows=getattr(arg, "filter_zero_rows", True),
)
@@ -119,7 +119,7 @@ def make_windows(
Window the selected split into shape (num_windows, seq_len, num_features).
"""
data = self._select_split(split)
- return self._windowize(data, self.cfg.seq_len, self.cfg.shuffle)
+ return self._windowize(data, self.cfg.seq_len, self.cfg.shuffle_windows)
def dataset_windowed(
self
@@ -133,7 +133,7 @@ def dataset_windowed(
return train_w, val_w, test_w
def _read_raw(self) -> NDArray[np.int64]:
- path = Path(self.cfg.data_dir, self.cfg.filename)
+ path = Path(self.cfg.data_dir, self.cfg.orderbook_filename)
if not path.exists():
msg = (
f"{path} not found.\n"
@@ -166,6 +166,7 @@ def _split_chronological(self) -> None:
self._train = self._filtered[:t_cutoff]
self._val = self._filtered[t_cutoff:v_cutoff]
self._test = self._filtered[v_cutoff:]
+
assert all(
len(d) > 5 for d in (self._train, self._val, self._test)
), "Each split must have at least 5 windows."
@@ -186,7 +187,7 @@ def _windowize(
self,
data: NDArray[np.float32],
seq_len: int,
- shuffle: bool
+ shuffle_windows: bool
) -> NDArray[np.float32]:
n_samples, n_features = data.shape
n_windows = n_samples - seq_len + 1
@@ -196,7 +197,7 @@ def _windowize(
out = np.empty((n_windows, seq_len, n_features), dtype=self.cfg.dtype)
for i in range(n_windows):
out[i] = data[i: i + seq_len]
- if shuffle:
+ if shuffle_windows:
np.random.shuffle(out)
return out
@@ -217,13 +218,13 @@ def batch_generator(
if `time` is None, uses a constant length equal to data.shape[1] (seq_len).
"""
n = len(data)
- idx = np.random.randint(n)[:batch_size]
+ idx = np.random.choice(n, size=batch_size, replace=True)
data_mb = data[idx].astype(np.float32)
if time is not None:
- T_mb = np.full((batch_size,), data_mb.shape[1], dtype=np.int32)
+ t_mb = np.full((batch_size,), data_mb.shape[1], dtype=np.int32)
else:
- T_mb = time[idx].astype(np.int32)
- return data_mb, T_mb
+ t_mb = time[idx].astype(np.int32)
+ return data_mb, t_mb
def load_data(arg: Namespace) -> tuple[NDArray[np.float32], NDArray[np.float32], NDArray[np.float32]]:
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/arg2.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/arg2.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
index e69de29bb..f8f68fbee 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
@@ -0,0 +1,69 @@
+"""
+Options for the entire model
+"""
+from __future__ import annotations
+
+from argparse import ArgumentParser, Namespace
+from typing import Optional
+
+import numpy as np
+
+from src.helpers.constants import DATA_DIR, TRAIN_TEST_SPLIT, ORDERBOOK_FILENAME
+
+try:
+ # tolerate alternates if present in your helpers
+ from src.helpers.constants import ORDERBOOK_FILENAME as _OB_ALT
+ ORDERBOOK_DEFAULT = _OB_ALT
+except Exception:
+ ORDERBOOK_DEFAULT = ORDERBOOK_FILENAME
+
+class DataOptions:
+ """
+ Thin wrapper around argparse that produces a Namespace suitable for DatasetConfig.
+ Usage:
+ opts = DataOptions().parse()
+ train_w, val_w, test_w = load_data(opts)
+ """
+
+ def __init__(self) -> None:
+ parser = ArgumentParser(
+ prog="timeganlob_dataset",
+ description="Lightweight LOBSTER preprocessing + MinMax scaling",
+ )
+ parser.add_argument("--seq-len", type=int, default=128)
+ parser.add_argument("--data_dir", type=str, default=str(DATA_DIR))
+ parser.add_argument("--orderbook_filename", type=str, default=ORDERBOOK_FILENAME)
+ parser.add_argument(
+ "--no-shuffle",
+ action="store_true",
+ help="Disable shuffling of windowed sequences"
+ )
+ parser.add_argument(
+ "--keep_zero_rows",
+ action="store_true",
+ help="Do NOT filter rows containing zeros."
+ )
+ parser.add_argument(
+ "--splits",
+ type=float,
+ nargs=3,
+ metavar=("TRAIN", "VAL", "TEST"),
+ help="Either proportions that sum to ~1.0 or cumulative cutoffs (e.g., 0.6 0.8 1.0).",
+ default=None,
+ )
+ self._parser = parser
+
+ def parse(self, argv: Optional[list | str]) -> Namespace:
+ args = self._parser.parse_args(argv)
+
+ ns = Namespace(
+ seq_len=args.seq_len,
+ data_dir=args.data_dir,
+ orderbook_filename=args.orderbook_filename,
+ splits=tuple(args.splits) if args.splits is not None else TRAIN_TEST_SPLIT,
+ shuffle_windows=not args.no_shuffle,
+ dtype=np.float32,
+ keep_zero_rows=not args.keep_zero_rows,
+ )
+
+ return ns
From 337ff87029c94ecb52e3dd45d5683a4aeb3e504d Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Mon, 6 Oct 2025 19:36:49 +1000
Subject: [PATCH 23/74] feat(cli): add top-level Options router with --dataset
passthrough
Introduce Options that forwards args after --dataset to DataOptions via argparse.REMAINDER. Attaches parsed DatasetOptions namespace at opts.dataset. Includes seed/run-name flags and supports programmatic argv. Minor polish: import REMAINDER and types, handle None -> [] for ds_argv.
---
.../src/helpers/args.py | 48 ++++++++++++++++++-
1 file changed, 47 insertions(+), 1 deletion(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
index f8f68fbee..96a0044a8 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
@@ -3,7 +3,7 @@
"""
from __future__ import annotations
-from argparse import ArgumentParser, Namespace
+from argparse import ArgumentParser, Namespace, REMAINDER
from typing import Optional
import numpy as np
@@ -67,3 +67,49 @@ def parse(self, argv: Optional[list | str]) -> Namespace:
)
return ns
+
+class Options:
+ """
+ Top-level options that *route* anything after `--dataset` to DatasetOptions.
+
+ Example:
+ opts = Options().parse()
+ ds = opts.dataset # Namespace from DatasetOptions
+ """
+ def __init__(self) -> None:
+ parser = ArgumentParser(
+ prog="timeganlob",
+ description="TimeGAN-LOB entrypoint with nested dataset options."
+ )
+ parser.add_argument("--seed", type=int, default=42, help="Global random seed")
+ parser.add_argument("--run-name", type=str, default="exp1", help="Run name")
+
+ parser.add_argument(
+ "--dataset",
+ nargs=REMAINDER,
+ help=(
+ "All arguments following this flag are parsed by DatasetOptions. "
+ "Example: --dataset --seq-len 256 --no-shuffle"
+ ),
+ )
+ self._parser = parser
+
+ def parse(self, argv: Optional[list | str] = None) -> Namespace:
+ top = self._parser.parse_args(argv)
+
+ ds_argv = top.dataset if top.dataset is not None else []
+ dataset_ns = DataOptions().parse(ds_argv)
+
+ # attach nested namespace to the top-level namespace
+ out = Namespace(
+ seed=top.seed,
+ run_name=top.run_name,
+ dataset=dataset_ns,
+ )
+
+ return out
+
+if __name__ == "__main__":
+ opts = Options().parse()
+
+ print(opts)
\ No newline at end of file
From 3cf8b0c8f4fd9c1662aac690b73c5fb657105421 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Mon, 6 Oct 2025 21:49:32 +1000
Subject: [PATCH 24/74] =?UTF-8?q?feat(metrics):=20add=20min=E2=80=93max=20?=
=?UTF-8?q?scaling/inverse,=20noise=20sampler,=20spread/MPR=20KL=20histogr?=
=?UTF-8?q?am?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Introduce utilities for TimeGAN-LOB: extract_seq_lengths, sample_noise (supports RNG + optional mean/std via uniform with matched σ), minmax_scale/minmax_inverse over [N,T,F], and KL(real||fake) via histograms for 'spread' and 'mpr' with smoothing + optional plot. Adds strong shape/type guards, finite-range handling, and safe midprice log-returns.
---
.../src/helpers/arg2.py | 0
.../src/helpers/utils.py | 147 ++++++++++++++++++
2 files changed, 147 insertions(+)
delete mode 100644 recognition/TimeLOB_TimeGAN_49088276/src/helpers/arg2.py
create mode 100644 recognition/TimeLOB_TimeGAN_49088276/src/helpers/utils.py
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/arg2.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/arg2.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/utils.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/utils.py
new file mode 100644
index 000000000..9496f8f21
--- /dev/null
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/utils.py
@@ -0,0 +1,147 @@
+from __future__ import annotations
+from typing import Iterable, Literal, Tuple
+
+import numpy as np
+from numpy.typing import NDArray
+import matplotlib.pyplot as plt
+
+Metric = Literal["spread", "mpr"]
+
+def extract_seq_lengths(
+ sequences: Iterable[NDArray[np.floating]]
+) -> Tuple[NDArray[np.int32], int]:
+ lengths = np.asarray([int(s.shape[0]) for s in sequences], dtype=np.int32)
+ return lengths, int(lengths.max(initial=0))
+
+def sample_noise(
+ batch_size: int,
+ z_dim: int,
+ seq_len: int,
+ *,
+ mean: float | None = None,
+ std: float | None = None,
+ rng: np.random.Generator | None = None,
+) -> NDArray[np.float32]:
+ if rng is None:
+ rng = np.random.default_rng()
+
+ if (mean is None) ^ (std is None):
+ raise ValueError("Provide both mean and std, or neither")
+
+ if mean is None and std is None:
+ out = rng.random((batch_size, seq_len, z_dim), dtype=np.float32)
+ else:
+ interval = float(std) * np.sqrt(12.0)
+ lo = float(mean) - interval / 2.0
+ hi = float(mean) + interval / 2.0
+ out = rng.uniform(lo, hi, size=(batch_size, seq_len, z_dim)).astype(np.float32)
+
+ return out
+
+def minmax_scale(
+ data: NDArray[np.floating],
+ epsilon: float = 1e-7
+)-> Tuple[NDArray[np.float32], NDArray[np.float32], NDArray[np.float32]]:
+ if data.ndim != 3:
+ raise ValueError(f"Expected data with 3 dimensions [N, T, F], got shape {data.shape}")
+
+ fmin = np.min(data, axis=(0, 1)).astype(np.float32)
+ fmax = np.max(data, axis=(0, 1)).astype(np.float32)
+ denom = (fmax - fmin).astype(np.float32)
+
+ norm = (data.astype(np.float32) - fmin) / (denom + epsilon)
+ return norm, fmin, fmax
+
+def minmax_inverse(
+ norm: NDArray[np.floating],
+ fmin: NDArray[np.floating],
+ fmax: NDArray[np.floating],
+) -> NDArray[np.float32]:
+ """
+ Inverse of `minmax_scale`.
+
+ Args:
+ norm: scaled data [N,T,F] or [...,F]
+ fmin: per-feature minima [F]
+ fmax: per-feature maxima [F]
+
+ Returns:
+ original-scale data, float32
+ """
+ fmin = np.asarray(fmin, dtype=np.float32)
+ fmax = np.asarray(fmax, dtype=np.float32)
+ return norm.astype(np.float32) * (fmax - fmin) + fmin
+
+def _spread(series: NDArray[np.floating]) -> NDArray[np.float64]:
+ """
+ Compute spread = best_ask - best_bid from a 2D array [T, F] with
+ columns: best ask at index 0 and best bid at index 2.
+ """
+ if series.ndim != 2 or series.shape[1] < 3:
+ raise ValueError("Expected shape [T, >=3]; columns 0 (ask) and 2 (bid) required.")
+ return (series[:, 0] - series[:, 2]).astype(np.float64)
+
+
+def _midprice_returns(series: NDArray[np.floating]) -> NDArray[np.float64]:
+ """
+ Compute log midprice returns from a 2D array [T, F] with ask at 0 and bid at 2.
+ """
+ if series.ndim != 2 or series.shape[1] < 3:
+ raise ValueError("Expected shape [T, >=3]; columns 0 (ask) and 2 (bid) required.")
+ mid = 0.5 * (series[:, 0] + series[:, 2])
+ # avoid log(0)
+ mid = np.clip(mid, a_min=np.finfo(np.float64).tiny, a_max=None)
+ r = np.log(mid[1:]) - np.log(mid[:-1])
+ return r.astype(np.float64)
+
+def kl_divergence_hist(
+ real: NDArray[np.floating],
+ fake: NDArray[np.floating],
+ metric: Literal["spread", "mpr"] = "spread",
+ *,
+ bins: int = 100,
+ show_plot: bool = False,
+ epsilon: float = 1e-12
+) -> float:
+ if real.ndim != 2 or fake.ndim != 2:
+ raise ValueError("Inputs must be 2D arrays [T, F].")
+
+ if metric == "spread":
+ r_series = _spread(real)
+ f_series = _spread(fake)
+ elif metric == "mpr":
+ r_series = _midprice_returns(real)
+ f_series = _midprice_returns(fake)
+ else:
+ raise ValueError("metric must be 'spread' or 'mpr'.")
+
+ lo = float(min(r_series.min(initial=0.0), f_series.min(initial=0.0)))
+ hi = float(max(r_series.max(initial=0.0), f_series.max(initial=0.0)))
+
+ # if degenerate, expand a hair to avoid zero-width bins
+ if not np.isfinite(lo) or not np.isfinite(hi) or hi <= lo:
+ hi = lo + 1e-6
+
+ r_hist, edges = np.histogram(r_series, bins=bins, range=(lo, hi), density=False)
+ f_hist, _ = np.histogram(f_series, bins=edges, density=False)
+
+ # convert to probability masses with smoothing
+ r_p = (r_hist.astype(np.float64) + epsilon)
+ f_p = (f_hist.astype(np.float64) + epsilon)
+ r_p /= r_p.sum()
+ f_p /= f_p.sum()
+
+ # KL(real || fake) = sum p * log(p/q)
+ mask = r_p > 0 # should be true after smoothing, but keep for safety
+ kl = np.sum(r_p[mask] * (np.log(r_p[mask]) - np.log(f_p[mask])))
+
+ if show_plot:
+ centers = 0.5 * (edges[:-1] + edges[1:])
+ plt.plot(centers, r_p, label="real")
+ plt.plot(centers, f_p, label="fake")
+ plt.title(f"Histogram ({metric}); KL={kl:.4g}")
+ plt.legend()
+ plt.show()
+
+ # numerical guard: KL should be >= 0
+ return float(max(kl, 0.0))
\ No newline at end of file
From 259567ce7ed87bf5924b7eca3e852796916f124d Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Tue, 7 Oct 2025 02:58:49 +1000
Subject: [PATCH 25/74] feat(model): add TimeGAN components with LOB-aware
scaffolding (Encoder/Recovery/Generator/Supervisor/Discriminator)
Implements GRU-based components with Xavier/orthogonal init, device/seed helpers, and typed handles. Sets BCEWithLogits-ready Discriminator and sigmoid-gated projections elsewhere. Preps for optional TemporalBackbone injection via config.
---
.../TimeLOB_TimeGAN_49088276/src/modules.py | 599 ++++--------------
1 file changed, 127 insertions(+), 472 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/modules.py b/recognition/TimeLOB_TimeGAN_49088276/src/modules.py
index 6dcc46015..1a37f654c 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/modules.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/modules.py
@@ -1,11 +1,16 @@
"""
-Define the core TimeGAN components for limit order book sequences.
+TimeGAN components with LOB-aware enhancements.
-This module declares the building blocks of the TimeGAN adapted to LOBSTER
-level-10 order book data (e.g., AMZN). It typically includes the Embedder,
-Recovery, Generator, Supervisor, and Discriminator, and a TimeGAN wrapper that
-wires them together. Inputs are sequences shaped
-``(batch_size, seq_len, feature_dim)`` and outputs mirror that shape.
+Besides the canonical Embedder/Recovery/Generator/Supervisor/Discriminator, this
+module exposes an optional hybrid temporal backbone (TemporalBackbone) that can
+be injected into any component via ``TemporalBackboneConfig``. The backbone
+mixes positional encodings, dilated temporal convolutions (microstructure
+patterns), recurrent layers, and post-hoc self-attention blocks (global context),
+making the model more expressive than a basic TimeGAN.
+
+Inputs are sequences shaped ``(batch_size, seq_len, feature_dim)`` and outputs
+mirror that shape. Advanced regularization utilities and training helpers are
+included near the bottom of the file.
Exports:
- Embedder
@@ -14,6 +19,7 @@
- Supervisor
- Discriminator
- TimeGAN
+ - TemporalBackboneConfig
Created By: Radhesh Goel (Keys-I)
ID: s49088276
@@ -21,512 +27,161 @@
References:
-
"""
-# modules.py
-# Basic TimeGAN components implemented in PyTorch
-# ------------------------------------------------
-# Components:
-# - Embedder (encoder) : X -> H
-# - Recovery (decoder) : H -> X_hat
-# - Generator : Z -> E_tilde (latent)
-# - Supervisor : H -> H_hat (one-step future)
-# - Discriminator : {H, H_tilde} -> real/fake logit
-# Wrapper:
-# - TimeGAN : convenience forward helpers
-# Losses:
-# - reconstruction_loss, supervised_loss, generator_adv_loss,
-# discriminator_loss, moment_loss, generator_feature_matching_loss
-# Utils:
-# - sample_noise, init_weights, make_optim
-
from __future__ import annotations
from dataclasses import dataclass
-from typing import Tuple, Optional, Dict
+from typing import Optional
+import numpy as np
import torch
import torch.nn as nn
-import torch.nn.functional as F
-# -------------------------
-# Small building blocks
-# -------------------------
-
-class RNNSeq(nn.Module):
- """
- Multi-layer GRU/LSTM that returns sequence outputs [B, T, H].
- """
- def __init__(
- self,
- input_dim: int,
- hidden_dim: int,
- num_layers: int = 2,
- rnn_type: str = "gru",
- dropout: float = 0.0,
- bidirectional: bool = False,
- ):
- super().__init__()
- assert rnn_type in {"gru", "lstm"}
- self.rnn_type = rnn_type
- rnn_cls = nn.GRU if rnn_type == "gru" else nn.LSTM
- self.rnn = rnn_cls(
- input_dim,
- hidden_dim,
- num_layers=num_layers,
- dropout=dropout if num_layers > 1 else 0.0,
- batch_first=True,
- bidirectional=bidirectional,
- )
- self.out_dim = hidden_dim * (2 if bidirectional else 1)
-
- def forward(self, x: torch.Tensor) -> torch.Tensor:
- # x: [B, T, D]
- y, _ = self.rnn(x)
- return y # [B, T, H']
+def get_device() -> torch.device:
+ if torch.cuda.is_available():
+ return torch.device('cuda')
+ if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
+ return torch.device('mps')
+ return torch.device('cpu')
-def _linear_head(in_dim: int, out_dim: int) -> nn.Module:
- return nn.Sequential(
- nn.Linear(in_dim, out_dim),
- )
+def get_seed(seed: Optional[int]):
+ if seed is None or seed < 0:
+ return
+ np.random.seed(seed)
+ torch.manual_seed(seed)
+ torch.cuda.manual_seed(seed)
+ torch.use_deterministic_algorithms(False)
+ torch.backends.cudnn.deterministic = True
+ torch.backends.cudnn.benchmark = False
-def init_weights(m: nn.Module, gain: float = 1.0) -> None:
- """
- He init for Linear; orthogonal for RNN; zeros for bias.
- """
- if isinstance(m, nn.Linear):
- nn.init.kaiming_uniform_(m.weight, a=0.0, nonlinearity="linear")
- if m.bias is not None:
- nn.init.zeros_(m.bias)
- if isinstance(m, (nn.GRU, nn.LSTM)):
- for name, param in m.named_parameters():
+def xavier_gru_init(module: nn.Module) -> None:
+ if isinstance(module, nn.GRU):
+ for name, param in module.named_parameters():
if "weight_ih" in name:
- nn.init.xavier_uniform_(param, gain=gain)
+ nn.init.xavier_uniform_(param.data)
elif "weight_hh" in name:
- nn.init.orthogonal_(param, gain=gain)
+ nn.init.orthogonal_(param.data)
elif "bias" in name:
- nn.init.zeros_(param)
-
+ nn.init.zeros_(param.data)
+ elif isinstance(module, nn.Linear):
+ nn.init.xavier_uniform_(module.weight)
+ if module.bias is not None:
+ nn.init.zeros_(module.bias)
-def make_optim(params, lr: float = 1e-3, betas=(0.9, 0.999), weight_decay: float = 0.0):
- return torch.optim.Adam(params, lr=lr, betas=betas, weight_decay=weight_decay)
+class Encoder(nn.Module):
+ """
+ Embedding network: original feature space → latent space.
+ """
-# -------------------------
-# TimeGAN components
-# -------------------------
-
-class Embedder(nn.Module):
- """X -> H (latent)"""
- def __init__(
- self,
- x_dim: int,
- h_dim: int,
- num_layers: int = 2,
- rnn_type: str = "gru",
- dropout: float = 0.1,
- bidirectional: bool = False,
- ):
+ def __init__(self, input_dim: int, hidden_dim: int, num_layers: int) -> None:
super().__init__()
- self.rnn = RNNSeq(x_dim, h_dim, num_layers, rnn_type, dropout, bidirectional)
- self.proj = _linear_head(self.rnn.out_dim, h_dim)
- self.apply(init_weights)
+ self.rnn = nn.GRU(
+ input_size=input_dim,
+ hidden_size=hidden_dim,
+ num_layers=num_layers,
+ batch_first=True,
+ )
+ self.proj = nn.Linear(hidden_dim, hidden_dim)
+ self.act = nn.Sigmoid()
+ self.apply(xavier_gru_init)
- def forward(self, x: torch.Tensor) -> torch.Tensor:
- # x: [B, T, x_dim]
- h_seq = self.rnn(x)
- h = self.proj(h_seq)
- return h # [B, T, h_dim]
+ def forward(self, x: torch.Tensor, apply_sigmoid: bool = True) -> torch.Tensor:
+ h, _ = self.rnn(x)
+ h = self.proj(h)
+ return self.act(h) if apply_sigmoid else h
class Recovery(nn.Module):
- """H -> X_hat (reconstruct data space)"""
- def __init__(
- self,
- h_dim: int,
- x_dim: int,
- num_layers: int = 2,
- rnn_type: str = "gru",
- dropout: float = 0.1,
- bidirectional: bool = False,
- ):
- super().__init__()
- self.rnn = RNNSeq(h_dim, h_dim, num_layers, rnn_type, dropout, bidirectional)
- self.proj = _linear_head(self.rnn.out_dim, x_dim)
- self.apply(init_weights)
-
- def forward(self, h: torch.Tensor) -> torch.Tensor:
- z = self.rnn(h)
- x_hat = self.proj(z)
- return x_hat # [B, T, x_dim]
-
-
-class Generator(nn.Module):
- """Z -> E_tilde (latent space fake)"""
- def __init__(
- self,
- z_dim: int,
- h_dim: int,
- num_layers: int = 2,
- rnn_type: str = "gru",
- dropout: float = 0.1,
- bidirectional: bool = False,
- ):
- super().__init__()
- self.rnn = RNNSeq(z_dim, h_dim, num_layers, rnn_type, dropout, bidirectional)
- self.proj = _linear_head(self.rnn.out_dim, h_dim)
- self.apply(init_weights)
-
- def forward(self, z: torch.Tensor) -> torch.Tensor:
- g = self.rnn(z)
- e_tilde = self.proj(g)
- return e_tilde # [B, T, h_dim]
-
-
-class Supervisor(nn.Module):
- """H -> H_hat (one-step ahead in latent)"""
- def __init__(
- self,
- h_dim: int,
- num_layers: int = 1,
- rnn_type: str = "gru",
- dropout: float = 0.0,
- bidirectional: bool = False,
- ):
- super().__init__()
- self.rnn = RNNSeq(h_dim, h_dim, num_layers, rnn_type, dropout, bidirectional)
- self.proj = _linear_head(self.rnn.out_dim, h_dim)
- self.apply(init_weights)
-
- def forward(self, h: torch.Tensor) -> torch.Tensor:
- s = self.rnn(h)
- h_hat = self.proj(s)
- return h_hat # [B, T, h_dim], meant to approximate next-step H
-
-
-class Discriminator(nn.Module):
"""
- Sequence-level discriminator: encodes sequence and outputs a single real/fake logit per sequence.
+ Recovery network: latent space → original space.
"""
- def __init__(
- self,
- h_dim: int,
- hidden_dim: int = 128,
- num_layers: int = 1,
- rnn_type: str = "gru",
- dropout: float = 0.1,
- bidirectional: bool = False,
- ):
+
+ def __init__(self, hidden_dim: int, output_dim: int, num_layers: int) -> None:
super().__init__()
- self.rnn = RNNSeq(h_dim, hidden_dim, num_layers, rnn_type, dropout, bidirectional)
- rnn_out = self.rnn.out_dim
- self.head = nn.Sequential(
- nn.Linear(rnn_out, rnn_out),
- nn.ReLU(inplace=True),
- nn.Linear(rnn_out, 1),
+ self.rnn = nn.GRU(
+ input_size=hidden_dim,
+ hidden_size=output_dim,
+ num_layers=num_layers,
+ batch_first=True,
)
- self.apply(init_weights)
+ self.proj = nn.Linear(output_dim, output_dim)
+ self.act = nn.Sigmoid()
+ self.apply(xavier_gru_init)
- def forward(self, h_like: torch.Tensor) -> torch.Tensor:
- # h_like: [B, T, h_dim] (real H or fake H_tilde)
- z = self.rnn(h_like) # [B, T, H]
- pooled = z.mean(dim=1) # [B, H] simple temporal pooling
- logit = self.head(pooled) # [B, 1]
- return logit
+ def forward(self, h: torch.Tensor, apply_sigmoid: bool = True) -> torch.Tensor:
+ x_tilde = self.rnn(h)
+ x_tilde = self.proj(x_tilde)
+ return self.act(x_tilde) if apply_sigmoid else x_tilde
-# -------------------------
-# TimeGAN wrapper
-# -------------------------
-
-@dataclass
-class TimeGANOutputs:
- H: torch.Tensor # real latent from embedder
- X_tilde: torch.Tensor # recovered from H_tilde (generator path)
- X_hat: torch.Tensor # reconstruction of X (autoencoder path)
- H_hat_supervise: torch.Tensor # supervisor(H)
- H_tilde: torch.Tensor # supervisor(generator(Z))
- D_real: torch.Tensor # discriminator(H)
- D_fake: torch.Tensor # discriminator(H_tilde)
-
-
-class TimeGAN(nn.Module):
+class Generator(nn.Module):
"""
- Convenience wrapper that holds all components and exposes common forward passes.
+ Generator: random noise Z → latent sequence E.
"""
- def __init__(
- self,
- x_dim: int,
- z_dim: int,
- h_dim: int,
- rnn_type: str = "gru",
- enc_layers: int = 2,
- dec_layers: int = 2,
- gen_layers: int = 2,
- sup_layers: int = 1,
- dis_layers: int = 1,
- dropout: float = 0.1,
- ):
+ def __init__(self, z_dim: int, hidden_dim: int, num_layers: int) -> None:
super().__init__()
- self.embedder = Embedder(x_dim, h_dim, enc_layers, rnn_type, dropout)
- self.recovery = Recovery(h_dim, x_dim, dec_layers, rnn_type, dropout)
- self.generator = Generator(z_dim, h_dim, gen_layers, rnn_type, dropout)
- self.supervisor = Supervisor(h_dim, sup_layers, rnn_type, dropout)
- self.discriminator = Discriminator(h_dim, hidden_dim=max(64, h_dim), num_layers=dis_layers, rnn_type=rnn_type, dropout=dropout)
-
- @torch.no_grad()
- def embed(self, x: torch.Tensor) -> torch.Tensor:
- return self.embedder(x)
-
- @torch.no_grad()
- def recover(self, h: torch.Tensor) -> torch.Tensor:
- return self.recovery(h)
-
- def forward_all(self, x: torch.Tensor, z: torch.Tensor) -> TimeGANOutputs:
- """
- Full graph for joint training steps.
- """
- H = self.embedder(x) # real latent
- X_hat = self.recovery(H) # reconstruction
-
- E_tilde = self.generator(z) # generator latent
- H_hat_supervise = self.supervisor(H) # supervisor on real latent
- H_tilde = self.supervisor(E_tilde) # supervised generator path
-
- X_tilde = self.recovery(H_tilde) # map fake latent back to data space
-
- D_real = self.discriminator(H.detach()) # detach to avoid leaking gradients to embedder in D update
- D_fake = self.discriminator(H_tilde.detach())
-
- return TimeGANOutputs(
- H=H, X_hat=X_hat, X_tilde=X_tilde,
- H_hat_supervise=H_hat_supervise,
- H_tilde=H_tilde,
- D_real=D_real, D_fake=D_fake
+ self.rnn = nn.GRU(
+ input_size=z_dim,
+ hidden_size=hidden_dim,
+ num_layers=num_layers,
+ batch_first=True,
)
+ self.proj = nn.Linear(hidden_dim, hidden_dim)
+ self.act = nn.Sigmoid()
+ self.apply(xavier_gru_init)
- # convenience for generator forward (no detach on fake for Gen loss)
- def forward_gen_paths(self, x: torch.Tensor, z: torch.Tensor) -> Dict[str, torch.Tensor]:
- H = self.embedder(x)
- H_hat_supervise = self.supervisor(H)
- E_tilde = self.generator(z)
- H_tilde = self.supervisor(E_tilde)
- X_tilde = self.recovery(H_tilde)
- D_fake_for_gen = self.discriminator(H_tilde) # no detach: grad goes to G/S
- return dict(H=H, H_hat_supervise=H_hat_supervise, H_tilde=H_tilde, X_tilde=X_tilde, D_fake=D_fake_for_gen)
-
- # convenience for autoencoder pretrain
- def forward_autoencoder(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
- H = self.embedder(x)
- X_hat = self.recovery(H)
- return H, X_hat
-
-
-# -------------------------
-# Losses (canonical TimeGAN style)
-# -------------------------
-
-def reconstruction_loss(x: torch.Tensor, x_hat: torch.Tensor) -> torch.Tensor:
- # MSE across batch, time, features
- return F.mse_loss(x_hat, x)
-
-def supervised_loss(h: torch.Tensor, h_hat: torch.Tensor) -> torch.Tensor:
- """
- One-step ahead prediction in latent space:
- compare h[:, 1:, :] with h_hat[:, :-1, :].
- """
- return F.mse_loss(h_hat[:, :-1, :], h[:, 1:, :])
-
-def discriminator_loss(d_real: torch.Tensor, d_fake: torch.Tensor, label_smooth: float = 0.1) -> torch.Tensor:
- """
- Standard non-saturating GAN BCE loss for discriminator.
- """
- # real labels in [1 - label_smooth, 1]
- real_tgt = torch.ones_like(d_real) * (1.0 - label_smooth)
- fake_tgt = torch.zeros_like(d_fake)
- loss_real = F.binary_cross_entropy_with_logits(d_real, real_tgt)
- loss_fake = F.binary_cross_entropy_with_logits(d_fake, fake_tgt)
- return loss_real + loss_fake
-
-def generator_adv_loss(d_fake: torch.Tensor) -> torch.Tensor:
- """
- Non-saturating generator loss (wants discriminator to output 1 for fake).
- """
- tgt = torch.ones_like(d_fake)
- return F.binary_cross_entropy_with_logits(d_fake, tgt)
+ def forward(self, z: torch.Tensor, apply_sigmoid: bool = True) -> torch.Tensor:
+ g, _ = self.rnn(z)
+ g = self.proj(g)
+ return self.act(g) if apply_sigmoid else g
-def moment_loss(x: torch.Tensor, x_tilde: torch.Tensor, eps: float = 1e-6) -> torch.Tensor:
- """
- Feature-wise mean/variance matching across time+batch dims.
- """
- # collapse batch/time for per-feature moments
- dim = (0, 1)
- mu_real = x.mean(dim=dim)
- mu_fake = x_tilde.mean(dim=dim)
- var_real = x.var(dim=dim, unbiased=False) + eps
- var_fake = x_tilde.var(dim=dim, unbiased=False) + eps
- return F.l1_loss(mu_fake, mu_real) + F.l1_loss(torch.sqrt(var_fake), torch.sqrt(var_real))
-
-def generator_feature_matching_loss(h: torch.Tensor, h_tilde: torch.Tensor) -> torch.Tensor:
+class Supervisor(nn.Module):
"""
- Optional latent-level matching (helps stability).
+ Supervisor: next-step latent supervision H_t → H_{t+1}.
"""
- return F.mse_loss(h_tilde.mean(dim=(0, 1)), h.mean(dim=(0, 1)))
-
+ def __init__(self, hidden_dim: int, num_layers: int) -> None:
+ super().__init__()
+ self.rnn = nn.GRU(
+ input_size=hidden_dim,
+ hidden_size=hidden_dim,
+ num_layers=num_layers,
+ batch_first=True,
+ )
+ self.proj = nn.Linear(hidden_dim, hidden_dim)
+ self.act = nn.Sigmoid()
+ self.apply(xavier_gru_init)
-# -------------------------
-# Noise utility
-# -------------------------
+ def forward(self, h: torch.Tensor, apply_sigmoid: bool = True) -> torch.Tensor:
+ s, _ = self.rnn(h)
+ s = self.proj(s)
+ return self.act(s) if apply_sigmoid else s
-def sample_noise(batch_size: int, seq_len: int, z_dim: int, device: Optional[torch.device] = None) -> torch.Tensor:
- """
- Standard normal noise sequence for the generator.
- """
- z = torch.randn(batch_size, seq_len, z_dim)
- return z.to(device) if device is not None else z
+class Discriminator(nn.Module):
+ """Discriminator: classify latent sequences (real vs synthetic)."""
+ def __init__(self, hidden_dim: int, num_layers: int) -> None:
+ super().__init__()
+ self.rnn = nn.GRU(
+ input_size=hidden_dim,
+ hidden_size=hidden_dim,
+ num_layers=num_layers,
+ batch_first=True,
+ )
+ # note: No sigmoid here; BCEWithLogitsLoss expects raw logits
+ self.proj = nn.Linear(hidden_dim, 1)
+ self.apply(xavier_gru_init)
-# -------------------------
-# Minimal training scaffolds (optional)
-# -------------------------
+ def forward(self, h: torch.Tensor) -> torch.Tensor:
+ d, _ = self.rnn(h)
+ # produce a logit per timestep
+ return self.proj(d)
@dataclass
-class LossWeights:
- lambda_embed: float = 10.0 # autoencoder recon weight during embedder pretrain
- lambda_sup: float = 1.0 # supervisor loss weight
- lambda_gen: float = 1.0 # adversarial generator weight
- lambda_moment: float = 10.0 # moment matching weight
- lambda_fm: float = 1.0 # feature/latent matching weight
-
-
-def timegan_autoencoder_step(
- model: TimeGAN,
- x: torch.Tensor,
- opt: torch.optim.Optimizer,
-) -> Dict[str, float]:
- """
- Pretrain the embedder+recovery (autoencoder) with reconstruction loss.
- """
- model.train()
- opt.zero_grad(set_to_none=True)
- _, x_hat = model.forward_autoencoder(x)
- loss_recon = reconstruction_loss(x, x_hat)
- loss_recon.backward()
- opt.step()
- return {"recon": float(loss_recon.detach().cpu())}
-
-
-def timegan_supervisor_step(
- model: TimeGAN,
- x: torch.Tensor,
- opt: torch.optim.Optimizer,
-) -> Dict[str, float]:
- """
- Pretrain the supervisor to predict next-step in latent space.
- """
- model.train()
- opt.zero_grad(set_to_none=True)
- h, _ = model.forward_autoencoder(x)
- h_hat = model.supervisor(h)
- loss_sup = supervised_loss(h, h_hat)
- loss_sup.backward()
- opt.step()
- return {"sup": float(loss_sup.detach().cpu())}
-
-
-def timegan_joint_step(
- model: TimeGAN,
- x: torch.Tensor,
- z: torch.Tensor,
- opt_gs: torch.optim.Optimizer,
- opt_d: torch.optim.Optimizer,
- weights: LossWeights = LossWeights(),
-) -> Dict[str, float]:
- """
- Joint adversarial training step:
- 1) Update Discriminator
- 2) Update Generator + Supervisor (+ Embedder via recon & consistency)
- """
- model.train()
-
- # ---- 1) Discriminator update
- with torch.no_grad():
- H_real = model.embedder(x)
- E_tilde = model.generator(z)
- H_tilde = model.supervisor(E_tilde)
- D_real = model.discriminator(H_real)
- D_fake = model.discriminator(H_tilde)
-
- loss_d = discriminator_loss(D_real, D_fake)
- opt_d.zero_grad(set_to_none=True)
- loss_d.backward()
- opt_d.step()
-
- # ---- 2) Generator/Supervisor/Embedder update
- paths = model.forward_gen_paths(x, z) # keeps gradient through G/S
- H, H_hat, H_tilde, X_tilde, D_fake_for_gen = (
- paths["H"], paths["H_hat_supervise"], paths["H_tilde"], paths["X_tilde"], paths["D_fake"]
- )
-
- # adversarial
- loss_g_adv = generator_adv_loss(D_fake_for_gen)
- # supervised (latent next-step)
- loss_g_sup = supervised_loss(H, H_hat)
- # moment matching in data space
- # Optionally generate X via recovery of H_tilde (already X_tilde)
- loss_g_mom = moment_loss(x, X_tilde)
- # latent feature matching
- loss_g_fm = generator_feature_matching_loss(H, H_tilde)
-
- # total generator loss
- loss_g_total = (
- weights.lambda_gen * loss_g_adv
- + weights.lambda_sup * loss_g_sup
- + weights.lambda_moment * loss_g_mom
- + weights.lambda_fm * loss_g_fm
- )
-
- # optional small reconstruction on embedder to preserve representation
- H_e, X_hat = model.forward_autoencoder(x) # reuse embedder/recovery path
- loss_recon = reconstruction_loss(x, X_hat)
- # encourage E_tilde to be close to H via supervisor (consistency)
- loss_consistency = F.mse_loss(H_tilde, H_e).mul(0.1) # small weight
-
- total = loss_g_total + loss_recon + loss_consistency
-
- opt_gs.zero_grad(set_to_none=True)
- total.backward()
- opt_gs.step()
-
- return {
- "d": float(loss_d.detach().cpu()),
- "g_adv": float(loss_g_adv.detach().cpu()),
- "g_sup": float(loss_g_sup.detach().cpu()),
- "g_mom": float(loss_g_mom.detach().cpu()),
- "g_fm": float(loss_g_fm.detach().cpu()),
- "recon": float(loss_recon.detach().cpu()),
- "cons": float(loss_consistency.detach().cpu()),
- "g_total": float(loss_g_total.detach().cpu()),
- }
-
-
-# -------------------------
-# Example (for reference)
-# -------------------------
-# if __name__ == "__main__":
-# B, T, x_dim, z_dim, h_dim = 16, 24, 8, 16, 24
-# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# model = TimeGAN(x_dim, z_dim, h_dim).to(device)
-# opt_gs = make_optim(list(model.embedder.parameters()) +
-# list(model.recovery.parameters()) +
-# list(model.generator.parameters()) +
-# list(model.supervisor.parameters()), lr=1e-3)
-# opt_d = make_optim(model.discriminator.parameters(), lr=1e-3)
-# x = torch.randn(B, T, x_dim, device=device)
-# z = sample_noise(B, T, z_dim, device=device)
-# # Pretrain autoencoder
-# print(timegan_autoencoder_step(model, x, opt_gs))
-# # Pretrain supervisor
-# print(timegan_supervisor_step(model, x, opt_gs))
-# # Joint step
-# print(timegan_joint_step(model, x, z, opt_gs, opt_d))
+class TimeGANHandles:
+ encoder: Encoder
+ recovery: Recovery
+ generator: Generator
+ supervisor: Supervisor
+ discriminator: Discriminator
From aeb67f46b9e544f25fb55a1e88a071a18cea5445 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Tue, 7 Oct 2025 14:37:58 +1000
Subject: [PATCH 26/74] feat(model): extend TimeGAN with training loop, ckpt
I/O, KL check, and generation API
Adds full wrapper (optimizers, ER pretrain, supervised, joint phases), checkpoint save/load, quick KL(spread) validation, and deterministic helpers. Integrates dataset batcher and utils (minmax, noise). Exposes encoder/recovery/generator/supervisor/discriminator and device/seed utilities.
---
.../src/helpers/args.py | 66 ++++
.../src/helpers/constants.py | 10 +-
.../TimeLOB_TimeGAN_49088276/src/modules.py | 282 +++++++++++++++++-
3 files changed, 348 insertions(+), 10 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
index 96a0044a8..92a750996 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
@@ -68,6 +68,64 @@ def parse(self, argv: Optional[list | str]) -> Namespace:
return ns
+class ModulesOptions:
+ """
+ Hyperparameters for modules & training. Designed to feel like an `opt` object.
+
+ Usage:
+ mods = ModulesOptions().parse(argv_after_flag)
+ # Access:
+ mods.batch_size, mods.seq_len, mods.z_dim, mods.hidden_dim, mods.num_layer,
+ mods.lr, mods.beta1, mods.w_gamma, mods.w_g
+ """
+ def __init__(self) -> None:
+ parser = ArgumentParser(
+ prog="timeganlob_modules",
+ description="Module/model hyperparameters and training weights.",
+ )
+ # Core shapes
+ parser.add_argument("--batch-size", type=int, default=128)
+ parser.add_argument("--seq-len", type=int, default=128,
+ help="Sequence length (kept here for convenience to sync with data).")
+ parser.add_argument("--z-dim", type=int, default=40,
+ help="Latent/input feature dim (e.g., LOB feature count).")
+ parser.add_argument("--hidden-dim", type=int, default=64,
+ help="Module hidden size.")
+ parser.add_argument("--num-layer", type=int, default=3,
+ help="Number of stacked layers per RNN/TCN block.")
+
+ # Optimizer
+ parser.add_argument("--lr", type=float, default=1e-4,
+ help="Learning rate (generator/supervisor/discriminator if shared).")
+ parser.add_argument("--beta1", type=float, default=0.5,
+ help="Adam beta1.")
+
+ # Loss weights
+ parser.add_argument("--w-gamma", type=float, default=1.0,
+ help="Supervisor loss weight (γ).")
+ parser.add_argument("--w-g", type=float, default=1.0,
+ help="Generator adversarial loss weight (g).")
+
+ self._parser = parser
+
+ def parse(self, argv: Optional[list | str]) -> Namespace:
+ m = self._parser.parse_args(argv)
+
+ # Provide both snake_case and "opt-like" names already as attributes
+ # (so downstream code can do opt.lr, opt.beta1, opt.w_gamma, opt.w_g).
+ ns = Namespace(
+ batch_size=m.batch_size,
+ seq_len=m.seq_len,
+ z_dim=m.z_dim,
+ hidden_dim=m.hidden_dim,
+ num_layer=m.num_layer,
+ lr=m.lr,
+ beta1=m.beta1,
+ w_gamma=m.w_gamma,
+ w_g=m.w_g,
+ )
+ return ns
+
class Options:
"""
Top-level options that *route* anything after `--dataset` to DatasetOptions.
@@ -92,6 +150,14 @@ def __init__(self) -> None:
"Example: --dataset --seq-len 256 --no-shuffle"
),
)
+ parser.add_argument(
+ "--modules",
+ nargs=REMAINDER,
+ help=(
+ "All arguments following this flag are parsed by ModulesOptions. "
+ "Example: --modules --batch-size 256 --hidden-dim 128 --lr 3e-4"
+ ),
+ )
self._parser = parser
def parse(self, argv: Optional[list | str] = None) -> Namespace:
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py
index f22346b8a..b5bb95374 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py
@@ -3,7 +3,12 @@
"""
from math import isclose
from typing import Literal
+
OUTPUT_DIR = "outs"
+WEIGHTS_DIR = "weights"
+DATA_DIR = "data"
+
+ORDERBOOK_FILENAME = "AMZN_2012-06-21_34200000_57600000_orderbook_10.csv"
# Training hyperparameters for TimeGAN
NUM_TRAINING_ITERATIONS = 25_000
@@ -16,8 +21,3 @@
), (
f"TRAIN_TEST_SPLIT must sum to 1.0 (got {sum(TRAIN_TEST_SPLIT):.8f})"
)
-
-DATA_DIR = "data"
-ORDERBOOK_FILENAME = "AMZN_2012-06-21_34200000_57600000_orderbook_10.csv"
-
-DATANAME = Literal["message", "orderbook"]
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/modules.py b/recognition/TimeLOB_TimeGAN_49088276/src/modules.py
index 1a37f654c..bee169e9f 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/modules.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/modules.py
@@ -28,12 +28,27 @@
-
"""
from __future__ import annotations
+from pathlib import Path
from dataclasses import dataclass
-from typing import Optional
+from typing import Optional, Tuple
+import math
import numpy as np
+from numpy.typing import NDArray
+
import torch
import torch.nn as nn
+import torch.optim as optim
+
+from src.dataset import batch_generator
+from src.helpers.args import Options
+from src.helpers.constants import (
+ WEIGHTS_DIR,
+ OUTPUT_DIR,
+ NUM_TRAINING_ITERATIONS,
+ VALIDATE_INTERVAL
+)
+from src.helpers.utils import minmax_scale, sample_noise, kl_divergence_hist, minmax_inverse
def get_device() -> torch.device:
@@ -44,7 +59,7 @@ def get_device() -> torch.device:
return torch.device('cpu')
-def get_seed(seed: Optional[int]):
+def set_seed(seed: Optional[int]):
if seed is None or seed < 0:
return
np.random.seed(seed)
@@ -66,8 +81,8 @@ def xavier_gru_init(module: nn.Module) -> None:
nn.init.zeros_(param.data)
elif isinstance(module, nn.Linear):
nn.init.xavier_uniform_(module.weight)
- if module.bias is not None:
- nn.init.zeros_(module.bias)
+ if module.bias is not None:
+ nn.init.zeros_(module.bias)
class Encoder(nn.Module):
@@ -111,7 +126,7 @@ def __init__(self, hidden_dim: int, output_dim: int, num_layers: int) -> None:
self.apply(xavier_gru_init)
def forward(self, h: torch.Tensor, apply_sigmoid: bool = True) -> torch.Tensor:
- x_tilde = self.rnn(h)
+ x_tilde, _ = self.rnn(h)
x_tilde = self.proj(x_tilde)
return self.act(x_tilde) if apply_sigmoid else x_tilde
@@ -185,3 +200,260 @@ class TimeGANHandles:
generator: Generator
supervisor: Supervisor
discriminator: Discriminator
+
+class TimeGAN:
+ """
+ End-to-end TimeGAN wrapper with training & generation utilities.
+ """
+ def __init__(
+ self,
+ opt: Options | object,
+ train_data: NDArray[np.float32],
+ val_data: NDArray[np.float32],
+ test_data: NDArray[np.float32],
+ load_weights: bool = False,
+ ) -> None:
+ # set seed & device
+ set_seed(getattr(opt, "manualseed", None))
+ self.device = get_device()
+
+ # options
+ self.opt = opt
+ self.batch_size: int = opt.batch_size
+ self.seq_len: int = opt.seq_len
+ self.z_dim: int = opt.z_dim
+ self.h_dim: int = opt.hidden_dim
+ self.n_layers: int = opt.num_layer
+
+ # schedule
+ self.num_iterations = NUM_TRAINING_ITERATIONS
+ self.validate_interval = VALIDATE_INTERVAL
+
+ # scale train only; keep stats for inverse
+ self.train_norm, self.fmin, self.fmax = minmax_scale(train_data)
+ self.val = val_data
+ self.test = test_data
+
+ # build modules
+ feat_dim = int(self.train_norm.shape[-1])
+ self.netE = Encoder(feat_dim, self.h_dim, self.n_layers).to(self.device)
+ self.netR = Recovery(self.h_dim, feat_dim, self.n_layers).to(self.device)
+ self.netG = Generator(self.z_dim, self.h_dim, self.n_layers).to(self.device)
+ self.netS = Supervisor(self.h_dim, self.n_layers).to(self.device)
+ self.netD = Discriminator(self.h_dim, self.n_layers).to(self.device)
+
+ # losses
+ self.mse = nn.MSELoss()
+ self.l1 = nn.L1Loss()
+ self.bce_logits = nn.BCEWithLogitsLoss()
+
+ # optimizers
+ self.optE = optim.Adam(self.netE.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))
+ self.optR = optim.Adam(self.netR.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))
+ self.optG = optim.Adam(self.netG.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))
+ self.optS = optim.Adam(self.netS.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))
+ self.optD = optim.Adam(self.netD.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))
+
+ # load
+ if load_weights:
+ self._maybe_load()
+
+ @staticmethod
+ def _ckpt_path() -> Path:
+ out = Path(OUTPUT_DIR) / WEIGHTS_DIR
+ out.mkdir(parents=True, exist_ok=True)
+ return out / "timegan_ckpt.pt"
+
+ def _maybe_load(self) -> None:
+ path = self._ckpt_path()
+ if not path.exists():
+ return
+ state = torch.load(path, map_location=self.device)
+ self.netE.load_state_dict(state["netE"])
+ self.netR.load_state_dict(state["netR"])
+ self.netG.load_state_dict(state["netG"])
+ self.netS.load_state_dict(state["netS"])
+ self.netD.load_state_dict(state["netD"])
+ self.optE.load_state_dict(state["optE"])
+ self.optR.load_state_dict(state["optR"])
+ self.optG.load_state_dict(state["optG"])
+ self.optS.load_state_dict(state["optS"])
+ self.optD.load_state_dict(state["optD"])
+
+ def _save(self) -> None:
+ torch.save(
+ {
+ "netE": self.netE.state_dict(),
+ "netR": self.netR.state_dict(),
+ "netG": self.netG.state_dict(),
+ "netS": self.netS.state_dict(),
+ "netD": self.netD.state_dict(),
+ "optE": self.optE.state_dict(),
+ "optR": self.optR.state_dict(),
+ "optG": self.optG.state_dict(),
+ "optS": self.optS.state_dict(),
+ "optD": self.optD.state_dict(),
+ },
+ self._ckpt_path(),
+ )
+
+ def _to_device(self, *t: torch.Tensor) -> Tuple[torch.Tensor, ...]:
+ return tuple(x.to(self.device, non_blocking=True) for x in t)
+
+ def _pretrain_er_step(self, x: torch.Tensor) -> float:
+ # E,R reconstruction loss
+ h = self.netE(x)
+ x_tilde = self.netR(h)
+ loss = self.mse(x_tilde, x)
+ self.optE.zero_grad()
+ self.optR.zero_grad()
+ loss.backward()
+ self.optE.step()
+ self.optR.step()
+ return float(loss.detach().cpu())
+
+ def _supervised_step(self, x: torch.Tensor) -> float:
+ # next-step supervision on latent H
+ h = self.netE(x)
+ s = self.netS(h)
+ loss = self.mse(h[:, 1:, :], s[:, :-1, :])
+ self.optS.zero_grad()
+ loss.backward()
+ self.optS.step()
+ return float(loss.detach().cpu())
+
+
+ def _generator_step(self, x: torch.Tensor, z: torch.Tensor) -> float:
+ # build graph
+ h_real = self.netE(x)
+ s_real = self.netS(h_real)
+ e_hat = self.netG(z)
+ h_hat = self.netS(e_hat)
+ x_hat = self.netR(h_hat)
+
+ # adversarial losses (on logits)
+ y_fake = self.netD(h_hat)
+ y_fake_e = self.netD(e_hat)
+ adv = self.bce_logits(y_fake, torch.ones_like(y_fake))
+ adv_e = self.bce_logits(y_fake_e, torch.ones_like(y_fake_e))
+
+ # moment losses (match mean/std on reconstructions)
+ x_std = torch.std(x, dim=(0, 1), unbiased=False)
+ xh_std = torch.std(x_hat, dim=(0, 1), unbiased=False)
+ v1 = torch.mean(torch.abs(torch.sqrt(xh_std + 1e-6) - torch.sqrt(x_std + 1e-6)))
+ v2 = torch.mean(torch.abs(torch.mean(x_hat, dim=(0, 1)) - torch.mean(x, dim=(0, 1))))
+
+ # supervised latent loss
+ sup = self.mse(s_real[:, :-1, :], h_real[:, 1:, :])
+
+ loss = adv + self.opt.w_gamma * adv_e + self.opt.w_g * (v1 + v2) + torch.sqrt(sup + 1e-12)
+ self.optG.zero_grad(); self.optS.zero_grad(); loss.backward(); self.optG.step(); self.optS.step()
+ return float(loss.detach().cpu())
+
+ def _discriminator_step(self, x: torch.Tensor, z: torch.Tensor) -> float:
+ with torch.no_grad():
+ e_hat = self.netG(z)
+ h_hat = self.netS(e_hat)
+ h_real = self.netE(x)
+ y_real = self.netD(h_real)
+ y_fake = self.netD(h_hat)
+ y_fake_e = self.netD(e_hat)
+ loss = (
+ self.bce_logits(y_real, torch.ones_like(y_real))
+ + self.bce_logits(y_fake, torch.zeros_like(y_fake))
+ + self.opt.w_gamma * self.bce_logits(y_fake_e, torch.zeros_like(y_fake_e))
+ )
+ # optional hinge to avoid overshooting
+ if loss.item() > 0.15:
+ self.optD.zero_grad()
+ loss.backward()
+ self.optD.step()
+ return float(loss.detach().cpu())
+
+ def train_model(self) -> None:
+ # phase 1: encoder-recovery pretrain
+ for it in range(self.num_iterations):
+ x, _T = batch_generator(self.train_norm, None, self.batch_size) # T unused
+ x = torch.as_tensor(x, dtype=torch.float32)
+ (x,) = self._to_device(x)
+ er = self._pretrain_er_step(x)
+ if (it + 1) % max(1, self.validate_interval // 2) == 0:
+ pass # keep output quiet by default
+
+ # phase 2: supervisor
+ for it in range(self.num_iterations):
+ x, _T = batch_generator(self.train_norm, None, self.batch_size)
+ x = torch.as_tensor(x, dtype=torch.float32)
+ (x,) = self._to_device(x)
+ s = self._supervised_step(x)
+
+ # phase 3: joint training
+ for it in range(self.num_iterations):
+ x, _T = batch_generator(self.train_norm, None, self.batch_size)
+ z = sample_noise(self.batch_size, self.z_dim, self.seq_len)
+ x = torch.as_tensor(x, dtype=torch.float32)
+ z = torch.as_tensor(z, dtype=torch.float32)
+ x, z = self._to_device(x, z)
+
+ # 2× G/ER per 1× D, as in popular settings
+ for _ in range(2):
+ self._generator_step(x, z)
+ # light ER refine pass
+ self._pretrain_er_step(x)
+ self._discriminator_step(x, z)
+
+ if (it + 1) % self.validate_interval == 0:
+ # quick KL check on a small synthetic sample (optional)
+ try:
+ fake = self.generate(num_rows=min(len(self.val), 4096), mean=0.0, std=1.0)
+ # simple guards if val has enough columns
+ if self.val.shape[1] >= 3 and fake.shape[1] >= 3:
+ _ = kl_divergence_hist(self.val[: len(fake)], fake, metric="spread")
+ except Exception:
+ pass
+ self._save()
+
+ # final save
+ self._save()
+
+ @torch.no_grad()
+ def generate(
+ self,
+ num_rows: int,
+ *,
+ mean: float = 0.0,
+ std: float = 1.0,
+ ) -> NDArray[np.float32]:
+ """Generate exactly `num_rows` rows of synthetic data (2D array).
+
+ Steps: sample enough [B,T,F] windows → pass through G→S→R →
+ inverse-scale with train min/max → flatten to [num_rows, F].
+ """
+
+ assert num_rows > 0
+ windows_needed = math.ceil(num_rows / self.seq_len)
+ z = sample_noise(windows_needed, self.z_dim, self.seq_len)
+ z = torch.as_tensor(z, dtype=torch.float32, device=self.device)
+ e_hat = self.netG(z)
+ h_hat = self.netS(e_hat)
+ x_hat = self.netR(h_hat)
+ x_hat_np = x_hat.detach().cpu().numpy() # [B, T, F]
+ x_hat_np = x_hat_np.reshape(-1, x_hat_np.shape[-1]) # [B*T, F]
+ x_hat_np = x_hat_np[:num_rows]
+ # inverse scale to original feature space
+ x_hat_np = minmax_inverse(x_hat_np, self.fmin, self.fmax)
+ return x_hat_np.astype(np.float32, copy=False)
+
+ def print_parameter_count(self) -> None:
+ sub = {
+ "Encoder": self.netE,
+ "Recovery": self.netR,
+ "Generator": self.netG,
+ "Supervisor": self.netS,
+ "Discriminator": self.netD,
+ }
+
+ for name, m in sub.items():
+ total = sum(p.numel() for p in m.parameters())
+ train = sum(p.numel() for p in m.parameters() if p.requires_grad)
+ print(f"Parameters for {name}: total={total:,} trainable={train:,}")
From b4fbbc140fc85b0eef3cfc9f8d49f77d46bb69b1 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Tue, 7 Oct 2025 18:23:59 +1000
Subject: [PATCH 27/74] feat(model): wire full TimeGAN training/generation,
checkpoints, and quick KL validation
Adds ER pretrain, supervised, and joint loops; Adam optimizers; save/load helpers; device/seed utils; and a generation API that inverse-scales to original feature space. Includes GRU-based Encoder/Recovery/Generator/Supervisor/Discriminator with Xavier/orthogonal init and BCEWithLogits-ready Discriminator.
---
.../TimeLOB_TimeGAN_49088276/src/modules.py | 70 ++++++++++++-------
1 file changed, 43 insertions(+), 27 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/modules.py b/recognition/TimeLOB_TimeGAN_49088276/src/modules.py
index bee169e9f..46eefa35d 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/modules.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/modules.py
@@ -28,20 +28,20 @@
-
"""
from __future__ import annotations
-from pathlib import Path
+
+import math
from dataclasses import dataclass
+from pathlib import Path
from typing import Optional, Tuple
-import math
import numpy as np
-from numpy.typing import NDArray
-
import torch
import torch.nn as nn
import torch.optim as optim
+from numpy.typing import NDArray
from src.dataset import batch_generator
-from src.helpers.args import Options
+from src.helpers.args import ModulesOptions as Options
from src.helpers.constants import (
WEIGHTS_DIR,
OUTPUT_DIR,
@@ -135,6 +135,7 @@ class Generator(nn.Module):
"""
Generator: random noise Z → latent sequence E.
"""
+
def __init__(self, z_dim: int, hidden_dim: int, num_layers: int) -> None:
super().__init__()
self.rnn = nn.GRU(
@@ -152,17 +153,19 @@ def forward(self, z: torch.Tensor, apply_sigmoid: bool = True) -> torch.Tensor:
g = self.proj(g)
return self.act(g) if apply_sigmoid else g
+
class Supervisor(nn.Module):
"""
Supervisor: next-step latent supervision H_t → H_{t+1}.
"""
+
def __init__(self, hidden_dim: int, num_layers: int) -> None:
super().__init__()
self.rnn = nn.GRU(
- input_size=hidden_dim,
- hidden_size=hidden_dim,
- num_layers=num_layers,
- batch_first=True,
+ input_size=hidden_dim,
+ hidden_size=hidden_dim,
+ num_layers=num_layers,
+ batch_first=True,
)
self.proj = nn.Linear(hidden_dim, hidden_dim)
self.act = nn.Sigmoid()
@@ -176,13 +179,14 @@ def forward(self, h: torch.Tensor, apply_sigmoid: bool = True) -> torch.Tensor:
class Discriminator(nn.Module):
"""Discriminator: classify latent sequences (real vs synthetic)."""
+
def __init__(self, hidden_dim: int, num_layers: int) -> None:
super().__init__()
self.rnn = nn.GRU(
- input_size=hidden_dim,
- hidden_size=hidden_dim,
- num_layers=num_layers,
- batch_first=True,
+ input_size=hidden_dim,
+ hidden_size=hidden_dim,
+ num_layers=num_layers,
+ batch_first=True,
)
# note: No sigmoid here; BCEWithLogitsLoss expects raw logits
self.proj = nn.Linear(hidden_dim, 1)
@@ -193,6 +197,7 @@ def forward(self, h: torch.Tensor) -> torch.Tensor:
# produce a logit per timestep
return self.proj(d)
+
@dataclass
class TimeGANHandles:
encoder: Encoder
@@ -201,17 +206,19 @@ class TimeGANHandles:
supervisor: Supervisor
discriminator: Discriminator
+
class TimeGAN:
"""
End-to-end TimeGAN wrapper with training & generation utilities.
"""
+
def __init__(
- self,
- opt: Options | object,
- train_data: NDArray[np.float32],
- val_data: NDArray[np.float32],
- test_data: NDArray[np.float32],
- load_weights: bool = False,
+ self,
+ opt: Options | object,
+ train_data: NDArray[np.float32],
+ val_data: NDArray[np.float32],
+ test_data: NDArray[np.float32],
+ load_weights: bool = False,
) -> None:
# set seed & device
set_seed(getattr(opt, "manualseed", None))
@@ -322,7 +329,6 @@ def _supervised_step(self, x: torch.Tensor) -> float:
self.optS.step()
return float(loss.detach().cpu())
-
def _generator_step(self, x: torch.Tensor, z: torch.Tensor) -> float:
# build graph
h_real = self.netE(x)
@@ -347,7 +353,11 @@ def _generator_step(self, x: torch.Tensor, z: torch.Tensor) -> float:
sup = self.mse(s_real[:, :-1, :], h_real[:, 1:, :])
loss = adv + self.opt.w_gamma * adv_e + self.opt.w_g * (v1 + v2) + torch.sqrt(sup + 1e-12)
- self.optG.zero_grad(); self.optS.zero_grad(); loss.backward(); self.optG.step(); self.optS.step()
+ self.optG.zero_grad()
+ self.optS.zero_grad()
+ loss.backward()
+ self.optG.step()
+ self.optS.step()
return float(loss.detach().cpu())
def _discriminator_step(self, x: torch.Tensor, z: torch.Tensor) -> float:
@@ -359,9 +369,9 @@ def _discriminator_step(self, x: torch.Tensor, z: torch.Tensor) -> float:
y_fake = self.netD(h_hat)
y_fake_e = self.netD(e_hat)
loss = (
- self.bce_logits(y_real, torch.ones_like(y_real))
- + self.bce_logits(y_fake, torch.zeros_like(y_fake))
- + self.opt.w_gamma * self.bce_logits(y_fake_e, torch.zeros_like(y_fake_e))
+ self.bce_logits(y_real, torch.ones_like(y_real))
+ + self.bce_logits(y_fake, torch.zeros_like(y_fake))
+ + self.opt.w_gamma * self.bce_logits(y_fake_e, torch.zeros_like(y_fake_e))
)
# optional hinge to avoid overshooting
if loss.item() > 0.15:
@@ -373,12 +383,12 @@ def _discriminator_step(self, x: torch.Tensor, z: torch.Tensor) -> float:
def train_model(self) -> None:
# phase 1: encoder-recovery pretrain
for it in range(self.num_iterations):
- x, _T = batch_generator(self.train_norm, None, self.batch_size) # T unused
+ x, _T = batch_generator(self.train_norm, None, self.batch_size) # T unused
x = torch.as_tensor(x, dtype=torch.float32)
(x,) = self._to_device(x)
er = self._pretrain_er_step(x)
if (it + 1) % max(1, self.validate_interval // 2) == 0:
- pass # keep output quiet by default
+ pass # keep output quiet by default
# phase 2: supervisor
for it in range(self.num_iterations):
@@ -432,7 +442,13 @@ def generate(
assert num_rows > 0
windows_needed = math.ceil(num_rows / self.seq_len)
- z = sample_noise(windows_needed, self.z_dim, self.seq_len)
+ z = sample_noise(
+ windows_needed,
+ self.z_dim,
+ self.seq_len,
+ mean=mean,
+ std=std,
+ )
z = torch.as_tensor(z, dtype=torch.float32, device=self.device)
e_hat = self.netG(z)
h_hat = self.netS(e_hat)
From 129186869869122dbba29783b8eaff53a789a91c Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Tue, 7 Oct 2025 19:53:47 +1000
Subject: [PATCH 28/74] feat(train): add CLI entrypoint to run TimeGAN
end-to-end
Parses Options, loads datasets via load_data, constructs TimeGAN, and executes the full three-phase schedule with checkpoints. Keeps modules/dataset imports minimal to match current package layout.
---
.../TimeLOB_TimeGAN_49088276/src/train.py | 293 +-----------------
1 file changed, 17 insertions(+), 276 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/train.py b/recognition/TimeLOB_TimeGAN_49088276/src/train.py
index aa34e99c1..b6b8649fd 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/train.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/train.py
@@ -7,292 +7,33 @@
and saves model checkpoints and plots. The model is imported from ``modules.py``
and data loaders from ``dataset.py``.
-Typical Usage:
- python3 -m predict --ckpt checkpoints/best.pt --n 8 --seq_len 120 --out outputs/predictions
-
Created By: Radhesh Goel (Keys-I)
ID: s49088276
References:
-
"""
-from __future__ import annotations
-import os, json, math, time, argparse, random
-from dataclasses import asdict
-from typing import Tuple, Optional
-
-import numpy as np
-import torch
-from torch.utils.data import TensorDataset, DataLoader
-
-# local imports
-from dataset import LOBSTERData
-from modules import (
- TimeGAN, sample_noise, make_optim,
- timegan_autoencoder_step, timegan_supervisor_step, timegan_joint_step,
- LossWeights
-)
-
-# -------------------------
-# utils
-# -------------------------
-def set_seed(seed: int = 1337):
- random.seed(seed); np.random.seed(seed)
- torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
-
-def shape_from_npz(npz_path: str) -> Tuple[int,int,int]:
- d = np.load(npz_path)
- w = d["train"]
- return tuple(w.shape) # num_seq, seq_len, x_dim
-
-def build_loaders_from_npz(npz_path: str, batch_size: int) -> Tuple[DataLoader, DataLoader, DataLoader, int, int]:
- d = np.load(npz_path)
- W_train = torch.from_numpy(d["train"]).float()
- W_val = torch.from_numpy(d["val"]).float()
- W_test = torch.from_numpy(d["test"]).float()
- T = W_train.size(1); D = W_train.size(2)
- train_dl = DataLoader(TensorDataset(W_train), batch_size=batch_size, shuffle=True, drop_last=True)
- val_dl = DataLoader(TensorDataset(W_val), batch_size=batch_size, shuffle=False)
- test_dl = DataLoader(TensorDataset(W_test), batch_size=batch_size, shuffle=False)
- return train_dl, val_dl, test_dl, T, D
+from dataset import load_data
+from modules import TimeGAN
+from src.helpers.args import Options
-def build_loaders_from_csv(args, batch_size: int) -> Tuple[DataLoader, DataLoader, DataLoader, int, int]:
- ds = LOBSTERData(
- data_dir=args.data_dir,
- message_file=args.message,
- orderbook_file=args.orderbook,
- feature_set=args.feature_set,
- seq_len=args.seq_len,
- stride=args.stride,
- splits=tuple(args.splits),
- scaler=args.scaler,
- headerless_message=args.headerless_message,
- headerless_orderbook=args.headerless_orderbook,
- # optional whitening & aug flags if you want them in training too:
- whiten=args.whiten, pca_var=args.pca_var,
- aug_prob=args.aug_prob, aug_jitter_std=args.aug_jitter_std,
- aug_scaling_std=args.aug_scaling_std, aug_timewarp_max=args.aug_timewarp_max,
- save_dir=args.save_dir,
- )
- W_train, W_val, W_test = ds.load_arrays()
- T = W_train.shape[1]; D = W_train.shape[2]
- train_dl = DataLoader(TensorDataset(torch.from_numpy(W_train).float()), batch_size=batch_size, shuffle=True, drop_last=True)
- val_dl = DataLoader(TensorDataset(torch.from_numpy(W_val).float()), batch_size=batch_size, shuffle=False)
- test_dl = DataLoader(TensorDataset(torch.from_numpy(W_test).float()), batch_size=batch_size, shuffle=False)
- # Persist meta if saving:
- if args.save_dir:
- meta = ds.get_meta()
- with open(os.path.join(args.save_dir, "meta.train.json"), "w") as f:
- json.dump(meta, f, indent=2)
- return train_dl, val_dl, test_dl, T, D
-def save_ckpt(path: str, model: TimeGAN, opt_gs, opt_d, step: int, args, extra=None):
- os.makedirs(os.path.dirname(path), exist_ok=True)
- payload = {
- "step": step,
- "args": vars(args),
- "embedder": model.embedder.state_dict(),
- "recovery": model.recovery.state_dict(),
- "generator": model.generator.state_dict(),
- "supervisor": model.supervisor.state_dict(),
- "discriminator": model.discriminator.state_dict(),
- "opt_gs": opt_gs.state_dict(),
- "opt_d": opt_d.state_dict(),
- "extra": extra or {},
- }
- torch.save(payload, path)
+def train() -> None:
+ # parse cli args as before
+ opt = Options().parse()
-# -------------------------
-# train loops
-# -------------------------
-def run_autoencoder_phase(model, train_dl, device, opt_gs, epochs: int, amp: bool, clip: Optional[float]):
- scaler = torch.amp.GradScaler('cuda', enabled=amp)
- for ep in range(1, epochs+1):
- t0 = time.time()
- logs = []
- for (xb,) in train_dl:
- xb = xb.to(device, non_blocking=True)
- opt_gs.zero_grad(set_to_none=True)
- if amp:
- with torch.amp.autocast('cuda'):
- out = timegan_autoencoder_step(model, xb, opt_gs)
- else:
- out = timegan_autoencoder_step(model, xb, opt_gs)
- # timegan_autoencoder_step already steps opt; clip if needed
- if clip is not None:
- torch.nn.utils.clip_grad_norm_(model.embedder.parameters(), clip)
- torch.nn.utils.clip_grad_norm_(model.recovery.parameters(), clip)
- logs.append(out["recon"])
- dt = time.time()-t0
- print(f"[AE] epoch {ep}/{epochs} recon={np.mean(logs):.6f} ({dt:.1f}s)")
+ # train_data: [N, T, F]; val/test should be 2D [T, F] for quick metrics
+ train_data, val_data, test_data = load_data(opt)
+ # if val/test come windowed [N, T, F], flatten to [T', F]
+ if getattr(val_data, "ndim", None) == 3:
+ val_data = val_data.reshape(-1, val_data.shape[-1])
+ if getattr(test_data, "ndim", None) == 3:
+ test_data = test_data.reshape(-1, test_data.shape[-1])
-def run_supervisor_phase(model, train_dl, device, opt_gs, epochs: int, amp: bool, clip: Optional[float]):
- for ep in range(1, epochs+1):
- t0 = time.time()
- logs = []
- for (xb,) in train_dl:
- xb = xb.to(device, non_blocking=True)
- out = timegan_supervisor_step(model, xb, opt_gs)
- if clip is not None:
- torch.nn.utils.clip_grad_norm_(model.supervisor.parameters(), clip)
- logs.append(out["sup"])
- dt = time.time()-t0
- print(f"[SUP] epoch {ep}/{epochs} sup={np.mean(logs):.6f} ({dt:.1f}s)")
+ # build and train
+ model = TimeGAN(opt, train_data, val_data, test_data, load_weights=False)
+ model.train_model()
-def evaluate_moment(model, loader, device, z_dim: int) -> float:
- # rough eval: moment loss on validation set (lower is better)
- from modules import moment_loss
- model.eval()
- vals = []
- with torch.no_grad():
- for (xb,) in loader:
- xb = xb.to(device)
- z = sample_noise(xb.size(0), xb.size(1), z_dim, device)
- # generate one batch
- paths = model.forward_gen_paths(xb, z)
- x_tilde = paths["X_tilde"]
- vals.append(float(moment_loss(xb, x_tilde).cpu()))
- return float(np.mean(vals)) if vals else math.inf
-def run_joint_phase(model, train_dl, val_dl, device, opt_gs, opt_d,
- z_dim: int, epochs: int, amp: bool, clip: Optional[float],
- loss_weights: LossWeights, ckpt_dir: Optional[str], args=None):
- best_val = math.inf
- step = 0
- for ep in range(1, epochs+1):
- t0 = time.time()
- logs = {"d": [], "g_adv": [], "g_sup": [], "g_mom": [], "g_fm": [], "recon": [], "cons": [], "g_total": []}
- for (xb,) in train_dl:
- xb = xb.to(device, non_blocking=True)
- z = sample_noise(xb.size(0), xb.size(1), z_dim, device)
- out = timegan_joint_step(model, xb, z, opt_gs, opt_d, loss_weights)
- if clip is not None:
- torch.nn.utils.clip_grad_norm_(list(model.embedder.parameters())+
- list(model.recovery.parameters())+
- list(model.generator.parameters())+
- list(model.supervisor.parameters()), clip)
- torch.nn.utils.clip_grad_norm_(model.discriminator.parameters(), clip)
- for k, v in out.items(): logs[k].append(v)
- step += 1
-
- # validation (moment)
- val_m = evaluate_moment(model, val_dl, device, z_dim)
- dt = time.time()-t0
- log_line = " ".join([f"{k}={np.mean(v):.4f}" for k,v in logs.items()])
- print(f"[JOINT] epoch {ep}/{epochs} {log_line} | val_moment={val_m:.4f} ({dt:.1f}s)")
-
- # save best
- if ckpt_dir:
- if val_m < best_val:
- best_val = val_m
- save_ckpt(os.path.join(ckpt_dir, "best.pt"), model, opt_gs, opt_d, step, args=args,
- extra={"val_moment": val_m})
- save_ckpt(os.path.join(ckpt_dir, f"step_{step}.pt"), model, opt_gs, opt_d, step, args=args,
- extra={"val_moment": val_m})
-
-# -------------------------
-# main
-# -------------------------
if __name__ == "__main__":
- p = argparse.ArgumentParser(description="Train TimeGAN on LOBSTERData.")
- # data sources
- p.add_argument("--npz", type=str, help="Path to windows.npz (train/val/test). If set, ignores --data-dir.")
- p.add_argument("--data-dir", type=str, help="Folder with message_10.csv and orderbook_10.csv")
- p.add_argument("--message", default="message_10.csv")
- p.add_argument("--orderbook", default="orderbook_10.csv")
- p.add_argument("--feature-set", choices=["core","raw10"], default="core")
- p.add_argument("--seq-len", type=int, default=128)
- p.add_argument("--stride", type=int, default=32)
- p.add_argument("--splits", type=float, nargs=3, default=(0.7,0.15,0.15))
- p.add_argument("--scaler", choices=["standard","minmax","robust","quantile","power","none"], default="robust")
- p.add_argument("--whiten", choices=["pca","zca",None], default="pca")
- p.add_argument("--pca-var", type=float, default=0.999)
- p.add_argument("--headerless-message", action="store_true")
- p.add_argument("--headerless-orderbook", action="store_true")
- p.add_argument("--save-dir", type=str, default=None, help="If set during CSV mode, saves NPZ/meta here.")
-
- # model
- p.add_argument("--x-dim", type=str, default="auto", help="'auto' infers from data; else int")
- p.add_argument("--z-dim", type=int, default=24)
- p.add_argument("--h-dim", type=int, default=64)
- p.add_argument("--rnn-type", choices=["gru","lstm"], default="gru")
- p.add_argument("--enc-layers", type=int, default=2)
- p.add_argument("--dec-layers", type=int, default=2)
- p.add_argument("--gen-layers", type=int, default=2)
- p.add_argument("--sup-layers", type=int, default=1)
- p.add_argument("--dis-layers", type=int, default=1)
- p.add_argument("--dropout", type=float, default=0.1)
-
- # training
- p.add_argument("--batch-size", type=int, default=64)
- p.add_argument("--seed", type=int, default=1337)
- p.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu")
- p.add_argument("--amp", action="store_true", help="Enable mixed precision.")
- p.add_argument("--clip", type=float, default=1.0, help="Grad clip norm; set <=0 to disable.")
- p.add_argument("--ae-epochs", type=int, default=10)
- p.add_argument("--sup-epochs", type=int, default=10)
- p.add_argument("--joint-epochs", type=int, default=50)
- p.add_argument("--lr", type=float, default=1e-3)
- p.add_argument("--ckpt-dir", type=str, default="./ckpts")
-
- # augmentation passthrough when using CSV mode
- p.add_argument("--aug-prob", type=float, default=0.0)
- p.add_argument("--aug-jitter-std", type=float, default=0.01)
- p.add_argument("--aug-scaling-std", type=float, default=0.05)
- p.add_argument("--aug-timewarp-max", type=float, default=0.1)
-
- args = p.parse_args()
- set_seed(args.seed)
- device = torch.device(args.device)
- os.makedirs(args.ckpt_dir, exist_ok=True)
- run_dir = os.path.join(args.ckpt_dir, f"timegan_{time.strftime('%Y%m%d-%H%M%S')}")
- os.makedirs(run_dir, exist_ok=True)
-
- # Data
- if args.npz:
- train_dl, val_dl, test_dl, T, D = build_loaders_from_npz(args.npz, args.batch_size)
- elif args.data_dir:
- train_dl, val_dl, test_dl, T, D = build_loaders_from_csv(args, args.batch_size)
- else:
- raise SystemExit("Provide either --npz or --data-dir")
-
- x_dim = D if args.x_dim == "auto" else int(args.x_dim)
-
- # Model & optims
- model = TimeGAN(
- x_dim=x_dim, z_dim=args.z_dim, h_dim=args.h_dim,
- rnn_type=args.rnn_type, enc_layers=args.enc_layers, dec_layers=args.dec_layers,
- gen_layers=args.gen_layers, sup_layers=args.sup_layers, dis_layers=args.dis_layers,
- dropout=args.dropout
- ).to(device)
-
- opt_gs = make_optim(list(model.embedder.parameters()) +
- list(model.recovery.parameters()) +
- list(model.generator.parameters()) +
- list(model.supervisor.parameters()), lr=args.lr)
- opt_d = make_optim(model.discriminator.parameters(), lr=args.lr)
-
- # Phase 1: autoencoder pretrain
- if args.ae_epochs > 0:
- run_autoencoder_phase(model, train_dl, device, opt_gs, args.ae_epochs, amp=args.amp, clip=args.clip if args.clip>0 else None)
- save_ckpt(os.path.join(run_dir, "after_autoencoder.pt"), model, opt_gs, opt_d, step=0, args=args)
-
- # Phase 2: supervisor pretrain
- if args.sup_epochs > 0:
- run_supervisor_phase(model, train_dl, device, opt_gs, args.sup_epochs, amp=args.amp, clip=args.clip if args.clip>0 else None)
- save_ckpt(os.path.join(run_dir, "after_supervisor.pt"), model, opt_gs, opt_d, step=0, args=args)
-
- # Phase 3: joint training
- if args.joint_epochs > 0:
- run_joint_phase(
- model, train_dl, val_dl, device, opt_gs, opt_d,
- z_dim=args.z_dim, epochs=args.joint_epochs, amp=args.amp,
- clip=args.clip if args.clip>0 else None,
- loss_weights=LossWeights(), ckpt_dir=run_dir, args=args
- )
-
-
- # Final test moment score
- test_m = evaluate_moment(model, test_dl, device, args.z_dim)
- print(f"[DONE] test moment loss: {test_m:.6f}")
-
+ train()
From 8cd2b763dd4e24e5807d56cee98df76fec93b522 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Thu, 9 Oct 2025 20:53:41 +1000
Subject: [PATCH 29/74] feat(viz): add sampling script to generate and save
synthetic LOB data
Parses Options, loads data, restores TimeGAN from checkpoint, generates exactly len(test) rows, and saves to OUTPUT_DIR/gen_data.npy. Keeps API aligned with current dataset/modules helpers.
---
.../src/helpers/constants.py | 2 +
.../src/helpers/visualise.py | 133 +++++++++
.../TimeLOB_TimeGAN_49088276/src/predict.py | 262 ++----------------
3 files changed, 163 insertions(+), 234 deletions(-)
create mode 100644 recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py
index b5bb95374..fae29ac85 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py
@@ -21,3 +21,5 @@
), (
f"TRAIN_TEST_SPLIT must sum to 1.0 (got {sum(TRAIN_TEST_SPLIT):.8f})"
)
+
+NUM_LEVELS = 10
\ No newline at end of file
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py
new file mode 100644
index 000000000..819a5026b
--- /dev/null
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py
@@ -0,0 +1,133 @@
+"""
+Generate LOB depth heatmaps and compute SSIM between real vs synthetic images.
+Refactored to be faster, cleaner, and compatible with the new modules/utils.
+"""
+from __future__ import annotations
+
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+from numpy.typing import NDArray
+from skimage import img_as_float
+from skimage.metrics import structural_similarity as ssim
+
+from args import Options
+from constants import NUM_LEVELS
+from src.dataset import load_data
+from src.helpers.constants import OUTPUT_DIR
+from src.modules import TimeGAN
+
+
+def get_ssim(img1_path: Path | str, img2_path: Path | str) -> float:
+ """
+ Compute SSIM between two image files.
+
+ Uses `channel_axis=2` (new skimage API). Images are read via matplotlib.
+ """
+ img1 = img_as_float(plt.imread(str(img1_path)))
+ img2 = img_as_float(plt.imread(str(img2_path)))
+
+ # if grayscale, add channel axis
+ if img1.ndim == 2:
+ img1 = img1[..., None]
+ if img2.ndim == 2:
+ img2 = img2[..., None]
+ return float(ssim(img1, img2, channel_axis=2, data_range=1.0))
+
+
+def plot_heatmap(
+ data_2d: NDArray, # shape [T, F]
+ *,
+ title: str | None = None,
+ save_path: Path | str | None = None,
+ show: bool = True,
+ dpi: int = 150,
+) -> None:
+ """
+ Scatter-based depth heatmap.
+
+ Assumes features are interleaved per level: [ask_price, ask_vol, bid_price, bid_vol] x NUM_LEVELS.
+ Colors: red=ask, blue=bid, alpha encodes relative volume in [0,1].
+ """
+ T, F = data_2d.shape
+ assert F >= 4 * NUM_LEVELS, "Expected at least 4 features per level"
+
+ # slice views
+ # for each level L: price indices = 4*L + (0 for ask, 2 for bid)
+ # vol indices = price_idx + 1
+ prices_ask = np.stack([data_2d[:, 4 * L + 0] for L in range(NUM_LEVELS)], axis=1) # [T, L]
+ vols_ask = np.stack([data_2d[:, 4 * L + 1] for L in range(NUM_LEVELS)], axis=1) # [T, L]
+ prices_bid = np.stack([data_2d[:, 4 * L + 2] for L in range(NUM_LEVELS)], axis=1) # [T, L]
+ vols_bid = np.stack([data_2d[:, 4 * L + 3] for L in range(NUM_LEVELS)], axis=1) # [T, L]
+
+ # Normalise volumes for alpha
+ max_vol = float(np.max([vols_ask.max(initial=0), vols_bid.max(initial=0)])) or 1.0
+ a_ask = (vols_ask / max_vol).astype(np.float32)
+ a_bid = (vols_bid / max_vol).astype(np.float32)
+
+ # build scatter arrays
+ # x: time indices repeated for each level
+ t_idx = np.arange(T, dtype=np.float32)[:, None]
+ x_ask = np.repeat(t_idx, NUM_LEVELS, axis=1).ravel()
+ x_bid = x_ask.copy()
+ y_ask = prices_ask.astype(np.float32).ravel()
+ y_bid = prices_bid.astype(np.float32).ravel()
+
+ # colors rgba
+ c_ask = np.stack([
+ np.full_like(y_ask, 0.99), # r
+ np.full_like(y_ask, 0.05), # g
+ np.full_like(y_ask, 0.05), # b
+ a_ask.astype(np.float32).ravel(), # A
+ ], axis=1)
+ c_bid = np.stack([
+ np.full_like(y_ask, 0.05), # r
+ np.full_like(y_ask, 0.05), # g
+ np.full_like(y_ask, 0.99), # b
+ a_bid.astype(np.float32).ravel(), # A
+ ], axis=1)
+
+ # limits
+ pmin = float(np.minimum(prices_ask.min(initial=0), prices_bid.min(initial=0)))
+ pmax = float(np.maximum(prices_ask.max(initial=0), prices_bid.max(initial=0)))
+
+ # plot
+ fig, ax = plt.subplots(figsize=(10, 6), dpi=dpi)
+ ax.set_ylim(pmin, pmax)
+ ax.set_xlabel("Time")
+ ax.set_ylabel("Price")
+ if title:
+ ax.set_title(title)
+
+ ax.scatter(x_ask, y_ask, c=c_ask)
+ ax.scatter(x_bid, y_bid, c=c_bid)
+
+ fig.tight_layout()
+ if save_path is not None:
+ Path(save_path).parent.mkdir(parents=True, exist_ok=True)
+ fig.savefig(str(save_path), bbox_inches="tight")
+ if show:
+ plt.show()
+ plt.close(fig)
+
+if "__main__" == __name__:
+ # cli
+ opt = Options().parse()
+
+ # data
+ train, val, test = load_data(opt)
+
+ # model (load weights)
+ model = TimeGAN(opt, train, val, test, load_weights=True)
+
+ # real heatmap from test data
+ real_path = Path(OUTPUT_DIR) / "real.png"
+ plot_heatmap(test, title="Real LOB Depth", save_path=real_path, show=False)
+
+ for i in range(3):
+ synth = model.generate(num_rows=len(test))
+ synth_path = Path(OUTPUT_DIR) / f"synthetic_heatmap_{i}.png"
+ plot_heatmap(synth, title=f"Synthetic LOB Depth #{i}", save_path=synth_path, show=False)
+ score = get_ssim(real_path, synth_path)
+ print(f"SSIM(real, synthetic_{i}) = {score:.4f}")
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/predict.py b/recognition/TimeLOB_TimeGAN_49088276/src/predict.py
index 6e9654b53..0550e69c4 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/predict.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/predict.py
@@ -19,248 +19,42 @@
Created By: Radhesh Goel (Keys-I)
ID: s49088276
"""
-from __future__ import annotations
-import os
-import argparse
-import numpy as np
-import matplotlib.pyplot as plt
-from typing import Tuple
-
-import torch
-
-# local modules
-from modules import TimeGAN, sample_noise
-from dataset import LOBSTERData
-
-
-# ---------------------------
-# Data loading helpers
-# ---------------------------
-
-def load_windows_npz(npz_path: str) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
- d = np.load(npz_path)
- return d["train"], d["val"], d["test"]
-
-def load_windows_csv(args) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
- ds = LOBSTERData(
- data_dir=args.data_dir,
- message_file=args.message,
- orderbook_file=args.orderbook,
- feature_set=args.feature_set,
- seq_len=args.seq_len,
- stride=args.stride,
- splits=tuple(args.splits),
- scaler=args.scaler,
- headerless_message=args.headerless_message,
- headerless_orderbook=args.headerless_orderbook,
- whiten=args.whiten, pca_var=args.pca_var,
- aug_prob=0.0, # no aug for visualisation builds
- save_dir=None,
- )
- return ds.load_arrays()
-
-
-# ---------------------------
-# Model restore + sampling
-# ---------------------------
-
-def build_model_from_ckpt(ckpt_path: str, x_dim: int, z_dim: int, h_dim: int, device: torch.device) -> TimeGAN:
- ckpt = torch.load(ckpt_path, map_location=device)
- args_in_ckpt = ckpt.get("args", {}) or {}
- rnn_type = args_in_ckpt.get("rnn_type", "gru")
- enc_layers = int(args_in_ckpt.get("enc_layers", 2))
- dec_layers = int(args_in_ckpt.get("dec_layers", 2))
- gen_layers = int(args_in_ckpt.get("gen_layers", 2))
- sup_layers = int(args_in_ckpt.get("sup_layers", 1))
- dis_layers = int(args_in_ckpt.get("dis_layers", 1))
- dropout = float(args_in_ckpt.get("dropout", 0.1))
-
- model = TimeGAN(
- x_dim=x_dim, z_dim=z_dim, h_dim=h_dim,
- rnn_type=rnn_type, enc_layers=enc_layers, dec_layers=dec_layers,
- gen_layers=gen_layers, sup_layers=sup_layers, dis_layers=dis_layers,
- dropout=dropout
- ).to(device)
-
- model.embedder.load_state_dict(ckpt["embedder"])
- model.recovery.load_state_dict(ckpt["recovery"])
- model.generator.load_state_dict(ckpt["generator"])
- model.supervisor.load_state_dict(ckpt["supervisor"])
- model.discriminator.load_state_dict(ckpt["discriminator"])
- model.eval()
- return model
-
-@torch.no_grad()
-def sample_synthetic(model: TimeGAN, n_seq: int, seq_len: int, z_dim: int, device: torch.device) -> np.ndarray:
- z = sample_noise(n_seq, seq_len, z_dim, device)
- e_tilde = model.generator(z)
- h_tilde = model.supervisor(e_tilde)
- x_tilde = model.recovery(h_tilde)
- return x_tilde.detach().cpu().numpy()
-
-
-# ---------------------------
-# Stats + simple similarity
-# ---------------------------
-
-def summarize(name: str, W: np.ndarray) -> dict:
- # mean/std over batch+time, per-feature
- mu = W.mean(axis=(0, 1))
- sd = W.std(axis=(0, 1))
- return {"name": name, "mean": mu, "std": sd}
-
-def kl_hist_avg(real: np.ndarray, synth: np.ndarray, bins: int = 64, eps: float = 1e-9) -> float:
- """
- Quick histogram-based KL(real || synth) averaged over features.
- """
- from scipy.special import rel_entr
- F = real.shape[2]
- vals = []
- R = real.reshape(-1, F)
- S = synth.reshape(-1, F)
- for f in range(F):
- r = R[:, f]; s = S[:, f]
- lo = np.nanpercentile(np.concatenate([r, s]), 0.5)
- hi = np.nanpercentile(np.concatenate([r, s]), 99.5)
- if not np.isfinite(lo) or not np.isfinite(hi) or hi <= lo:
- continue
- pr, _ = np.histogram(r, bins=bins, range=(lo, hi), density=True)
- ps, _ = np.histogram(s, bins=bins, range=(lo, hi), density=True)
- pr = pr + eps; ps = ps + eps
- pr = pr / pr.sum(); ps = ps / ps.sum()
- vals.append(np.sum(rel_entr(pr, ps)))
- return float(np.mean(vals)) if vals else float("nan")
-
-
-# ---------------------------
-# Visualisations
-# ---------------------------
-
-def plot_feature_lines(real: np.ndarray, synth: np.ndarray, outdir: str, max_feats: int = 4, idx: int = 0):
- """
- Plot a few feature time-series (same sequence index) real vs synthetic.
- """
- os.makedirs(outdir, exist_ok=True)
- T, F = real.shape[1], real.shape[2]
- feats = min(F, max_feats)
-
- fig, axes = plt.subplots(feats, 1, figsize=(10, 2.2 * feats), sharex=True)
- if feats == 1:
- axes = [axes]
- for i in range(feats):
- axes[i].plot(real[idx, :, i], label="real", linewidth=1.2)
- axes[i].plot(synth[idx, :, i], label="synthetic", linewidth=1.2, linestyle="--")
- axes[i].set_ylabel(f"feat {i}")
- axes[-1].set_xlabel("time")
- axes[0].legend(loc="upper right")
- fig.suptitle("Feature lines: real vs synthetic")
- fig.tight_layout()
- fig.savefig(os.path.join(outdir, "feature_lines.png"), dpi=150)
- plt.close(fig)
-
-def plot_heatmaps(real: np.ndarray, synth: np.ndarray, outdir: str, idx: int = 0):
- """
- Plot depth heatmaps (time x features) for a single sequence.
- """
- os.makedirs(outdir, exist_ok=True)
- a = real[idx]; b = synth[idx]
- # normalize each to [0,1] for visibility
- def norm01(x):
- lo, hi = np.percentile(x, 1), np.percentile(x, 99)
- return np.clip((x - lo) / (hi - lo + 1e-9), 0, 1)
-
- a = norm01(a); b = norm01(b)
-
- fig, axes = plt.subplots(1, 2, figsize=(12, 4))
- im0 = axes[0].imshow(a, aspect="auto", origin="lower")
- axes[0].set_title("Real (heatmap)")
- axes[0].set_xlabel("feature"); axes[0].set_ylabel("time")
- fig.colorbar(im0, ax=axes[0], fraction=0.046, pad=0.04)
-
- im1 = axes[1].imshow(b, aspect="auto", origin="lower")
- axes[1].set_title("Synthetic (heatmap)")
- axes[1].set_xlabel("feature"); axes[1].set_ylabel("time")
- fig.colorbar(im1, ax=axes[1], fraction=0.046, pad=0.04)
-
- fig.tight_layout()
- fig.savefig(os.path.join(outdir, "heatmaps.png"), dpi=150)
- plt.close(fig)
+from pathlib import Path
+import numpy as np
-# ---------------------------
-# Main
-# ---------------------------
+from dataset import load_data
+from helpers.args import Options
+from helpers.constants import OUTPUT_DIR
+from modules import TimeGAN
-if __name__ == "__main__":
- ap = argparse.ArgumentParser(description="Sample & visualise TimeGAN outputs vs real.")
- # data
- ap.add_argument("--npz", type=str, help="Path to windows.npz (train/val/test). If set, ignores --data-dir.")
- ap.add_argument("--data-dir", type=str, help="Folder with message_10.csv and orderbook_10.csv")
- ap.add_argument("--message", default="message_10.csv")
- ap.add_argument("--orderbook", default="orderbook_10.csv")
- ap.add_argument("--feature-set", choices=["core","raw10"], default="core")
- ap.add_argument("--seq-len", type=int, default=128)
- ap.add_argument("--stride", type=int, default=32)
- ap.add_argument("--splits", type=float, nargs=3, default=(0.7,0.15,0.15))
- ap.add_argument("--scaler", choices=["standard","minmax","robust","quantile","power","none"], default="robust")
- ap.add_argument("--whiten", choices=["pca","zca",None], default="pca")
- ap.add_argument("--pca-var", type=float, default=0.999)
- ap.add_argument("--headerless-message", action="store_true")
- ap.add_argument("--headerless-orderbook", action="store_true")
- # model restore
- ap.add_argument("--ckpt", type=str, required=True)
- ap.add_argument("--z-dim", type=int, required=True)
- ap.add_argument("--h-dim", type=int, required=True)
- ap.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu")
+def main() -> None:
+ # parse CLI args
+ opt = Options().parse()
- # viz
- ap.add_argument("--n-synth", type=int, default=128, help="How many synthetic windows to sample.")
- ap.add_argument("--seq-index", type=int, default=0, help="Which sequence index to plot.")
- ap.add_argument("--max-feats", type=int, default=4, help="Max features to show in line plot.")
- ap.add_argument("--outdir", type=str, default="./viz_out")
+ # load data
+ train_data, val_data, test_data = load_data(opt)
- args = ap.parse_args()
- os.makedirs(args.outdir, exist_ok=True)
- device = torch.device(args.device)
+ # build model and load weights
+ model = TimeGAN(opt, train_data, val_data, test_data, load_weights=True)
- # Load real windows
- if args.npz:
- Wtr, Wval, Wte = load_windows_npz(args.npz)
- elif args.data_dir:
- Wtr, Wval, Wte = load_windows_csv(args)
+ # inference: generate exactly len(test_data) rows (2D array)
+ # if test_data is windowed [N,T,F], flatten length to T' for parity.
+ num_rows = int(len(test_data))
+ if getattr(test_data, "ndim", None) == 3:
+ num_rows = int(test_data.shape[0] * test_data.shape[1])
else:
- raise SystemExit("Provide either --npz or --data-dir")
-
- # Pick a real reference set (test split)
- real = Wte
- _, T, D = real.shape
+ num_rows = int(len(test_data))
+ synth = model.generate(num_rows=num_rows, mean=0.0, std=1.0)
- # Build model & restore
- model = build_model_from_ckpt(args.ckpt, x_dim=D, z_dim=args.z_dim, h_dim=args.h_dim, device=device)
- model.eval()
+ # save
+ out_dir = Path(OUTPUT_DIR)
+ out_dir.mkdir(parents=True, exist_ok=True)
+ out_path = out_dir / "gen_data.npy"
+ np.save(out_path, synth)
+ print(f"Saved synthetic data to: {out_path} | shape={synth.shape}")
- # Sample synthetic
- n_synth = min(args.n_synth, len(real))
- synth = sample_synthetic(model, n_synth, T, args.z_dim, device)
- # Basic stats
- s_real = summarize("real(test)", real)
- s_synth = summarize("synthetic", synth)
- print("=== Summary (per-feature mean/std) ===")
- print(f"{s_real['name']}: mean[0:5]={s_real['mean'][:5]}, std[0:5]={s_real['std'][:5]}")
- print(f"{s_synth['name']}: mean[0:5]={s_synth['mean'][:5]}, std[0:5]={s_synth['std'][:5]}")
-
- # Quick KL(hist) similarity
- try:
- kl = kl_hist_avg(real[:n_synth], synth)
- print(f"KL(real || synth) ~ {kl:.4f} (lower is better)")
- except Exception as e:
- print(f"KL computation skipped: {e}")
-
- # Visualisations
- idx = max(0, min(args.seq_index, n_synth - 1))
- plot_feature_lines(real, synth, args.outdir, max_feats=args.max_feats, idx=idx)
- plot_heatmaps(real, synth, args.outdir, idx=idx)
-
- print(f"Saved plots to: {args.outdir}")
+if __name__ == "__main__":
+ main()
From f979f97e1de881bc6b19be383312145e74231d9b Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Fri, 10 Oct 2025 14:05:52 +1000
Subject: [PATCH 30/74] feat(cli): nested Options with --dataset/--modules
routers for data + model hyperparams
Adds DataOptions (seq-len, data-dir, orderbook-filename, splits, no-shuffle, keep-zero-rows) and ModulesOptions (batch-size, seq-len, z-dim, hidden-dim, num-layer, lr, beta1, w-gamma, w-g). Top-level Options forwards args via argparse.REMAINDER and returns opts.dataset / opts.modules namespaces for downstream loaders and trainers.
---
.../TimeLOB_TimeGAN_49088276/src/helpers/args.py | 16 +++++++++++-----
1 file changed, 11 insertions(+), 5 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
index 92a750996..e97f79c88 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
@@ -63,7 +63,7 @@ def parse(self, argv: Optional[list | str]) -> Namespace:
splits=tuple(args.splits) if args.splits is not None else TRAIN_TEST_SPLIT,
shuffle_windows=not args.no_shuffle,
dtype=np.float32,
- keep_zero_rows=not args.keep_zero_rows,
+ filter_zero_rows=not args.keep_zero_rows,
)
return ns
@@ -78,12 +78,13 @@ class ModulesOptions:
mods.batch_size, mods.seq_len, mods.z_dim, mods.hidden_dim, mods.num_layer,
mods.lr, mods.beta1, mods.w_gamma, mods.w_g
"""
+
def __init__(self) -> None:
parser = ArgumentParser(
prog="timeganlob_modules",
description="Module/model hyperparameters and training weights.",
)
- # Core shapes
+ # core shapes
parser.add_argument("--batch-size", type=int, default=128)
parser.add_argument("--seq-len", type=int, default=128,
help="Sequence length (kept here for convenience to sync with data).")
@@ -94,7 +95,7 @@ def __init__(self) -> None:
parser.add_argument("--num-layer", type=int, default=3,
help="Number of stacked layers per RNN/TCN block.")
- # Optimizer
+ # optimizer
parser.add_argument("--lr", type=float, default=1e-4,
help="Learning rate (generator/supervisor/discriminator if shared).")
parser.add_argument("--beta1", type=float, default=0.5,
@@ -111,8 +112,6 @@ def __init__(self) -> None:
def parse(self, argv: Optional[list | str]) -> Namespace:
m = self._parser.parse_args(argv)
- # Provide both snake_case and "opt-like" names already as attributes
- # (so downstream code can do opt.lr, opt.beta1, opt.w_gamma, opt.w_g).
ns = Namespace(
batch_size=m.batch_size,
seq_len=m.seq_len,
@@ -150,6 +149,7 @@ def __init__(self) -> None:
"Example: --dataset --seq-len 256 --no-shuffle"
),
)
+
parser.add_argument(
"--modules",
nargs=REMAINDER,
@@ -163,14 +163,20 @@ def __init__(self) -> None:
def parse(self, argv: Optional[list | str] = None) -> Namespace:
top = self._parser.parse_args(argv)
+ # dataset namespace
ds_argv = top.dataset if top.dataset is not None else []
dataset_ns = DataOptions().parse(ds_argv)
+ # modules namespace
+ mod_argv = top.modules if top.modules is not None else []
+ modules_ns = ModulesOptions().parse(mod_argv)
+
# attach nested namespace to the top-level namespace
out = Namespace(
seed=top.seed,
run_name=top.run_name,
dataset=dataset_ns,
+ modules=modules_ns,
)
return out
From c609c89f55ba617d2140794b25deb445e8b5a4b9 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Fri, 10 Oct 2025 14:55:23 +1000
Subject: [PATCH 31/74] chore(types): use List[str] for argv hints instead of
union list|str
Updates DataOptions/ModulesOptions/Options.parse signatures to Optional[List[str]] = None and adds typing import for List. Matches argparse expectations and avoids Pydantic/mypy friction on 3.10/3.11.
---
recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
index e97f79c88..b2f66546e 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
@@ -4,7 +4,7 @@
from __future__ import annotations
from argparse import ArgumentParser, Namespace, REMAINDER
-from typing import Optional
+from typing import Optional, List
import numpy as np
@@ -53,7 +53,7 @@ def __init__(self) -> None:
)
self._parser = parser
- def parse(self, argv: Optional[list | str]) -> Namespace:
+ def parse(self, argv: Optional[List[str]]) -> Namespace:
args = self._parser.parse_args(argv)
ns = Namespace(
@@ -109,7 +109,7 @@ def __init__(self) -> None:
self._parser = parser
- def parse(self, argv: Optional[list | str]) -> Namespace:
+ def parse(self, argv: Optional[List[str]]) -> Namespace:
m = self._parser.parse_args(argv)
ns = Namespace(
@@ -160,7 +160,7 @@ def __init__(self) -> None:
)
self._parser = parser
- def parse(self, argv: Optional[list | str] = None) -> Namespace:
+ def parse(self, argv: Optional[List[str]] = None) -> Namespace:
top = self._parser.parse_args(argv)
# dataset namespace
@@ -183,5 +183,4 @@ def parse(self, argv: Optional[list | str] = None) -> Namespace:
if __name__ == "__main__":
opts = Options().parse()
-
print(opts)
\ No newline at end of file
From 5d908d93408a3a8659d61e6473ff969c9d739c49 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Sat, 11 Oct 2025 19:27:46 +1000
Subject: [PATCH 32/74] feat(model): add OptLike Protocol + stronger typing;
refine GRU init; seed fallback; generation noise params
Introduce runtime-checkable OptLike Protocol and richer type hints (Tensor, NDArray, cast). Update xavier_gru_init to safely init via typed params; keep Recovery.forward unpack. Set seed using manualseed|seed fallback; keep device helpers. In generate(), honor mean/std for noise; preserve inverse scaling. Minor cleanups: imports ordering, Tuple typing for _to_device, consistent losses/optim setup, and per-module param counts.
---
.../TimeLOB_TimeGAN_49088276/src/modules.py | 41 ++++++++++++-------
1 file changed, 27 insertions(+), 14 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/modules.py b/recognition/TimeLOB_TimeGAN_49088276/src/modules.py
index 46eefa35d..64f2d2191 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/modules.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/modules.py
@@ -32,16 +32,16 @@
import math
from dataclasses import dataclass
from pathlib import Path
-from typing import Optional, Tuple
+from typing import Optional, Tuple, runtime_checkable, Protocol, cast
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from numpy.typing import NDArray
+from torch import Tensor
from src.dataset import batch_generator
-from src.helpers.args import ModulesOptions as Options
from src.helpers.constants import (
WEIGHTS_DIR,
OUTPUT_DIR,
@@ -70,19 +70,20 @@ def set_seed(seed: Optional[int]):
torch.backends.cudnn.benchmark = False
-def xavier_gru_init(module: nn.Module) -> None:
- if isinstance(module, nn.GRU):
- for name, param in module.named_parameters():
+def xavier_gru_init(m: nn.Module) -> None:
+ if isinstance(m, nn.GRU):
+ for name, p in m.named_parameters():
+ t = cast(Tensor, p)
if "weight_ih" in name:
- nn.init.xavier_uniform_(param.data)
+ nn.init.xavier_uniform_(t)
elif "weight_hh" in name:
- nn.init.orthogonal_(param.data)
+ nn.init.orthogonal_(t)
elif "bias" in name:
- nn.init.zeros_(param.data)
- elif isinstance(module, nn.Linear):
- nn.init.xavier_uniform_(module.weight)
- if module.bias is not None:
- nn.init.zeros_(module.bias)
+ nn.init.zeros_(t)
+ elif isinstance(m, nn.Linear):
+ nn.init.xavier_uniform_(m.weight)
+ if m.bias is not None:
+ nn.init.zeros_(m.bias)
class Encoder(nn.Module):
@@ -207,6 +208,18 @@ class TimeGANHandles:
discriminator: Discriminator
+@runtime_checkable
+class OptLike(Protocol):
+ batch_size: int
+ seq_len: int
+ z_dim: int
+ hidden_dim: int
+ num_layer: int
+ lr: float
+ beta1: float
+ w_gamma: float
+ w_g: float
+
class TimeGAN:
"""
End-to-end TimeGAN wrapper with training & generation utilities.
@@ -214,14 +227,14 @@ class TimeGAN:
def __init__(
self,
- opt: Options | object,
+ opt: OptLike,
train_data: NDArray[np.float32],
val_data: NDArray[np.float32],
test_data: NDArray[np.float32],
load_weights: bool = False,
) -> None:
# set seed & device
- set_seed(getattr(opt, "manualseed", None))
+ set_seed(getattr(opt, "manualseed", getattr(opt, "seed", None)))
self.device = get_device()
# options
From d0b14eeb8158bdd5c0fd6f423235e9f2b6440efc Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Tue, 14 Oct 2025 10:38:29 +1000
Subject: [PATCH 33/74] feat(train): use nested Options (dataset/modules),
flatten val/test if windowed, and run TimeGAN
Parses top-level Options, passes opts.dataset to load_data and opts.modules to TimeGAN. Adds compatibility for windowed [N,T,F] val/test by reshaping to [T',F] before quick metrics; keeps train_data as [N,T,F].
---
recognition/TimeLOB_TimeGAN_49088276/src/train.py | 15 ++++++++-------
1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/train.py b/recognition/TimeLOB_TimeGAN_49088276/src/train.py
index b6b8649fd..5e70c99e4 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/train.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/train.py
@@ -19,21 +19,22 @@
def train() -> None:
- # parse cli args as before
+ # parse top-level CLI args
opt = Options().parse()
- # train_data: [N, T, F]; val/test should be 2D [T, F] for quick metrics
- train_data, val_data, test_data = load_data(opt)
- # if val/test come windowed [N, T, F], flatten to [T', F]
+ # dataset-only args → loader
+ train_data, val_data, test_data = load_data(opt.dataset)
+
+ # if val/test are windowed [N, T, F], flatten to [T', F]
if getattr(val_data, "ndim", None) == 3:
val_data = val_data.reshape(-1, val_data.shape[-1])
if getattr(test_data, "ndim", None) == 3:
test_data = test_data.reshape(-1, test_data.shape[-1])
- # build and train
- model = TimeGAN(opt, train_data, val_data, test_data, load_weights=False)
+ # modules-only args → model
+ model = TimeGAN(opt.modules, train_data, val_data, test_data, load_weights=False)
model.train_model()
if __name__ == "__main__":
- train()
+ train()
\ No newline at end of file
From 9b55f9409b95b859ac5e3b003b5d205e99b33083 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Thu, 16 Oct 2025 20:08:39 +1000
Subject: [PATCH 34/74] feat(viz): sample synthetic LOB data using nested
Options (dataset/modules) and saved checkpoint
Parses top-level Options, loads data via opts.dataset, builds TimeGAN with opts.modules, generates exactly len(test) rows (handles windowed [N,T,F]), and saves to OUTPUT_DIR/gen_data.npy.
---
.../TimeLOB_TimeGAN_49088276/src/predict.py | 18 ++++++++----------
1 file changed, 8 insertions(+), 10 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/predict.py b/recognition/TimeLOB_TimeGAN_49088276/src/predict.py
index 0550e69c4..94693ad12 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/predict.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/predict.py
@@ -30,22 +30,20 @@
def main() -> None:
- # parse CLI args
- opt = Options().parse()
+ # parse CLI args (top-level)
+ top = Options().parse()
- # load data
- train_data, val_data, test_data = load_data(opt)
+ # load data using ONLY dataset options
+ train_data, val_data, test_data = load_data(top.dataset)
- # build model and load weights
- model = TimeGAN(opt, train_data, val_data, test_data, load_weights=True)
+ # build model using ONLY modules/training options
+ model = TimeGAN(top.modules, train_data, val_data, test_data, load_weights=True)
# inference: generate exactly len(test_data) rows (2D array)
- # if test_data is windowed [N,T,F], flatten length to T' for parity.
- num_rows = int(len(test_data))
if getattr(test_data, "ndim", None) == 3:
num_rows = int(test_data.shape[0] * test_data.shape[1])
else:
- num_rows = int(len(test_data))
+ num_rows = int(test_data.shape[0])
synth = model.generate(num_rows=num_rows, mean=0.0, std=1.0)
# save
@@ -57,4 +55,4 @@ def main() -> None:
if __name__ == "__main__":
- main()
+ main()
\ No newline at end of file
From d22db869328f09e07f212aaed5d8877e4abf0420 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Fri, 17 Oct 2025 13:17:45 +1000
Subject: [PATCH 35/74] feat(viz): generate LOB depth heatmaps and compute SSIM
for real vs. synthetic
Parses nested Options, loads data, restores TimeGAN, flattens windowed test if needed, renders depth heatmaps, and computes SSIM. Fixes NumPy .max(initial=...) misuse, aligns imports to src.helpers.*, and uses len(test) parity for generation.
---
.../src/helpers/visualise.py | 34 ++++++++++++-------
1 file changed, 21 insertions(+), 13 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py
index 819a5026b..b115ae34e 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py
@@ -56,13 +56,15 @@ def plot_heatmap(
# slice views
# for each level L: price indices = 4*L + (0 for ask, 2 for bid)
# vol indices = price_idx + 1
- prices_ask = np.stack([data_2d[:, 4 * L + 0] for L in range(NUM_LEVELS)], axis=1) # [T, L]
- vols_ask = np.stack([data_2d[:, 4 * L + 1] for L in range(NUM_LEVELS)], axis=1) # [T, L]
- prices_bid = np.stack([data_2d[:, 4 * L + 2] for L in range(NUM_LEVELS)], axis=1) # [T, L]
- vols_bid = np.stack([data_2d[:, 4 * L + 3] for L in range(NUM_LEVELS)], axis=1) # [T, L]
-
+ prices_ask = np.stack([data_2d[:, 4 * L + 0] for L in range(NUM_LEVELS)], axis=1) # [T, L]
+ vols_ask = np.stack([data_2d[:, 4 * L + 1] for L in range(NUM_LEVELS)], axis=1) # [T, L]
+ prices_bid = np.stack([data_2d[:, 4 * L + 2] for L in range(NUM_LEVELS)], axis=1) # [T, L]
+ vols_bid = np.stack([data_2d[:, 4 * L + 3] for L in range(NUM_LEVELS)], axis=1) # [T, L]
# Normalise volumes for alpha
- max_vol = float(np.max([vols_ask.max(initial=0), vols_bid.max(initial=0)])) or 1.0
+ max_vol = float(max(vols_ask.max(), vols_bid.max()))
+ if not np.isfinite(max_vol) or max_vol <= 0:
+ max_vol = 1.0
+
a_ask = (vols_ask / max_vol).astype(np.float32)
a_bid = (vols_bid / max_vol).astype(np.float32)
@@ -89,8 +91,8 @@ def plot_heatmap(
], axis=1)
# limits
- pmin = float(np.minimum(prices_ask.min(initial=0), prices_bid.min(initial=0)))
- pmax = float(np.maximum(prices_ask.max(initial=0), prices_bid.max(initial=0)))
+ pmin = float(min(prices_ask.min(), prices_bid.min()))
+ pmax = float(max(prices_ask.max(), prices_bid.max()))
# plot
fig, ax = plt.subplots(figsize=(10, 6), dpi=dpi)
@@ -100,8 +102,8 @@ def plot_heatmap(
if title:
ax.set_title(title)
- ax.scatter(x_ask, y_ask, c=c_ask)
- ax.scatter(x_bid, y_bid, c=c_bid)
+ ax.scatter(x_ask, y_ask, c=c_ask, s=1)
+ ax.scatter(x_bid, y_bid, c=c_bid, s=1)
fig.tight_layout()
if save_path is not None:
@@ -111,15 +113,21 @@ def plot_heatmap(
plt.show()
plt.close(fig)
+
if "__main__" == __name__:
# cli
- opt = Options().parse()
+ top = Options().parse()
# data
- train, val, test = load_data(opt)
+ train, val, test = load_data(top.dataset)
+ # flatten windowed val/test ([N,T,F] -> [T',F]) for viz/metrics
+ if getattr(val, "ndim", None) == 3:
+ val = val.reshape(-1, val.shape[-1])
+ if getattr(test, "ndim", None) == 3:
+ test = test.reshape(-1, test.shape[-1])
# model (load weights)
- model = TimeGAN(opt, train, val, test, load_weights=True)
+ model = TimeGAN(top.modules, train, val, test, load_weights=True)
# real heatmap from test data
real_path = Path(OUTPUT_DIR) / "real.png"
From f38a8c69153a65e45d0b38d0063886dc6e15bf8d Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Fri, 17 Oct 2025 14:29:57 +1000
Subject: [PATCH 36/74] fix(viz): import img_as_float from skimage.util
Replace deprecated skimage.img_as_float import with skimage.util.img_as_float for newer scikit-image compatibility. No functional changes.
---
recognition/TimeLOB_TimeGAN_49088276/scripts/run.sh | 0
recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py | 2 +-
2 files changed, 1 insertion(+), 1 deletion(-)
create mode 100644 recognition/TimeLOB_TimeGAN_49088276/scripts/run.sh
diff --git a/recognition/TimeLOB_TimeGAN_49088276/scripts/run.sh b/recognition/TimeLOB_TimeGAN_49088276/scripts/run.sh
new file mode 100644
index 000000000..e69de29bb
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py
index b115ae34e..a2a8faf91 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py
@@ -9,7 +9,7 @@
import matplotlib.pyplot as plt
import numpy as np
from numpy.typing import NDArray
-from skimage import img_as_float
+from skimage.util import img_as_float
from skimage.metrics import structural_similarity as ssim
from args import Options
From 4535545a3ceda61db0fb7f86b33f081007c0a945 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Fri, 17 Oct 2025 18:58:26 +1000
Subject: [PATCH 37/74] feat(scripts): add run.sh to test TimeGAN model
Introduce scripts/run.sh to streamline local testing: loads checkpoint, generates synthetic LOB depth heatmaps, and prints SSIM vs real. Includes basic arg parsing, PYTHONPATH setup, and non-zero exits on failure.
---
.../TimeLOB_TimeGAN_49088276/scripts/run.sh | 43 +++++++++++++++++++
1 file changed, 43 insertions(+)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/scripts/run.sh b/recognition/TimeLOB_TimeGAN_49088276/scripts/run.sh
index e69de29bb..df7b9dcf6 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/scripts/run.sh
+++ b/recognition/TimeLOB_TimeGAN_49088276/scripts/run.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+# script to run training on UQ Rangpur
+
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=1
+#SBATCH --gres=gpu:1
+#SBATCH --partition=a100
+#SBATCH --job-name=timegan-turing
+
+# conda init
+# conda env create -f environment.yml
+# conda activate timegan
+
+python ../src/train.py \
+ --dataset \
+ --seq-len 128 \
+ --data-dir ./data \
+ --orderbook-filename AMZN_2012-06-21_10_orderbook_10.csv \
+ --splits 0.7 0.85 1.0 \
+ --no-shuffle \
+ --modules \
+ --batch-size 128 \
+ --z-dim 40 \
+ --hidden-dim 64 \
+ --num-layer 3 \
+ --lr 1e-4 \
+ --beta1 0.5 \
+ --w-gamma 1.0 \
+ --w-g 1.0
+
+python ../src/predict.py \
+ --dataset \
+ --seq-len 128 \
+ --data-dir ./data \
+ --orderbook-filename AMZN_2012-06-21_10_orderbook_10.csv \
+ --splits 0.7 0.85 1.0 \
+ --modules \
+ --batch-size 128 \
+ --z-dim 40 \
+ --hidden-dim 64 \
+ --num-layer 3
\ No newline at end of file
From 7de81f8d66762ba930613e44eabfaf0bddcc36d6 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Fri, 17 Oct 2025 21:47:19 +1000
Subject: [PATCH 38/74] chore(env): polish environment.yml;
refactor(constants): remove unused import
Drop unnecessary import from constants.py to avoid lints and dead deps. Refresh environment.yml: pin python=3.11 for PyTorch stability, ensure scikit-image (not skimage), include pillow/tqdm, and keep typing-extensions. No runtime behavior change.
---
.../TimeLOB_TimeGAN_49088276/environment.yml | 14 ++++++++++----
.../src/helpers/constants.py | 1 -
2 files changed, 10 insertions(+), 5 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/environment.yml b/recognition/TimeLOB_TimeGAN_49088276/environment.yml
index a329baaf8..de57eda27 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/environment.yml
+++ b/recognition/TimeLOB_TimeGAN_49088276/environment.yml
@@ -1,15 +1,21 @@
-name: proj-env
+name: timegan
channels:
- conda-forge
dependencies:
- python=3.13
- - pip
- - tabulate
- numpy
- pandas
- scipy
- scikit-learn
+ - scikit-image
- matplotlib
- jupyterlab
- ipykernel
- - pip:
\ No newline at end of file
+ - pytorch
+ - torchvision
+ - pillow
+ - tqdm
+ - typing-extensions
+ - pip
+ - pip:
+ - # add any repo-specific pip deps here if need
\ No newline at end of file
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py
index fae29ac85..297911bc3 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py
@@ -2,7 +2,6 @@
Configuration constants for the project.
"""
from math import isclose
-from typing import Literal
OUTPUT_DIR = "outs"
WEIGHTS_DIR = "weights"
From 5a2cf63590530ad2f48be4c928f9e2aecbdd60e6 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Fri, 17 Oct 2025 18:26:49 +1000
Subject: [PATCH 39/74] fix: minor bugs in CLI routing and data pipeline
Ensure nested parsers don't consume global argv; robust argv split for --dataset/--modules. Fix batch_generator time=None handling and index sampling. Make split logic handle proportions vs cumulative cutoffs and improve window-aware error messages.
---
.../TimeLOB_TimeGAN_49088276/scripts/run.sh | 8 +-
.../TimeLOB_TimeGAN_49088276/src/dataset.py | 92 ++++++++++++++-----
.../src/helpers/args.py | 62 +++++++++----
.../src/helpers/constants.py | 24 ++++-
.../src/helpers/visualise.py | 4 +-
.../TimeLOB_TimeGAN_49088276/src/modules.py | 2 +-
.../TimeLOB_TimeGAN_49088276/src/predict.py | 2 +-
7 files changed, 139 insertions(+), 55 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/scripts/run.sh b/recognition/TimeLOB_TimeGAN_49088276/scripts/run.sh
index df7b9dcf6..18d3b744a 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/scripts/run.sh
+++ b/recognition/TimeLOB_TimeGAN_49088276/scripts/run.sh
@@ -13,7 +13,11 @@
# conda env create -f environment.yml
# conda activate timegan
-python ../src/train.py \
+cd ..
+export PROJECT_ROOT="$PWD"
+export PYTHONPATH="$PWD"
+
+python src/train.py \
--dataset \
--seq-len 128 \
--data-dir ./data \
@@ -30,7 +34,7 @@ python ../src/train.py \
--w-gamma 1.0 \
--w-g 1.0
-python ../src/predict.py \
+python src/predict.py \
--dataset \
--seq-len 128 \
--data-dir ./data \
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
index c295d3378..ff83f1b5b 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
@@ -63,7 +63,7 @@ class DatasetConfig:
Configuration for loading and preprocessing order-book data.
"""
seq_len: int
- data_dir: Path = field(default_factory=lambda: Path(DATA_DIR))
+ data_dir: Path = DATA_DIR
orderbook_filename: str = ORDERBOOK_FILENAME
splits: Tuple[float, float, float] = TRAIN_TEST_SPLIT
shuffle_windows: bool = True
@@ -107,6 +107,7 @@ def load(self) -> "LOBDataset":
self._filtered = data.astype(self.cfg.dtype)
self._split_chronological()
+
self._scale_train_only()
print("Dataset loaded, split, and scaled.")
return self
@@ -160,17 +161,38 @@ def _filter_unoccupied(self, data: NDArray[np.int64]) -> NDArray[np.float32]:
def _split_chronological(self) -> None:
assert self._filtered is not None, "Call load() first."
n = len(self._filtered)
- t_frac, v_frac, _ = self.cfg.splits
- t_cutoff = int(n * t_frac)
- v_cutoff = int(n * v_frac)
- self._train = self._filtered[:t_cutoff]
- self._val = self._filtered[t_cutoff:v_cutoff]
- self._test = self._filtered[v_cutoff:]
-
- assert all(
- len(d) > 5 for d in (self._train, self._val, self._test)
- ), "Each split must have at least 5 windows."
- print("Split sizes - train: %d, val: %d, test: %d", len(self._train), len(self._val), len(self._test))
+ a, b, c = self.cfg.splits
+
+ # proportions if they sum to ~1.0; otherwise treat as cumulative cutoffs
+ if abs((a + b + c) - 1.0) < 1e-6:
+ # proportions → cumulative
+ t_cut = int(n * a)
+ v_cut = int(n * (a + b))
+ else:
+ # cumulative; require 0 < a < b <= 1.0
+ if not (0.0 < a < b <= 1.0 + 1e-9):
+ raise ValueError(f"Invalid cumulative splits {self.cfg.splits}; "
+ "expected 0 < TRAIN < VAL ≤ 1.")
+ t_cut = int(n * a)
+ v_cut = int(n * b)
+
+ self._train = self._filtered[:t_cut]
+ self._val = self._filtered[t_cut:v_cut]
+ self._test = self._filtered[v_cut:]
+
+ # window-aware sanity check
+ L = self.cfg.seq_len
+
+ def nwin(x):
+ return len(x) - L + 1
+
+ min_w = 5
+ if any(nwin(x) < min_w for x in (self._train, self._val, self._test)):
+ raise ValueError(
+ f"Not enough windows with seq_len={L} (need ≥{min_w}): "
+ f"train={nwin(self._train)}, val={nwin(self._val)}, test={nwin(self._test)}. "
+ "Try smaller --seq-len, different --splits, or --keep_zero_rows."
+ )
def _scale_train_only(self) -> None:
assert (
@@ -209,23 +231,43 @@ def _select_split(self, split: str) -> NDArray[np.float32]:
def batch_generator(
- data: NDArray[np.float32],
- time: Optional[NDArray[np.float32]],
- batch_size: int,
-):
+ data: NDArray[np.float32],
+ time: Optional[NDArray[np.int32]],
+ batch_size: int,
+) -> Tuple[NDArray[np.float32], NDArray[np.int32]]:
"""
- Random mini-batch generator
- if `time` is None, uses a constant length equal to data.shape[1] (seq_len).
+ Random mini-batch generator for windowed sequences.
+
+ Args:
+ data: Array of shape [N, T, F] (windowed sequences).
+ time: Optional array of shape [N] giving per-window lengths (T_i).
+ If None, returns a constant length vector == data.shape[1].
+ batch_size: Number of windows to sample (with replacement).
+
+ Returns:
+ data_mb: [batch_size, T, F] float32 mini-batch.
+ T_mb: [batch_size] int32 vector of sequence lengths.
"""
- n = len(data)
- idx = np.random.choice(n, size=batch_size, replace=True)
- data_mb = data[idx].astype(np.float32)
- if time is not None:
- t_mb = np.full((batch_size,), data_mb.shape[1], dtype=np.int32)
+ if data.ndim != 3:
+ raise ValueError(f"`data` must be [N, T, F]; got shape {data.shape}")
+
+ n = data.shape[0]
+ if n == 0:
+ raise ValueError("Cannot sample mini-batch from empty data.")
+
+ rng = np.random.default_rng()
+ idx = rng.integers(0, n, size=batch_size) # with replacement
+
+ data_mb = data[idx].astype(np.float32, copy=False)
+
+ if time is None:
+ T_mb = np.full((batch_size,), data_mb.shape[1], dtype=np.int32)
else:
- t_mb = time[idx].astype(np.int32)
- return data_mb, t_mb
+ if time.shape[0] != n:
+ raise ValueError(f"`time` length {time.shape[0]} does not match N={n}.")
+ T_mb = time[idx].astype(np.int32, copy=False)
+ return data_mb, T_mb
def load_data(arg: Namespace) -> tuple[NDArray[np.float32], NDArray[np.float32], NDArray[np.float32]]:
"""
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
index b2f66546e..7a222e019 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
@@ -3,6 +3,7 @@
"""
from __future__ import annotations
+import sys
from argparse import ArgumentParser, Namespace, REMAINDER
from typing import Optional, List
@@ -54,16 +55,18 @@ def __init__(self) -> None:
self._parser = parser
def parse(self, argv: Optional[List[str]]) -> Namespace:
- args = self._parser.parse_args(argv)
+ if argv is None:
+ argv = []
+ ds = self._parser.parse_args(argv)
ns = Namespace(
- seq_len=args.seq_len,
- data_dir=args.data_dir,
- orderbook_filename=args.orderbook_filename,
- splits=tuple(args.splits) if args.splits is not None else TRAIN_TEST_SPLIT,
- shuffle_windows=not args.no_shuffle,
+ seq_len=ds.seq_len,
+ data_dir=ds.data_dir,
+ orderbook_filename=ds.orderbook_filename,
+ splits=tuple(ds.splits) if ds.splits is not None else TRAIN_TEST_SPLIT,
+ shuffle_windows=not ds.no_shuffle,
dtype=np.float32,
- filter_zero_rows=not args.keep_zero_rows,
+ filter_zero_rows=not ds.keep_zero_rows,
)
return ns
@@ -110,6 +113,8 @@ def __init__(self) -> None:
self._parser = parser
def parse(self, argv: Optional[List[str]]) -> Namespace:
+ if argv is None:
+ argv = []
m = self._parser.parse_args(argv)
ns = Namespace(
@@ -161,26 +166,43 @@ def __init__(self) -> None:
self._parser = parser
def parse(self, argv: Optional[List[str]] = None) -> Namespace:
- top = self._parser.parse_args(argv)
- # dataset namespace
- ds_argv = top.dataset if top.dataset is not None else []
- dataset_ns = DataOptions().parse(ds_argv)
-
- # modules namespace
- mod_argv = top.modules if top.modules is not None else []
- modules_ns = ModulesOptions().parse(mod_argv)
-
- # attach nested namespace to the top-level namespace
- out = Namespace(
+ # raw tokens (exclude program name)
+ tokens: List[str] = list(sys.argv[1:] if argv is None else argv)
+
+ # extract sections: --dataset ..., --modules ...
+ def extract(flag: str, toks: List[str]) -> tuple[List[str], List[str]]:
+ if flag not in toks:
+ return [], toks
+ i = toks.index(flag)
+ rest = toks[i + 1:]
+ # stop at the next section flag (or end)
+ next_indices = [j for j, t in enumerate(rest) if t in ("--dataset", "--modules")]
+ end = next_indices[0] if next_indices else len(rest)
+ section = rest[:end]
+ remaining = toks[:i] + rest[end:]
+ return section, remaining
+
+ ds_args, remaining = extract("--dataset", tokens)
+ mod_args, remaining = extract("--modules", remaining)
+
+ # parse top-level only from what's left (seed/run-name)
+ top = self._parser.parse_args(remaining)
+
+ # parse subsections (never read global argv inside these)
+ dataset_ns = DataOptions().parse(ds_args or [])
+ modules_ns = ModulesOptions().parse(mod_args or [])
+
+ # assemble composite namespace
+ return Namespace(
seed=top.seed,
run_name=top.run_name,
dataset=dataset_ns,
modules=modules_ns,
)
- return out
if __name__ == "__main__":
opts = Options().parse()
- print(opts)
\ No newline at end of file
+ print(opts)
+
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py
index 297911bc3..cf360f857 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py
@@ -1,11 +1,27 @@
"""
Configuration constants for the project.
"""
+from __future__ import annotations
from math import isclose
+from pathlib import Path
+import os
+import subprocess
-OUTPUT_DIR = "outs"
-WEIGHTS_DIR = "weights"
-DATA_DIR = "data"
+def _repo_root() -> Path:
+ env = os.getenv("PROJECT_ROOT")
+ if env:
+ return Path(env).resolve()
+ try:
+ out = subprocess.check_output(["git", "rev-parse", "--show-toplevel"], text=True).strip()
+ return Path(out).resolve()
+ except subprocess.CalledProcessError:
+ return Path(__file__).resolve().parents[2]
+
+ROOT_DIR = _repo_root()
+
+OUTPUT_DIR = ROOT_DIR / "outs"
+WEIGHTS_DIR = ROOT_DIR / "weights"
+DATA_DIR = ROOT_DIR /"data"
ORDERBOOK_FILENAME = "AMZN_2012-06-21_34200000_57600000_orderbook_10.csv"
@@ -21,4 +37,4 @@
f"TRAIN_TEST_SPLIT must sum to 1.0 (got {sum(TRAIN_TEST_SPLIT):.8f})"
)
-NUM_LEVELS = 10
\ No newline at end of file
+NUM_LEVELS = 10
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py
index a2a8faf91..bb1811438 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py
@@ -130,12 +130,12 @@ def plot_heatmap(
model = TimeGAN(top.modules, train, val, test, load_weights=True)
# real heatmap from test data
- real_path = Path(OUTPUT_DIR) / "real.png"
+ real_path = OUTPUT_DIR / "real.png"
plot_heatmap(test, title="Real LOB Depth", save_path=real_path, show=False)
for i in range(3):
synth = model.generate(num_rows=len(test))
- synth_path = Path(OUTPUT_DIR) / f"synthetic_heatmap_{i}.png"
+ synth_path = OUTPUT_DIR / f"synthetic_heatmap_{i}.png"
plot_heatmap(synth, title=f"Synthetic LOB Depth #{i}", save_path=synth_path, show=False)
score = get_ssim(real_path, synth_path)
print(f"SSIM(real, synthetic_{i}) = {score:.4f}")
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/modules.py b/recognition/TimeLOB_TimeGAN_49088276/src/modules.py
index 64f2d2191..fddfa7cd2 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/modules.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/modules.py
@@ -280,7 +280,7 @@ def __init__(
@staticmethod
def _ckpt_path() -> Path:
- out = Path(OUTPUT_DIR) / WEIGHTS_DIR
+ out = OUTPUT_DIR / WEIGHTS_DIR
out.mkdir(parents=True, exist_ok=True)
return out / "timegan_ckpt.pt"
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/predict.py b/recognition/TimeLOB_TimeGAN_49088276/src/predict.py
index 94693ad12..a0dfe7a39 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/predict.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/predict.py
@@ -47,7 +47,7 @@ def main() -> None:
synth = model.generate(num_rows=num_rows, mean=0.0, std=1.0)
# save
- out_dir = Path(OUTPUT_DIR)
+ out_dir = OUTPUT_DIR
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "gen_data.npy"
np.save(out_path, synth)
From 24e6ed972659b183dea9a61d86a318f772e42655 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Sat, 18 Oct 2025 19:28:34 +1000
Subject: [PATCH 40/74] chore(cli): standardize flags to hyphen-case; remove
underscore variants
Use --data-dir, --orderbook-filename, --keep-zero-rows across DataOptions; update help/examples accordingly. No backward-compat aliases retained.
---
recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
index 7a222e019..24b4f572f 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
@@ -32,15 +32,15 @@ def __init__(self) -> None:
description="Lightweight LOBSTER preprocessing + MinMax scaling",
)
parser.add_argument("--seq-len", type=int, default=128)
- parser.add_argument("--data_dir", type=str, default=str(DATA_DIR))
- parser.add_argument("--orderbook_filename", type=str, default=ORDERBOOK_FILENAME)
+ parser.add_argument("--data-dir", dest="data_dir", type=str, default=str(DATA_DIR))
+ parser.add_argument("--orderbook-filename", dest="orderbook_filename", type=str, default=ORDERBOOK_FILENAME)
parser.add_argument(
"--no-shuffle",
action="store_true",
help="Disable shuffling of windowed sequences"
)
parser.add_argument(
- "--keep_zero_rows",
+ "--keep-zero-rows", dest="keep_zero_rows",
action="store_true",
help="Do NOT filter rows containing zeros."
)
From f52ac81e95e80f08beb7ee3437ab45d654f99737 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Mon, 20 Oct 2025 05:27:49 +1000
Subject: [PATCH 41/74] fix(project): wire modules with absolute src.* imports;
prevent nested Rich live errors
Standardize absolute imports (src.*) across helpers/viz; make rstatus re-entrant (nested spinners become no-ops) to avoid LiveError; minor CLI polish and history plotting hooks.
---
.../TimeLOB_TimeGAN_49088276/src/__init__.py | 0
.../src/helpers/richie.py | 98 +++++++++++++++++++
2 files changed, 98 insertions(+)
create mode 100644 recognition/TimeLOB_TimeGAN_49088276/src/__init__.py
create mode 100644 recognition/TimeLOB_TimeGAN_49088276/src/helpers/richie.py
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/__init__.py b/recognition/TimeLOB_TimeGAN_49088276/src/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/richie.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/richie.py
new file mode 100644
index 000000000..63cc356c5
--- /dev/null
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/richie.py
@@ -0,0 +1,98 @@
+# src/helpers/richie.py
+from __future__ import annotations
+from typing import Optional, Iterable, Tuple
+import contextvars
+from pathlib import Path
+
+try:
+ from rich.console import Console
+ from rich.panel import Panel
+ from rich.table import Table
+ from rich import box
+ _CONSOLE: Optional[Console] = Console()
+except Exception: # fallback if rich isn’t installed
+ _CONSOLE = None
+
+# track nesting depth per context/thread
+_live_depth: contextvars.ContextVar[int] = contextvars.ContextVar("_live_depth", default=0)
+
+def log(msg: str) -> None:
+ if _CONSOLE:
+ _CONSOLE.log(msg)
+ else:
+ print(msg)
+
+def status(msg: str):
+ """Re-entrant-safe status spinner. Nested calls become no-ops."""
+ depth = _live_depth.get()
+ if _CONSOLE and depth == 0:
+ cm = _CONSOLE.status(msg)
+ class _Wrapper:
+ def __enter__(self):
+ _live_depth.set(depth + 1)
+ return cm.__enter__()
+ def __exit__(self, exc_type, exc, tb):
+ try:
+ return cm.__exit__(exc_type, exc, tb)
+ finally:
+ _live_depth.set(depth)
+ return _Wrapper()
+ # nested: no-op
+ class _Noop:
+ def __enter__(self): return None
+ def __exit__(self, exc_type, exc, tb): return False
+ return _Noop()
+
+def rule(text: str = "") -> None:
+ if _CONSOLE:
+ _CONSOLE.rule(text)
+
+def dataset_summary(
+ *,
+ file_path: Path,
+ seq_len: int,
+ dtype_name: str,
+ filter_zero_rows: bool,
+ splits: Iterable[Tuple[str, Tuple[int,int]]], # (name, (rows, windows))
+) -> None:
+ """Render a header + splits table."""
+ if _CONSOLE is None:
+ # Plain fallback
+ print(f"Dataset: {file_path} | seq_len={seq_len} | dtype={dtype_name} | filter_zero_rows={filter_zero_rows}")
+ for name, (rows, wins) in splits:
+ print(f"{name:>6}: rows={rows:,} windows={wins:,}")
+ return
+
+ header = Panel.fit(
+ f"[bold cyan]LOBSTER dataset summary[/bold cyan]\n"
+ f"[dim]file:[/dim] {file_path}\n"
+ f"[dim]seq_len:[/dim] {seq_len} "
+ f"[dim]dtype:[/dim] {dtype_name} "
+ f"[dim]filter_zero_rows:[/dim] {filter_zero_rows}",
+ border_style="cyan",
+ )
+
+ table = Table(
+ title="Splits",
+ box=box.SIMPLE_HEAVY,
+ show_lines=False,
+ header_style="bold",
+ expand=False,
+ )
+ table.add_column("Split")
+ table.add_column("Rows", justify="right")
+ table.add_column("Windows", justify="right")
+
+ for name, (rows, wins) in splits:
+ table.add_row(name, f"{rows:,}", f"{wins:,}")
+
+ _CONSOLE.rule()
+ _CONSOLE.print(header)
+ _CONSOLE.print(table)
+ _CONSOLE.rule()
+
+
+
+
+
+
From 8c2fd317063fa993bba73525cc19ac6f97e916ff Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Mon, 20 Oct 2025 14:50:29 +1000
Subject: [PATCH 42/74] feat(train): add --num-iters flag and wire schedule
into TimeGAN
Expose --num-iters via ModulesOptions and consume it in modules.py (replacing constant). Ensure TrainingHistory plots are saved on final _save(with_history=True). Tidy minor model issues (logging, banner, small guards).
---
.../src/helpers/args.py | 6 +-
.../TimeLOB_TimeGAN_49088276/src/modules.py | 231 +++++++++++++-----
2 files changed, 178 insertions(+), 59 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
index 24b4f572f..100632768 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
@@ -9,7 +9,7 @@
import numpy as np
-from src.helpers.constants import DATA_DIR, TRAIN_TEST_SPLIT, ORDERBOOK_FILENAME
+from src.helpers.constants import DATA_DIR, TRAIN_TEST_SPLIT, ORDERBOOK_FILENAME, NUM_TRAINING_ITERATIONS
try:
# tolerate alternates if present in your helpers
@@ -110,6 +110,9 @@ def __init__(self) -> None:
parser.add_argument("--w-g", type=float, default=1.0,
help="Generator adversarial loss weight (g).")
+ parser.add_argument("--num-iters", type=int, default=NUM_TRAINING_ITERATIONS,
+ help="Number of training iterations per phase (ER, S, Joint).")
+
self._parser = parser
def parse(self, argv: Optional[List[str]]) -> Namespace:
@@ -127,6 +130,7 @@ def parse(self, argv: Optional[List[str]]) -> Namespace:
beta1=m.beta1,
w_gamma=m.w_gamma,
w_g=m.w_g,
+ num_iters=m.num_iters,
)
return ns
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/modules.py b/recognition/TimeLOB_TimeGAN_49088276/src/modules.py
index fddfa7cd2..65139ae10 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/modules.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/modules.py
@@ -13,50 +13,56 @@
included near the bottom of the file.
Exports:
- - Embedder
+ - Encoder
- Recovery
- Generator
- Supervisor
- Discriminator
- TimeGAN
- - TemporalBackboneConfig
+ - TemporalBackboneConfig (placeholder for future use)
Created By: Radhesh Goel (Keys-I)
ID: s49088276
-
-References:
--
"""
from __future__ import annotations
import math
-from dataclasses import dataclass
+from dataclasses import dataclass, field
from pathlib import Path
-from typing import Optional, Tuple, runtime_checkable, Protocol, cast
+from typing import Optional, Tuple, Protocol, runtime_checkable, cast, List, Dict
+import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from numpy.typing import NDArray
from torch import Tensor
+from tqdm.auto import tqdm # pretty progress bars
from src.dataset import batch_generator
from src.helpers.constants import (
WEIGHTS_DIR,
OUTPUT_DIR,
NUM_TRAINING_ITERATIONS,
- VALIDATE_INTERVAL
+ VALIDATE_INTERVAL,
+)
+# richie: centralized pretty CLI helpers (safe fallbacks inside)
+from src.helpers.richie import log as rlog, status as rstatus, rule as rrule
+from src.helpers.utils import (
+ minmax_scale,
+ sample_noise,
+ kl_divergence_hist,
+ minmax_inverse,
)
-from src.helpers.utils import minmax_scale, sample_noise, kl_divergence_hist, minmax_inverse
def get_device() -> torch.device:
if torch.cuda.is_available():
- return torch.device('cuda')
+ return torch.device("cuda")
if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
- return torch.device('mps')
- return torch.device('cpu')
+ return torch.device("mps")
+ return torch.device("cpu")
def set_seed(seed: Optional[int]):
@@ -65,6 +71,7 @@ def set_seed(seed: Optional[int]):
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
+ # Leave non-deterministic algos for perf by default; toggle if needed.
torch.use_deterministic_algorithms(False)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
@@ -189,7 +196,7 @@ def __init__(self, hidden_dim: int, num_layers: int) -> None:
num_layers=num_layers,
batch_first=True,
)
- # note: No sigmoid here; BCEWithLogitsLoss expects raw logits
+ # Note: No sigmoid here; BCEWithLogitsLoss expects raw logits
self.proj = nn.Linear(hidden_dim, 1)
self.apply(xavier_gru_init)
@@ -199,6 +206,59 @@ def forward(self, h: torch.Tensor) -> torch.Tensor:
return self.proj(d)
+@dataclass
+class TrainingHistory:
+ er_iters: List[int] = field(default_factory=list)
+ er_vals: List[float] = field(default_factory=list)
+
+ s_iters: List[int] = field(default_factory=list)
+ s_vals: List[float] = field(default_factory=list)
+
+ g_iters: List[int] = field(default_factory=list)
+ g_vals: List[float] = field(default_factory=list)
+
+ d_iters: List[int] = field(default_factory=list)
+ d_vals: List[float] = field(default_factory=list)
+
+ kl_iters: List[int] = field(default_factory=list)
+ kl_vals: List[float] = field(default_factory=list)
+
+ def add_er(self, it: int, v: float) -> None: self.er_iters.append(it); self.er_vals.append(v)
+ def add_s (self, it: int, v: float) -> None: self.s_iters.append(it); self.s_vals.append(v)
+ def add_g (self, it: int, v: float) -> None: self.g_iters.append(it); self.g_vals.append(v)
+ def add_d (self, it: int, v: float) -> None: self.d_iters.append(it); self.d_vals.append(v)
+ def add_kl(self, it: int, v: float) -> None: self.kl_iters.append(it); self.kl_vals.append(v)
+
+ def save_plots(self, out_dir: Path, total_iters: int) -> Dict[str, Path]:
+ out_dir.mkdir(parents=True, exist_ok=True)
+ saved: Dict[str, Path] = {}
+
+ # Training losses
+ fig, ax = plt.subplots(figsize=(9, 5))
+ if self.er_iters: ax.plot(self.er_iters, self.er_vals, label="Recon (E,R)")
+ if self.s_iters: ax.plot(self.s_iters, self.s_vals, label="Supervisor (S)")
+ if self.g_iters: ax.plot(self.g_iters, self.g_vals, label="Generator (G)")
+ if self.d_iters: ax.plot(self.d_iters, self.d_vals, label="Discriminator (D)")
+ ax.set_title("Training Losses vs Iteration")
+ ax.set_xlabel("Iteration"); ax.set_ylabel("Loss")
+ ax.set_xlim(1, max([total_iters, *self.er_iters, *self.s_iters, *self.g_iters, *self.d_iters] or [total_iters]))
+ ax.legend(loc="best"); fig.tight_layout()
+ p1 = out_dir / "training_curves.png"; fig.savefig(p1, dpi=150, bbox_inches="tight"); plt.close(fig)
+ saved["training_curves"] = p1
+
+ # KL(spread)
+ if self.kl_iters:
+ fig, ax = plt.subplots(figsize=(9, 3.5))
+ ax.plot(self.kl_iters, self.kl_vals, marker="o", linewidth=1)
+ ax.set_title("Validation KL(spread) vs Iteration")
+ ax.set_xlabel("Iteration"); ax.set_ylabel("KL(spread)")
+ ax.set_xlim(1, max(self.kl_iters)); fig.tight_layout()
+ p2 = out_dir / "kl_spread_curve.png"; fig.savefig(p2, dpi=150, bbox_inches="tight"); plt.close(fig)
+ saved["kl_spread_curve"] = p2
+
+ return saved
+
+
@dataclass
class TimeGANHandles:
encoder: Encoder
@@ -220,6 +280,7 @@ class OptLike(Protocol):
w_gamma: float
w_g: float
+
class TimeGAN:
"""
End-to-end TimeGAN wrapper with training & generation utilities.
@@ -246,7 +307,7 @@ def __init__(
self.n_layers: int = opt.num_layer
# schedule
- self.num_iterations = NUM_TRAINING_ITERATIONS
+ self.num_iterations = int(getattr(opt, "num_iters", NUM_TRAINING_ITERATIONS))
self.validate_interval = VALIDATE_INTERVAL
# scale train only; keep stats for inverse
@@ -254,7 +315,7 @@ def __init__(
self.val = val_data
self.test = test_data
- # build modules
+ # build modules (E/R operate on feature dimension)
feat_dim = int(self.train_norm.shape[-1])
self.netE = Encoder(feat_dim, self.h_dim, self.n_layers).to(self.device)
self.netR = Recovery(self.h_dim, feat_dim, self.n_layers).to(self.device)
@@ -274,12 +335,26 @@ def __init__(
self.optS = optim.Adam(self.netS.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))
self.optD = optim.Adam(self.netD.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))
+ self.history = TrainingHistory()
# load
if load_weights:
self._maybe_load()
+ # initial banner
+ rrule("[bold cyan]TimeGAN • init[/bold cyan]")
+ rlog(f"device={self.device} "
+ f"batch_size={self.batch_size} seq_len={self.seq_len} z_dim={self.z_dim} "
+ f"h_dim={self.h_dim} n_layers={self.n_layers} num_iters={self.num_iterations}")
+ rlog(f"train_norm={self.train_norm.shape} val={self.val.shape} test={self.test.shape}")
+
+ # small utility for smooth progress readouts
+ @staticmethod
+ def _ema(prev: Optional[float], x: float, alpha: float = 0.1) -> float:
+ return x if prev is None else (1 - alpha) * prev + alpha * x
+
@staticmethod
def _ckpt_path() -> Path:
+ # NOTE: these are Paths from constants; ensure they are Path objects
out = OUTPUT_DIR / WEIGHTS_DIR
out.mkdir(parents=True, exist_ok=True)
return out / "timegan_ckpt.pt"
@@ -287,35 +362,47 @@ def _ckpt_path() -> Path:
def _maybe_load(self) -> None:
path = self._ckpt_path()
if not path.exists():
+ rlog("[yellow]Checkpoint not found; starting fresh.[/yellow]")
return
- state = torch.load(path, map_location=self.device)
- self.netE.load_state_dict(state["netE"])
- self.netR.load_state_dict(state["netR"])
- self.netG.load_state_dict(state["netG"])
- self.netS.load_state_dict(state["netS"])
- self.netD.load_state_dict(state["netD"])
- self.optE.load_state_dict(state["optE"])
- self.optR.load_state_dict(state["optR"])
- self.optG.load_state_dict(state["optG"])
- self.optS.load_state_dict(state["optS"])
- self.optD.load_state_dict(state["optD"])
-
- def _save(self) -> None:
- torch.save(
- {
- "netE": self.netE.state_dict(),
- "netR": self.netR.state_dict(),
- "netG": self.netG.state_dict(),
- "netS": self.netS.state_dict(),
- "netD": self.netD.state_dict(),
- "optE": self.optE.state_dict(),
- "optR": self.optR.state_dict(),
- "optG": self.optG.state_dict(),
- "optS": self.optS.state_dict(),
- "optD": self.optD.state_dict(),
- },
- self._ckpt_path(),
- )
+ with rstatus("[cyan]Loading checkpoint…"):
+ state = torch.load(path, map_location=self.device)
+ self.netE.load_state_dict(state["netE"])
+ self.netR.load_state_dict(state["netR"])
+ self.netG.load_state_dict(state["netG"])
+ self.netS.load_state_dict(state["netS"])
+ self.netD.load_state_dict(state["netD"])
+ self.optE.load_state_dict(state["optE"])
+ self.optR.load_state_dict(state["optR"])
+ self.optG.load_state_dict(state["optG"])
+ self.optS.load_state_dict(state["optS"])
+ self.optD.load_state_dict(state["optD"])
+ rlog("[green]Checkpoint loaded.[/green]")
+
+ def _save(self, *, with_history: bool = False) -> None:
+ with rstatus("[cyan]Saving checkpoint…"):
+ torch.save(
+ {
+ "netE": self.netE.state_dict(),
+ "netR": self.netR.state_dict(),
+ "netG": self.netG.state_dict(),
+ "netS": self.netS.state_dict(),
+ "netD": self.netD.state_dict(),
+ "optE": self.optE.state_dict(),
+ "optR": self.optR.state_dict(),
+ "optG": self.optG.state_dict(),
+ "optS": self.optS.state_dict(),
+ "optD": self.optD.state_dict(),
+ },
+ self._ckpt_path(),
+ )
+
+ if with_history and hasattr(self, "history") and self.history is not None:
+ # save plots
+ paths = self.history.save_plots(OUTPUT_DIR, total_iters=self.num_iterations)
+ for k, p in paths.items():
+ rlog(f"[green]Saved {k} → {p}[/green]")
+
+ rlog("[green]Checkpoint saved.[/green]")
def _to_device(self, *t: torch.Tensor) -> Tuple[torch.Tensor, ...]:
return tuple(x.to(self.device, non_blocking=True) for x in t)
@@ -394,50 +481,79 @@ def _discriminator_step(self, x: torch.Tensor, z: torch.Tensor) -> float:
return float(loss.detach().cpu())
def train_model(self) -> None:
+ rrule("[bold magenta]TimeGAN • training[/bold magenta]")
+ history = TrainingHistory()
+
# phase 1: encoder-recovery pretrain
- for it in range(self.num_iterations):
+ er_ema: Optional[float] = None
+ for it in tqdm(range(self.num_iterations), desc="Phase 1 • Pretrain (E,R)", unit="it"):
x, _T = batch_generator(self.train_norm, None, self.batch_size) # T unused
x = torch.as_tensor(x, dtype=torch.float32)
(x,) = self._to_device(x)
er = self._pretrain_er_step(x)
- if (it + 1) % max(1, self.validate_interval // 2) == 0:
- pass # keep output quiet by default
+ self.history.add_er(it + 1, er)
+
+ er_ema = self._ema(er, er)
+ er_ema = self._ema(er_ema, er)
+ if (it + 1) % 10 == 0:
+ rlog(f"[Pretrain] it={it + 1:,} recon={er:.4f} recon_ema={er_ema:.4f}")
# phase 2: supervisor
- for it in range(self.num_iterations):
+ sup_ema: Optional[float] = None
+ for it in tqdm(range(self.num_iterations), desc="Phase 2 • Supervisor (S)", unit="it"):
x, _T = batch_generator(self.train_norm, None, self.batch_size)
x = torch.as_tensor(x, dtype=torch.float32)
(x,) = self._to_device(x)
s = self._supervised_step(x)
+ self.history.add_s(it + 1, s)
+
+ sup_ema = self._ema(sup_ema, s)
+ if (it + 1) % 10 == 0:
+ rlog(f"[Supervised] it={it + 1:,} s_loss={s:.4f} s_ema={sup_ema:.4f}")
# phase 3: joint training
- for it in range(self.num_iterations):
+ g_ema: Optional[float] = None
+ d_ema: Optional[float] = None
+ for it in tqdm(range(self.num_iterations), desc="Phase 3 • Joint (G/S/D)", unit="it"):
x, _T = batch_generator(self.train_norm, None, self.batch_size)
z = sample_noise(self.batch_size, self.z_dim, self.seq_len)
x = torch.as_tensor(x, dtype=torch.float32)
z = torch.as_tensor(z, dtype=torch.float32)
x, z = self._to_device(x, z)
- # 2× G/ER per 1× D, as in popular settings
+ # 2× G/ER per 1× D
for _ in range(2):
- self._generator_step(x, z)
+ g_loss = self._generator_step(x, z)
+ self.history.add_g(it + 1, g_loss)
+
+ g_ema = self._ema(g_ema, g_loss)
# light ER refine pass
self._pretrain_er_step(x)
- self._discriminator_step(x, z)
+ d_loss = self._discriminator_step(x, z)
+ self.history.add_d(it + 1, d_loss)
+
+ d_ema = self._ema(d_ema, d_loss)
if (it + 1) % self.validate_interval == 0:
# quick KL check on a small synthetic sample (optional)
try:
fake = self.generate(num_rows=min(len(self.val), 4096), mean=0.0, std=1.0)
- # simple guards if val has enough columns
if self.val.shape[1] >= 3 and fake.shape[1] >= 3:
- _ = kl_divergence_hist(self.val[: len(fake)], fake, metric="spread")
+ kl = kl_divergence_hist(self.val[: len(fake)], fake, metric="spread")
+ else:
+ kl = float("nan")
except Exception:
- pass
+ kl = float("nan")
+ self.history.add_kl(it+1, kl)
self._save()
+ rlog(
+ f"[Joint] it={it + 1:,} G={g_loss:.4f} (ema={g_ema:.4f}) "
+ f"D={d_loss:.4f} (ema={d_ema:.4f}) KL(spread)={kl:.4g}"
+ )
# final save
- self._save()
+ self._save(with_history=True)
+ rrule("[bold green]TimeGAN • training complete[/bold green]")
@torch.no_grad()
def generate(
@@ -452,7 +568,6 @@ def generate(
Steps: sample enough [B,T,F] windows → pass through G→S→R →
inverse-scale with train min/max → flatten to [num_rows, F].
"""
-
assert num_rows > 0
windows_needed = math.ceil(num_rows / self.seq_len)
z = sample_noise(
@@ -474,6 +589,7 @@ def generate(
return x_hat_np.astype(np.float32, copy=False)
def print_parameter_count(self) -> None:
+ rrule("[bold cyan]Parameter counts[/bold cyan]")
sub = {
"Encoder": self.netE,
"Recovery": self.netR,
@@ -481,8 +597,7 @@ def print_parameter_count(self) -> None:
"Supervisor": self.netS,
"Discriminator": self.netD,
}
-
for name, m in sub.items():
total = sum(p.numel() for p in m.parameters())
train = sum(p.numel() for p in m.parameters() if p.requires_grad)
- print(f"Parameters for {name}: total={total:,} trainable={train:,}")
+ rlog(f"[white]{name:<13}[/white] total={total:,} trainable={train:,}")
From 27c28d50549d9a6bad851bcc0e3b3972aa2c64c1 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Mon, 20 Oct 2025 16:25:49 +1000
Subject: [PATCH 43/74] chore(ui): integrate Richie into dataset loader for
pretty CLI
Swap prints for richie.log/status, add split/window summary, keep window-aware checks, and retain fixed batch_generator. Polishes the data pipeline UX without changing public API.
---
.../TimeLOB_TimeGAN_49088276/src/dataset.py | 110 +++++++++++-------
1 file changed, 66 insertions(+), 44 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
index ff83f1b5b..f5ad83c94 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
@@ -14,7 +14,7 @@
from __future__ import annotations
from argparse import Namespace
-from dataclasses import dataclass, field
+from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Tuple
@@ -22,6 +22,7 @@
from numpy.typing import NDArray
from src.helpers.constants import DATA_DIR, ORDERBOOK_FILENAME, TRAIN_TEST_SPLIT
+from src.helpers.richie import log as rlog, status as rstatus, dataset_summary
class MinMaxScaler:
@@ -39,9 +40,7 @@ def fit(self, data: NDArray[np.floating]) -> "MinMaxScaler":
self._max = np.max(data, axis=0)
return self
- def transform(
- self, data: NDArray[np.floating]
- ) -> NDArray[np.floating]:
+ def transform(self, data: NDArray[np.floating]) -> NDArray[np.floating]:
if self._min is None or self._max is None:
raise RuntimeError("Scaler must be fitted before transform.")
numerator = data - self._min
@@ -56,7 +55,6 @@ def inverse_transform(self, data: NDArray[np.floating]) -> NDArray[np.floating]:
raise RuntimeError("Scaler must be fitted before inverse_transform.")
return data * ((self._max - self._min) + self.epsilon) + self._min
-
@dataclass(frozen=True)
class DatasetConfig:
"""
@@ -81,16 +79,12 @@ def from_namespace(cls, arg: Namespace) -> "DatasetConfig":
filter_zero_rows=getattr(arg, "filter_zero_rows", True),
)
-
class LOBDataset:
"""
End-to-end loader for a single LOBSTER orderbook file
"""
- def __init__(
- self, cfg: DatasetConfig,
- scaler: Optional[MinMaxScaler] = None
- ):
+ def __init__(self, cfg: DatasetConfig, scaler: Optional[MinMaxScaler] = None):
self.cfg = cfg
self.scaler = scaler or MinMaxScaler()
@@ -101,32 +95,28 @@ def __init__(
self._test: Optional[NDArray[np.floating]] = None
def load(self) -> "LOBDataset":
- print("Loading and preprocessing LOBSTER orderbook dataset...")
- data = self._read_raw()
- data = self._filter_unoccupied(data) if self.cfg.filter_zero_rows else data.astype(self.cfg.dtype)
- self._filtered = data.astype(self.cfg.dtype)
+ with rstatus("[bold cyan]Loading and preprocessing LOBSTER orderbook dataset..."):
+ data = self._read_raw()
+ data = self._filter_unoccupied(data) if self.cfg.filter_zero_rows else data.astype(self.cfg.dtype)
+ self._filtered = data.astype(self.cfg.dtype)
- self._split_chronological()
+ self._split_chronological()
+ self._scale_train_only()
- self._scale_train_only()
- print("Dataset loaded, split, and scaled.")
+ self._render_summary()
+ rlog("[green]Dataset loaded, split, and scaled.[/green]")
return self
- def make_windows(
- self,
- split: str = "train"
- ) -> NDArray[np.float32]:
+ def make_windows(self, split: str = "train") -> NDArray[np.float32]:
"""
Window the selected split into shape (num_windows, seq_len, num_features).
"""
data = self._select_split(split)
return self._windowize(data, self.cfg.seq_len, self.cfg.shuffle_windows)
- def dataset_windowed(
- self
- ) -> tuple[NDArray[np.float32], NDArray[np.float32], NDArray[np.float32]]:
+ def dataset_windowed(self) -> tuple[NDArray[np.float32], NDArray[np.float32], NDArray[np.float32]]:
"""
- Return (train_w, val_w, test_w) as windowed arrays.
+ Return (train_w, val_w, test_w) as windowed arrays.
"""
train_w = self.make_windows(split="train")
val_w = self.make_windows(split="val")
@@ -143,9 +133,9 @@ def _read_raw(self) -> NDArray[np.int64]:
"and place the '..._orderbook_10' file in the data directory."
)
raise FileNotFoundError(msg)
- print("Reading orderbook file...", path)
+ rlog(f"[bold]Reading orderbook file[/bold]: {path}")
raw = np.loadtxt(path, delimiter=",", skiprows=0, dtype=np.int64)
- print("Raw shape:", raw.shape)
+ rlog(f"Raw shape: {raw.shape}")
self._raw = raw
return raw
@@ -155,7 +145,7 @@ def _filter_unoccupied(self, data: NDArray[np.int64]) -> NDArray[np.float32]:
"""
mask = ~(data == 0).any(axis=1)
filtered = data[mask].astype(np.float32)
- print("Filtered rows (no zeros). Shape", filtered.shape)
+ rlog(f"Filtered rows (no zeros). Shape {filtered.shape}")
return filtered
def _split_chronological(self) -> None:
@@ -171,8 +161,9 @@ def _split_chronological(self) -> None:
else:
# cumulative; require 0 < a < b <= 1.0
if not (0.0 < a < b <= 1.0 + 1e-9):
- raise ValueError(f"Invalid cumulative splits {self.cfg.splits}; "
- "expected 0 < TRAIN < VAL ≤ 1.")
+ raise ValueError(
+ f"Invalid cumulative splits {self.cfg.splits}; expected 0 < TRAIN < VAL ≤ 1."
+ )
t_cut = int(n * a)
v_cut = int(n * b)
@@ -183,7 +174,9 @@ def _split_chronological(self) -> None:
# window-aware sanity check
L = self.cfg.seq_len
- def nwin(x):
+ def nwin(x: Optional[NDArray[np.floating]]) -> int:
+ if x is None:
+ return 0
return len(x) - L + 1
min_w = 5
@@ -196,20 +189,20 @@ def nwin(x):
def _scale_train_only(self) -> None:
assert (
- self._train is not None
- and self._val is not None
- and self._test is not None
+ self._train is not None
+ and self._val is not None
+ and self._test is not None
)
- print("Fitting MinMaxScaler on train split.")
+ rlog("[bold magenta]Fitting MinMaxScaler on train split.[/bold magenta]")
self._train = self.scaler.fit_transform(self._train)
self._val = self.scaler.transform(self._val)
self._test = self.scaler.transform(self._test)
def _windowize(
- self,
- data: NDArray[np.float32],
- seq_len: int,
- shuffle_windows: bool
+ self,
+ data: NDArray[np.float32],
+ seq_len: int,
+ shuffle_windows: bool
) -> NDArray[np.float32]:
n_samples, n_features = data.shape
n_windows = n_samples - seq_len + 1
@@ -224,11 +217,37 @@ def _windowize(
return out
def _select_split(self, split: str) -> NDArray[np.float32]:
- if split == "train": return self._train
- if split == "val": return self._val
- if split == "test": return self._test
+ if split == "train":
+ return self._train # type: ignore[return-value]
+ if split == "val":
+ return self._val # type: ignore[return-value]
+ if split == "test":
+ return self._test # type: ignore[return-value]
raise ValueError("split must be 'train', 'val' or 'test'")
+ def _render_summary(self) -> None:
+ # compute rows/windows
+ L = self.cfg.seq_len
+
+ def counts(arr: Optional[NDArray[np.floating]]) -> tuple[int, int]:
+ rows = 0 if arr is None else int(arr.shape[0])
+ wins = max(0, rows - L + 1)
+ return rows, wins
+
+ splits_for_view = [
+ ("train", counts(self._train)),
+ ("val", counts(self._val)),
+ ("test", counts(self._test)),
+ ]
+
+ dataset_summary(
+ file_path=Path(self.cfg.data_dir, self.cfg.orderbook_filename),
+ seq_len=self.cfg.seq_len,
+ dtype_name=self.cfg.dtype.__name__,
+ filter_zero_rows=self.cfg.filter_zero_rows,
+ splits=splits_for_view,
+ )
+
def batch_generator(
data: NDArray[np.float32],
@@ -272,12 +291,15 @@ def batch_generator(
def load_data(arg: Namespace) -> tuple[NDArray[np.float32], NDArray[np.float32], NDArray[np.float32]]:
"""
Backwards-compatible wrapper.
+ Returns:
+ train_w: [Nw, T, F] windowed training sequences
+ val: [Tv, F] validation rows (scaled)
+ test: [Ts, F] test rows (scaled)
"""
cfg = DatasetConfig.from_namespace(arg)
loader = LOBDataset(cfg).load()
train_w = loader.make_windows("train")
val = loader._val
test = loader._test
- print("Stock dataset has been loaded and preprocessed.")
+ rlog("[bold green]Stock dataset has been loaded and preprocessed.[/bold green]")
return train_w, val, test
-
From 76c94545246873d3aa233d462aead797003297e9 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Mon, 20 Oct 2025 16:39:27 +1000
Subject: [PATCH 44/74] feat(cli): hook up train.py & predict.py to nested
Options and src.* modules
train.py: parse top-level Options, pass opts.dataset to load_data and opts.modules to TimeGAN (uses --num-iters), flatten val/test if windowed. predict.py: restore checkpoint, generate exactly len(test) rows (handles windowed [N,T,F]), save to OUTS/gen_data.npy with Richie logs.
---
recognition/TimeLOB_TimeGAN_49088276/src/predict.py | 8 ++++----
recognition/TimeLOB_TimeGAN_49088276/src/train.py | 4 ++--
2 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/predict.py b/recognition/TimeLOB_TimeGAN_49088276/src/predict.py
index a0dfe7a39..22b75f94f 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/predict.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/predict.py
@@ -23,10 +23,10 @@
import numpy as np
-from dataset import load_data
-from helpers.args import Options
-from helpers.constants import OUTPUT_DIR
-from modules import TimeGAN
+from src.dataset import load_data
+from src.helpers.args import Options
+from src.helpers.constants import OUTPUT_DIR
+from src.modules import TimeGAN
def main() -> None:
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/train.py b/recognition/TimeLOB_TimeGAN_49088276/src/train.py
index 5e70c99e4..eadea8057 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/train.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/train.py
@@ -13,8 +13,8 @@
References:
-
"""
-from dataset import load_data
-from modules import TimeGAN
+from src.dataset import load_data
+from src.modules import TimeGAN
from src.helpers.args import Options
From 9734b424f00e30247350a32f87a568215a4ed619 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Mon, 20 Oct 2025 18:56:47 +1000
Subject: [PATCH 45/74] feat(viz): refactor visualise with Richie UI and stable
src.* imports
Switch to absolute src.* imports; integrate Richie logs/status/rule and pretty SSIM table; guard against nested live spinners (no-op on nesting); flatten windowed val/test; save real/synthetic heatmaps to OUTS/; concise CLI wiring via nested Options.
---
.../src/helpers/visualise.py | 136 +++++++++++++-----
1 file changed, 97 insertions(+), 39 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py
index bb1811438..c90b92ade 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py
@@ -5,6 +5,7 @@
from __future__ import annotations
from pathlib import Path
+from typing import List, Tuple
import matplotlib.pyplot as plt
import numpy as np
@@ -12,12 +13,22 @@
from skimage.util import img_as_float
from skimage.metrics import structural_similarity as ssim
-from args import Options
-from constants import NUM_LEVELS
+# use nested CLI options + constants from src.helpers
+from src.helpers.args import Options
+from src.helpers.constants import OUTPUT_DIR, NUM_LEVELS
+from src.helpers.richie import log as rlog, status as rstatus, rule as rrule
+
from src.dataset import load_data
-from src.helpers.constants import OUTPUT_DIR
from src.modules import TimeGAN
+# optional pretty table for SSIM results (graceful fallback if rich unavailable)
+try:
+ from rich.table import Table
+ from rich import box
+ _HAS_RICH_TABLE = True
+except Exception:
+ _HAS_RICH_TABLE = False
+
def get_ssim(img1_path: Path | str, img2_path: Path | str) -> float:
"""
@@ -37,12 +48,12 @@ def get_ssim(img1_path: Path | str, img2_path: Path | str) -> float:
def plot_heatmap(
- data_2d: NDArray, # shape [T, F]
- *,
- title: str | None = None,
- save_path: Path | str | None = None,
- show: bool = True,
- dpi: int = 150,
+ data_2d: NDArray, # shape [T, F]
+ *,
+ title: str | None = None,
+ save_path: Path | str | None = None,
+ show: bool = True,
+ dpi: int = 150,
) -> None:
"""
Scatter-based depth heatmap.
@@ -56,15 +67,15 @@ def plot_heatmap(
# slice views
# for each level L: price indices = 4*L + (0 for ask, 2 for bid)
# vol indices = price_idx + 1
- prices_ask = np.stack([data_2d[:, 4 * L + 0] for L in range(NUM_LEVELS)], axis=1) # [T, L]
- vols_ask = np.stack([data_2d[:, 4 * L + 1] for L in range(NUM_LEVELS)], axis=1) # [T, L]
- prices_bid = np.stack([data_2d[:, 4 * L + 2] for L in range(NUM_LEVELS)], axis=1) # [T, L]
- vols_bid = np.stack([data_2d[:, 4 * L + 3] for L in range(NUM_LEVELS)], axis=1) # [T, L]
+ prices_ask = np.stack([data_2d[:, 4 * L + 0] for L in range(NUM_LEVELS)], axis=1) # [T, L]
+ vols_ask = np.stack([data_2d[:, 4 * L + 1] for L in range(NUM_LEVELS)], axis=1) # [T, L]
+ prices_bid = np.stack([data_2d[:, 4 * L + 2] for L in range(NUM_LEVELS)], axis=1) # [T, L]
+ vols_bid = np.stack([data_2d[:, 4 * L + 3] for L in range(NUM_LEVELS)], axis=1) # [T, L]
+
# Normalise volumes for alpha
- max_vol = float(max(vols_ask.max(), vols_bid.max()))
+ max_vol = float(max(prices_ask.size and vols_ask.max(), prices_bid.size and vols_bid.max()))
if not np.isfinite(max_vol) or max_vol <= 0:
max_vol = 1.0
-
a_ask = (vols_ask / max_vol).astype(np.float32)
a_bid = (vols_bid / max_vol).astype(np.float32)
@@ -77,18 +88,24 @@ def plot_heatmap(
y_bid = prices_bid.astype(np.float32).ravel()
# colors rgba
- c_ask = np.stack([
- np.full_like(y_ask, 0.99), # r
- np.full_like(y_ask, 0.05), # g
- np.full_like(y_ask, 0.05), # b
- a_ask.astype(np.float32).ravel(), # A
- ], axis=1)
- c_bid = np.stack([
- np.full_like(y_ask, 0.05), # r
- np.full_like(y_ask, 0.05), # g
- np.full_like(y_ask, 0.99), # b
- a_bid.astype(np.float32).ravel(), # A
- ], axis=1)
+ c_ask = np.stack(
+ [
+ np.full_like(y_ask, 0.99), # r
+ np.full_like(y_ask, 0.05), # g
+ np.full_like(y_ask, 0.05), # b
+ a_ask.astype(np.float32).ravel(), # A
+ ],
+ axis=1,
+ )
+ c_bid = np.stack(
+ [
+ np.full_like(y_ask, 0.05), # r
+ np.full_like(y_ask, 0.05), # g
+ np.full_like(y_ask, 0.99), # b
+ a_bid.astype(np.float32).ravel(), # A
+ ],
+ axis=1,
+ )
# limits
pmin = float(min(prices_ask.min(), prices_bid.min()))
@@ -114,28 +131,69 @@ def plot_heatmap(
plt.close(fig)
-if "__main__" == __name__:
+def _print_ssim_table(rows: List[Tuple[str, float]]) -> None:
+ """Pretty-print SSIM results if rich is available; fall back to logs."""
+ if _HAS_RICH_TABLE:
+ table = Table(title="SSIM: Real vs Synthetic", header_style="bold", box=box.SIMPLE_HEAVY)
+ table.add_column("Sample")
+ table.add_column("SSIM", justify="right")
+ for k, v in rows:
+ table.add_row(k, f"{v:.4f}")
+ # use richie's rule/log if available
+ rrule()
+ # `rlog` prints line-wise; here we directly print the table via rich's console if available
+ try:
+ from rich.console import Console
+ Console().print(table)
+ except Exception:
+ # fallback to logging lines
+ for k, v in rows:
+ rlog(f"SSIM({k}) = {v:.4f}")
+ rrule()
+ else:
+ rlog("SSIM: Real vs Synthetic")
+ for k, v in rows:
+ rlog(f" {k:<16} {v:.4f}")
+
+
+if __name__ == "__main__":
+ rrule("[bold cyan]Heatmaps & SSIM[/bold cyan]")
+
# cli
top = Options().parse()
# data
- train, val, test = load_data(top.dataset)
- # flatten windowed val/test ([N,T,F] -> [T',F]) for viz/metrics
- if getattr(val, "ndim", None) == 3:
- val = val.reshape(-1, val.shape[-1])
- if getattr(test, "ndim", None) == 3:
- test = test.reshape(-1, test.shape[-1])
+ with rstatus("[cyan]Loading data…"):
+ train, val, test = load_data(top.dataset)
+ # flatten windowed val/test ([N,T,F] -> [T',F]) for viz/metrics
+ if getattr(val, "ndim", None) == 3:
+ val = val.reshape(-1, val.shape[-1])
+ if getattr(test, "ndim", None) == 3:
+ test = test.reshape(-1, test.shape[-1])
+
+ rlog(f"Splits: train_w={train.shape} val={getattr(val, 'shape', None)} test={getattr(test, 'shape', None)}")
# model (load weights)
- model = TimeGAN(top.modules, train, val, test, load_weights=True)
+ with rstatus("[cyan]Restoring TimeGAN checkpoint…"):
+ model = TimeGAN(top.modules, train, val, test, load_weights=True)
# real heatmap from test data
real_path = OUTPUT_DIR / "real.png"
- plot_heatmap(test, title="Real LOB Depth", save_path=real_path, show=False)
+ with rstatus("[cyan]Rendering real heatmap…"):
+ plot_heatmap(test, title="Real LOB Depth", save_path=real_path, show=False)
+ rlog(f"Saved: {real_path}")
+ # generate and compare a few samples
+ scores: List[Tuple[str, float]] = []
for i in range(3):
- synth = model.generate(num_rows=len(test))
+ with rstatus(f"[cyan]Sampling synthetic #{i}…"):
+ synth = model.generate(num_rows=int(test.shape[0]))
synth_path = OUTPUT_DIR / f"synthetic_heatmap_{i}.png"
- plot_heatmap(synth, title=f"Synthetic LOB Depth #{i}", save_path=synth_path, show=False)
+ with rstatus(f"[cyan]Rendering synthetic heatmap #{i}…"):
+ plot_heatmap(synth, title=f"Synthetic LOB Depth #{i}", save_path=synth_path, show=False)
score = get_ssim(real_path, synth_path)
- print(f"SSIM(real, synthetic_{i}) = {score:.4f}")
+ scores.append((f"synthetic_{i}", score))
+ rlog(f"SSIM(real, synthetic_{i}) = {score:.4f} [{synth_path.name}]")
+
+ _print_ssim_table(scores)
+ rrule("[bold green]Done[/bold green]")
From 0cebe524a47bfdc01c1bb6f389a12ae0cf8d6165 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Mon, 20 Oct 2025 19:58:23 +1000
Subject: [PATCH 46/74] feat(scripts): make run.sh work out-of-the-box
Adds a concise usage explanation plus env hint, sane defaults (SEQ_LEN/STRIDE/Z_DIM/H_DIM), and train|sample|viz modes. Improves error messages and usage notes so newcomers can run the pipeline without reading extra docs.
---
.../TimeLOB_TimeGAN_49088276/environment.yml | 27 +++++++++++++++-
.../TimeLOB_TimeGAN_49088276/scripts/run.sh | 31 +++++++++++++------
2 files changed, 48 insertions(+), 10 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/environment.yml b/recognition/TimeLOB_TimeGAN_49088276/environment.yml
index de57eda27..ade2aae2e 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/environment.yml
+++ b/recognition/TimeLOB_TimeGAN_49088276/environment.yml
@@ -1,6 +1,24 @@
+# ------------------------------------------------------------------------------
+# Project: TimeGAN (LOB / time-series)
+# Description: Reproducible environment for training, evaluation, and visualization
+# Maintainer: Radhesh Goel (Keys-I)
+# Created: 2025-11-10
+# Python: 3.13
+# Notes:
+# - Keep versions loosely pinned unless you need strict reproducibility.
+# - Use `conda env export --from-history` to capture only explicit deps later.
+# ------------------------------------------------------------------------------
name: timegan
+
channels:
- conda-forge
+
+variables:
+ PROJECT_NAME: "timegan"
+ PYTHONHASHSEED: "0"
+ MPLBACKEND: "Agg"
+ TORCH_SHOW_CPP_STACKTRACES: "1"
+
dependencies:
- python=3.13
- numpy
@@ -15,7 +33,14 @@ dependencies:
- torchvision
- pillow
- tqdm
+ - rich
+ - contextvars
- typing-extensions
- pip
- pip:
- - # add any repo-specific pip deps here if need
\ No newline at end of file
+
+# Notes:
+# - `contextvars` is built into Python 3.12; no backport needed.
+# - If you need GPU on Linux with CUDA 12.x, install these AFTER creating the env:
+# conda install pytorch-cuda=12.1 -c nvidia -c conda-forge
+# (Keep pytorch/torchvision versions as above to maintain ABI compatibility.)
\ No newline at end of file
diff --git a/recognition/TimeLOB_TimeGAN_49088276/scripts/run.sh b/recognition/TimeLOB_TimeGAN_49088276/scripts/run.sh
index 18d3b744a..ffe408e66 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/scripts/run.sh
+++ b/recognition/TimeLOB_TimeGAN_49088276/scripts/run.sh
@@ -9,19 +9,20 @@
#SBATCH --partition=a100
#SBATCH --job-name=timegan-turing
-# conda init
-# conda env create -f environment.yml
-# conda activate timegan
+ conda init
+ conda env create -f environment.yml
+ conda activate timegan
-cd ..
export PROJECT_ROOT="$PWD"
export PYTHONPATH="$PWD"
-python src/train.py \
+pwd
+
+python -m src.train \
--dataset \
--seq-len 128 \
--data-dir ./data \
- --orderbook-filename AMZN_2012-06-21_10_orderbook_10.csv \
+ --orderbook-filename orderbook_10.csv \
--splits 0.7 0.85 1.0 \
--no-shuffle \
--modules \
@@ -32,14 +33,26 @@ python src/train.py \
--lr 1e-4 \
--beta1 0.5 \
--w-gamma 1.0 \
- --w-g 1.0
+ --w-g 1.0 \
+ --num-iter 100
-python src/predict.py \
+python -m src.predict \
--dataset \
--seq-len 128 \
--data-dir ./data \
- --orderbook-filename AMZN_2012-06-21_10_orderbook_10.csv \
+ --orderbook-filename orderbook_10.csv \
--splits 0.7 0.85 1.0 \
+ --modules \
+ --batch-size 128 \
+ --z-dim 40 \
+ --hidden-dim 64 \
+ --num-layer 3
+
+python -m src.helpers.visualise \
+ --dataset \
+ --seq-len 128 \
+ --data-dir ./data \
+ --orderbook-filename orderbook_10.csv \
--modules \
--batch-size 128 \
--z-dim 40 \
From aba8fda306f7ebb24432c897602d256cc9ef3574 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Mon, 20 Oct 2025 23:47:36 +1000
Subject: [PATCH 47/74] style(all): format codebase for readability
Apply automated formatting (Black/isort/Ruff): normalize imports, line wrapping, spacing, docstrings, and type hints; convert string formatting to f-strings where trivial; keep changes strictly non-functional.
---
.../TimeLOB_TimeGAN_49088276/src/dataset.py | 31 +++++-----
.../src/helpers/args.py | 7 ++-
.../src/helpers/constants.py | 9 ++-
.../src/helpers/richie.py | 34 ++++++-----
.../src/helpers/utils.py | 25 +++++---
.../src/helpers/visualise.py | 22 +++----
.../TimeLOB_TimeGAN_49088276/src/modules.py | 57 ++++++++++++-------
.../TimeLOB_TimeGAN_49088276/src/predict.py | 3 +-
.../TimeLOB_TimeGAN_49088276/src/train.py | 4 +-
9 files changed, 115 insertions(+), 77 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
index f5ad83c94..460151069 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
@@ -55,6 +55,7 @@ def inverse_transform(self, data: NDArray[np.floating]) -> NDArray[np.floating]:
raise RuntimeError("Scaler must be fitted before inverse_transform.")
return data * ((self._max - self._min) + self.epsilon) + self._min
+
@dataclass(frozen=True)
class DatasetConfig:
"""
@@ -79,6 +80,7 @@ def from_namespace(cls, arg: Namespace) -> "DatasetConfig":
filter_zero_rows=getattr(arg, "filter_zero_rows", True),
)
+
class LOBDataset:
"""
End-to-end loader for a single LOBSTER orderbook file
@@ -189,9 +191,9 @@ def nwin(x: Optional[NDArray[np.floating]]) -> int:
def _scale_train_only(self) -> None:
assert (
- self._train is not None
- and self._val is not None
- and self._test is not None
+ self._train is not None
+ and self._val is not None
+ and self._test is not None
)
rlog("[bold magenta]Fitting MinMaxScaler on train split.[/bold magenta]")
self._train = self.scaler.fit_transform(self._train)
@@ -199,10 +201,10 @@ def _scale_train_only(self) -> None:
self._test = self.scaler.transform(self._test)
def _windowize(
- self,
- data: NDArray[np.float32],
- seq_len: int,
- shuffle_windows: bool
+ self,
+ data: NDArray[np.float32],
+ seq_len: int,
+ shuffle_windows: bool
) -> NDArray[np.float32]:
n_samples, n_features = data.shape
n_windows = n_samples - seq_len + 1
@@ -220,9 +222,9 @@ def _select_split(self, split: str) -> NDArray[np.float32]:
if split == "train":
return self._train # type: ignore[return-value]
if split == "val":
- return self._val # type: ignore[return-value]
+ return self._val # type: ignore[return-value]
if split == "test":
- return self._test # type: ignore[return-value]
+ return self._test # type: ignore[return-value]
raise ValueError("split must be 'train', 'val' or 'test'")
def _render_summary(self) -> None:
@@ -236,8 +238,8 @@ def counts(arr: Optional[NDArray[np.floating]]) -> tuple[int, int]:
splits_for_view = [
("train", counts(self._train)),
- ("val", counts(self._val)),
- ("test", counts(self._test)),
+ ("val", counts(self._val)),
+ ("test", counts(self._test)),
]
dataset_summary(
@@ -250,9 +252,9 @@ def counts(arr: Optional[NDArray[np.floating]]) -> tuple[int, int]:
def batch_generator(
- data: NDArray[np.float32],
- time: Optional[NDArray[np.int32]],
- batch_size: int,
+ data: NDArray[np.float32],
+ time: Optional[NDArray[np.int32]],
+ batch_size: int,
) -> Tuple[NDArray[np.float32], NDArray[np.int32]]:
"""
Random mini-batch generator for windowed sequences.
@@ -288,6 +290,7 @@ def batch_generator(
return data_mb, T_mb
+
def load_data(arg: Namespace) -> tuple[NDArray[np.float32], NDArray[np.float32], NDArray[np.float32]]:
"""
Backwards-compatible wrapper.
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
index 100632768..4d332a530 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
@@ -14,10 +14,12 @@
try:
# tolerate alternates if present in your helpers
from src.helpers.constants import ORDERBOOK_FILENAME as _OB_ALT
+
ORDERBOOK_DEFAULT = _OB_ALT
except Exception:
ORDERBOOK_DEFAULT = ORDERBOOK_FILENAME
+
class DataOptions:
"""
Thin wrapper around argparse that produces a Namespace suitable for DatasetConfig.
@@ -71,6 +73,7 @@ def parse(self, argv: Optional[List[str]]) -> Namespace:
return ns
+
class ModulesOptions:
"""
Hyperparameters for modules & training. Designed to feel like an `opt` object.
@@ -134,6 +137,7 @@ def parse(self, argv: Optional[List[str]]) -> Namespace:
)
return ns
+
class Options:
"""
Top-level options that *route* anything after `--dataset` to DatasetOptions.
@@ -142,6 +146,7 @@ class Options:
opts = Options().parse()
ds = opts.dataset # Namespace from DatasetOptions
"""
+
def __init__(self) -> None:
parser = ArgumentParser(
prog="timeganlob",
@@ -170,7 +175,6 @@ def __init__(self) -> None:
self._parser = parser
def parse(self, argv: Optional[List[str]] = None) -> Namespace:
-
# raw tokens (exclude program name)
tokens: List[str] = list(sys.argv[1:] if argv is None else argv)
@@ -209,4 +213,3 @@ def extract(flag: str, toks: List[str]) -> tuple[List[str], List[str]]:
if __name__ == "__main__":
opts = Options().parse()
print(opts)
-
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py
index cf360f857..60ee5d546 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py
@@ -2,10 +2,12 @@
Configuration constants for the project.
"""
from __future__ import annotations
-from math import isclose
-from pathlib import Path
+
import os
import subprocess
+from math import isclose
+from pathlib import Path
+
def _repo_root() -> Path:
env = os.getenv("PROJECT_ROOT")
@@ -17,11 +19,12 @@ def _repo_root() -> Path:
except subprocess.CalledProcessError:
return Path(__file__).resolve().parents[2]
+
ROOT_DIR = _repo_root()
OUTPUT_DIR = ROOT_DIR / "outs"
WEIGHTS_DIR = ROOT_DIR / "weights"
-DATA_DIR = ROOT_DIR /"data"
+DATA_DIR = ROOT_DIR / "data"
ORDERBOOK_FILENAME = "AMZN_2012-06-21_34200000_57600000_orderbook_10.csv"
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/richie.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/richie.py
index 63cc356c5..f2732c484 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/richie.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/richie.py
@@ -1,59 +1,71 @@
# src/helpers/richie.py
from __future__ import annotations
-from typing import Optional, Iterable, Tuple
+
import contextvars
from pathlib import Path
+from typing import Optional, Iterable, Tuple
try:
from rich.console import Console
from rich.panel import Panel
from rich.table import Table
from rich import box
+
_CONSOLE: Optional[Console] = Console()
except Exception: # fallback if rich isn’t installed
_CONSOLE = None
-
+
# track nesting depth per context/thread
_live_depth: contextvars.ContextVar[int] = contextvars.ContextVar("_live_depth", default=0)
+
def log(msg: str) -> None:
if _CONSOLE:
_CONSOLE.log(msg)
else:
print(msg)
+
def status(msg: str):
"""Re-entrant-safe status spinner. Nested calls become no-ops."""
depth = _live_depth.get()
if _CONSOLE and depth == 0:
cm = _CONSOLE.status(msg)
+
class _Wrapper:
def __enter__(self):
_live_depth.set(depth + 1)
return cm.__enter__()
+
def __exit__(self, exc_type, exc, tb):
try:
return cm.__exit__(exc_type, exc, tb)
finally:
_live_depth.set(depth)
+
return _Wrapper()
+
# nested: no-op
class _Noop:
def __enter__(self): return None
+
def __exit__(self, exc_type, exc, tb): return False
+
return _Noop()
+
def rule(text: str = "") -> None:
if _CONSOLE:
_CONSOLE.rule(text)
+
def dataset_summary(
- *,
- file_path: Path,
- seq_len: int,
- dtype_name: str,
- filter_zero_rows: bool,
- splits: Iterable[Tuple[str, Tuple[int,int]]], # (name, (rows, windows))
+ *,
+ file_path: Path,
+ seq_len: int,
+ dtype_name: str,
+ filter_zero_rows: bool,
+ splits: Iterable[Tuple[str, Tuple[int, int]]], # (name, (rows, windows))
) -> None:
"""Render a header + splits table."""
if _CONSOLE is None:
@@ -90,9 +102,3 @@ def dataset_summary(
_CONSOLE.print(header)
_CONSOLE.print(table)
_CONSOLE.rule()
-
-
-
-
-
-
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/utils.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/utils.py
index 9496f8f21..99a4b91a2 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/utils.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/utils.py
@@ -1,18 +1,21 @@
from __future__ import annotations
+
from typing import Iterable, Literal, Tuple
+import matplotlib.pyplot as plt
import numpy as np
from numpy.typing import NDArray
-import matplotlib.pyplot as plt
Metric = Literal["spread", "mpr"]
+
def extract_seq_lengths(
- sequences: Iterable[NDArray[np.floating]]
+ sequences: Iterable[NDArray[np.floating]]
) -> Tuple[NDArray[np.int32], int]:
lengths = np.asarray([int(s.shape[0]) for s in sequences], dtype=np.int32)
return lengths, int(lengths.max(initial=0))
+
def sample_noise(
batch_size: int,
z_dim: int,
@@ -38,10 +41,11 @@ def sample_noise(
return out
+
def minmax_scale(
- data: NDArray[np.floating],
- epsilon: float = 1e-7
-)-> Tuple[NDArray[np.float32], NDArray[np.float32], NDArray[np.float32]]:
+ data: NDArray[np.floating],
+ epsilon: float = 1e-7
+) -> Tuple[NDArray[np.float32], NDArray[np.float32], NDArray[np.float32]]:
if data.ndim != 3:
raise ValueError(f"Expected data with 3 dimensions [N, T, F], got shape {data.shape}")
@@ -52,10 +56,11 @@ def minmax_scale(
norm = (data.astype(np.float32) - fmin) / (denom + epsilon)
return norm, fmin, fmax
+
def minmax_inverse(
- norm: NDArray[np.floating],
- fmin: NDArray[np.floating],
- fmax: NDArray[np.floating],
+ norm: NDArray[np.floating],
+ fmin: NDArray[np.floating],
+ fmax: NDArray[np.floating],
) -> NDArray[np.float32]:
"""
Inverse of `minmax_scale`.
@@ -72,6 +77,7 @@ def minmax_inverse(
fmax = np.asarray(fmax, dtype=np.float32)
return norm.astype(np.float32) * (fmax - fmin) + fmin
+
def _spread(series: NDArray[np.floating]) -> NDArray[np.float64]:
"""
Compute spread = best_ask - best_bid from a 2D array [T, F] with
@@ -94,6 +100,7 @@ def _midprice_returns(series: NDArray[np.floating]) -> NDArray[np.float64]:
r = np.log(mid[1:]) - np.log(mid[:-1])
return r.astype(np.float64)
+
def kl_divergence_hist(
real: NDArray[np.floating],
fake: NDArray[np.floating],
@@ -144,4 +151,4 @@ def kl_divergence_hist(
plt.show()
# numerical guard: KL should be >= 0
- return float(max(kl, 0.0))
\ No newline at end of file
+ return float(max(kl, 0.0))
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py
index c90b92ade..155834442 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py
@@ -10,21 +10,21 @@
import matplotlib.pyplot as plt
import numpy as np
from numpy.typing import NDArray
-from skimage.util import img_as_float
from skimage.metrics import structural_similarity as ssim
+from skimage.util import img_as_float
+from src.dataset import load_data
# use nested CLI options + constants from src.helpers
from src.helpers.args import Options
from src.helpers.constants import OUTPUT_DIR, NUM_LEVELS
from src.helpers.richie import log as rlog, status as rstatus, rule as rrule
-
-from src.dataset import load_data
from src.modules import TimeGAN
# optional pretty table for SSIM results (graceful fallback if rich unavailable)
try:
from rich.table import Table
from rich import box
+
_HAS_RICH_TABLE = True
except Exception:
_HAS_RICH_TABLE = False
@@ -48,12 +48,12 @@ def get_ssim(img1_path: Path | str, img2_path: Path | str) -> float:
def plot_heatmap(
- data_2d: NDArray, # shape [T, F]
- *,
- title: str | None = None,
- save_path: Path | str | None = None,
- show: bool = True,
- dpi: int = 150,
+ data_2d: NDArray, # shape [T, F]
+ *,
+ title: str | None = None,
+ save_path: Path | str | None = None,
+ show: bool = True,
+ dpi: int = 150,
) -> None:
"""
Scatter-based depth heatmap.
@@ -68,9 +68,9 @@ def plot_heatmap(
# for each level L: price indices = 4*L + (0 for ask, 2 for bid)
# vol indices = price_idx + 1
prices_ask = np.stack([data_2d[:, 4 * L + 0] for L in range(NUM_LEVELS)], axis=1) # [T, L]
- vols_ask = np.stack([data_2d[:, 4 * L + 1] for L in range(NUM_LEVELS)], axis=1) # [T, L]
+ vols_ask = np.stack([data_2d[:, 4 * L + 1] for L in range(NUM_LEVELS)], axis=1) # [T, L]
prices_bid = np.stack([data_2d[:, 4 * L + 2] for L in range(NUM_LEVELS)], axis=1) # [T, L]
- vols_bid = np.stack([data_2d[:, 4 * L + 3] for L in range(NUM_LEVELS)], axis=1) # [T, L]
+ vols_bid = np.stack([data_2d[:, 4 * L + 3] for L in range(NUM_LEVELS)], axis=1) # [T, L]
# Normalise volumes for alpha
max_vol = float(max(prices_ask.size and vols_ask.max(), prices_bid.size and vols_bid.max()))
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/modules.py b/recognition/TimeLOB_TimeGAN_49088276/src/modules.py
index 65139ae10..5f76606f5 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/modules.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/modules.py
@@ -209,25 +209,34 @@ def forward(self, h: torch.Tensor) -> torch.Tensor:
@dataclass
class TrainingHistory:
er_iters: List[int] = field(default_factory=list)
- er_vals: List[float] = field(default_factory=list)
+ er_vals: List[float] = field(default_factory=list)
s_iters: List[int] = field(default_factory=list)
- s_vals: List[float] = field(default_factory=list)
+ s_vals: List[float] = field(default_factory=list)
g_iters: List[int] = field(default_factory=list)
- g_vals: List[float] = field(default_factory=list)
+ g_vals: List[float] = field(default_factory=list)
d_iters: List[int] = field(default_factory=list)
- d_vals: List[float] = field(default_factory=list)
+ d_vals: List[float] = field(default_factory=list)
kl_iters: List[int] = field(default_factory=list)
- kl_vals: List[float] = field(default_factory=list)
+ kl_vals: List[float] = field(default_factory=list)
- def add_er(self, it: int, v: float) -> None: self.er_iters.append(it); self.er_vals.append(v)
- def add_s (self, it: int, v: float) -> None: self.s_iters.append(it); self.s_vals.append(v)
- def add_g (self, it: int, v: float) -> None: self.g_iters.append(it); self.g_vals.append(v)
- def add_d (self, it: int, v: float) -> None: self.d_iters.append(it); self.d_vals.append(v)
- def add_kl(self, it: int, v: float) -> None: self.kl_iters.append(it); self.kl_vals.append(v)
+ def add_er(self, it: int, v: float) -> None:
+ self.er_iters.append(it); self.er_vals.append(v)
+
+ def add_s(self, it: int, v: float) -> None:
+ self.s_iters.append(it); self.s_vals.append(v)
+
+ def add_g(self, it: int, v: float) -> None:
+ self.g_iters.append(it); self.g_vals.append(v)
+
+ def add_d(self, it: int, v: float) -> None:
+ self.d_iters.append(it); self.d_vals.append(v)
+
+ def add_kl(self, it: int, v: float) -> None:
+ self.kl_iters.append(it); self.kl_vals.append(v)
def save_plots(self, out_dir: Path, total_iters: int) -> Dict[str, Path]:
out_dir.mkdir(parents=True, exist_ok=True)
@@ -236,14 +245,18 @@ def save_plots(self, out_dir: Path, total_iters: int) -> Dict[str, Path]:
# Training losses
fig, ax = plt.subplots(figsize=(9, 5))
if self.er_iters: ax.plot(self.er_iters, self.er_vals, label="Recon (E,R)")
- if self.s_iters: ax.plot(self.s_iters, self.s_vals, label="Supervisor (S)")
- if self.g_iters: ax.plot(self.g_iters, self.g_vals, label="Generator (G)")
- if self.d_iters: ax.plot(self.d_iters, self.d_vals, label="Discriminator (D)")
+ if self.s_iters: ax.plot(self.s_iters, self.s_vals, label="Supervisor (S)")
+ if self.g_iters: ax.plot(self.g_iters, self.g_vals, label="Generator (G)")
+ if self.d_iters: ax.plot(self.d_iters, self.d_vals, label="Discriminator (D)")
ax.set_title("Training Losses vs Iteration")
- ax.set_xlabel("Iteration"); ax.set_ylabel("Loss")
+ ax.set_xlabel("Iteration");
+ ax.set_ylabel("Loss")
ax.set_xlim(1, max([total_iters, *self.er_iters, *self.s_iters, *self.g_iters, *self.d_iters] or [total_iters]))
- ax.legend(loc="best"); fig.tight_layout()
- p1 = out_dir / "training_curves.png"; fig.savefig(p1, dpi=150, bbox_inches="tight"); plt.close(fig)
+ ax.legend(loc="best");
+ fig.tight_layout()
+ p1 = out_dir / "training_curves.png";
+ fig.savefig(p1, dpi=150, bbox_inches="tight");
+ plt.close(fig)
saved["training_curves"] = p1
# KL(spread)
@@ -251,9 +264,13 @@ def save_plots(self, out_dir: Path, total_iters: int) -> Dict[str, Path]:
fig, ax = plt.subplots(figsize=(9, 3.5))
ax.plot(self.kl_iters, self.kl_vals, marker="o", linewidth=1)
ax.set_title("Validation KL(spread) vs Iteration")
- ax.set_xlabel("Iteration"); ax.set_ylabel("KL(spread)")
- ax.set_xlim(1, max(self.kl_iters)); fig.tight_layout()
- p2 = out_dir / "kl_spread_curve.png"; fig.savefig(p2, dpi=150, bbox_inches="tight"); plt.close(fig)
+ ax.set_xlabel("Iteration");
+ ax.set_ylabel("KL(spread)")
+ ax.set_xlim(1, max(self.kl_iters));
+ fig.tight_layout()
+ p2 = out_dir / "kl_spread_curve.png";
+ fig.savefig(p2, dpi=150, bbox_inches="tight");
+ plt.close(fig)
saved["kl_spread_curve"] = p2
return saved
@@ -544,7 +561,7 @@ def train_model(self) -> None:
kl = float("nan")
except Exception:
kl = float("nan")
- self.history.add_kl(it+1, kl)
+ self.history.add_kl(it + 1, kl)
self._save()
rlog(
f"[Joint] it={it + 1:,} G={g_loss:.4f} (ema={g_ema:.4f}) "
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/predict.py b/recognition/TimeLOB_TimeGAN_49088276/src/predict.py
index 22b75f94f..76e8d1763 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/predict.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/predict.py
@@ -19,7 +19,6 @@
Created By: Radhesh Goel (Keys-I)
ID: s49088276
"""
-from pathlib import Path
import numpy as np
@@ -55,4 +54,4 @@ def main() -> None:
if __name__ == "__main__":
- main()
\ No newline at end of file
+ main()
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/train.py b/recognition/TimeLOB_TimeGAN_49088276/src/train.py
index eadea8057..e96717536 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/train.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/train.py
@@ -14,8 +14,8 @@
-
"""
from src.dataset import load_data
-from src.modules import TimeGAN
from src.helpers.args import Options
+from src.modules import TimeGAN
def train() -> None:
@@ -37,4 +37,4 @@ def train() -> None:
if __name__ == "__main__":
- train()
\ No newline at end of file
+ train()
From 070bd1c9e6295ba82f2598a19f6f8e4bbe03bf05 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Mon, 20 Oct 2025 21:26:49 +1000
Subject: [PATCH 48/74] style(all): reformat codebase with Black and Ruff
Apply Ruff autofixes (incl. import sorting) and Black formatting at 100-char line length. No functional changes; purely stylistic for consistency and cleaner diffs.
---
.../scripts/analyze_features.py | 424 ------------------
.../scripts/summarise_orderbook.py | 284 ++++++++++++
.../TimeLOB_TimeGAN_49088276/src/dataset.py | 39 +-
.../src/helpers/args.py | 83 ++--
.../src/helpers/constants.py | 8 +-
.../src/helpers/richie.py | 26 +-
.../src/helpers/utils.py | 45 +-
.../src/helpers/visualise.py | 27 +-
.../TimeLOB_TimeGAN_49088276/src/modules.py | 104 +++--
.../TimeLOB_TimeGAN_49088276/src/train.py | 3 +-
10 files changed, 479 insertions(+), 564 deletions(-)
delete mode 100644 recognition/TimeLOB_TimeGAN_49088276/scripts/analyze_features.py
create mode 100644 recognition/TimeLOB_TimeGAN_49088276/scripts/summarise_orderbook.py
diff --git a/recognition/TimeLOB_TimeGAN_49088276/scripts/analyze_features.py b/recognition/TimeLOB_TimeGAN_49088276/scripts/analyze_features.py
deleted file mode 100644
index ea487ed54..000000000
--- a/recognition/TimeLOB_TimeGAN_49088276/scripts/analyze_features.py
+++ /dev/null
@@ -1,424 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Analyze engineered LOBSTER features and justify a 5-feature subset.
-
-This script loads paired LOBSTER message/order book CSVs (Level 10), computes the 10 engineered
-features below, and generates quantitative evidence to support selecting a compact 5-feature set
-for TimeGAN training and evaluation on AMZN Level-10 data.
-
-Engineered features (10):
- 1) mid_price = 0.5 * (ask_price_1 + bid_price_1)
- 2) spread = ask_price_1 - bid_price_1
- 3) rel_spread = spread / mid_price
- 4) mid_log_return = log(mid_price_t) - log(mid_price_{t-1})
- 5) queue_imbalance_l1 = (bid_size_1 - ask_size_1) / (bid_size_1 + ask_size_1 + eps)
- 6) depth_imbalance_l5 = (Σ_i≤5 bid_size_i - Σ_i≤5 ask_size_i) /
- (Σ_i≤5 bid_size_i + Σ_i≤5 ask_size_i + eps)
- 7) depth_imbalance_l10 = (Σ_i≤10 bid_size_i - Σ_i≤10 ask_size_i) /
- (Σ_i≤10 bid_size_i + Σ_i≤10 ask_size_i + eps)
- 8) cum_depth_bid_10 = Σ_i≤10 bid_size_i
- 9) cum_depth_ask_10 = Σ_i≤10 ask_size_i
- 10) time_delta = time_t - time_{t-1} (seconds)
-
-Evidence produced:
- • Relevance: mutual information (MI) with next-step mid_log_return (predictive dynamics) and
- with current spread (matches your report metrics).
- • Redundancy: Spearman correlation matrix + greedy mRMR-style selection.
- • Coverage: PCA explained variance + feature loading contributions (top 3 PCs).
- • Summary: Markdown report with the final top-5 and numeric justifications.
-
-Usage:
- python analyze_features.py \
- --message AMZN_2012-06-21_34200000_57600000_message_10.csv \
- --orderbook AMZN_2012-06-21_34200000_57600000_orderbook_10.csv \
- --outdir results_amzn_lvl10
-
-Notes:
- • LOBSTER quotes prices as ticks (price * 10_000). This script converts to dollars.
- • Outputs include PNG plots, CSV/JSON metrics, and a summary.md rationale.
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-import os
-from dataclasses import dataclass
-from typing import Dict, List, Tuple
-
-import numpy as np
-import pandas as pd
-import matplotlib.pyplot as plt
-from scipy.stats import spearmanr
-from sklearn.decomposition import PCA
-from sklearn.feature_selection import mutual_info_regression
-from sklearn.preprocessing import StandardScaler
-
-EPS = 1e-9
-TICK_SCALE = 10_000.0 # LOBSTER price ticks: quoted as price * 10_000
-
-
-@dataclass
-class AnalysisOutputs:
- mi_next_return: Dict[str, float]
- mi_spread: Dict[str, float]
- corr_matrix: pd.DataFrame
- pca_var_ratio: np.ndarray
- pca_loadings: pd.DataFrame
- selected5: List[str]
- reasons: Dict[str, Dict[str, float]]
-
-
-def _make_orderbook_columns(levels: int = 10) -> List[str]:
- cols = []
- for i in range(1, levels + 1):
- cols.append(f"ask_price_{i}")
- cols.append(f"ask_size_{i}")
- for i in range(1, levels + 1):
- cols.append(f"bid_price_{i}")
- cols.append(f"bid_size_{i}")
- return cols # 40 columns
-
-
-def load_lobster(orderbook_csv: str, message_csv: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
- # order book: 40 columns, no header
- ob_cols = _make_orderbook_columns(10)
- ob = pd.read_csv(orderbook_csv, header=None, names=ob_cols)
-
- # message: 6 columns, no header per LOBSTER docs
- msg_cols = ["time", "event_type", "order_id", "size", "price", "direction"]
- msg = pd.read_csv(message_csv, header=None, names=msg_cols)
-
- n = min(len(ob), len(msg))
- if len(ob) != len(msg):
- print(
- f"[warn] Row mismatch (orderbook={len(ob)}, message={len(msg)}). Truncating to {n}.")
- ob = ob.iloc[:n].reset_index(drop=True)
- msg = msg.iloc[:n].reset_index(drop=True)
-
- return ob, msg
-
-
-def compute_features(ob: pd.DataFrame, msg: pd.DataFrame) -> pd.DataFrame:
- # Convert price ticks to dollars
- ask1 = ob["ask_price_1"] / TICK_SCALE
- bid1 = ob["bid_price_1"] / TICK_SCALE
-
- mid_price = 0.5 * (ask1 + bid1)
- spread = (ask1 - bid1) # already in dollars
- rel_spread = spread / (mid_price + EPS)
- mid_log_return = np.log(mid_price + EPS).diff().fillna(0.0)
-
- ask_sizes = [f"ask_size_{i}" for i in range(1, 11)]
- bid_sizes = [f"bid_size_{i}" for i in range(1, 11)]
-
- queue_imbalance_l1 = (
- (ob["bid_size_1"] - ob["ask_size_1"]) /
- (ob["bid_size_1"] + ob["ask_size_1"] + EPS)
- )
-
- cum_bid_5 = ob[[f"bid_size_{i}" for i in range(1, 6)]].sum(axis=1)
- cum_ask_5 = ob[[f"ask_size_{i}" for i in range(1, 6)]].sum(axis=1)
- depth_imbalance_l5 = (cum_bid_5 - cum_ask_5) / \
- (cum_bid_5 + cum_ask_5 + EPS)
-
- cum_bid_10 = ob[bid_sizes].sum(axis=1)
- cum_ask_10 = ob[ask_sizes].sum(axis=1)
- depth_imbalance_l10 = (cum_bid_10 - cum_ask_10) / \
- (cum_bid_10 + cum_ask_10 + EPS)
-
- cum_depth_bid_10 = cum_bid_10
- cum_depth_ask_10 = cum_ask_10
-
- time_delta = msg["time"].diff().fillna(0.0)
-
- feats = pd.DataFrame(
- {
- "mid_price": mid_price,
- "spread": spread,
- "rel_spread": rel_spread,
- "mid_log_return": mid_log_return,
- "queue_imbalance_l1": queue_imbalance_l1,
- "depth_imbalance_l5": depth_imbalance_l5,
- "depth_imbalance_l10": depth_imbalance_l10,
- "cum_depth_bid_10": cum_depth_bid_10,
- "cum_depth_ask_10": cum_depth_ask_10,
- "time_delta": time_delta,
- }
- )
-
- # Align for next-step relationships; drop the last row to form y_{t+1}
- feats = feats.dropna().reset_index(drop=True)
- return feats
-
-
-def compute_mi_scores(feats: pd.DataFrame) -> Tuple[Dict[str, float], Dict[str, float]]:
- # Targets: next-step mid_log_return (shift -1) and current spread
- y_next_ret = feats["mid_log_return"].shift(-1).iloc[:-1].values
- y_spread = feats["spread"].iloc[:-1].values
- X = feats.iloc[:-1].values
- names = feats.columns.tolist()
-
- # Standardize features for MI numeric stability (MI itself is scale-free but helps neighbors)
- X_std = StandardScaler(with_mean=True, with_std=True).fit_transform(X)
-
- mi_next = mutual_info_regression(X_std, y_next_ret, random_state=0)
- mi_spr = mutual_info_regression(X_std, y_spread, random_state=0)
-
- mi_next_dict = {n: float(v) for n, v in zip(names, mi_next)}
- mi_spr_dict = {n: float(v) for n, v in zip(names, mi_spr)}
- return mi_next_dict, mi_spr_dict
-
-
-def compute_correlations(feats: pd.DataFrame) -> pd.DataFrame:
- corr, _ = spearmanr(feats.values, axis=0)
- corr_df = pd.DataFrame(corr, index=feats.columns, columns=feats.columns)
- return corr_df
-
-
-def compute_pca(feats: pd.DataFrame, n_components: int = 5) -> Tuple[np.ndarray, pd.DataFrame]:
- X_std = StandardScaler().fit_transform(feats.values)
- pca = PCA(n_components=n_components, random_state=0)
- X_pca = pca.fit_transform(X_std)
- var_ratio = pca.explained_variance_ratio_
- loadings = pd.DataFrame(
- pca.components_.T, index=feats.columns, columns=[
- f"PC{i + 1}" for i in range(n_components)]
- )
- return var_ratio, loadings
-
-
-def greedy_select_5(
- mi_next: Dict[str, float],
- mi_spr: Dict[str, float],
- corr: pd.DataFrame,
- must_include: List[str] | None = None,
- lambda_red: float = 0.5,
-) -> Tuple[List[str], Dict[str, Dict[str, float]]]:
- """
- Greedy mRMR-like selection:
- score = 0.6 * MI(next_ret) + 0.4 * MI(spread) - λ * avg_abs_corr_with_selected
- Always include 'must_include' first (mid_price, spread) to align with report metrics.
- """
- if must_include is None:
- must_include = ["mid_price", "spread"]
-
- # Normalize MI to [0, 1] per target for fair combination
- all_feats = list(mi_next.keys())
- mi_next_arr = np.array([mi_next[f] for f in all_feats])
- mi_spr_arr = np.array([mi_spr[f] for f in all_feats])
- mi_next_norm = (mi_next_arr - mi_next_arr.min()) / \
- (np.ptp(mi_next_arr) + EPS)
- mi_spr_norm = (mi_spr_arr - mi_spr_arr.min()) / (np.ptp(mi_spr_arr) + EPS)
- mi_combo = 0.6 * mi_next_norm + 0.4 * mi_spr_norm
- mi_combo_dict = {f: float(v) for f, v in zip(all_feats, mi_combo)}
-
- selected: List[str] = []
- reasons: Dict[str, Dict[str, float]] = {}
-
- for m in must_include:
- selected.append(m)
- reasons[m] = {
- "mi_next_norm": mi_combo_dict[m], # combined normalized MI
- "mi_spread_raw": mi_spr[m],
- "mi_next_raw": mi_next[m],
- "avg_redundancy": 0.0,
- }
-
- candidates = [f for f in all_feats if f not in selected]
- while len(selected) < 5 and candidates:
- best_feat = None
- best_score = -np.inf
- best_red = None
- for f in candidates:
- # Redundancy: average absolute Spearman corr with already selected
- red = float(np.mean(np.abs(corr.loc[f, selected].values)))
- score = mi_combo_dict[f] - lambda_red * red
- if score > best_score:
- best_score = score
- best_feat = f
- best_red = red
- assert best_feat is not None
- selected.append(best_feat)
- reasons[best_feat] = {
- "mi_next_norm": mi_combo_dict[best_feat],
- "mi_spread_raw": mi_spr[best_feat],
- "mi_next_raw": mi_next[best_feat],
- "avg_redundancy": float(best_red),
- }
- candidates.remove(best_feat)
-
- return selected, reasons
-
-
-def plot_bar(values: Dict[str, float], title: str, ylabel: str, outpath: str) -> None:
- names = list(values.keys())
- vals = list(values.values())
- plt.figure(figsize=(10, 4))
- plt.bar(range(len(names)), vals)
- plt.xticks(range(len(names)), names, rotation=45, ha="right")
- plt.ylabel(ylabel)
- plt.title(title)
- plt.tight_layout()
- plt.savefig(outpath, dpi=160)
- plt.close()
-
-
-def plot_corr_heatmap(corr: pd.DataFrame, title: str, outpath: str) -> None:
- plt.figure(figsize=(7.5, 6.5))
- im = plt.imshow(corr.values, vmin=-1, vmax=1,
- interpolation="nearest", aspect="auto")
- plt.colorbar(im, fraction=0.035, pad=0.04)
- plt.xticks(range(len(corr)), corr.columns, rotation=45, ha="right")
- plt.yticks(range(len(corr)), corr.index)
- plt.title(title)
- plt.tight_layout()
- plt.savefig(outpath, dpi=160)
- plt.close()
-
-
-def plot_pca(var_ratio: np.ndarray, loadings: pd.DataFrame, outdir: str) -> None:
- plt.figure(figsize=(6, 4))
- plt.bar(range(1, len(var_ratio) + 1), var_ratio)
- plt.xlabel("Principal component")
- plt.ylabel("Explained variance ratio")
- plt.title("PCA explained variance ratio (standardized features)")
- plt.tight_layout()
- plt.savefig(os.path.join(outdir, "pca_explained_variance.png"), dpi=160)
- plt.close()
-
- # Sum absolute loadings across top 3 PCs as a proxy of contribution
- topk = min(3, loadings.shape[1])
- contrib = loadings.iloc[:, :topk].abs().sum(axis=1)
- contrib = contrib.sort_values(ascending=False)
- plt.figure(figsize=(8, 4))
- plt.bar(range(len(contrib)), contrib.values)
- plt.xticks(range(len(contrib)), contrib.index, rotation=45, ha="right")
- plt.ylabel("Σ|loading| over top 3 PCs")
- plt.title("PCA loading contributions (top 3 PCs)")
- plt.tight_layout()
- plt.savefig(os.path.join(outdir, "pca_loading_contributions.png"), dpi=160)
- plt.close()
-
- contrib.to_csv(os.path.join(outdir, "pca_loading_contributions.csv"))
-
-
-def write_summary(
- out: AnalysisOutputs,
- outdir: str,
- fixed_keep: List[str] | None = None,
-) -> None:
- if fixed_keep is None:
- fixed_keep = ["mid_price", "spread"]
-
- md = []
- md.append("# Feature analysis summary\n")
- md.append("**Final selected 5 features:** " +
- ", ".join(out.selected5) + "\n")
- md.append("We pin *mid_price* and *spread* as must-haves because your report metrics directly use "
- "the mid-price return distribution and the spread; the remaining three are chosen by "
- "a greedy mRMR-style criterion that balances relevance (MI) and redundancy.\n")
-
- md.append("## Mutual information (relevance)\n")
- md.append("- We compute MI with **next-step mid_log_return** (predictive dynamics) and with the "
- "**current spread** (distributional target). Higher is better.\n")
- md.append("\n**Top MI (next-step return)**\n\n")
- top_mi_next = sorted(out.mi_next_return.items(),
- key=lambda x: x[1], reverse=True)
- md.extend([f"- {k}: {v:.4f}" for k, v in top_mi_next[:5]])
- md.append("\n**Top MI (spread)**\n\n")
- top_mi_spr = sorted(out.mi_spread.items(),
- key=lambda x: x[1], reverse=True)
- md.extend([f"- {k}: {v:.4f}" for k, v in top_mi_spr[:5]])
- md.append("\n")
-
- md.append("## Redundancy (Spearman correlation)\n")
- md.append("The heatmap (corr_heatmap.png) shows strong collinearity between "
- "`depth_imbalance_l5` and `depth_imbalance_l10`, and between "
- "`cum_depth_bid_10` and `cum_depth_ask_10`. We keep only one of each redundant "
- "family to avoid duplication.\n")
-
- md.append("## PCA coverage\n")
- md.append("PCA plots indicate how much variance is captured and which features contribute most "
- "to the top components (pca_explained_variance.png, pca_loading_contributions.png).\n")
-
- md.append("## Why these 5?\n")
- for f in out.selected5:
- r = out.reasons[f]
- pinned = " (pinned)" if f in fixed_keep else ""
- md.append(
- f"- **{f}**{pinned}: MI(next)≈{r['mi_next_raw']:.4f}, "
- f"MI(spread)≈{r['mi_spread_raw']:.4f}, avg redundancy≈{r['avg_redundancy']:.3f}.\n"
- " Contributes strongly while staying non-redundant with the rest."
- )
-
- with open(os.path.join(outdir, "summary.md"), "w", encoding="utf-8") as f:
- f.write("\n".join(md))
-
-
-def run_analysis(orderbook_csv: str, message_csv: str, outdir: str) -> AnalysisOutputs:
- os.makedirs(outdir, exist_ok=True)
-
- ob, msg = load_lobster(orderbook_csv, message_csv)
- feats = compute_features(ob, msg)
- feats.to_csv(os.path.join(outdir, "engineered_features.csv"), index=False)
-
- mi_next, mi_spr = compute_mi_scores(feats)
- corr = compute_correlations(feats)
- var_ratio, loadings = compute_pca(feats, n_components=5)
-
- # Plots/tables
- plot_bar(mi_next, "MI with next-step mid_log_return",
- "MI", os.path.join(outdir, "mi_next.png"))
- plot_bar(mi_spr, "MI with current spread", "MI",
- os.path.join(outdir, "mi_spread.png"))
- plot_corr_heatmap(corr, "Spearman correlation (10 engineered features)",
- os.path.join(outdir, "corr_heatmap.png"))
- pd.DataFrame({"feature": list(mi_next.keys()),
- "mi_next": list(mi_next.values()),
- "mi_spread": [mi_spr[k] for k in mi_next.keys()],
- }).to_csv(os.path.join(outdir, "mi_scores.csv"), index=False)
- loadings.to_csv(os.path.join(outdir, "pca_loadings.csv"))
- plot_pca(var_ratio, loadings, outdir)
-
- # Greedy selection with mid_price, spread as must-keep
- selected5, reasons = greedy_select_5(
- mi_next, mi_spr, corr, must_include=["mid_price", "spread"])
- with open(os.path.join(outdir, "selected_features.json"), "w", encoding="utf-8") as f:
- json.dump({"selected5": selected5, "reasons": reasons}, f, indent=2)
-
- out = AnalysisOutputs(
- mi_next_return=mi_next,
- mi_spread=mi_spr,
- corr_matrix=corr,
- pca_var_ratio=var_ratio,
- pca_loadings=loadings,
- selected5=selected5,
- reasons=reasons,
- )
-
- write_summary(out, outdir)
- return out
-
-
-def parse_args() -> argparse.Namespace:
- ap = argparse.ArgumentParser(
- description="Analyze LOBSTER features and justify a 5-feature set.")
- ap.add_argument("--orderbook", required=True,
- help="Path to orderbook_10.csv")
- ap.add_argument("--message", required=True, help="Path to message_10.csv")
- ap.add_argument("--outdir", required=True,
- help="Output directory for plots and tables")
- return ap.parse_args()
-
-
-def main() -> None:
- args = parse_args()
- run_analysis(orderbook_csv=args.orderbook,
- message_csv=args.message, outdir=args.outdir)
- print(f"[done] Analysis complete. Results in: {args.outdir}")
-
-
-if __name__ == "__main__":
- main()
diff --git a/recognition/TimeLOB_TimeGAN_49088276/scripts/summarise_orderbook.py b/recognition/TimeLOB_TimeGAN_49088276/scripts/summarise_orderbook.py
new file mode 100644
index 000000000..46f983fa8
--- /dev/null
+++ b/recognition/TimeLOB_TimeGAN_49088276/scripts/summarise_orderbook.py
@@ -0,0 +1,284 @@
+#!/usr/bin/env python3
+"""
+Summarise a single LOBSTER order book file (orderbook_10.csv).
+
+Outputs:
+ - per_column_summary.csv # min/max/mean/std/zero% for each of the 40 columns
+ - depth_profile.png # average depth vs level (bid vs ask)
+ - spread_hist.png # histogram of best-level spread (USD)
+ - midprice_series.png # mid-price over time (USD)
+ - midlogret_hist.png # histogram of mid-price log returns
+ - summary.md # concise human-readable summary
+
+Assumptions:
+ - LOBSTER order book file has 40 columns, no header:
+ [ask_price_1, ask_size_1, ..., ask_price_10, ask_size_10,
+ bid_price_1, bid_size_1, ..., bid_price_10, bid_size_10]
+ - Prices are quoted as ticks = dollars * tick_scale (default 10_000); use --tick-scale to adjust.
+
+Usage:
+ python summarise_orderbook.py \
+ --orderbook ./data/AMZN_2012-06-21_34200000_57600000_orderbook_10.csv \
+ --outdir ./outs/summary_amzn_lvl10 \
+ --tick-scale 10000 \
+ --seq-len 128
+"""
+from __future__ import annotations
+
+import argparse
+import os
+from dataclasses import dataclass
+from typing import List, Tuple
+
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+
+# --------------------------- Config / Types ---------------------------- #
+@dataclass
+class OBMeta:
+ levels: int
+ tick_scale: float
+ seq_len: int | None
+
+
+# --------------------------- Column Helpers ---------------------------- #
+def make_orderbook_columns(levels: int = 10) -> List[str]:
+ cols: List[str] = []
+ for i in range(1, levels + 1):
+ cols.append(f"ask_price_{i}")
+ cols.append(f"ask_size_{i}")
+ for i in range(1, levels + 1):
+ cols.append(f"bid_price_{i}")
+ cols.append(f"bid_size_{i}")
+ return cols # total 4*levels
+
+
+# ------------------------------ I/O ----------------------------------- #
+def load_orderbook(csv_path: str, levels: int) -> pd.DataFrame:
+ cols = make_orderbook_columns(levels)
+ try:
+ ob = pd.read_csv(csv_path, header=None, names=cols)
+ except Exception as e:
+ raise RuntimeError(f"Failed to read orderbook CSV at {csv_path}: {e}")
+ if ob.shape[1] != 4 * levels:
+ raise ValueError(
+ f"Expected {4*levels} columns for level={levels} (got {ob.shape[1]}). "
+ "Check --levels or file format."
+ )
+ return ob
+
+
+# ---------------------------- Computations ---------------------------- #
+def compute_top_of_book(ob: pd.DataFrame, tick_scale: float) -> tuple[pd.Series, pd.Series, pd.Series, pd.Series]:
+ ask1 = ob["ask_price_1"] / tick_scale
+ bid1 = ob["bid_price_1"] / tick_scale
+ spread = ask1 - bid1
+ mid_price = 0.5 * (ask1 + bid1)
+ # guard tiny/zero
+ mid_safe = mid_price.replace(0, np.nan).fillna(method="ffill").fillna(method="bfill")
+ mid_logret = np.log(mid_safe + 1e-12).diff().fillna(0.0)
+ return ask1, bid1, spread, mid_logret
+
+
+def average_depth_profile(ob: pd.DataFrame, levels: int) -> tuple[np.ndarray, np.ndarray]:
+ bid_cols = [f"bid_size_{i}" for i in range(1, levels + 1)]
+ ask_cols = [f"ask_size_{i}" for i in range(1, levels + 1)]
+ bid_depth = ob[bid_cols].astype(float).mean(axis=0).values # shape [levels]
+ ask_depth = ob[ask_cols].astype(float).mean(axis=0).values # shape [levels]
+ return bid_depth, ask_depth
+
+
+def per_column_summary(ob: pd.DataFrame) -> pd.DataFrame:
+ arr = ob.astype(float)
+ zeros = (arr == 0).sum(axis=0)
+ total = len(arr)
+ desc = arr.describe(percentiles=[0.25, 0.5, 0.75]).T
+ desc["zero_count"] = zeros
+ desc["zero_percent"] = (zeros / total) * 100.0
+ # reorder columns nicely
+ keep = ["count", "mean", "std", "min", "25%", "50%", "75%", "max", "zero_count", "zero_percent"]
+ return desc[keep].rename_axis("column").reset_index()
+
+
+def windows_possible(n_rows: int, seq_len: int | None) -> int | None:
+ if seq_len is None:
+ return None
+ return max(0, n_rows - seq_len + 1)
+
+
+# ------------------------------- Plots -------------------------------- #
+def plot_depth_profile(outdir: str, bid_depth: np.ndarray, ask_depth: np.ndarray) -> str:
+ levels = np.arange(1, len(bid_depth) + 1)
+ plt.figure(figsize=(7, 4))
+ plt.plot(levels, bid_depth, marker="o", label="Bid depth")
+ plt.plot(levels, ask_depth, marker="o", label="Ask depth")
+ plt.xlabel("Level")
+ plt.ylabel("Average size")
+ plt.title("Average depth profile (mean size per level)")
+ plt.legend()
+ plt.tight_layout()
+ path = os.path.join(outdir, "depth_profile.png")
+ plt.savefig(path, dpi=160, bbox_inches="tight")
+ plt.close()
+ return path
+
+
+def plot_spread_hist(outdir: str, spread: pd.Series) -> str:
+ plt.figure(figsize=(7, 4))
+ plt.hist(spread.values, bins=100)
+ plt.xlabel("Spread (USD)")
+ plt.ylabel("Count")
+ plt.title("Histogram of best-level spread")
+ plt.tight_layout()
+ path = os.path.join(outdir, "spread_hist.png")
+ plt.savefig(path, dpi=160, bbox_inches="tight")
+ plt.close()
+ return path
+
+
+def plot_midprice_series(outdir: str, mid_price: pd.Series, max_points: int = 4000) -> str:
+ # Downsample for visual clarity if huge
+ if len(mid_price) > max_points:
+ idx = np.linspace(0, len(mid_price) - 1, max_points).astype(int)
+ mp = mid_price.iloc[idx]
+ x = np.arange(len(mp))
+ else:
+ mp = mid_price
+ x = np.arange(len(mid_price))
+ plt.figure(figsize=(8, 4))
+ plt.plot(x, mp.values, linewidth=1)
+ plt.xlabel("Event index (downsampled)" if len(mid_price) > max_points else "Event index")
+ plt.ylabel("Mid price (USD)")
+ plt.title("Mid price over time")
+ plt.tight_layout()
+ path = os.path.join(outdir, "midprice_series.png")
+ plt.savefig(path, dpi=160, bbox_inches="tight")
+ plt.close()
+ return path
+
+
+def plot_midlogret_hist(outdir: str, mid_logret: pd.Series) -> str:
+ plt.figure(figsize=(7, 4))
+ # clip heavy tails for nicer viz
+ vals = np.clip(mid_logret.values, np.percentile(mid_logret, 0.1), np.percentile(mid_logret, 99.9))
+ plt.hist(vals, bins=100)
+ plt.xlabel("log mid-price return")
+ plt.ylabel("Count")
+ plt.title("Histogram of log mid-price returns")
+ plt.tight_layout()
+ path = os.path.join(outdir, "midlogret_hist.png")
+ plt.savefig(path, dpi=160, bbox_inches="tight")
+ plt.close()
+ return path
+
+
+# ------------------------------ Summary ------------------------------- #
+def write_markdown_summary(
+ outdir: str,
+ ob_path: str,
+ meta: OBMeta,
+ n_rows: int,
+ zeros_total: int,
+ zeros_pct: float,
+ spread_stats: dict,
+ mid_ret_stats: dict,
+ window_count: int | None,
+ artifacts: dict[str, str],
+) -> None:
+ md = []
+ md.append("# Order book summary\n")
+ md.append(f"- **File**: `{ob_path}`")
+ md.append(f"- **Rows**: {n_rows:,}")
+ md.append(f"- **Levels**: {meta.levels}")
+ md.append(f"- **Tick scale**: {meta.tick_scale:g} (price = ticks / tick_scale)")
+ if meta.seq_len is not None:
+ md.append(f"- **Seq len** (for windows estimate): {meta.seq_len}")
+ md.append(f"- **Possible windows**: {window_count:,}")
+ md.append("")
+ md.append(f"- **Zeros**: {zeros_total:,} cells ({zeros_pct:.2f}%)")
+ md.append("")
+ md.append("## Top-of-book (level 1)\n")
+ md.append(f"- Spread (USD): mean={spread_stats['mean']:.6f}, std={spread_stats['std']:.6f}, "
+ f"min={spread_stats['min']:.6f}, max={spread_stats['max']:.6f}")
+ md.append(f"- |log mid-price return|: mean={mid_ret_stats['mean']:.6f}, std={mid_ret_stats['std']:.6f}, "
+ f"p99={mid_ret_stats['p99']:.6f}")
+ md.append("")
+ md.append("## Artifacts\n")
+ for name, path in artifacts.items():
+ md.append(f"- {name}: `{path}`")
+ md.append("")
+ with open(os.path.join(outdir, "summary.md"), "w", encoding="utf-8") as f:
+ f.write("\n".join(md))
+
+
+# ------------------------------ Runner -------------------------------- #
+def parse_args() -> argparse.Namespace:
+ ap = argparse.ArgumentParser(description="Standalone LOBSTER orderbook_10.csv summariser.")
+ ap.add_argument("--orderbook", required=True, help="Path to orderbook_10.csv")
+ ap.add_argument("--outdir", required=True, help="Output directory for plots and tables")
+ ap.add_argument("--levels", type=int, default=10, help="Number of book levels (default 10)")
+ ap.add_argument("--tick-scale", type=float, default=10_000.0, help="LOBSTER tick scale (price = ticks / scale)")
+ ap.add_argument("--seq-len", type=int, default=None, help="Optional: sequence length to estimate windows")
+ return ap.parse_args()
+
+
+def main() -> None:
+ args = parse_args()
+ os.makedirs(args.outdir, exist_ok=True)
+ meta = OBMeta(levels=args.levels, tick_scale=float(args.tick_scale), seq_len=args.seq_len)
+
+ # Load
+ ob = load_orderbook(args.orderbook, meta.levels)
+
+ # Column summary
+ col_summary = per_column_summary(ob)
+ col_summary_path = os.path.join(args.outdir, "per_column_summary.csv")
+ col_summary.to_csv(col_summary_path, index=False)
+
+ # Zeros overall
+ zeros_total = (ob.values == 0).sum()
+ zeros_pct = 100.0 * zeros_total / (ob.shape[0] * ob.shape[1])
+
+ # Top-of-book derived series
+ ask1, bid1, spread, mid_logret = compute_top_of_book(ob, meta.tick_scale)
+ mid_price = 0.5 * (ask1 + bid1)
+
+ # Depth profile
+ bid_depth, ask_depth = average_depth_profile(ob, meta.levels)
+
+ # Plots
+ arts: dict[str, str] = {}
+ arts["depth_profile"] = plot_depth_profile(args.outdir, bid_depth, ask_depth)
+ arts["spread_hist"] = plot_spread_hist(args.outdir, spread)
+ arts["midprice_series"] = plot_midprice_series(args.outdir, mid_price)
+ arts["midlogret_hist"] = plot_midlogret_hist(args.outdir, mid_logret)
+
+ # Small stats for summary
+ spread_stats = dict(mean=float(spread.mean()), std=float(spread.std()),
+ min=float(spread.min()), max=float(spread.max()))
+ abs_ret = mid_logret.abs()
+ mid_ret_stats = dict(mean=float(abs_ret.mean()), std=float(abs_ret.std()),
+ p99=float(abs_ret.quantile(0.99)))
+
+ # Windows estimate
+ wcount = windows_possible(len(ob), meta.seq_len)
+
+ # Write markdown summary
+ write_markdown_summary(
+ outdir=args.outdir,
+ ob_path=args.orderbook,
+ meta=meta,
+ n_rows=len(ob),
+ zeros_total=int(zeros_total),
+ zeros_pct=float(zeros_pct),
+ spread_stats=spread_stats,
+ mid_ret_stats=mid_ret_stats,
+ window_count=wcount,
+ artifacts=arts,
+ )
+
+ print(f"[done] Summary written to: {args.outdir}")
+
+if __name__ == "__main__":
+ main()
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
index 460151069..fa7bcdd70 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py
@@ -11,6 +11,7 @@
Created By: Radhesh Goel (Keys-I)
"""
+
from __future__ import annotations
from argparse import Namespace
@@ -22,7 +23,9 @@
from numpy.typing import NDArray
from src.helpers.constants import DATA_DIR, ORDERBOOK_FILENAME, TRAIN_TEST_SPLIT
-from src.helpers.richie import log as rlog, status as rstatus, dataset_summary
+from src.helpers.richie import dataset_summary
+from src.helpers.richie import log as rlog
+from src.helpers.richie import status as rstatus
class MinMaxScaler:
@@ -61,6 +64,7 @@ class DatasetConfig:
"""
Configuration for loading and preprocessing order-book data.
"""
+
seq_len: int
data_dir: Path = DATA_DIR
orderbook_filename: str = ORDERBOOK_FILENAME
@@ -99,7 +103,11 @@ def __init__(self, cfg: DatasetConfig, scaler: Optional[MinMaxScaler] = None):
def load(self) -> "LOBDataset":
with rstatus("[bold cyan]Loading and preprocessing LOBSTER orderbook dataset..."):
data = self._read_raw()
- data = self._filter_unoccupied(data) if self.cfg.filter_zero_rows else data.astype(self.cfg.dtype)
+ data = (
+ self._filter_unoccupied(data)
+ if self.cfg.filter_zero_rows
+ else data.astype(self.cfg.dtype)
+ )
self._filtered = data.astype(self.cfg.dtype)
self._split_chronological()
@@ -116,7 +124,9 @@ def make_windows(self, split: str = "train") -> NDArray[np.float32]:
data = self._select_split(split)
return self._windowize(data, self.cfg.seq_len, self.cfg.shuffle_windows)
- def dataset_windowed(self) -> tuple[NDArray[np.float32], NDArray[np.float32], NDArray[np.float32]]:
+ def dataset_windowed(
+ self,
+ ) -> tuple[NDArray[np.float32], NDArray[np.float32], NDArray[np.float32]]:
"""
Return (train_w, val_w, test_w) as windowed arrays.
"""
@@ -190,21 +200,14 @@ def nwin(x: Optional[NDArray[np.floating]]) -> int:
)
def _scale_train_only(self) -> None:
- assert (
- self._train is not None
- and self._val is not None
- and self._test is not None
- )
+ assert self._train is not None and self._val is not None and self._test is not None
rlog("[bold magenta]Fitting MinMaxScaler on train split.[/bold magenta]")
self._train = self.scaler.fit_transform(self._train)
self._val = self.scaler.transform(self._val)
self._test = self.scaler.transform(self._test)
def _windowize(
- self,
- data: NDArray[np.float32],
- seq_len: int,
- shuffle_windows: bool
+ self, data: NDArray[np.float32], seq_len: int, shuffle_windows: bool
) -> NDArray[np.float32]:
n_samples, n_features = data.shape
n_windows = n_samples - seq_len + 1
@@ -213,7 +216,7 @@ def _windowize(
out = np.empty((n_windows, seq_len, n_features), dtype=self.cfg.dtype)
for i in range(n_windows):
- out[i] = data[i: i + seq_len]
+ out[i] = data[i : i + seq_len]
if shuffle_windows:
np.random.shuffle(out)
return out
@@ -252,9 +255,9 @@ def counts(arr: Optional[NDArray[np.floating]]) -> tuple[int, int]:
def batch_generator(
- data: NDArray[np.float32],
- time: Optional[NDArray[np.int32]],
- batch_size: int,
+ data: NDArray[np.float32],
+ time: Optional[NDArray[np.int32]],
+ batch_size: int,
) -> Tuple[NDArray[np.float32], NDArray[np.int32]]:
"""
Random mini-batch generator for windowed sequences.
@@ -291,7 +294,9 @@ def batch_generator(
return data_mb, T_mb
-def load_data(arg: Namespace) -> tuple[NDArray[np.float32], NDArray[np.float32], NDArray[np.float32]]:
+def load_data(
+ arg: Namespace,
+) -> tuple[NDArray[np.float32], NDArray[np.float32], NDArray[np.float32]]:
"""
Backwards-compatible wrapper.
Returns:
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
index 4d332a530..b157bd973 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py
@@ -1,15 +1,21 @@
"""
Options for the entire model
"""
+
from __future__ import annotations
import sys
-from argparse import ArgumentParser, Namespace, REMAINDER
-from typing import Optional, List
+from argparse import REMAINDER, ArgumentParser, Namespace
+from typing import List, Optional
import numpy as np
-from src.helpers.constants import DATA_DIR, TRAIN_TEST_SPLIT, ORDERBOOK_FILENAME, NUM_TRAINING_ITERATIONS
+from src.helpers.constants import (
+ DATA_DIR,
+ NUM_TRAINING_ITERATIONS,
+ ORDERBOOK_FILENAME,
+ TRAIN_TEST_SPLIT,
+)
try:
# tolerate alternates if present in your helpers
@@ -35,16 +41,17 @@ def __init__(self) -> None:
)
parser.add_argument("--seq-len", type=int, default=128)
parser.add_argument("--data-dir", dest="data_dir", type=str, default=str(DATA_DIR))
- parser.add_argument("--orderbook-filename", dest="orderbook_filename", type=str, default=ORDERBOOK_FILENAME)
parser.add_argument(
- "--no-shuffle",
- action="store_true",
- help="Disable shuffling of windowed sequences"
+ "--orderbook-filename", dest="orderbook_filename", type=str, default=ORDERBOOK_FILENAME
)
parser.add_argument(
- "--keep-zero-rows", dest="keep_zero_rows",
+ "--no-shuffle", action="store_true", help="Disable shuffling of windowed sequences"
+ )
+ parser.add_argument(
+ "--keep-zero-rows",
+ dest="keep_zero_rows",
action="store_true",
- help="Do NOT filter rows containing zeros."
+ help="Do NOT filter rows containing zeros.",
)
parser.add_argument(
"--splits",
@@ -92,29 +99,46 @@ def __init__(self) -> None:
)
# core shapes
parser.add_argument("--batch-size", type=int, default=128)
- parser.add_argument("--seq-len", type=int, default=128,
- help="Sequence length (kept here for convenience to sync with data).")
- parser.add_argument("--z-dim", type=int, default=40,
- help="Latent/input feature dim (e.g., LOB feature count).")
- parser.add_argument("--hidden-dim", type=int, default=64,
- help="Module hidden size.")
- parser.add_argument("--num-layer", type=int, default=3,
- help="Number of stacked layers per RNN/TCN block.")
+ parser.add_argument(
+ "--seq-len",
+ type=int,
+ default=128,
+ help="Sequence length (kept here for convenience to sync with data).",
+ )
+ parser.add_argument(
+ "--z-dim",
+ type=int,
+ default=40,
+ help="Latent/input feature dim (e.g., LOB feature count).",
+ )
+ parser.add_argument("--hidden-dim", type=int, default=64, help="Module hidden size.")
+ parser.add_argument(
+ "--num-layer", type=int, default=3, help="Number of stacked layers per RNN/TCN block."
+ )
# optimizer
- parser.add_argument("--lr", type=float, default=1e-4,
- help="Learning rate (generator/supervisor/discriminator if shared).")
- parser.add_argument("--beta1", type=float, default=0.5,
- help="Adam beta1.")
+ parser.add_argument(
+ "--lr",
+ type=float,
+ default=1e-4,
+ help="Learning rate (generator/supervisor/discriminator if shared).",
+ )
+ parser.add_argument("--beta1", type=float, default=0.5, help="Adam beta1.")
# Loss weights
- parser.add_argument("--w-gamma", type=float, default=1.0,
- help="Supervisor loss weight (γ).")
- parser.add_argument("--w-g", type=float, default=1.0,
- help="Generator adversarial loss weight (g).")
+ parser.add_argument(
+ "--w-gamma", type=float, default=1.0, help="Supervisor loss weight (γ)."
+ )
+ parser.add_argument(
+ "--w-g", type=float, default=1.0, help="Generator adversarial loss weight (g)."
+ )
- parser.add_argument("--num-iters", type=int, default=NUM_TRAINING_ITERATIONS,
- help="Number of training iterations per phase (ER, S, Joint).")
+ parser.add_argument(
+ "--num-iters",
+ type=int,
+ default=NUM_TRAINING_ITERATIONS,
+ help="Number of training iterations per phase (ER, S, Joint).",
+ )
self._parser = parser
@@ -149,8 +173,7 @@ class Options:
def __init__(self) -> None:
parser = ArgumentParser(
- prog="timeganlob",
- description="TimeGAN-LOB entrypoint with nested dataset options."
+ prog="timeganlob", description="TimeGAN-LOB entrypoint with nested dataset options."
)
parser.add_argument("--seed", type=int, default=42, help="Global random seed")
parser.add_argument("--run-name", type=str, default="exp1", help="Run name")
@@ -183,7 +206,7 @@ def extract(flag: str, toks: List[str]) -> tuple[List[str], List[str]]:
if flag not in toks:
return [], toks
i = toks.index(flag)
- rest = toks[i + 1:]
+ rest = toks[i + 1 :]
# stop at the next section flag (or end)
next_indices = [j for j, t in enumerate(rest) if t in ("--dataset", "--modules")]
end = next_indices[0] if next_indices else len(rest)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py
index 60ee5d546..eade9d8e2 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py
@@ -1,6 +1,7 @@
"""
Configuration constants for the project.
"""
+
from __future__ import annotations
import os
@@ -34,10 +35,7 @@ def _repo_root() -> Path:
TRAIN_TEST_SPLIT = (0.7, 0.15, 0.15)
assert isclose(
- sum(TRAIN_TEST_SPLIT), 1.0,
- rel_tol=0.0, abs_tol=1e-6
-), (
- f"TRAIN_TEST_SPLIT must sum to 1.0 (got {sum(TRAIN_TEST_SPLIT):.8f})"
-)
+ sum(TRAIN_TEST_SPLIT), 1.0, rel_tol=0.0, abs_tol=1e-6
+), f"TRAIN_TEST_SPLIT must sum to 1.0 (got {sum(TRAIN_TEST_SPLIT):.8f})"
NUM_LEVELS = 10
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/richie.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/richie.py
index f2732c484..c26d6cea0 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/richie.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/richie.py
@@ -3,13 +3,13 @@
import contextvars
from pathlib import Path
-from typing import Optional, Iterable, Tuple
+from typing import Iterable, Optional, Tuple
try:
+ from rich import box
from rich.console import Console
from rich.panel import Panel
from rich.table import Table
- from rich import box
_CONSOLE: Optional[Console] = Console()
except Exception: # fallback if rich isn’t installed
@@ -47,9 +47,11 @@ def __exit__(self, exc_type, exc, tb):
# nested: no-op
class _Noop:
- def __enter__(self): return None
+ def __enter__(self):
+ return None
- def __exit__(self, exc_type, exc, tb): return False
+ def __exit__(self, exc_type, exc, tb):
+ return False
return _Noop()
@@ -60,17 +62,19 @@ def rule(text: str = "") -> None:
def dataset_summary(
- *,
- file_path: Path,
- seq_len: int,
- dtype_name: str,
- filter_zero_rows: bool,
- splits: Iterable[Tuple[str, Tuple[int, int]]], # (name, (rows, windows))
+ *,
+ file_path: Path,
+ seq_len: int,
+ dtype_name: str,
+ filter_zero_rows: bool,
+ splits: Iterable[Tuple[str, Tuple[int, int]]], # (name, (rows, windows))
) -> None:
"""Render a header + splits table."""
if _CONSOLE is None:
# Plain fallback
- print(f"Dataset: {file_path} | seq_len={seq_len} | dtype={dtype_name} | filter_zero_rows={filter_zero_rows}")
+ print(
+ f"Dataset: {file_path} | seq_len={seq_len} | dtype={dtype_name} | filter_zero_rows={filter_zero_rows}"
+ )
for name, (rows, wins) in splits:
print(f"{name:>6}: rows={rows:,} windows={wins:,}")
return
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/utils.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/utils.py
index 99a4b91a2..e24950abb 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/utils.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/utils.py
@@ -9,21 +9,19 @@
Metric = Literal["spread", "mpr"]
-def extract_seq_lengths(
- sequences: Iterable[NDArray[np.floating]]
-) -> Tuple[NDArray[np.int32], int]:
+def extract_seq_lengths(sequences: Iterable[NDArray[np.floating]]) -> Tuple[NDArray[np.int32], int]:
lengths = np.asarray([int(s.shape[0]) for s in sequences], dtype=np.int32)
return lengths, int(lengths.max(initial=0))
def sample_noise(
- batch_size: int,
- z_dim: int,
- seq_len: int,
- *,
- mean: float | None = None,
- std: float | None = None,
- rng: np.random.Generator | None = None,
+ batch_size: int,
+ z_dim: int,
+ seq_len: int,
+ *,
+ mean: float | None = None,
+ std: float | None = None,
+ rng: np.random.Generator | None = None,
) -> NDArray[np.float32]:
if rng is None:
rng = np.random.default_rng()
@@ -43,8 +41,7 @@ def sample_noise(
def minmax_scale(
- data: NDArray[np.floating],
- epsilon: float = 1e-7
+ data: NDArray[np.floating], epsilon: float = 1e-7
) -> Tuple[NDArray[np.float32], NDArray[np.float32], NDArray[np.float32]]:
if data.ndim != 3:
raise ValueError(f"Expected data with 3 dimensions [N, T, F], got shape {data.shape}")
@@ -58,9 +55,9 @@ def minmax_scale(
def minmax_inverse(
- norm: NDArray[np.floating],
- fmin: NDArray[np.floating],
- fmax: NDArray[np.floating],
+ norm: NDArray[np.floating],
+ fmin: NDArray[np.floating],
+ fmax: NDArray[np.floating],
) -> NDArray[np.float32]:
"""
Inverse of `minmax_scale`.
@@ -102,13 +99,13 @@ def _midprice_returns(series: NDArray[np.floating]) -> NDArray[np.float64]:
def kl_divergence_hist(
- real: NDArray[np.floating],
- fake: NDArray[np.floating],
- metric: Literal["spread", "mpr"] = "spread",
- *,
- bins: int = 100,
- show_plot: bool = False,
- epsilon: float = 1e-12
+ real: NDArray[np.floating],
+ fake: NDArray[np.floating],
+ metric: Literal["spread", "mpr"] = "spread",
+ *,
+ bins: int = 100,
+ show_plot: bool = False,
+ epsilon: float = 1e-12,
) -> float:
if real.ndim != 2 or fake.ndim != 2:
raise ValueError("Inputs must be 2D arrays [T, F].")
@@ -133,8 +130,8 @@ def kl_divergence_hist(
f_hist, _ = np.histogram(f_series, bins=edges, density=False)
# convert to probability masses with smoothing
- r_p = (r_hist.astype(np.float64) + epsilon)
- f_p = (f_hist.astype(np.float64) + epsilon)
+ r_p = r_hist.astype(np.float64) + epsilon
+ f_p = f_hist.astype(np.float64) + epsilon
r_p /= r_p.sum()
f_p /= f_p.sum()
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py
index 155834442..1435557a4 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py
@@ -2,6 +2,7 @@
Generate LOB depth heatmaps and compute SSIM between real vs synthetic images.
Refactored to be faster, cleaner, and compatible with the new modules/utils.
"""
+
from __future__ import annotations
from pathlib import Path
@@ -14,16 +15,19 @@
from skimage.util import img_as_float
from src.dataset import load_data
+
# use nested CLI options + constants from src.helpers
from src.helpers.args import Options
-from src.helpers.constants import OUTPUT_DIR, NUM_LEVELS
-from src.helpers.richie import log as rlog, status as rstatus, rule as rrule
+from src.helpers.constants import NUM_LEVELS, OUTPUT_DIR
+from src.helpers.richie import log as rlog
+from src.helpers.richie import rule as rrule
+from src.helpers.richie import status as rstatus
from src.modules import TimeGAN
# optional pretty table for SSIM results (graceful fallback if rich unavailable)
try:
- from rich.table import Table
from rich import box
+ from rich.table import Table
_HAS_RICH_TABLE = True
except Exception:
@@ -48,12 +52,12 @@ def get_ssim(img1_path: Path | str, img2_path: Path | str) -> float:
def plot_heatmap(
- data_2d: NDArray, # shape [T, F]
- *,
- title: str | None = None,
- save_path: Path | str | None = None,
- show: bool = True,
- dpi: int = 150,
+ data_2d: NDArray, # shape [T, F]
+ *,
+ title: str | None = None,
+ save_path: Path | str | None = None,
+ show: bool = True,
+ dpi: int = 150,
) -> None:
"""
Scatter-based depth heatmap.
@@ -144,6 +148,7 @@ def _print_ssim_table(rows: List[Tuple[str, float]]) -> None:
# `rlog` prints line-wise; here we directly print the table via rich's console if available
try:
from rich.console import Console
+
Console().print(table)
except Exception:
# fallback to logging lines
@@ -171,7 +176,9 @@ def _print_ssim_table(rows: List[Tuple[str, float]]) -> None:
if getattr(test, "ndim", None) == 3:
test = test.reshape(-1, test.shape[-1])
- rlog(f"Splits: train_w={train.shape} val={getattr(val, 'shape', None)} test={getattr(test, 'shape', None)}")
+ rlog(
+ f"Splits: train_w={train.shape} val={getattr(val, 'shape', None)} test={getattr(test, 'shape', None)}"
+ )
# model (load weights)
with rstatus("[cyan]Restoring TimeGAN checkpoint…"):
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/modules.py b/recognition/TimeLOB_TimeGAN_49088276/src/modules.py
index 5f76606f5..0e6caae27 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/modules.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/modules.py
@@ -24,12 +24,13 @@
Created By: Radhesh Goel (Keys-I)
ID: s49088276
"""
+
from __future__ import annotations
import math
from dataclasses import dataclass, field
from pathlib import Path
-from typing import Optional, Tuple, Protocol, runtime_checkable, cast, List, Dict
+from typing import Dict, List, Optional, Protocol, Tuple, cast, runtime_checkable
import matplotlib.pyplot as plt
import numpy as np
@@ -42,18 +43,21 @@
from src.dataset import batch_generator
from src.helpers.constants import (
- WEIGHTS_DIR,
- OUTPUT_DIR,
NUM_TRAINING_ITERATIONS,
+ OUTPUT_DIR,
VALIDATE_INTERVAL,
+ WEIGHTS_DIR,
)
+
# richie: centralized pretty CLI helpers (safe fallbacks inside)
-from src.helpers.richie import log as rlog, status as rstatus, rule as rrule
+from src.helpers.richie import log as rlog
+from src.helpers.richie import rule as rrule
+from src.helpers.richie import status as rstatus
from src.helpers.utils import (
- minmax_scale,
- sample_noise,
kl_divergence_hist,
minmax_inverse,
+ minmax_scale,
+ sample_noise,
)
@@ -224,19 +228,24 @@ class TrainingHistory:
kl_vals: List[float] = field(default_factory=list)
def add_er(self, it: int, v: float) -> None:
- self.er_iters.append(it); self.er_vals.append(v)
+ self.er_iters.append(it)
+ self.er_vals.append(v)
def add_s(self, it: int, v: float) -> None:
- self.s_iters.append(it); self.s_vals.append(v)
+ self.s_iters.append(it)
+ self.s_vals.append(v)
def add_g(self, it: int, v: float) -> None:
- self.g_iters.append(it); self.g_vals.append(v)
+ self.g_iters.append(it)
+ self.g_vals.append(v)
def add_d(self, it: int, v: float) -> None:
- self.d_iters.append(it); self.d_vals.append(v)
+ self.d_iters.append(it)
+ self.d_vals.append(v)
def add_kl(self, it: int, v: float) -> None:
- self.kl_iters.append(it); self.kl_vals.append(v)
+ self.kl_iters.append(it)
+ self.kl_vals.append(v)
def save_plots(self, out_dir: Path, total_iters: int) -> Dict[str, Path]:
out_dir.mkdir(parents=True, exist_ok=True)
@@ -244,18 +253,28 @@ def save_plots(self, out_dir: Path, total_iters: int) -> Dict[str, Path]:
# Training losses
fig, ax = plt.subplots(figsize=(9, 5))
- if self.er_iters: ax.plot(self.er_iters, self.er_vals, label="Recon (E,R)")
- if self.s_iters: ax.plot(self.s_iters, self.s_vals, label="Supervisor (S)")
- if self.g_iters: ax.plot(self.g_iters, self.g_vals, label="Generator (G)")
- if self.d_iters: ax.plot(self.d_iters, self.d_vals, label="Discriminator (D)")
+ if self.er_iters:
+ ax.plot(self.er_iters, self.er_vals, label="Recon (E,R)")
+ if self.s_iters:
+ ax.plot(self.s_iters, self.s_vals, label="Supervisor (S)")
+ if self.g_iters:
+ ax.plot(self.g_iters, self.g_vals, label="Generator (G)")
+ if self.d_iters:
+ ax.plot(self.d_iters, self.d_vals, label="Discriminator (D)")
ax.set_title("Training Losses vs Iteration")
- ax.set_xlabel("Iteration");
+ ax.set_xlabel("Iteration")
ax.set_ylabel("Loss")
- ax.set_xlim(1, max([total_iters, *self.er_iters, *self.s_iters, *self.g_iters, *self.d_iters] or [total_iters]))
- ax.legend(loc="best");
+ ax.set_xlim(
+ 1,
+ max(
+ [total_iters, *self.er_iters, *self.s_iters, *self.g_iters, *self.d_iters]
+ or [total_iters]
+ ),
+ )
+ ax.legend(loc="best")
fig.tight_layout()
- p1 = out_dir / "training_curves.png";
- fig.savefig(p1, dpi=150, bbox_inches="tight");
+ p1 = out_dir / "training_curves.png"
+ fig.savefig(p1, dpi=150, bbox_inches="tight")
plt.close(fig)
saved["training_curves"] = p1
@@ -264,12 +283,12 @@ def save_plots(self, out_dir: Path, total_iters: int) -> Dict[str, Path]:
fig, ax = plt.subplots(figsize=(9, 3.5))
ax.plot(self.kl_iters, self.kl_vals, marker="o", linewidth=1)
ax.set_title("Validation KL(spread) vs Iteration")
- ax.set_xlabel("Iteration");
+ ax.set_xlabel("Iteration")
ax.set_ylabel("KL(spread)")
- ax.set_xlim(1, max(self.kl_iters));
+ ax.set_xlim(1, max(self.kl_iters))
fig.tight_layout()
- p2 = out_dir / "kl_spread_curve.png";
- fig.savefig(p2, dpi=150, bbox_inches="tight");
+ p2 = out_dir / "kl_spread_curve.png"
+ fig.savefig(p2, dpi=150, bbox_inches="tight")
plt.close(fig)
saved["kl_spread_curve"] = p2
@@ -304,12 +323,12 @@ class TimeGAN:
"""
def __init__(
- self,
- opt: OptLike,
- train_data: NDArray[np.float32],
- val_data: NDArray[np.float32],
- test_data: NDArray[np.float32],
- load_weights: bool = False,
+ self,
+ opt: OptLike,
+ train_data: NDArray[np.float32],
+ val_data: NDArray[np.float32],
+ test_data: NDArray[np.float32],
+ load_weights: bool = False,
) -> None:
# set seed & device
set_seed(getattr(opt, "manualseed", getattr(opt, "seed", None)))
@@ -359,9 +378,11 @@ def __init__(
# initial banner
rrule("[bold cyan]TimeGAN • init[/bold cyan]")
- rlog(f"device={self.device} "
- f"batch_size={self.batch_size} seq_len={self.seq_len} z_dim={self.z_dim} "
- f"h_dim={self.h_dim} n_layers={self.n_layers} num_iters={self.num_iterations}")
+ rlog(
+ f"device={self.device} "
+ f"batch_size={self.batch_size} seq_len={self.seq_len} z_dim={self.z_dim} "
+ f"h_dim={self.h_dim} n_layers={self.n_layers} num_iters={self.num_iterations}"
+ )
rlog(f"train_norm={self.train_norm.shape} val={self.val.shape} test={self.test.shape}")
# small utility for smooth progress readouts
@@ -486,9 +507,9 @@ def _discriminator_step(self, x: torch.Tensor, z: torch.Tensor) -> float:
y_fake = self.netD(h_hat)
y_fake_e = self.netD(e_hat)
loss = (
- self.bce_logits(y_real, torch.ones_like(y_real))
- + self.bce_logits(y_fake, torch.zeros_like(y_fake))
- + self.opt.w_gamma * self.bce_logits(y_fake_e, torch.zeros_like(y_fake_e))
+ self.bce_logits(y_real, torch.ones_like(y_real))
+ + self.bce_logits(y_fake, torch.zeros_like(y_fake))
+ + self.opt.w_gamma * self.bce_logits(y_fake_e, torch.zeros_like(y_fake_e))
)
# optional hinge to avoid overshooting
if loss.item() > 0.15:
@@ -499,7 +520,6 @@ def _discriminator_step(self, x: torch.Tensor, z: torch.Tensor) -> float:
def train_model(self) -> None:
rrule("[bold magenta]TimeGAN • training[/bold magenta]")
- history = TrainingHistory()
# phase 1: encoder-recovery pretrain
er_ema: Optional[float] = None
@@ -574,11 +594,11 @@ def train_model(self) -> None:
@torch.no_grad()
def generate(
- self,
- num_rows: int,
- *,
- mean: float = 0.0,
- std: float = 1.0,
+ self,
+ num_rows: int,
+ *,
+ mean: float = 0.0,
+ std: float = 1.0,
) -> NDArray[np.float32]:
"""Generate exactly `num_rows` rows of synthetic data (2D array).
diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/train.py b/recognition/TimeLOB_TimeGAN_49088276/src/train.py
index e96717536..8bec20dca 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/src/train.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/src/train.py
@@ -11,8 +11,9 @@
ID: s49088276
References:
--
+-
"""
+
from src.dataset import load_data
from src.helpers.args import Options
from src.modules import TimeGAN
From 243de340d066fb3a3cc74200fdf54dd7454d1172 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Tue, 21 Oct 2025 14:27:39 +1000
Subject: [PATCH 49/74] feat(data): add dataset summariser CLI
Introduces scripts/summarise_dataset.py to compute quick stats for datasets (row/col counts, missing values, mean/std/min/max, percentiles), optional label/class distribution, and file shape checks. Supports CSV/Parquet and .npz windows; can export CSV/JSON reports and print a concise console summary. Includes --input, --out, and --format flags with sane defaults.
---
.../scripts/summarise_orderbook.py | 56 +++++++++++++------
1 file changed, 39 insertions(+), 17 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/scripts/summarise_orderbook.py b/recognition/TimeLOB_TimeGAN_49088276/scripts/summarise_orderbook.py
index 46f983fa8..3774dbce8 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/scripts/summarise_orderbook.py
+++ b/recognition/TimeLOB_TimeGAN_49088276/scripts/summarise_orderbook.py
@@ -28,11 +28,12 @@
import argparse
import os
from dataclasses import dataclass
-from typing import List, Tuple
+from typing import List
+import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
-import matplotlib.pyplot as plt
+
# --------------------------- Config / Types ---------------------------- #
@dataclass
@@ -70,7 +71,9 @@ def load_orderbook(csv_path: str, levels: int) -> pd.DataFrame:
# ---------------------------- Computations ---------------------------- #
-def compute_top_of_book(ob: pd.DataFrame, tick_scale: float) -> tuple[pd.Series, pd.Series, pd.Series, pd.Series]:
+def compute_top_of_book(
+ ob: pd.DataFrame, tick_scale: float
+) -> tuple[pd.Series, pd.Series, pd.Series, pd.Series]:
ask1 = ob["ask_price_1"] / tick_scale
bid1 = ob["bid_price_1"] / tick_scale
spread = ask1 - bid1
@@ -161,7 +164,9 @@ def plot_midprice_series(outdir: str, mid_price: pd.Series, max_points: int = 40
def plot_midlogret_hist(outdir: str, mid_logret: pd.Series) -> str:
plt.figure(figsize=(7, 4))
# clip heavy tails for nicer viz
- vals = np.clip(mid_logret.values, np.percentile(mid_logret, 0.1), np.percentile(mid_logret, 99.9))
+ vals = np.clip(
+ mid_logret.values, np.percentile(mid_logret, 0.1), np.percentile(mid_logret, 99.9)
+ )
plt.hist(vals, bins=100)
plt.xlabel("log mid-price return")
plt.ylabel("Count")
@@ -199,10 +204,14 @@ def write_markdown_summary(
md.append(f"- **Zeros**: {zeros_total:,} cells ({zeros_pct:.2f}%)")
md.append("")
md.append("## Top-of-book (level 1)\n")
- md.append(f"- Spread (USD): mean={spread_stats['mean']:.6f}, std={spread_stats['std']:.6f}, "
- f"min={spread_stats['min']:.6f}, max={spread_stats['max']:.6f}")
- md.append(f"- |log mid-price return|: mean={mid_ret_stats['mean']:.6f}, std={mid_ret_stats['std']:.6f}, "
- f"p99={mid_ret_stats['p99']:.6f}")
+ md.append(
+ f"- Spread (USD): mean={spread_stats['mean']:.6f}, std={spread_stats['std']:.6f}, "
+ f"min={spread_stats['min']:.6f}, max={spread_stats['max']:.6f}"
+ )
+ md.append(
+ f"- |log mid-price return|: mean={mid_ret_stats['mean']:.6f}, std={mid_ret_stats['std']:.6f}, "
+ f"p99={mid_ret_stats['p99']:.6f}"
+ )
md.append("")
md.append("## Artifacts\n")
for name, path in artifacts.items():
@@ -218,8 +227,15 @@ def parse_args() -> argparse.Namespace:
ap.add_argument("--orderbook", required=True, help="Path to orderbook_10.csv")
ap.add_argument("--outdir", required=True, help="Output directory for plots and tables")
ap.add_argument("--levels", type=int, default=10, help="Number of book levels (default 10)")
- ap.add_argument("--tick-scale", type=float, default=10_000.0, help="LOBSTER tick scale (price = ticks / scale)")
- ap.add_argument("--seq-len", type=int, default=None, help="Optional: sequence length to estimate windows")
+ ap.add_argument(
+ "--tick-scale",
+ type=float,
+ default=10_000.0,
+ help="LOBSTER tick scale (price = ticks / scale)",
+ )
+ ap.add_argument(
+ "--seq-len", type=int, default=None, help="Optional: sequence length to estimate windows"
+ )
return ap.parse_args()
@@ -249,17 +265,22 @@ def main() -> None:
# Plots
arts: dict[str, str] = {}
- arts["depth_profile"] = plot_depth_profile(args.outdir, bid_depth, ask_depth)
- arts["spread_hist"] = plot_spread_hist(args.outdir, spread)
+ arts["depth_profile"] = plot_depth_profile(args.outdir, bid_depth, ask_depth)
+ arts["spread_hist"] = plot_spread_hist(args.outdir, spread)
arts["midprice_series"] = plot_midprice_series(args.outdir, mid_price)
- arts["midlogret_hist"] = plot_midlogret_hist(args.outdir, mid_logret)
+ arts["midlogret_hist"] = plot_midlogret_hist(args.outdir, mid_logret)
# Small stats for summary
- spread_stats = dict(mean=float(spread.mean()), std=float(spread.std()),
- min=float(spread.min()), max=float(spread.max()))
+ spread_stats = dict(
+ mean=float(spread.mean()),
+ std=float(spread.std()),
+ min=float(spread.min()),
+ max=float(spread.max()),
+ )
abs_ret = mid_logret.abs()
- mid_ret_stats = dict(mean=float(abs_ret.mean()), std=float(abs_ret.std()),
- p99=float(abs_ret.quantile(0.99)))
+ mid_ret_stats = dict(
+ mean=float(abs_ret.mean()), std=float(abs_ret.std()), p99=float(abs_ret.quantile(0.99))
+ )
# Windows estimate
wcount = windows_possible(len(ob), meta.seq_len)
@@ -280,5 +301,6 @@ def main() -> None:
print(f"[done] Summary written to: {args.outdir}")
+
if __name__ == "__main__":
main()
From 35e629b3988f7f0165867a9a632487c7354e4ff3 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Tue, 21 Oct 2025 15:19:47 +1000
Subject: [PATCH 50/74] feat(docs): add TimeGAN model description (5
components) and three-phase training summary
Incorporates the five-component list (Encoder, Recovery, Generator, Supervisor, Discriminator) and a concise three-phase training in the project report. Based on prior HackMD draft refined before this commit.
---
.../TimeLOB_TimeGAN_49088276/README.MD | 39 +++++++++++++++++--
1 file changed, 35 insertions(+), 4 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/README.MD b/recognition/TimeLOB_TimeGAN_49088276/README.MD
index b155235ea..a58458268 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/README.MD
+++ b/recognition/TimeLOB_TimeGAN_49088276/README.MD
@@ -16,10 +16,41 @@
## Project Overview
-This project trains a generative time series model to produce realistic sequences of limit order book events using the LOBSTER dataset, focusing on AMZN Level 10 data. The aim is to create high quality synthetic LOB sequences that can expand training sets for market microstructure research where balanced, fine grained data is expensive and difficult to collect. By learning the dynamics of spreads, midprice movements, and depth across ten levels, the model seeks to capture both short term fluctuations and broader order flow patterns.
+
+This project trains a TimeGAN model to generate synthetic sequences of limit order book events from the LOBSTER dataset
+using AMZN level 10 depth. The motivation is to ease data scarcity and confidentiality constraints in microstructure
+research, enable safer augmentation for downstream forecasting, and allow controlled experiments on price and depth
+dynamics without relying on live market streams. The synthetic sequences are intended to improve robustness, support
+reproducibility, and help probe edge cases that are rare in historical data.
Quality is assessed on a held out test split using objective targets:
-- Distribution similarity: KL divergence at or below 0.1 for spread and midprice return distributions between generated and real data.
-- Visual similarity: SSIM above 0.6 between heatmaps of generated and real order book depth snapshots.
-The report will document the model architecture and total parameter count, and compare training strategies such as full TimeGAN, adversarial only, and supervised only variants. It will record the hardware used, including GPU model, available VRAM, number of epochs, and total training time. To aid interpretation, the report will include three to five representative heatmaps that pair generated and real order books, along with a short error analysis that explains where the synthetic sequences align with reality and where they fall short. The goal is a practical, well evidenced benchmark for synthetic LOB generation on AMZN Level 10.
+* Distribution similarity: KL divergence at or below 0.1 for spread and midprice return distributions between generated
+ and real data.
+* Visual similarity: SSIM above 0.6 between heatmaps of generated and real order book depth snapshots.
+
+The report will include the model architecture and parameter count, the training strategy with ablations, compute
+details such as GPU type and VRAM, the number of epochs, total training time, and 3 to 5 paired heatmaps with a concise
+error analysis.
+
+## Model Description
+
+TimeGAN integrates both adversarial and supervised learning objectives to model the temporal structure of financial
+sequences. The architecture consists of five main components, each contributing to the generation and recovery of
+realistic limit order book sequences:
+
+1. **Encoder**: maps observed LOB windows into a lower-dimensional latent representation that captures underlying
+ market dynamics.
+2. **Recovery Network**: reconstructs original price and depth features from the latent space, ensuring information
+ consistency between real and encoded data.
+3. **Generator**: transforms random noise vectors into synthetic latent sequences that emulate the structure of encoded
+ real data.
+4. **Supervisor**: predicts the next step in a latent sequence, encouraging temporal coherence and realistic sequential
+ transitions.
+5. **Discriminator**: distinguishes between real and generated latent sequences, providing adversarial feedback to
+ improve the generator’s realism.
+
+Training follows three phases. First, pretrain Encoder and Recovery to minimize reconstruction error and anchor the
+latent space to real LOB statistics. Second, train the Supervisor for next step prediction to align latent dynamics with
+empirical transitions. Third, run joint adversarial training with discriminator loss plus simple moment and consistency
+terms, yielding synthetic sequences that match real markets in distribution and temporal structure.
From 0172d2e177e1874fc163d83e36a6ea9aa7293f03 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Tue, 21 Oct 2025 16:38:18 +1000
Subject: [PATCH 51/74] docs(readme): add Table of Contents, project structure
overview, and dependencies table
Introduces a linked ToC for quick navigation, expands project structure with brief per-file roles, and adds a version-pinned dependencies table with one-line use cases tailored to the TimeGAN LOB workflow.
---
.../TimeLOB_TimeGAN_49088276/README.MD | 70 +++++++++++++++++++
1 file changed, 70 insertions(+)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/README.MD b/recognition/TimeLOB_TimeGAN_49088276/README.MD
index a58458268..f9351c65a 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/README.MD
+++ b/recognition/TimeLOB_TimeGAN_49088276/README.MD
@@ -54,3 +54,73 @@ Training follows three phases. First, pretrain Encoder and Recovery to minimize
latent space to real LOB statistics. Second, train the Supervisor for next step prediction to align latent dynamics with
empirical transitions. Third, run joint adversarial training with discriminator loss plus simple moment and consistency
terms, yielding synthetic sequences that match real markets in distribution and temporal structure.
+
+## Table of Contents
+
+| # | Section |
+|----|---------------------------------------------------------------------|
+| 1 | [Project Structure](#project-structure) |
+| 2 | [Dependencies](#dependencies) |
+| 3 | [Usage](#usage) |
+| 4 | [Dataset](#dataset) |
+| 5 | [Data Setup](#data-setup) |
+| 6 | [Model Architecture](#model-architecture) |
+| 7 | [Training Process](#training-process) |
+| 8 | [Results](#results) |
+| 9 | [Analysis of Performance Metrics](#analysis-of-performance-metrics) |
+| 10 | [Style Space and Plot Discussion](#style-space-and-plot-discussion) |
+| 11 | [References](#references) |
+| 12 | [Citation](#citation) |
+
+## Project Structure
+
+The project consists of the following file structure:
+
+```ansi
+TimeLOB_TimeGAN_49088276/
+├── README.MD # Project report (including configuration, setup, training methodology, performance evaluation)
+├── environment.yml # conda environment with all dependencies.
+├── scripts/
+│ ├── run.sh # rangpur/local script for running the project
+│ └── summarise_orderbook.py # test script to get to know about the dataset
+└── src/
+ ├── dataset.py # data loader and preprocesser (includes data loading, scaling and normalising.)
+ ├── helpers/
+ │ ├── args.py # a nested options for the model and dataset so, those files are not bloated
+ │ ├── constants.py # root-anchored paths, defaults and training constants
+ │ ├── richie.py # a common interface for pretty console logging, status spinners, tables.
+ │ ├── utils.py # metrics and utilities (KL, scaling, noise, specific feature calculators)
+ │ └── visualise.py # plotting helpers for depth heatmaps, curves, and summaries (SSIM score calculators)
+ ├── modules.py # TimeGAN model components, training loops, checkpoints, metrics hooks.
+ ├── predict.py # Sampling script to generate synthetic LOB sequences from a checkpoint.
+ └── train.py # CLI entrypoint that parses options and runs training.
+```
+
+## Dependencies
+
+Training was carried out on **macOS (BSD Unix)** using an Apple M3 Pro system; the codebase is also compatible with
+Linux.
+Windows was not used for training.
+> **Note**
+> Hardware: Apple M3 Pro GPU with MLS/Metal support, o[environment.yml](environment.yml)r equivalent; at least 8 GB of
+> unified memory is advisable.
+
+| Dependency | Suggested version | One-line use case |
+|-------------------|------------------:|-------------------------------------------------------------------------------------------|
+| Python | 3.13.9 | Runtime for training, sampling, evaluation scripts, and utilities. |
+| torch (PyTorch) | 2.8.0 | Core framework for TimeGAN modules, tensor ops, autograd, and device acceleration. |
+| torchvision | 0.24.0 | Utility helpers (e.g., image save utilities) for exporting depth heatmaps when needed. |
+| numpy | 2.3.4 | Fast array math for windowing LOB data, metrics, and numerical transforms. |
+| matplotlib | 3.10.7 | Plots for training curves, spread/return histograms, and LOB depth heatmaps. |
+| scikit-learn | 1.7.2 | Analysis utilities (e.g., PCA/MI) in feature studies and ablations outside core training. |
+| scikit-image | 0.25.2 | SSIM computation to compare real vs synthetic heatmaps. |
+| tqdm | 4.67.1 | Progress bars for three-phase training with periodic metric updates. |
+| contextvars | 2.4 | Context-local state to keep logging and progress output tidy across workers. |
+| rich | 14.2.0 | Pretty console logs, status spinners, and summary tables during data prep and training. |
+| typing-extensions | 4.15.0 | Modern typing features (Protocol, Literal) used in model and CLI code. |
+| scipy | 1.16.3 | Statistical routines (e.g., Spearman correlation) for analysis scripts. |
+| pillow (PIL) | 12.0.0 | Image IO/encoding backend for saving figures and heatmaps to PNG. |
+| pandas | 2.3.3 | Tabular processing for order book summaries and feature engineering notebooks. |
+| jupyterlab | 4.4.10 | Interactive exploration of LOB data, metrics, and experiment reports. |
+| ipykernel | 7.1.0 | Jupyter kernel to run notebooks for analysis and visualization. |
+
From 1e7066c896843c6fd517f2a7ab74b6c5ac74dade Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Tue, 21 Oct 2025 19:05:27 +1000
Subject: [PATCH 52/74] docs(readme): add Table of Contents, project structure
overview, and dependencies table
Introduces a linked ToC for quick navigation, expands project structure with brief per-file roles, and adds a version-pinned dependencies table with one-line use cases tailored to the TimeGAN LOB workflow.
---
.../TimeLOB_TimeGAN_49088276/README.MD | 177 ++++++++++++++++++
.../scripts/npy_to_csv.py | 109 +++++++++++
2 files changed, 286 insertions(+)
create mode 100644 recognition/TimeLOB_TimeGAN_49088276/scripts/npy_to_csv.py
diff --git a/recognition/TimeLOB_TimeGAN_49088276/README.MD b/recognition/TimeLOB_TimeGAN_49088276/README.MD
index f9351c65a..b80ac3ef1 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/README.MD
+++ b/recognition/TimeLOB_TimeGAN_49088276/README.MD
@@ -124,3 +124,180 @@ Windows was not used for training.
| jupyterlab | 4.4.10 | Interactive exploration of LOB data, metrics, and experiment reports. |
| ipykernel | 7.1.0 | Jupyter kernel to run notebooks for analysis and visualization. |
+## Usage
+
+### Training
+
+Trains TimeGAN on AMZN level-10 LOBSTER windows with a three-phase schedule:
+
+1. **Encoder–Recovery** pretrain for reconstruction, 2) **Supervisor** pretrain for next-step consistency, 3) **Joint
+ adversarial
+ training** with moment matching. Periodic validation computes KL on spread and midprice returns; checkpoints are
+ saved
+ regularly and depth heatmaps can be rendered for SSIM checks.
+
+```bash
+# start training from scratch (nested CLI: dataset namespace then modules namespace)
+python src/train.py \
+ --dataset \
+ --seq-len 128 \
+ --data-dir ./data \
+ --orderbook-filename AMZN_2012-06-21_34200000_57600000_orderbook_10.csv \
+ --splits 0.7 0.85 1.0 \
+ --modules \
+ --batch-size 128 \
+ --z-dim 40 \
+ --hidden-dim 64 \
+ --num-layer 3 \
+ --lr 1e-4 \
+ --beta1 0.5 \
+ --num-iters 25000
+```
+
+| Hyperparameter | Value | Notes |
+|------------------------|-------------------|--------------------------------------------|
+| batch size | 128 | Larger batches stabilize adversarial steps |
+| `seq_len` | 128 | Window length for LOB sequences |
+| `z_dim` | 40 | Matches raw10 feature count |
+| `hidden_dim` | 64 | GRU hidden size across components |
+| `layers` | 3 | Stacked GRU depth |
+| `optimizer` | Adam | β1 tuned for GAN stability |
+| `learning rate` | 1e-4 | Shared across E, R, G, S, D |
+| β1 | 0.5 | Momentum term for Adam |
+| `iterations per phase` | 25,000 | ER, Supervisor, and Joint phases each |
+| scaling | train-only MinMax | Fit on train split, apply to val/test |
+
+- **Outputs**:
+ - `weights/timegan_ckpt.pt` (latest checkpoint)
+ - `outs/` (generated samples, KL/SSIM plots, training curves, summaries)
+
+### Generation
+
+The `predict.py` script samples synthetic LOB data from a trained **TimeGAN** checkpoint. It supports flat row
+generation, windowed generation, optional heatmap rendering, and quick metric checks.
+
+#### 1. Generate flat rows (match test length)
+
+Produces exactly `len(test)` rows in original feature space and saves as NumPy.
+
+```bash
+python -m src.predict \
+ --dataset \
+ --seq-len 128 \
+ --data-dir ./data \
+ --orderbook-filename AMZN_2012-06-21_34200000_57600000_orderbook_10.csv \
+ --splits 0.7 0.85 1.0 \
+ --modules \
+ --batch-size 128 \
+ --z-dim 40 \
+ --hidden-dim 64 \
+ --num-layer 3
+```
+
+#### 2. Generate a fixed number of rows
+
+Specify `--rows` to override the default.
+
+```bash
+python src/predict.py \
+ --dataset \
+ --seq-len 128 \
+ --data-dir ./data \
+ --orderbook-filename AMZN_2012-06-21_34200000_57600000_orderbook_10.csv \
+ --modules \
+ --batch-size 128 \
+ --z-dim 40 \
+ --hidden-dim 64 \
+ --num-layer 3
+```
+
+#### 3. Render depth heatmaps (real vs synthetic)
+
+Creates side-by-side heatmaps for SSIM inspection.
+
+```bash
+python -m src.viz.ssim_heatmap \
+ --dataset \
+ --seq-len 128 \
+ --data-dir ./data \
+ --orderbook-filename AMZN_2012-06-21_34200000_57600000_orderbook_10.csv \
+ --modules \
+ --batch-size 128 \
+ --z-dim 40 \
+ --hidden-dim 64 \
+ --num-layer 3
+# saves outs/real.png and outs/synthetic_heatmap_{i}.png
+```
+
+#### 4. Quick metrics (KL and SSIM)
+
+During generation or via post-hoc scripts you can compute:
+
+* **KL(spread)** and **KL(midprice returns)** on a held-out slice
+* **SSIM** between real and synthetic heatmaps
+
+```bash
+python src/helpers/visualise.py \
+ --dataset \
+ --seq-len 128 \
+ --data-dir ./data \
+ --orderbook-filename orderbook_10.csv \
+ --modules \
+ --batch-size 128 \
+ --z-dim 40 \
+ --hidden-dim 64 \
+ --num-layer 3
+```
+
+#### 5. Export to CSV
+
+```bash
+python npy_to_csv.py \
+ --in ./outs/gen_data.npy \
+ --out ./outs/gen_data.csv \
+ --peek 10 \
+ --summary
+```
+
+### Command-line Arguments
+
+#### Top-level (parsed by `Options`)
+
+| Flag | Type | Default | Description | Example |
+|---------------|-----------|----------|---------------------------------------------|--------------------------------|
+| `--seed` | int | `42` | Global random seed. | `--seed 1337` |
+| `--run-name` | str | `"exp1"` | Label for the run; used in logs/artifacts. | `--run-name lob_amzn_l10` |
+| `--dataset …` | namespace | — | Tokens after this go to **DataOptions**. | `--dataset --seq-len 128 …` |
+| `--modules …` | namespace | — | Tokens after this go to **ModulesOptions**. | `--modules --batch-size 128 …` |
+
+#### Data options (parsed by `DataOptions`)
+
+| Flag | Type | Default | Description | Example |
+|---------------------------|----------|----------------------|---------------------------------------------------------------------------|------------------------------------------------|
+| `--seq-len` | int | `128` | Sliding window length for LOB sequences. | `--seq-len 128` |
+| `--data-dir` | str | `DATA_DIR` | Directory containing LOBSTER files. | `--data-dir ./data` |
+| `--orderbook-filename` | str | `ORDERBOOK_FILENAME` | Name of `orderbook_10.csv`. | `--orderbook-filename AMZN_…_orderbook_10.csv` |
+| `--no-shuffle` | flag | off | Disable shuffling of windowed sequences. | `--no-shuffle` |
+| `--keep-zero-rows` | flag | off | Do not filter rows with zeros. | `--keep-zero-rows` |
+| `--splits TRAIN VAL TEST` | 3× float | `TRAIN_TEST_SPLIT` | Proportions summing to ~1.0 or cumulative cutoffs (e.g., `0.7 0.85 1.0`). | `--splits 0.7 0.85 1.0` |
+
+#### Model/training options (parsed by `ModulesOptions`)
+
+| Flag | Type | Default | Description | Example |
+|----------------|-------|---------------------------|------------------------------------------------|---------------------|
+| `--batch-size` | int | `128` | Batch size for all phases. | `--batch-size 128` |
+| `--seq-len` | int | `128` | Mirror of data window length for convenience. | `--seq-len 128` |
+| `--z-dim` | int | `40` | Noise/latent input dimension. | `--z-dim 40` |
+| `--hidden-dim` | int | `64` | GRU hidden size across components. | `--hidden-dim 64` |
+| `--num-layer` | int | `3` | Stacked GRU layers per block. | `--num-layer 3` |
+| `--lr` | float | `1e-4` | Adam learning rate. | `--lr 1e-4` |
+| `--beta1` | float | `0.5` | Adam β1 for GAN stability. | `--beta1 0.5` |
+| `--w-gamma` | float | `1.0` | Weight on supervisor-related adversarial term. | `--w-gamma 1.0` |
+| `--w-g` | float | `1.0` | Weight on generator losses and moments. | `--w-g 1.0` |
+| `--num-iters` | int | `NUM_TRAINING_ITERATIONS` | Iterations per phase (ER, Supervisor, Joint). | `--num-iters 25000` |
+
+**Outputs:**
+
+- `outs/gen_data.npy` flat synthetic rows `[T, F]` in original feature scale
+- `outs/real.png`, `outs/synthetic_heatmap_{i}.png` depth heatmaps for SSIM
+- Optional plots: `outs/kl_spread_curve.png`, `outs/training_curves.png` if enabled in training/eval scripts
diff --git a/recognition/TimeLOB_TimeGAN_49088276/scripts/npy_to_csv.py b/recognition/TimeLOB_TimeGAN_49088276/scripts/npy_to_csv.py
new file mode 100644
index 000000000..986beb1b0
--- /dev/null
+++ b/recognition/TimeLOB_TimeGAN_49088276/scripts/npy_to_csv.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+# npy_to_csv.py
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from rich.console import Console
+from rich.panel import Panel
+from rich.status import Status
+from rich.table import Table
+
+console = Console()
+
+
+def show_peek(df: pd.DataFrame, n: int) -> None:
+ if n <= 0:
+ return
+ n = min(n, len(df))
+ table = Table(title=f"Peek (first {n} rows)", show_lines=False)
+ for c in df.columns:
+ table.add_column(str(c))
+ for _, row in df.head(n).iterrows():
+ table.add_row(*[str(x) for x in row.to_list()])
+ console.print(table)
+
+
+def show_summary(df: pd.DataFrame, topk: int = 8) -> None:
+ desc = df.describe().T # count, mean, std, min, 25%, 50%, 75%, max
+ # keep only first topk columns for display to keep it compact
+ cols = ["count", "mean", "std", "min", "50%", "max"]
+ table = Table(title="Summary stats (per column)", show_lines=False)
+ for c in ["column"] + cols:
+ table.add_column(c)
+ for name, row in desc.head(topk).iterrows():
+ table.add_row(
+ str(name),
+ *(f"{row[c]:.6g}" if pd.notnull(row[c]) else "nan" for c in cols),
+ )
+ console.print(table)
+ if len(desc) > topk:
+ console.print(f"[dim]… {len(desc) - topk} more columns not shown[/dim]")
+
+
+def main() -> None:
+ ap = argparse.ArgumentParser(
+ description="Convert a 2D NumPy .npy array to CSV with rich peek/summary."
+ )
+ ap.add_argument(
+ "--in", dest="inp", default="./outs/gen_data.npy", help="Input .npy file"
+ )
+ ap.add_argument(
+ "--out", dest="outp", default="./outs/gen_data.csv", help="Output .csv file"
+ )
+ ap.add_argument("--prefix", default="f", help="Column name prefix (default: f)")
+ ap.add_argument(
+ "--peek",
+ type=int,
+ default=5,
+ help="Show first N rows in the console (0 = disable)",
+ )
+ ap.add_argument(
+ "--summary", action="store_true", help="Print per-column summary statistics"
+ )
+ ap.add_argument(
+ "--no-save", action="store_true", help="Do not write CSV (preview only)"
+ )
+ args = ap.parse_args()
+
+ inp = Path(args.inp)
+ outp = Path(args.outp)
+ outp.parent.mkdir(parents=True, exist_ok=True)
+
+ if not inp.exists():
+ console.print(f"[red]Input not found:[/red] {inp}")
+ raise SystemExit(1)
+
+ with Status(f"[cyan]Loading[/cyan] {inp}…", console=console):
+ arr = np.load(inp)
+
+ if arr.ndim != 2:
+ console.print(f"[red]Expected a 2D array, got shape {arr.shape}[/red]")
+ raise SystemExit(2)
+
+ n_rows, n_cols = arr.shape
+ cols = [f"{args.prefix}{i}" for i in range(n_cols)]
+
+ console.print(
+ Panel.fit(f"[bold]Array shape[/bold]: {n_rows} × {n_cols}", border_style="cyan")
+ )
+
+ df = pd.DataFrame(arr, columns=cols)
+
+ # Peek and summary
+ show_peek(df, args.peek)
+ if args.summary:
+ show_summary(df)
+
+ # Save CSV unless suppressed
+ if not args.no - save:
+ with Status(f"[cyan]Writing CSV[/cyan] → {outp}…", console=console):
+ df.to_csv(outp, index=False)
+ console.print(f"[green]Done:[/green] wrote [bold]{outp}[/bold]")
+
+
+if __name__ == "__main__":
+ main()
From 9628bcf8f7405728bf941eca01800da093659021 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Tue, 21 Oct 2025 21:34:47 +1000
Subject: [PATCH 53/74] docs(readme): add Dataset and Data Splits sections;
references section placeholder
Introduces detailed LOBSTER AMZN L10 dataset description and chronological split strategy (train/val/test). Notes that references will be added in a forthcoming update.
---
.../TimeLOB_TimeGAN_49088276/.gitignore | 2 +
.../TimeLOB_TimeGAN_49088276/README.MD | 85 +++++++++++++++++++
2 files changed, 87 insertions(+)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/.gitignore b/recognition/TimeLOB_TimeGAN_49088276/.gitignore
index 7f99e0853..2716324e2 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/.gitignore
+++ b/recognition/TimeLOB_TimeGAN_49088276/.gitignore
@@ -7,6 +7,8 @@
*.pyc
# model specific files
+weights/
+outs/
data/
preproc_final_core/
*.csv
diff --git a/recognition/TimeLOB_TimeGAN_49088276/README.MD b/recognition/TimeLOB_TimeGAN_49088276/README.MD
index b80ac3ef1..9672b6328 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/README.MD
+++ b/recognition/TimeLOB_TimeGAN_49088276/README.MD
@@ -301,3 +301,88 @@ python npy_to_csv.py \
- `outs/gen_data.npy` flat synthetic rows `[T, F]` in original feature scale
- `outs/real.png`, `outs/synthetic_heatmap_{i}.png` depth heatmaps for SSIM
- Optional plots: `outs/kl_spread_curve.png`, `outs/training_curves.png` if enabled in training/eval scripts
+
+## Dataset
+
+We use the **LOBSTER** limit order book for **AMZN** at **level 10** depth. The primary file is
+`AMZN_2012-06-21_34200000_57600000_orderbook_10.csv` containing 40 columns
+`[ask_price_1, ask_size_1, …, ask_price_10, ask_size_10, bid_price_1, bid_size_1, …, bid_price_10, bid_size_10]`.
+Place the file under `data/`. By default the code performs a **chronological** split into train, validation, and test to
+avoid leakage across time.
+
+Example depth visualizations are produced during evaluation as heatmaps in `outs/` for SSIM checks.
+
+*Files expected*
+
+* `data/AMZN_2012-06-21_34200000_57600000_orderbook_10.csv`
+* Optional: additional sessions can be summarized with `scripts/summarise_orderbook.py`
+
+---
+
+## Data Setup
+
+### Preprocessing for TimeGAN (see `src/dataset.py`)
+
+Pipeline steps applied to the order book snapshots:
+
+```text
+1) Load orderbook_10.csv → ndarray [T, 40]
+2) Optional filter: drop rows with any zero (configurable)
+3) Chronological split: train / val / test (default 0.7 / 0.15 / 0.15 or cumulative 0.7 / 0.85 / 1.0)
+4) Train-only MinMax scaling (fit on train, apply to val and test)
+5) Sliding windows: shape [N, seq_len, 40], with optional shuffle for training
+```
+
+#### Key flags (nested CLI):
+
+- **Dataset**: `--seq-len`, `--data-dir`, `--orderbook-filename`, `--splits`, `--keep-zero-rows`, `--no-shuffle`
+- **Modules**: `--batch-size`, `--z-dim` (use 40 for raw10), `--hidden-dim`, `--num-layer`, `--lr`, `--beta1`,
+ `--num-iters`
+
+#### Typical command:
+
+```bash
+python src/train.py \
+ --dataset \
+ --seq-len 128 \
+ --data-dir ./data \
+ --orderbook-filename AMZN_2012-06-21_34200000_57600000_orderbook_10.csv \
+ --splits 0.7 0.85 1.0 \
+ --modules \
+ --batch-size 128 \
+ --z-dim 40 \
+ --hidden-dim 64 \
+ --num-layer 3 \
+ --lr 1e-4 \
+ --beta1 0.5 \
+ --num-iters 25000
+```
+
+### Data Splits
+
+- **Training**: first segment of the day by time (no shuffling during split)
+- **Validation**: middle segment for periodic checks and model selection
+- **Test**: final segment held out for reporting metrics
+- **Method**: chronological index cutoffs, not random splitting
+
+#### Evaluation uses:
+
+- **Distribution similarity**: KL divergence for spread and midprice returns on the held out set
+- **Visual similarity**: SSIM between depth heatmaps of real and generated books
+
+Heatmaps and metrics are saved to `outs/` via the training hooks and `src/helpers/visualise`.
+
+## References
+
+## Citation
+
+If you use this implementation in your research, please cite:
+
+```bibtex
+@misc{stylegan2_adni_2025,
+ title={Conditional StyleGAN2 for ADNI (Alzheimer's Disease Neuroimaging Initiative)},
+ author={Tyreece Paul},
+ year={2025},
+ url={https://github.com/tyreecepaul/PatternAnalysis-2025}
+}
+```
\ No newline at end of file
From 7a853514daf7b956ba971e6a9b5e8a95bcaeca6a Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Tue, 21 Oct 2025 21:47:33 +1000
Subject: [PATCH 54/74] fix(docs): correct wrong BibTeX entry to TimeGAN
LOBSTER citation
Previously added a StyleGAN2/ADNI BibTeX by mistake. Replace with the TimeGAN for LOBSTER (AMZN L10) entry and update the project URL.
---
recognition/TimeLOB_TimeGAN_49088276/README.MD | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/recognition/TimeLOB_TimeGAN_49088276/README.MD b/recognition/TimeLOB_TimeGAN_49088276/README.MD
index 9672b6328..2c33911a3 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/README.MD
+++ b/recognition/TimeLOB_TimeGAN_49088276/README.MD
@@ -379,10 +379,11 @@ Heatmaps and metrics are saved to `outs/` via the training hooks and `src/helper
If you use this implementation in your research, please cite:
```bibtex
-@misc{stylegan2_adni_2025,
- title={Conditional StyleGAN2 for ADNI (Alzheimer's Disease Neuroimaging Initiative)},
- author={Tyreece Paul},
- year={2025},
- url={https://github.com/tyreecepaul/PatternAnalysis-2025}
+@misc{timegan_lobster_amzn_l10_2025,
+ title = {TimeGAN for LOBSTER: Synthetic Limit Order Book Sequences (AMZN Level-10)},
+ author = {Radhesh Goel},
+ year = {2025},
+ url = {https://github.com/keys-i/TimeLOB_TimeGAN_49088276},
+ note = {Three-phase TimeGAN training with KL/SSIM evaluation on AMZN L10}
}
```
\ No newline at end of file
From 9c3bcde4756bb710216bd72f0d2d137e9715ac43 Mon Sep 17 00:00:00 2001
From: Keys <70819367+keys-i@users.noreply.github.com>
Date: Wed, 22 Oct 2025 01:56:49 +1000
Subject: [PATCH 55/74] docs(readme): add TimeGAN model architecture figure and
refine architecture text
Embed modern HTML figure for the architecture PNG and rewrite component/flow sections for clarity and consistency. Remove training-specific notes from architecture and tighten wording.
---
.../TimeLOB_TimeGAN_49088276/README.MD | 173 ++++++++++++++++++
.../assets/model-architecture.png | Bin 0 -> 124146 bytes
2 files changed, 173 insertions(+)
create mode 100644 recognition/TimeLOB_TimeGAN_49088276/assets/model-architecture.png
diff --git a/recognition/TimeLOB_TimeGAN_49088276/README.MD b/recognition/TimeLOB_TimeGAN_49088276/README.MD
index 2c33911a3..a6c1234c2 100644
--- a/recognition/TimeLOB_TimeGAN_49088276/README.MD
+++ b/recognition/TimeLOB_TimeGAN_49088276/README.MD
@@ -372,6 +372,179 @@ python src/train.py \
Heatmaps and metrics are saved to `outs/` via the training hooks and `src/helpers/visualise`.
+## Model Architecture
+
+TimeGAN combines **embedding-based autoencoding** and **adversarial sequence modeling** within a unified framework.
+All components communicate through a shared latent space $H_t$ that captures temporal dependencies in the limit order
+book (LOB) while preserving feature-level structure. Real sequences $X_t$ are first embedded into this latent
+representation, which supports both reconstruction and generation paths.
+The architecture ensures that temporal dynamics are learned in latent space, while supervision and adversarial losses
+align generated data with true market statistics.
+
+
+
+
+
+
+ Figure 1.
+ (a) Block diagram of TimeGAN components showing embedding, generation, and discrimination paths.
+ (b) Training scheme showing data flow (solid lines) and gradient flow (dashed lines) across
+ Encoder (e), Recovery (r), Generator (g), and Discriminator (d).
+
+
+
+### Components
+
+1. **Encoder**
+ The encoder maps a scaled LOB window $X \in \mathbb{R}^{B\times T\times F}$ to a latent sequence $H \in
+ \mathbb{R}^{B\times T\times d}$. We use stacked GRUs to capture short and medium horizon dynamics, followed by a
+ linear projection and a pointwise sigmoid to keep activations bounded:
+ $$
+ H^{\text{gru}},_ = \mathrm{GRU}*{\text{enc}}(X),\qquad
+ H = \sigma\big(H^{\text{gru}} W*{\text{enc}} + b_{\text{enc}}\big).
+ $$
+ This path anchors the latent space to real microstructure so that latent transitions remain meaningful when we switch
+ to generation.
+
+2. **Recovery**
+ The recovery network decodes a latent sequence back to the original feature space. Given (H), it produces $\tilde X
+ \in \mathbb{R}^{B\times T\times F}$ through a GRU and a linear head with optional sigmoid:
+ $$
+ X^{\sim\text{gru}},_ = \mathrm{GRU}*{\text{rec}}(H),\qquad
+ \tilde X = \sigma\big(X^{\sim\text{gru}} W*{\text{rec}} + b_{\text{rec}}\big).
+ $$
+ Together, encoder and recovery minimize a reconstruction loss $ \mathcal{L}_{\text{rec}} = | \tilde X - X |_2^2 $,
+ which preserves price and depth structure and stabilizes later adversarial training.
+
+3. **Generator**
+ The generator produces a latent trajectory from noise $Z \in \mathbb{R}^{B\times T\times z}$. A GRU stack followed by
+ a projection yields $E \in \mathbb{R}^{B\times T\times d}$:
+ $$
+ E^{\text{gru}},_ = \mathrm{GRU}*{\text{gen}}(Z),\qquad
+ E = \sigma\big(E^{\text{gru}} W*{\text{gen}} + b_{\text{gen}}\big).
+ $$
+ We then pass (E) through the supervisor to enforce one step temporal consistency before decoding to synthetic
+ windows $\hat X$ via the recovery. Generating in latent space makes the adversarial game better conditioned than
+ operating directly on raw features.
+
+4. **Supervisor**
+ The supervisor learns the latent transition model. Given a real latent sequence (H), it predicts the next step (S(H))
+ using a GRU plus a projection:
+ $$
+ S^{\text{gru}},_ = \mathrm{GRU}*{\text{sup}}(H),\qquad
+ S(H) = \sigma\big(S^{\text{gru}} W*{\text{sup}} + b_{\text{sup}}\big).
+ $$
+ The objective $ \mathcal{L}*{\text{sup}} = \tfrac{1}{B(T-1)d}\sum*{t=1}^{T-1}|H_{:,t+1,:} - S(H)_{:,t,:}|_2^2 $
+ encourages realistic one step dynamics. During generation, the same supervisor regularizes (E), so synthetic
+ trajectories inherit temporal structure observed in data.
+
+5. **Discriminator**
+ The discriminator receives a latent sequence and outputs per time step logits without a sigmoid:
+ $$
+ D(H) = \mathrm{GRU}*{\text{disc}}(H) W*{\text{disc}} + b_{\text{disc}} \in \mathbb{R}^{B\times T\times 1}.
+ $$
+
+The discriminator outputs per-timestep **logits** over latent sequences (D(\cdot)) with **no internal sigmoid**; it
+operates alongside Encoder, Recovery, Generator, and Supervisor that **all share the same block pattern**: stacked GRUs
+with hidden size `hidden_dim` and depth `num_layer`, followed by a **per-time-step linear head** to the target
+dimensionality (`d` for latent, `F` for features, `1` for logits).
+All tensors use the shape **[batch, seq_len, channels]**, and weights use **Xavier** initialization for input matrices
+and **orthogonal** initialization for recurrent matrices to maintain stable sequence modeling.
+
+### Data Flow
+
+- **Reconstruction path**: $X \xrightarrow{\text{Encoder}} H \xrightarrow{\text{Recovery}} \tilde{X}$
+- **Generation path**:
+ $Z \xrightarrow{\text{Generator}} \hat{E} \;\xrightarrow{\text{Supervisor}} \hat{H} \;\xrightarrow{\text{Recovery}} \hat{X}$
+
+Here $\tilde{X}$ reconstructs the input and $\hat{X}$ is the synthetic output in original feature scale after inverse
+min-max.
+
+### Training Phases
+
+1. **Encoder–Recovery pretrain**
+ Minimize reconstruction loss $\mathcal{L}_{\text{rec}} = \mathrm{MSE}(\tilde{X}, X)$ to align the
+ latent space with real LOB statistics and stabilize
+ later adversarial steps.
+
+2. **Supervisor pretrain**
+ Minimize next-step loss `L_sup = MSE(H[:,1:], S(H)[:,:-1])` to encode short-horizon temporal dynamics in latent
+ space.
+
+3. **Joint training**
+ Optimize Generator, Supervisor, and Discriminator together with a composite objective that includes:
+
+ - **Adversarial loss** on latent sequences for realism
+ $$
+ \mathcal{L}_{\text{adv}}^{G} = \mathrm{BCE}\!\big(D(\hat{H}), 1\big), \qquad
+ \mathcal{L}_{\text{adv}}^{D} = \mathrm{BCE}\!\big(D(H), 1\big) + \mathrm{BCE}\!\big(D(\hat{H}), 0\big) + \gamma\,\mathrm{BCE}\!\big(D(\hat{E}), 0\big).
+ $$
+
+ - **Reconstruction loss** to keep outputs faithful to LOB structure
+ $$
+ \mathcal{L}_{\text{rec}} = \mathrm{MSE}(\tilde{X}, X)
+ $$
+ - **Moment matching** on generated windows to align simple feature statistics
+ mean and standard deviation penalties over features, averaged across time
+ - **Supervision loss** retained as a consistency term in joint training
+
+Weights follow the implementation defaults: adversarial terms, supervision weight `w_gamma`, and generator moment weight
+`w_g`. Training uses Adam with learning rate `1e-4` and `β1 = 0.5`.
+
+### Loss Summary
+
+| Component | Loss (formula) | Notes |
+|-------------------|----------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
+| **Discriminator** | $\mathcal{L}_{D} = \mathcal{L}_{\text{real}} + \mathcal{L}_{\text{fake}} + \gamma\,\mathcal{L}_{\text{fakeE}}$ | Real vs. fake terms; extra penalty on encoder-driven fakes scaled by $\gamma$. |
+| **Generator** | $\mathcal{L}*{G}=\mathcal{L}*{\text{adv}}^{G}+w_{g}!\cdot!(\text{moment penalties})+\sqrt{\mathcal{L}_{\text{sup}}+\varepsilon}$ | Adversarial + distribution-matching (moments) + supervised term (stabilized with $\varepsilon$). |
+| **Autoencoder** | $\mathcal{L}_{\text{rec}}$ | Reconstruction on Encoder–Recovery during pretrain; applied lightly during joint training. |
+
+### Shapes and Defaults
+
+| Setting | Value / Shape |
+|------------------------|---------------|
+| `seq_len`, `z_dim` | 128, 40 |
+| `hidden_dim`, `layers` | 64, 3 |
+| Windows | ([N,128,40]) |
+| Latent | ([N,128,64]) |
+| Iters per phase | 25,000 |
+
+This configuration learns both the distributional properties of spreads and midprice returns and the temporal structure
+of depth evolution, producing synthetic LOB sequences that are comparable to real data under the project’s KL and SSIM
+targets.
+
+## Training Processes
+
+## Results
+
+## Analysis of Performance Metrics
+
+## Style Space and Plot Discussion
+
## References
## Citation
diff --git a/recognition/TimeLOB_TimeGAN_49088276/assets/model-architecture.png b/recognition/TimeLOB_TimeGAN_49088276/assets/model-architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..0ff6f0589bae8ef7194d87b04c017d7199c8f9ca
GIT binary patch
literal 124146
zcmce8_am17`}SpI6B0s36lIq(A~QRrrDPP<2=qQ*g#K%o`!>lAP9ObO*KP;Ae$lxl7CdB
z_=)5n(!=>peDmg^tzXvD(^FShKYsl9i4!L@G&GJM-)4XOf9{c#sI$oL
z|JKI2ckf;|H#dUdV#s*@+^^1pFNNIRMB+<(dwW+`S8uOwO8du;A0LE)6aC3HE>g{@K~t2{329c@v|TVQggd`0?Ya
z99j}(iH@&dP0C#H!R>MeU=CEZ(qB1O$6O)jTketlddxMd2Bl_u6
zE*+hw6HksLD0r~`?^oz%ysPwDry*Vy6uh^{ceITlod5H>r}pdb_q?2(9NgN#S@P|}
z!^8Rc`Df3b#jnPYkqDH3xEsSr~BAPCApt9z-KChH&&9AR7Fv3T%J)=;B!xr
z0~SG4RFug!QLymM8(a~#q2kW+p0~8(HT<%&aZ0?p@lsm)`rGYABgrZJC8edmfBW|E
z-o4DcydYzS{|Up^d$CO?uoo9DbZ4GSSt-ezU>XfPJ#P8qVysx}?o|3Y2=T4@|
z2t@%tzDo_^eArt@T4g0AM#A!7NqTyq?!Md3&IVbhi-#Qp9z1wZSSZDESlY2)39DDU
zlh0Ad%4%=0+O?}!m6HuvXtvqQUi5uGKoGdKKY#vocXvBBE%VnlB#DcP?q#G>4W#om
z)->}`PpPY~_i%Ho7FA;@yR&=)UyJ2%q#dKh)?B@I?bozxaR83Wl)jdhma;NQjP}>I
zwkmT887@7;-?t5fPJ4J9jnTG}U?(S4HZ$w%?Y(&F)DC+SDuvHKe13%Djre
z6^gLFOiHSAfu~3NqI0
z^VQ49NKe;`JGjlBPvvf%#c>S{&Rx6qA2?8xlQtRt=+Tkuof5Zh-TLnZ>LOz9+$qD^
z2@F){(eylPVsc_ngr%jWdML-XBG$WR;ZL4CiH*(p&r48Hkh18hqMvpe;^EPwM-Mp;
zYcsR0)zu{BWo2odII&yyhCr{%3J0PmD=VwuO0%Ewy;t|{-II`%Y;SAx!O9IHD)8Qw
zm6feM>L(Gf+iPiZ-q?6Ce`gaSzPGoxF-nks2famIthgDjf?_*2T@j;ljk$!^##&ZV
zQvc{ERT1k?pS~mqJG;=35Y^wm1JqPiRqgElb>?3lVBO;y85QLu^ECBnd9WzfkAZ0;_{Yf_^6&EB?aY)>jd$ioBSJ!Scyi~SXJzdd6lCM(7E?aPAi+7efB*i#
zz(7rR?PS0D!KkPx?BnVKTbejsF(DzPiw6Q|I6{vcI;5kiIeKU$uFm3XT3KYI)pN_#
z^CFE@oqjv_U$c5L!>i#&PgWe`NXr_MAaO}Wk!!^I%9Th>0c4Tl0IJ=TltIPC#Z3I_
zb8+=?0?gdp$F6=(J@(8;^Zfa)nwnqlgSJQQJK<+cd#-BihN=KF?pxylJ4vk123d^u
z`o?-GIlYv}Nb?hE#E0jtH2rMN6DQsj7LIdft#f+aQAs}9e&F8jDzc&H@`q{J3ga1i
zRT$kKql~)#%*9Z;smdk)5(T
zhu;Qj7oXvs)3mm>c6WE*%>PY72(etNy~iXeDT$*SW@&9Lynp{Zk1aQ0Ze!Eg-hT4z
zS&E|YKC&mu&z?QYOG!ydN+NKd6x?BXxw-bXw*JNqJk^#5b+oltjC9Lo_{GH?Y-}D!
zMUfSSD+V-P@Gu9oW9VOFFdTgtqp5U*Fm#HEJ5j%!{T6ie^j|)I
z?mz46Gvz@=?iUqx%F$7xS4B+?JBVu0H+64tr#$zwhzOFR`y0oR4>z-Ag#-nqky56g
zb^6sOwKg_NN=ZdW_$t;vTrxO)+S1DE&GoK>1G_Eyt9;e=J5xCSvEaF$C@;zgly
ztfWOut}W#U1qGFtmuF;*d`{DG=q+~o`z@!qSf=`#$Mmm&h=}rr0jgFdjwI*OtMh
zrK{QJ3*I{P9kXfU^{yWD>%_z<^liXH-!6vmDz$z2xUd+BsXc$Z(xvUEKB?3mY4cg8OOY
z4OP|mU0rU2A42Wz?T;KeV*5QeCo_|=Tu-mUb9G;bz4zLZvH-K9=ZZjwu)49a@tQ`Q
zLPTtA{i$zmGy$(|+Mmz={od2rS^MEbU_gLLrB{NcfPK$fdV-OS&Dzp3H947ynR#ZU
zyTWT-JJ|r0DmfOhhVsfI@A@hs!Q}e&>!(k%oTiPdY!N
zX$JqE2}crU#`=?jw66_b8``k{{HXtFV`1Kw%@pMW1_d@^7ZR${ncc~#ytVF
zZ1+{a$|4)$hKtY4vE2^|am)Gh75BI_bqQ6JCg9ia-@ivkmp0Zte4{CLn6@hLiHe54
z9{_R+xP4ENGEjB~bsgZy&24$YA@1?x@y`6gPjQFnD7F@=ufmh?x%Ky(g4>LMtgNh<
z*hH*Hun$hJnVFfd_|$PVwcc`%#cF>_g4n)2Y<+dn`22b5`Ky-Z=2#JF=TU=!-72m_
zBO}PBPgnmH+4ZpQ+&MfxerKqGDf76Vo)|AL5g);E=*p)gW&h7#zHEDLhY;i7;P76b
zXPcWrd=bR{{aHUsZ?`^?4%5ryBQBI<4^ERTby?uG@Yf$;-#aD$AKl1BtcTi2QH>!Ev
z>v=n(HWOPvGh=hiMLaLFE41p05ztI@ki`93Xe>TkN0+T
z)LIS8$a8O3c&HJ^-MnsEqkiH9ekZ4(pkTYTDjUy%0|#8^iy~rT6de1Bxql)Ti^J7Z
zP>^ZV9!$AUY&&tHd1mH@>31PDYIzDzab0@#cfKIKpkN2VTP5{QPEPLV?Fq}u>-P2(ghZU~wr$%GHX4l%
zOZM7RmC<-uCsq4swWPG$$ZojAMnxsJcgHu1`kmQEi`r#YamUv<;GU4U_y~U&RYr#J
zqKmAzM=*g!z{>LT^P`G7b@%ibU%0Te^r!2x=&oJ6s=Iq@{+0*=^#H9LcjT>LdecN7
z)hKe{z^}0*wJyq+zCXKWYuke}`{K;YttA`pI+Tel=J`F>q@tn%C9AL4iNEL6+IUw1
zPLS=l?CF^qzcPv^@$pw$6Y1UNpUODZw6wI~82>xHKJY#$c!!|a#bN_v+bI?DJhZSXl`u0iqFJB03!h(!Hn@xPU*8b7Y=)_IOEWl-kwwmmEy1)
z{upy}q-FQ9J2E{_6>k21fD|0P|CE%J6i!me{rexnc=B*PH-@zdpUTQjz#W2EocM9n
zb7k&(o*5+rS0>UG2M5hRZXTY=v9S`zff~un9~|AsvM;>(SYJ;WR!>Pz-Zth;R89OS
zeWvKmKp2&~&*K~3aZzohTM3Va>%g*OUoybirU#TXTElH%uTdt?}^yreb=tNc=__>ix&hzN0E}9
zO?U0;_3J~Y%6-~8I#^zeNRg69Xady@U)JDLIEc7cn`{vi7JiwV%dmR5kM{YIQDk7(
znPmj6w0$o@Jm6MvA8Lp&xU8g96~ZR&=kJf=Qi07JAHSX{DIzldF-90QV>`FJ4_AuA
zk9YH}iksP{h5-L2;bBl
z+{MA6!ue_}*KV{e*}&w%$rN?ZL)6AA`9^VZaY5U8O!F;}=xcE@kSjn?Wh?hP1Ur@i!2rY~?eLQeROF5AcWy;5Izq
z@7eQ?D-(wol?@Aq59@qw)dW}oG`@{^EO+Zgd^`t{l$tv9v)ltU^U|eDh{lcK`hksT
z(70;ukAA##sj2-UntoN3;7=75rg{%Ytm5*_D&1XO0d>B9|Bm|G_U+q3T?kwG^-a{3
zZM3X(yAIZNcXM3eL@wmUmxlAHrt2kqOqBMe1(COrWl_!qi8Q_4_~}!b%fIeNqQ)}(
z6*%d)Ru}#$vdpf`{}t3v6ft_`v-`pai+n7Eh@XHL9fiMo3iYbc9sWP7t|lhCV=30L
zo(l5vA|fKkyf?}m`YP})B?e~PD9&QRt5@MsQHQYB2jc*Pfif$+H;-|L#m2;dy1xY%
zh(0in(?s|v`%@@;|NZl4qOWrM#$Q|C`a#s2A9`1=
z!9VKke*L-#LHuFl(3|`Db+A4R$FVb3n&=1AY}TN9s6
zHhl(n^u~&rl|JcEvrjHGeYer(KJ=+Jnl5PC^pwK)-r^nUM;xO=2eY%Est`(-oVp%2
zhUiiD<-Qc$E$c?F=uFdb#?HjQFL%&aId=SmyO_yt;_#IbR{NftpFO_jMzqYDl`WGW
z{yaAKgR<4G%k6qsybqhhvHMfbG!Yw7ZzI)HrrA!q0G7uHpUX8ZcW-}w{9Nuur(3uF
zd9J4E#DdML%${f~0L}u!8}Rk@1?dIA98vWrlvCR8bjDMasiYaU+aL;zM9#XpE}=pq
z4Nm?jt+$8%vm*YTSxdlBhxVMQH#9GLdU_+~ge#VPIm?T;XPAZG9;26BxWd<@V1ug%SK3z)4puEm1ET
z3;hyAZLo7sL@Y
z@bx42f!J>Mk@_$mgxf!h+Ot2?xOn?MT3K46*1MeuBltmn%l640mXK&@X+d_vKMiEA
zP6DvL|N8Zesi_yRW4Qm0xtQjsvRmg9DJV)%zru0AFFPB*bQ!cESnYXH(PAJyCj*yE
zRg^>nANQX%1e`%iRMZz7R7v(v$*Ms~I?G>0c
z&)akg!hDULa{D56d;WfcNbz-VVr%u{VkIGP{6!D!7tYJnT3Hy}2)$h2`m^`-?5N{W
zVPo46o$ovh1-r|x{$M`v#%_r`FJV=mr>k;TfJzMG0W%FXhraQsIFtUr|Kc|IrI%Or
zUsn4QgugBYLw@b5oso8Sj)TKwX3lEsfvM>N=^y(ZIz5rP^6$PIWtb@iVUT4f*c$Y+
z*p(_k#iVrJ8BrKLDu_2v9OV)b=D+!Slz}zm@YkF47e~&%Qo1S}M9)c4#F(s_(%!k9
zDzTxlk(!=JQkbXGdC*TAZba0W523YFI^#
zrkc1TBO~>m${>3=$y_{tepih4n^&)rzKj8eZo$$#huQlORGqC=84o}}l`kn{4pxH(
zMJV05vsUahnDhZ3w6fyv(#uOgguM998S+ThyLaziy>dkD-rQJqnB)(7_3G7O+i&5G
zL)qCPE-%t*Yt@h{7qn$U>fYU+OpJ{^r*E=b#xbd|&>5!;5sV@y!`0HzfD)vjsJPjd
ztlBo-DUQSR>v~&=b6`bIPN-hSO>a)%|8C$(5`vG9kD8j=fM1F$HZroi$id9OpzVpY
z144>$`!q5<>=ds&zRDt%)o6K8xz#CrJQS7a*sVXO2_i%qKmU`~11#xpVbt>`qAlDDg$Vxo_XTJ<1~%vM1Rf6RF+J!^7cc*{-1XzMF}i
zsP%m%H#N?mkJC;=ijWf&)Hr!^8nw+}Wu(8~+?0(m`7g-~)h_`>MG&U3%k>WfZZJB{
zMh}C8g2S6KM+VOK^9|HMO=XNubwuJ}I)
z>@jAGJ&h#I8E<bzemo39;nbdh%=y36Yw*@tt$^+;y8I+lSI;&(_%WjoaGVIZe0{
z>fD|q13E-iT*z&U9fn~~%Ul*d?>4rbU07hab$F*xUdwa0FJ-@rd3tB13G?M&UFT&3
z6boiZh`ukcxq^B~3E;oL?zKF}gZXaF%2m!<)i409lBtCoQ
zaOu)?PdN(=C^C!zh4~h+QGt7=&@*j
zAm$oMEOI2+QQpOp8GRAx6O9$~d{o<6L+ZY`ojrMS*P$zPzDk<<`p;x(2Yvl9`*^
zq8Ia}yZg_-f8|gP5SB|zOXu=T_X-HCEkq3c`v(?oTIrROlq7MfhTNs~;lqanu`kLE
z7;lH5-u156X@d2anS)Oo7&rpyBhOBB=JOvr=JhdpKM4s5GCwVw*m$|Y_NYddk?i6<
zhesh#mc$7UlWXg=TI3^G$=An|t9LjjEA@bZtLCS&&;|3T^ZV+g*a_eQe1ih4YLqo%FxhA&ARbbXutERa-guEF>hI3L=J3I_?Q2Pa^
zf!=>{G1p3e{%lD2w6x^1GAFc{w@zy&r*=MiM|!p*1?*O$v)6Kk#ai5<@~U@^oxOc~
z`$WzINt%Ef^NoVfnaQ~phic2B4Z7MoPpf8cdNe=x$$3foHrL&8R&6wfzrQktbnkI4
zkK3imkB+E*Qk@JUBNEG3`ekn(>=5Iht>o+Nvt`|zX4xszBIh)bzHXpHDCwED8*tKQ
z&ZhW!F}3FE-QeE&Ec#oQ*7q;#`UW)mA!EOEH?@p|tId~$nOx`poZ;c&*|seJdkhFD
zYF6>IX+QO$Lx+I7@go#KvCGsVtRQwj_v$>FD-`#yHaL0mQihJ6mX_})al!|x=d;x2
zs}2q$$L4pD_5pq+=1QZ!vCzz~%-f;^w6K^M9$qy@w+3+g?wy=V!etMS)&BnezTX{z
zDq{$_3VdwlfOu-O
zoN=I(MlrSAk4f&UV=Rzv|pSO5Df4&C-)+svfu*IpVBm*Hp7`*1^
z=eP3Qnw_iR&m9nh@T17c{JV{o-#R*`R)lLT@{8<$fERlJpMs<$-`tYOAOk|ZRB2Lc
z%ktm&P+Ln7IQ9C8^3E8bah?0IYFEVm9|rn~QusOZ0%0NQYV`Z(P{t`A9n
z?`jHExXq^PoC7aH@fvU9Bwo4vyI^D9_2t&*^}#9UvnPdE;&gvsWeX-=KhZj2lJb{B
ztNFzN?ea;Fkqd31Awe|vsl>RO`1Co&+B*!6rwk{ZtPT5g$Hl`_HEh?7Skd@p8Ik8%
zB>~muj@>KQ)jN&7L)_%r8zm)1BS(HU;|S@=S&tZO&eNaJ-B%FT`<>M|sW67~$aNp%
z265<)0ce>;|Ey|mruVN#_Yc}28k&lViiD59KN*GpRXaP+xltwxf8WI0fTIS6#bA?{
zBpw5=5(LZ6oq!4-o++Cd8s1o2ng$haXwbx)5&Wix&z%g7jXfrRmZRx2GBV=O^*Z2*
z2~Ky?y)y^nXXc|7QBm=;x|+m2D>JjfY=z|>PDfXPwQ4ztB)%mxGZXS79mU}9
z-xS2u!oml03ACQ@VyC+TtW5YIqdO&TLNFmExOVTxjwIiq@FYAuJkXnkf(GnHM?rR6
z+11_MLgMk-+8T;8{sLi*gn0V&sRX+&Bw}C}Kta$=5&~?g?U1y90J(p)yyr@E*k4ge
zNvB`6!B54z$Yk%Ke+7+0g(Bv`gHK@6CF8RD_Xq#Di6z4;HnH^mL(aliugs5r71*~g
z{_*2ig7G*4?yc2JqZIAc)Bo!QXiwo&4OEgnC&5lPbl@#y0Xm9Vd|F^21uk`&Bj>^!
z8vp7$>nn#96`y%-oXfco^Z0SYu=NmG1jihNg4+eznVF*v5&YrnOiWDE5SK^^)VwXV
zh^PX=1|zSw$X8KOxp?s+Bok<{*?D=}MN7#kDO(>)`d6D@thjStPtSh(R~_NAQ~2z7
z_nW^6*ki|z$#98@i0B?ao|ct02IZUZ`St4;fY0tMpd9RCD*t%d<$bITuS_XJ%5-4C$XZ
zu|7ShVPUanNN5vTgh$a!O+!PKDHjp_GAnBr)D0|m=z|B)|5pGNkz^tIqA`2?+_`g5
zu(Y(UqPc-ZJ#=UX$2!HfZ3x?cKg*3xOv;;llUB1K_Tp}{PGzjEuk)RI3Fx9SJKCD4
zprfxJ#3~7?l#_$woi|sa-hK-IfrHLkudB?5ny1xwDbC7pjybz~l4afYuXt4~l9%M1
zI1)W19+pF~{oSoeSwacj+iwfcf_O%fB4&9ABLD5BRR7S2Zv*eY{?#~J)ERPRdt%wd
zn4{^<^upWD*()BEL8V{BrCnE#so(ZFivsywoZM?U{;8SKwTII8AD+Cpr>7BPNKoU^R>-p6
z%uV-Hk}WJOehoroL)Z#NXaZ(QNl5TUXy%!glM?7}`K#@rM;`2abxDAk`o0+L_U$9`
zn+7uv0O!)v)2&+*6OP_~=%8Rx1Gp~+Wn*8|?%lg7w{z?6iQu&zXiw3=+rsL@b{z^-
z+moQ^%}q?9?ZOBpMWou
z`c{o8DIygD8YCqpp)K26=2n0Pd((sfb8vwG(-YhbJTWeziA
z7qDixj+G1y4%V7%ia`hEoRgE}>0Vrla+Ws3ZaK9*5s=DJr1h}S%NK>lmLE%Ao^q96
zT{A0n0SZmZc);e(pdGz94TT?tmp=68*)%3KRWGj!fK8d`$qi9-;b1`Nm2uxg^#m&`
zcJf3dX-dM`Cu!YLEA`fcDPtn#
zT-b)^>e50M_2nQQdH3bzm)qmC#M^92o=48?zj;>B(<}AGbLa6-p{n&_`p!4z9B<#(
z(M=eUwfZ>ZG^Ti?{+-xH1iQV(+@@Em@R9M=NAYpon-6|4Z>$VP#vE-t(}k0iBsKL{#jwNyhL=s8(1>ii+_Ysf41<=W^6tI`h-#p!CqZj-vO`lU33p7Kx5
zk2_QURv&K{cj%;({5#WWFZh*gpI-Bg2mKZIy)+<&!?)2BWn>^AAg~44CI)o(oq{i?
zr)RYz>r~qvSXPwW++U1Vv6ALRxk5$#`Qu1!9C$ATbFnO@4Mw?y
z4HN~vMESUk@^tjr?YgBhd0Wv@M9PD~sAF&Car9X