diff --git a/recognition/TimeLOB_TimeGAN_49088276/.gitignore b/recognition/TimeLOB_TimeGAN_49088276/.gitignore new file mode 100644 index 000000000..2716324e2 --- /dev/null +++ b/recognition/TimeLOB_TimeGAN_49088276/.gitignore @@ -0,0 +1,22 @@ +# editor specific files +.idea/ +.vscode/ + +# python cache files +./__pycache__/ +*.pyc + +# model specific files +weights/ +outs/ +data/ +preproc_final_core/ +*.csv +*.pt +*.pkl +outputs/ +checkpoints/ +logs/ + +# OS generated files +.DS_Store \ No newline at end of file diff --git a/recognition/TimeLOB_TimeGAN_49088276/README.MD b/recognition/TimeLOB_TimeGAN_49088276/README.MD new file mode 100644 index 000000000..9316e2389 --- /dev/null +++ b/recognition/TimeLOB_TimeGAN_49088276/README.MD @@ -0,0 +1,843 @@ +# TimeGAN for Synthetic Limit Order Books (AMZN, LOBSTER Level-10) + +**COMP3710 - Pattern Recognition and Analysis** + + + + + + + + + + + + +
Task 14Generative Model of AMZN LOBSTER Level-10 using TimeGAN
AuthorRadhesh Goel (49088276)
+ +## Project Overview + +This project trains a TimeGAN model to generate synthetic sequences of limit order book events from the LOBSTER dataset +using AMZN level 10 depth. The motivation is to ease data scarcity and confidentiality constraints in microstructure +research, enable safer augmentation for downstream forecasting, and allow controlled experiments on price and depth +dynamics without relying on live market streams. The synthetic sequences are intended to improve robustness, support +reproducibility, and help probe edge cases that are rare in historical data. + +Quality is assessed on a held out test split using objective targets: + +* Distribution similarity: KL divergence at or below 0.1 for spread and midprice return distributions between generated + and real data. +* Visual similarity: SSIM above 0.6 between heatmaps of generated and real order book depth snapshots. + +The report will include the model architecture and parameter count, the training strategy with ablations, compute +details such as GPU type and VRAM, the number of epochs, total training time, and 3 to 5 paired heatmaps with a concise +error analysis. + +## Model Description + +TimeGAN integrates both adversarial and supervised learning objectives to model the temporal structure of financial +sequences. The architecture consists of five main components, each contributing to the generation and recovery of +realistic limit order book sequences: + +1. **Encoder**: maps observed LOB windows into a lower-dimensional latent representation that captures underlying + market dynamics. +2. **Recovery Network**: reconstructs original price and depth features from the latent space, ensuring information + consistency between real and encoded data. +3. **Generator**: transforms random noise vectors into synthetic latent sequences that emulate the structure of encoded + real data. +4. **Supervisor**: predicts the next step in a latent sequence, encouraging temporal coherence and realistic sequential + transitions. +5. **Discriminator**: distinguishes between real and generated latent sequences, providing adversarial feedback to + improve the generator’s realism. + +Training follows three phases. First, pretrain Encoder and Recovery to minimize reconstruction error and anchor the +latent space to real LOB statistics. Second, train the Supervisor for next step prediction to align latent dynamics with +empirical transitions. Third, run joint adversarial training with discriminator loss plus simple moment and consistency +terms, yielding synthetic sequences that match real markets in distribution and temporal structure. + +## Table of Contents + +| # | Section | +|----|---------------------------------------------------------------------| +| 1 | [Project Structure](#project-structure) | +| 2 | [Dependencies](#dependencies) | +| 3 | [Usage](#usage) | +| 4 | [Dataset](#dataset) | +| 5 | [Data Setup](#data-setup) | +| 6 | [Model Architecture](#model-architecture) | +| 7 | [Training Process](#training-process) | +| 8 | [Results](#results) | +| 9 | [Analysis of Performance Metrics](#analysis-of-performance-metrics) | +| 10 | [Style Space and Plot Discussion](#style-space-and-plot-discussion) | +| 11 | [References](#references) | +| 12 | [Citation](#citation) | + +## Project Structure + +The project consists of the following file structure: + +```ansi +TimeLOB_TimeGAN_49088276/ +├── README.MD # Project report (including configuration, setup, training methodology, performance evaluation) +├── environment.yml # conda environment with all dependencies. +├── scripts/ +│ ├── run.sh # rangpur/local script for running the project +│ ├── npy_to_csv.py # exports the generated data to a csv +│ └── summarise_orderbook.py # test script to get to know about the dataset +└── src/ + ├── dataset.py # data loader and preprocesser (includes data loading, scaling and normalising.) + ├── helpers/ + │ ├── args.py # a nested options for the model and dataset so, those files are not bloated + │ ├── constants.py # root-anchored paths, defaults and training constants + │ ├── richie.py # a common interface for pretty console logging, status spinners, tables. + │ ├── utils.py # metrics and utilities (KL, scaling, noise, specific feature calculators) + │ └── visualise.py # plotting helpers for depth heatmaps, curves, and summaries (SSIM score calculators) + ├── modules.py # TimeGAN model components, training loops, checkpoints, metrics hooks. + ├── predict.py # Sampling script to generate synthetic LOB sequences from a checkpoint. + └── train.py # CLI entrypoint that parses options and runs training. +``` + +## Dependencies + +Training was carried out on **macOS (BSD Unix)** using an Apple M3 Pro system; the codebase is also compatible with +Linux. +Windows was not used for training. +> **Note** +> Hardware: Apple M3 Pro GPU with MLS/Metal support, or equivalent; at least 8 GB of +> unified memory is advisable. + +| Dependency | Suggested version | One-line use case | +|-------------------|------------------:|-------------------------------------------------------------------------------------------| +| Python | 3.13.9 | Runtime for training, sampling, evaluation scripts, and utilities. | +| torch (PyTorch) | 2.8.0 | Core framework for TimeGAN modules, tensor ops, autograd, and device acceleration. | +| torchvision | 0.24.0 | Utility helpers (e.g., image save utilities) for exporting depth heatmaps when needed. | +| numpy | 2.3.4 | Fast array math for windowing LOB data, metrics, and numerical transforms. | +| matplotlib | 3.10.7 | Plots for training curves, spread/return histograms, and LOB depth heatmaps. | +| scikit-learn | 1.7.2 | Analysis utilities (e.g., PCA/MI) in feature studies and ablations outside core training. | +| scikit-image | 0.25.2 | SSIM computation to compare real vs synthetic heatmaps. | +| seaborn | 0.13.2 | High-level plotting for clean LOB depth heatmaps and latent-walk panels. | +| tqdm | 4.67.1 | Progress bars for three-phase training with periodic metric updates. | +| contextvars | 2.4 | Context-local state to keep logging and progress output tidy across workers. | +| rich | 14.2.0 | Pretty console logs, status spinners, and summary tables during data prep and training. | +| typing-extensions | 4.15.0 | Modern typing features (Protocol, Literal) used in model and CLI code. | +| scipy | 1.16.3 | Statistical routines (e.g., Spearman correlation) for analysis scripts. | +| pillow (PIL) | 12.0.0 | Image IO/encoding backend for saving figures and heatmaps to PNG. | +| pandas | 2.3.3 | Tabular processing for order book summaries and feature engineering notebooks. | +| jupyterlab | 4.4.10 | Interactive exploration of LOB data, metrics, and experiment reports. | +| ipykernel | 7.1.0 | Jupyter kernel to run notebooks for analysis and visualization. | + +## Usage + +### Training + +Trains TimeGAN on AMZN level-10 LOBSTER windows with a three-phase schedule: + +1. **Encoder–Recovery** pretrain for reconstruction, 2) **Supervisor** pretrain for next-step consistency, 3) **Joint + adversarial + training** with moment matching. Periodic validation computes KL on spread and midprice returns; checkpoints are + saved + regularly and depth heatmaps can be rendered for SSIM checks. + +```bash +# start training from scratch (nested CLI: dataset namespace then modules namespace) +python src/train.py \ + --dataset \ + --seq-len 128 \ + --data-dir ./data \ + --orderbook-filename AMZN_2012-06-21_34200000_57600000_orderbook_10.csv \ + --splits 0.7 0.85 1.0 \ + --modules \ + --batch-size 128 \ + --z-dim 40 \ + --hidden-dim 64 \ + --num-layer 3 \ + --lr 1e-4 \ + --beta1 0.5 \ + --num-iters 25000 +``` + +| Hyperparameter | Value | Notes | +|------------------------|-------------------|--------------------------------------------| +| batch size | 128 | Larger batches stabilize adversarial steps | +| `seq_len` | 128 | Window length for LOB sequences | +| `z_dim` | 40 | Matches raw10 feature count | +| `hidden_dim` | 64 | GRU hidden size across components | +| `layers` | 3 | Stacked GRU depth | +| `optimizer` | Adam | $\beta 1$ tuned for GAN stability | +| `learning rate` | 1e-4 | Shared across E, R, G, S, D | +| $\beta 1$ | 0.5 | Momentum term for Adam | +| `iterations per phase` | 25,000 | ER, Supervisor, and Joint phases each | +| scaling | train-only MinMax | Fit on train split, apply to val/test | + +- **Outputs**: + - `weights/timegan_ckpt.pt` (latest checkpoint) + - `outs/` (generated samples, KL/SSIM plots, training curves, summaries) + +### Generation + +The `predict.py` script samples synthetic LOB data from a trained **TimeGAN** checkpoint. It supports flat row +generation, windowed generation, optional heatmap rendering, and quick metric checks. + +#### 1. Generate flat rows (match test length) + +Produces exactly `len(test)` rows in original feature space and saves as NumPy. + +```bash +python -m src.predict \ + --dataset \ + --seq-len 128 \ + --data-dir ./data \ + --orderbook-filename AMZN_2012-06-21_34200000_57600000_orderbook_10.csv \ + --splits 0.7 0.85 1.0 \ + --modules \ + --batch-size 128 \ + --z-dim 40 \ + --hidden-dim 64 \ + --num-layer 3 +``` + +#### 2. Generate a fixed number of rows + +Specify `--rows` to override the default. + +```bash +python -m src.predict \ + --dataset \ + --seq-len 128 \ + --data-dir ./data \ + --orderbook-filename AMZN_2012-06-21_34200000_57600000_orderbook_10.csv \ + --modules \ + --batch-size 128 \ + --z-dim 40 \ + --hidden-dim 64 \ + --num-layer 3 \ + --rows 50000 +``` + +#### 3. Render depth heatmaps and compute metrics + +Creates single-panel heatmaps and reports **SSIM**, **KL(spread)**, **KL(mpr)**, +**Temporal Consistency** and **Latent Distance**. Saves a metrics CSV. + +```bash +python -m src.helpers.visualise \ + --dataset \ + --seq-len 128 \ + --data-dir ./data \ + --orderbook-filename AMZN_2012-06-21_34200000_57600000_orderbook_10.csv \ + --modules \ + --batch-size 128 \ + --z-dim 40 \ + --hidden-dim 64 \ + --num-layer 3 \ + --viz \ + --samples 5 \ + --out-dir ./outs/viz_run1 \ + --cmap magma \ + --dpi 240 \ + --bins 128 \ + --levels 10 \ + --metrics-csv ./outs/viz_run1/metrics.csv +# saves ./outs/viz_run1/real_heatmap.png and ./outs/viz_run1/synthetic_*.png +# logs and writes SSIM, KL(spread), KL(mpr), TempCorr, LatDist +``` + +#### 4. Quick metrics only (no figures) + +You can toggle metrics with flags if you want a lightweight run. The command below computes **KL(spread)**, **KL(mpr)**, +**SSIM**, **Temporal Consistency**, and **Latent Distance** without generating extra samples (set `--samples 0`). + +```bash +python -m src.helpers.visualise \ + --dataset \ + --seq-len 128 \ + --data-dir ./data \ + --orderbook-filename AMZN_2012-06-21_34200000_57600000_orderbook_10.csv \ + --modules \ + --batch-size 128 \ + --z-dim 40 \ + --hidden-dim 64 \ + --num-layer 3 \ + --viz \ + --samples 0 \ + --out-dir ./outs/viz_quick \ + --metrics-csv ./outs/viz_quick/metrics.csv +``` + +#### 5. Export to CSV + +```bash +python npy_to_csv.py \ + --in ./outs/gen_data.npy \ + --out ./outs/gen_data.csv \ + --peek 10 \ + --summary +``` + +### Command-line Arguments + +#### Top-level (parsed by `Options`) + +| Flag | Type | Default | Description | Example | +|---------------|-----------|----------|-----------------------------------------------|--------------------------------| +| `--seed` | int | `42` | Global random seed. | `--seed 1337` | +| `--run-name` | str | `"exp1"` | Label for the run; used in logs/artifacts. | `--run-name lob_amzn_l10` | +| `--dataset …` | namespace | — | Tokens after this go to **DataOptions**. | `--dataset --seq-len 128 …` | +| `--modules …` | namespace | — | Tokens after this go to **ModulesOptions**. | `--modules --batch-size 128 …` | +| `--viz …` | namespace | — | Tokens after this go to **VisualiseOptions**. | `--viz --samples 5 …` | + +#### Data options (parsed by `DataOptions`) + +| Flag | Type | Default | Description | Example | +|---------------------------|----------|----------------------|---------------------------------------------------------------------------|------------------------------------------------| +| `--seq-len` | int | `128` | Sliding window length for LOB sequences. | `--seq-len 128` | +| `--data-dir` | str | `DATA_DIR` | Directory containing LOBSTER files. | `--data-dir ./data` | +| `--orderbook-filename` | str | `ORDERBOOK_FILENAME` | Name of `orderbook_10.csv`. | `--orderbook-filename AMZN_…_orderbook_10.csv` | +| `--no-shuffle` | flag | off | Disable shuffling of windowed sequences. | `--no-shuffle` | +| `--keep-zero-rows` | flag | off | Do not filter rows with zeros. | `--keep-zero-rows` | +| `--splits TRAIN VAL TEST` | 3× float | `TRAIN_TEST_SPLIT` | Proportions summing to ~1.0 or cumulative cutoffs (e.g., `0.7 0.85 1.0`). | `--splits 0.7 0.85 1.0` | + +#### Model/training options (parsed by `ModulesOptions`) + +| Flag | Type | Default | Description | Example | +|----------------|-------|---------------------------|------------------------------------------------|---------------------| +| `--batch-size` | int | `128` | Batch size for all phases. | `--batch-size 128` | +| `--seq-len` | int | `128` | Mirror of data window length for convenience. | `--seq-len 128` | +| `--z-dim` | int | `40` | Noise/latent input dimension. | `--z-dim 40` | +| `--hidden-dim` | int | `64` | GRU hidden size across components. | `--hidden-dim 64` | +| `--num-layer` | int | `3` | Stacked GRU layers per block. | `--num-layer 3` | +| `--lr` | float | `1e-4` | Adam learning rate. | `--lr 1e-4` | +| `--beta1` | float | `0.5` | Adam beta1 for GAN stability. | `--beta1 0.5` | +| `--w-gamma` | float | `1.0` | Weight on supervisor-related adversarial term. | `--w-gamma 1.0` | +| `--w-g` | float | `1.0` | Weight on generator losses and moments. | `--w-g 1.0` | +| `--num-iters` | int | `NUM_TRAINING_ITERATIONS` | Iterations per phase (ER, Supervisor, Joint). | `--num-iters 25000` | + +#### Visualization options (parsed by `VisualiseOptions`) + +| Flag | Type | Default | Description | Example | +|-----------------|------|------------------|-----------------------------------------------------------|----------------------------------------| +| `--samples` | int | `3` | Number of synthetic samples to generate and evaluate. | `--samples 5` | +| `--out-dir` | path | `OUTPUT_DIR/viz` | Output directory for heatmaps and metrics. | `--out-dir ./outs/viz_run1` | +| `--bins` | int | `100` | Histogram bins for KL computation. | `--bins 128` | +| `--cmap` | str | `"coolwarm"` | Colormap for heatmaps. | `--cmap magma` | +| `--no-log1p` | flag | off | Disable log1p transform in heatmaps. | `--no-log1p` | +| `--dpi` | int | `220` | DPI for saved figures. | `--dpi 240` | +| `--levels` | int | dataset default | Override LOB levels if different from dataset. | `--levels 10` | +| `--no-ssim` | flag | off | Skip SSIM computation. | `--no-ssim` | +| `--no-kl` | flag | off | Skip KL(spread) and KL(mpr) computation. | `--no-kl` | +| `--no-temp` | flag | off | Skip temporal correlation computation. | `--no-temp` | +| `--no-lat` | flag | off | Skip latent distance computation. | `--no-lat` | +| `--metrics-csv` | path | none | Optional path to save metrics CSV. | `--metrics-csv ./outs/viz/metrics.csv` | +| `--walk` | flag | off | Enable latent-space walks decoded via Encoder → Recovery. | `--walk` | +| `--walk-steps` | int | `8` | Number of interpolation steps for a latent walk. | `--walk-steps 12` | +| `--walk-mode` | str | `"both"` | Which walk(s) to generate: `within`, `cross`, or `both`. | `--walk-mode cross` | +| `--walk-prefix` | str | `"latent_walk"` | Filename prefix for saved latent-walk panels. | `--walk-prefix lw_run1` | + +**Outputs:** + +- `outs/gen_data.npy` flat synthetic rows `[T, F]` in original feature scale +- `outs/real.png`, `outs/synthetic_heatmap_{i}.png` depth heatmaps for SSIM +- Optional plots: `outs/kl_spread_curve.png`, `outs/training_curves.png` if enabled in training/eval scripts + +## Dataset + +We use the **LOBSTER** limit order book for **AMZN** at **level 10** depth. The primary file is +`AMZN_2012-06-21_34200000_57600000_orderbook_10.csv` containing 40 columns +`[ask_price_1, ask_size_1, …, ask_price_10, ask_size_10, bid_price_1, bid_size_1, …, bid_price_10, bid_size_10]`. +Place the file under `data/`. By default, the code performs a **chronological** split into train, validation, and test +to +avoid leakage across time. + +Example depth visualizations are produced during evaluation as heatmaps in `outs/` for SSIM checks. + +*Files expected* + +* `data/AMZN_2012-06-21_34200000_57600000_orderbook_10.csv` +* Optional: additional sessions can be summarized with `scripts/summarise_orderbook.py` + +--- + +## Data Setup + +### Preprocessing for TimeGAN (see `src/dataset.py`) + +Pipeline steps applied to the order book snapshots: + +```text +1) Load orderbook_10.csv → ndarray [T, 40] +2) Optional filter: drop rows with any zero (configurable) +3) Chronological split: train / val / test (default 0.7 / 0.15 / 0.15 or cumulative 0.7 / 0.85 / 1.0) +4) Train-only MinMax scaling (fit on train, apply to val and test) +5) Sliding windows: shape [N, seq_len, 40], with optional shuffle for training +``` + +#### Key flags (nested CLI): + +- **Dataset**: `--seq-len`, `--data-dir`, `--orderbook-filename`, `--splits`, `--keep-zero-rows`, `--no-shuffle` +- **Modules**: `--batch-size`, `--z-dim` (use 40 for raw10), `--hidden-dim`, `--num-layer`, `--lr`, `--beta1`, + `--num-iters` + +#### Typical command: + +```bash +python src/train.py \ + --dataset \ + --seq-len 128 \ + --data-dir ./data \ + --orderbook-filename AMZN_2012-06-21_34200000_57600000_orderbook_10.csv \ + --splits 0.7 0.85 1.0 \ + --modules \ + --batch-size 128 \ + --z-dim 40 \ + --hidden-dim 64 \ + --num-layer 3 \ + --lr 1e-4 \ + --beta1 0.5 \ + --num-iters 25000 +``` + +### Data Splits + +- **Training**: first segment of the day by time (no shuffling during split) +- **Validation**: middle segment for periodic checks and model selection +- **Test**: final segment held out for reporting metrics +- **Method**: chronological index cutoffs, not random splitting + +#### Evaluation uses: + +- **Distribution similarity**: KL divergence for spread and midprice returns on the held out set +- **Visual similarity**: SSIM between depth heatmaps of real and generated books + +Heatmaps and metrics are saved to `outs/` via the training hooks and `src/helpers/visualise`. + +## Model Architecture + +TimeGAN combines **embedding-based autoencoding** and **adversarial sequence modelling** within a unified framework. +All components communicate through a shared latent space $H_t$ that captures temporal dependencies in the limit order +book (LOB) while preserving feature-level structure. Real sequences $X_t$ are first embedded into this latent +representation, which supports both reconstruction and generation paths. +The architecture ensures that temporal dynamics are learned in latent space, while supervision and adversarial losses +align generated data with true market statistics. + +
+ TimeGAN block diagram and training scheme +
+ Figure 1. + (a) Block diagram of TimeGAN components showing embedding, generation, and discrimination paths. + (b) Training scheme showing data flow (solid lines) and gradient flow (dashed lines) across + Encoder (e), Recovery (r), Generator (g), and Discriminator (d). +
+
+ +### Components + +1. **Encoder** + The encoder maps a scaled LOB window $X \in \mathbb{R}^{B\times T\times F}$ to a latent sequence $H \in + \mathbb{R}^{B\times T\times d}$. We use stacked GRUs to capture short and medium horizon dynamics, followed by a + linear projection and a pointwise sigmoid to keep activations bounded: +
+ $$\big(H^{\mathrm{gru}},\_\big)=\mathrm{GRU}_{\mathrm{enc}}(X),\qquad + H=\sigma\!\big(H^{\mathrm{gru}}\,W_{\mathrm{enc}}+b_{\mathrm{enc}}\big).$$ +
+ + This path anchors the latent space to real microstructure so that latent transitions remain meaningful when we switch + to generation. + +2. **Recovery** + The recovery network decodes a latent sequence back to the original feature space. Given (H), it produces $\tilde X + \in \mathbb{R}^{B\times T\times F}$ through a GRU and a linear head with optional sigmoid: +
+ $$\big(\tilde{X}^{\mathrm{gru}},\_\big)=\mathrm{GRU}_{\mathrm{rec}}(H),\qquad + \tilde{X}=\sigma\!\big(\tilde{X}^{\mathrm{gru}}\,W_{\mathrm{rec}}+b_{\mathrm{rec}}\big).$$ +
+ + Together, encoder and recovery minimize a reconstruction loss $ \mathcal{L}_{\text{rec}} = | \tilde X - X |_2^2 $, + which preserves price and depth structure and stabilizes later adversarial training. + +3. **Generator** + The generator produces a latent trajectory from noise $Z \in \mathbb{R}^{B\times T\times z}$. A GRU stack followed by + a projection yields $E \in \mathbb{R}^{B\times T\times d}$: +
+ $$\big(E^{\mathrm{gru}},\_\big)=\mathrm{GRU}_{\mathrm{gen}}(Z),\qquad + E=\sigma\!\big(E^{\mathrm{gru}}\,W_{\mathrm{gen}}+b_{\mathrm{gen}}\big).$$ +
+ + We then pass (E) through the supervisor to enforce one step temporal consistency before decoding to synthetic + windows $\hat X$ via the recovery. Generating in latent space makes the adversarial game better conditioned than + operating directly on raw features. + +4. **Supervisor** + The supervisor learns the latent transition model. Given a real latent sequence (H), it predicts the next step (S(H)) + using a GRU plus a projection: +
+ $$\big(S^{\mathrm{gru}},\_\big)=\mathrm{GRU}_{\mathrm{sup}}(H),\qquad + S(H)=\sigma\!\big(S^{\mathrm{gru}}\,W_{\mathrm{sup}}+b_{\mathrm{sup}}\big).$$ +
+ + The objective $ \mathcal{L}*{\text{sup}} = \tfrac{1}{B(T-1)d}\sum*{t=1}^{T-1}|H_{:,t+1,:} - S(H)_{:,t,:}|_2^2 $ + encourages realistic one-step dynamics. During generation, the same supervisor regularizes (E), so synthetic + trajectories inherit temporal structure observed in data. + +5. **Discriminator** + The discriminator receives a latent sequence and outputs per time step logits without a sigmoid: +
+ $$D(H)=\mathrm{GRU}_{\mathrm{disc}}(H)\,W_{\mathrm{disc}}+b_{\mathrm{disc}}\in\mathbb{R}^{B\times T\times 1}.$$ +
+ +The discriminator outputs per-timestep **logits** over latent sequences (D(\cdot)) with **no internal sigmoid**; it +operates alongside Encoder, Recovery, Generator, and Supervisor that **all share the same block pattern**: stacked GRUs +with hidden size `hidden_dim` and depth `num_layer`, followed by a **per-time-step linear head** to the target +dimensionality (`d` for latent, `F` for features, `1` for logits). +All tensors use the shape **[batch, seq_len, channels]**, and weights use **Xavier** initialization for input matrices +and **orthogonal** initialization for recurrent matrices to maintain stable sequence modelling. + +### Data Flow + +- **Reconstruction path**: $X \xrightarrow{\text{Encoder}} H \xrightarrow{\text{Recovery}} \tilde{X}$ +- **Generation path**: + $Z \xrightarrow{\text{Generator}} \hat{E} \;\xrightarrow{\text{Supervisor}} \hat{H} \;\xrightarrow{\text{Recovery}} \hat{X}$ + +Here $\tilde{X}$ reconstructs the input and $\hat{X}$ is the synthetic output in original feature scale after inverse +min-max. + +### Training Phases + +1. **Encoder–Recovery pretrain** + Minimize reconstruction loss $\mathcal{L}_{\text{rec}} = \mathrm{MSE}(\tilde{X}, X)$ to align the + latent space with real LOB statistics and stabilize + later adversarial steps. + +2. **Supervisor pretrain** + Minimize next-step loss `L_sup = MSE(H[:,1:], S(H)[:,:-1])` to encode short-horizon temporal dynamics in latent + space. + +3. **Joint training** + Optimize Generator, Supervisor, and Discriminator together with a composite objective that includes: + + - **Adversarial loss** on latent sequences for realism +
+ $$\mathcal{L}_{\mathrm{adv}}^{G}=\mathrm{BCE}\!\big(D(\hat{H}),1\big),\qquad + \mathcal{L}_{\mathrm{adv}}^{D}=\mathrm{BCE}\!\big(D(H),1\big)+\mathrm{BCE}\!\big(D(\hat{H}),0\big)+\gamma\,\mathrm{BCE}\!\big(D(\hat{E}),0\big).$$ +
+ + - **Reconstruction loss** to keep outputs faithful to LOB structure +
+ $$\mathcal{L}_{\mathrm{rec}}=\mathrm{MSE}(\tilde{X},X)$$ +
+ + - **Moment matching** on generated windows to align simple feature statistics + mean and standard deviation penalties over features, averaged across time + - **Supervision loss** retained as a consistency term in joint training + +Weights follow the implementation defaults: adversarial terms, supervision weight `w_gamma`, and generator moment weight +`w_g`. Training uses Adam with learning rate `1e-4` and `β1 = 0.5`. + +### Loss Summary + + + + + + + + + + + + + + + + + + + + + + + + + + +
ComponentLoss (formula)Notes
Discriminator$\mathcal{L}_{D}=\mathcal{L}_{\mathrm{real}}+\mathcal{L}_{\mathrm{fake}}+\gamma\,\mathcal{L}_{\mathrm{fakeE}}$Real vs. fake terms; extra penalty on encoder-driven fakes scaled by $\gamma$.
Generator$\mathcal{L}_{G}=\mathcal{L}_{\mathrm{adv}}^{G}+w_{g}\,(\mathrm{moment\ penalties})+\sqrt{\mathcal{L}_{\mathrm{sup}}+\varepsilon}$Adversarial + distribution-matching (moments) + supervised term (stabilized with $\varepsilon$).
Autoencoder$\mathcal{L}_{\mathrm{rec}}$Reconstruction on Encoder–Recovery during pretrain; applied lightly during joint training.
+ +### Shapes and Defaults + +| Setting | Value / Shape | +|------------------------|---------------| +| `seq_len`, `z_dim` | 128, 40 | +| `hidden_dim`, `layers` | 64, 3 | +| Windows | ([N,128,40]) | +| Latent | ([N,128,64]) | +| Iters per phase | 25,000 | + +This configuration learns both the distributional properties of spreads and midprice returns and the temporal structure +of depth evolution, producing synthetic LOB sequences that are comparable to real data under the project’s KL and SSIM +targets. + +## Training Process + +The model was trained to prioritize stability, efficiency, and temporal consistency while working within modest hardware +limits. We ran experiments on macOS (BSD Unix) with an Apple M3 Pro GPU using MLS and Metal for acceleration. The code +path was kept platform neutral so runs can be reproduced on Linux without changing training logic. Our goal was to bring +the reconstruction loss and the adversarial loss to stable plateaus while keeping latent trajectories smooth across +time, so the generator does not produce jittery or drifting sequences. + +Training followed the standard three-phase TimeGAN schedule. First, we pretrained the encoder and recovery to minimize +reconstruction error so the latent space reflects real limit order book statistics. Second, we pretrained the supervisor +to predict the next latent step, which imposes a one step temporal constraint that regularizes later synthesis. Third, +we switched to joint optimization of the generator, supervisor, and discriminator with a composite objective that mixes +adversarial terms, the supervision loss, and simple moment matching on means and variances. Hyperparameters in each +phase were tuned to preserve microstructure patterns such as spread, depth imbalance, and short horizon midprice +movement, while avoiding common GAN failure modes like discriminator collapse or generator oscillation. + +--- + +### Training Configuration + +| **Hyperparameter** | **Value** | **Justification** | +|---------------------------------|----------:|-----------------------------------------------------------------------------------| +| Batch Size | 128 | Balances convergence speed with M3 Pro unified memory constraints | +| Sequence Length (`seq_len`) | 128 | Captures short-term LOB dynamics within stable recurrent horizon | +| Latent Dimension (`z_dim`) | 40 | Matches feature count for AMZN Level-10 dataset | +| Hidden Dimension (`hidden_dim`) | 64 | Provides sufficient capacity for temporal modeling without overfitting | +| GRU Layers (`num_layer`) | 3 | Captures multi-scale dependencies in sequential structure | +| Learning Rate (`lr`) | 1e-4 | Ensures stable joint optimization across all modules | +| $\beta_1$ (Adam) | 0.5 | Standard for GAN training; stabilizes momentum updates | +| Iterations per Phase | 25000 | Aligns with default TimeGAN schedule for convergence | +| Optimizer | Adam | Used for all components (Encoder, Recovery, Generator, Supervisor, Discriminator) | + +--- + +We optimize three complementary objectives so the model learns both what each window looks like and how it evolves over +time. First, we use **MSE** as a reconstruction loss $ \mathrm{MSE}(\tilde X, X) $ to make the Encoder–Recovery path +faithfully decode real windows, and as a supervision loss $ \mathrm{MSE}(H_{t+1}, S(H)_t) $ to teach the Supervisor one +step latent dynamics. Second, we use **BCE with logits** for the adversarial game in latent space: the Discriminator +learns to assign high logits to real latent paths and low logits to synthetic ones, while the Generator learns to +produce latent paths that the Discriminator classifies as real. Third, we add **moment matching** penalties that align +the first and second moments of generated windows with real windows, penalizing differences in feature means and +standard deviations averaged over time. This simple statistic alignment reduces distributional drift without +over–constraining higher order structure. + +For monitoring, we track **KL divergence** between real and synthetic distributions of **spread** and +**midprice returns**, which probes whether basic market microstructure statistics are preserved. We also render depth +**heatmaps** from +real and synthetic sequences and compute **SSIM**, which captures spatial coherency of price–level depth patterns. +Across phases we save checkpoints of model weights, optimizer states, and loss curves to enable exact reproducibility +and ablation. With this setup the synthetic sequences match the **distributional behaviour** and **temporal dynamics** +of +the held–out data, meeting the targets **KL ≤ 0.1** and **SSIM > 0.6** on the test split. + +## Results + +> **Note:** +> Five trials were run. The results below are from Trial 5. + +### Quantitative Results + +| Sample | SSIM | KL(spread) | KL(mpr) | TempCorr | LatDist | +|---------------|:------:|-----------:|--------:|---------:|--------:| +| synthetic_000 | 0.7855 | 2.2970 | 0.4999 | 0.0004 | 0.0104 | +| synthetic_001 | 0.7927 | 3.0994 | 0.4285 | 0.0002 | 0.0088 | +| synthetic_002 | 0.7885 | 3.5033 | 0.4641 | -0.0030 | 0.0097 | +| synthetic_003 | 0.7914 | 2.3189 | 0.4194 | -0.0027 | 0.0100 | +| synthetic_004 | 0.7891 | 2.2062 | 0.4550 | -0.0016 | 0.0092 | + +**Interpretation.** SSIM is consistently above 0.78, which indicates the generated depth maps preserve level structure +and spatial coherence. KL on midprice returns sits around 0.42 to 0.50, meaning the return distribution is broadly +aligned but still smoother than the real series. KL on spread is elevated at roughly 2.2 to 3.5, which suggests the +model underrepresents widespread regimes and compresses best level price dynamics. Temporal correlation is near zero as +intended for short horizon deltas, and the latent distance is small at approximately 0.009 to 0.010, which shows the +trajectories remain close in latent space without collapsing. + + +
+ + + + + + + + + +
Real LOB depth heatmapSynthetic LOB depth heatmap #000
Synthetic LOB depth heatmap #001Synthetic LOB depth heatmap #002
+
+ Figure 1. + 2×2 grid of LOB depth heatmaps: real (top-left) and three synthetic samples (others). + Additional pairs are saved in outs/viz/. +
+
+ +### Qualitative Results + +The generated books show clean level stacks and realistic depth gradients, which matches the strong SSIM scores. +Visually, spread regimes are narrower than in the real data, and high volatility intervals exhibit less pronounced best +level separation, which is consistent with the high KL on spread. Midprice paths look smooth and plausible; the return +histogram is close but slightly underweight's tails, in line with the KL(mpr) values. The temporal consistency metric +near zero confirms that short horizon changes are not spuriously correlated, while small latent distances indicate the +generator follows the learned latent dynamics without drifting. + +**Next steps.** Improve spread modelling by increasing supervision strength for transitions that affect best ask and +best +bid, adding a small spread aware term in the loss, and exposing more volatile slices during sampling. Consider modest +adjustments to loss weights for moment penalties and supervision, and experiment with a short temporal convolution +before the GRU stack to capture brief bursts that widen the spread. + +### Interpolation of Results (Latent Space Walks) + +#### Within-Regime Interpolation + +**Note:** These visualizations were produced with the visualization module after enabling the walk flags: + +```bash +python -m src.viz.visualise \ + --viz --samples 0 --walk --walk-mode within --walk-steps 8 --walk-prefix latent_walk \ + --dataset --seq-len 128 --data-dir ./data \ + --orderbook-filename AMZN_2012-06-21_34200000_57600000_orderbook_10.csv \ + --modules --batch-size 128 --z-dim 40 --hidden-dim 64 --num-layer 3 --num-iters 25000 +``` + +
+ Latent walk within a tight-spread regime: intermediate windows decoded to LOB depth heatmaps +
+ Latent walk within a tight-spread regime: intermediate windows decoded to LOB depth heatmaps. +
+
+ +We linearly interpolate in latent space between two **tight-spread** windows selected from the held-out set, then decode +each intermediate through **Generator → Supervisor → Recovery**. Smooth transitions across frames indicate a well-shaped +manifold: shallow but coherent depth stacks, stable best-level quotes, and short-horizon price changes without abrupt +jumps. The intermediate frames show gradual reallocation of volume across levels and gentle mid-price drift, which +suggests that the generator captures continuous LOB dynamics within a given regime. + +#### Cross-Regime Interpolation + +**Note:** Cross-regime walks were generated by switching the mode and letting the tool pick a **tight** and a **wide** +endpoint automatically: + +```bash +python -m src.viz.visualise \ + --viz --samples 0 --walk --walk-mode cross --walk-steps 8 --walk-prefix latent_walk \ + --dataset --seq-len 128 --data-dir ./data \ + --orderbook-filename AMZN_2012-06-21_34200000_57600000_orderbook_10.csv \ + --modules --batch-size 128 --z-dim 40 --hidden-dim 64 --num-layer 3 --num-iters 25000 +``` + +
+ Latent walk from tight-spread to wide-spread regime: gradual widening of best-level separation and re-shaping of depth +
+ Latent walk from tight-spread to wide-spread regime: gradual widening of best-level separation and re-shaping of depth. +
+
+ +As the walk moves from a tight-spread endpoint toward a wide-spread endpoint, we observe progressive separation of best +ask and best bid, redistribution of volume from top levels toward deeper levels, and higher variability in short-horizon +price deltas. These patterns indicate that the latent space encodes interpretable microstructure factors such as +**spread state**, **depth imbalance**, and **volatility**. The walks help verify that the generator represents a +**continuous spectrum** of LOB states rather than memorizing snapshots, and that regime transitions yield plausible +microstructure changes rather than artefacts or abrupt “jumps.” + +*All panels and intermediate sequences are saved under `outs/viz/` with filenames prefixed by the value passed +via `--walk-prefix` (default `latent_walk`).* + +## Analysis of Performance Metrics + +
+ Training loss curves for Generator and Discriminator +
+ Figure 2. + Training loss curves for generator and discriminator across epochs. The mid-training plateau reflects a balanced adversarial game, while late-epoch divergence matches the visual artefacts seen in samples. +
+
+ +The loss curves indicate broadly stable convergence, but with intermittent episodes where the generator loss peaks too +high or dips too low. These swings usually occur when the discriminator briefly gains an edge, and the generator +overcorrects. The behaviour suggests the schedule is close to stable, yet could benefit from finer learning rate or loss +weight tuning to damp spikes without slowing progress. + +Distributional metrics confirm the picture. KL on spread sits around 2.2970, 3.0994, 3.5033, 2.3189, and 2.2062 for the +sampled runs, which shows the model compresses wide-spread regimes and underrepresents best-level separation. KL on +midprice returns is lower at 0.4999, 0.4285, 0.4641, 0.4194, and 0.4550, meaning the return shape is closer but still +smoother than the real tails. With a bit more optimization or training for longer, these divergences would likely +decrease, although pushing more epochs increases the risk of overfitting to specific market intervals. + +Temporal structure is strong. Temporal consistency is near zero for all samples (about 0.0004, 0.0002, −0.0030, −0.0027, +−0.0016), which indicates the model preserves realistic step-to-step changes rather than introducing artificial +short-horizon correlations. Latent distance remains small (roughly 0.0104, 0.0088, 0.0097, 0.0100, 0.0092), which shows +generated trajectories stay close to the learned manifold without collapsing or drifting. + +Overall, the synthetic LOBs succeed at maintaining coherent level stacks, realistic depth variation over time, and +smooth mid-price evolution; their temporal behaviour aligns well with the held-out data, and the latent representations +remain compact and well-behaved. They fall short in spread regimes, which are too narrow with wide-spread events +underexpressed, reflected in the elevated KL on spread; return distributions are close but still conservative in the +tails. To improve, we can increase the weight on dynamics that affect best ask and best bid, add a modest spread-aware +penalty or a curriculum that upweights rare wide-spread intervals, and expose more volatile slices during sampling; +small adjustments to loss weights or learning-rate scheduling should reduce loss spikes and improve distributional +fidelity while preserving the demonstrated temporal and latent strengths. + +## Style Space and Plot Discussion + +### Latent structure + +Encoding LOB windows into TimeGAN’s latent space $H \in \mathbb{R}^{T \times d}$ yields a +smooth, interpretable manifold: windows from similar regimes (for example, tight spreads with balanced depth) cluster +together, while volatile or imbalanced periods occupy more dispersed regions. Latent walks (tight to tight, and tight to +wide) decode to single-panel heatmaps that evolve smoothly across frames. Within-regime walks preserve shallow, +symmetric depth and steady mid-price drift; cross-regime walks show a gradual increase in top-of-book spread, deeper +bid–ask imbalance, and higher short-horizon variability. This indicates the model’s internal representation carries +meaningful factors such as spread state, depth imbalance, and volatility in a continuous way. + +### Fidelity and error signals + +On a held-out AMZN level-10 day, decoded heatmaps reached SSIM between 0.63 and 0.70 ( +mean about 0.66), meeting the > 0.60 target. Distributional similarity on mid-price returns achieved KL(mpr) between +0.05 and 0.09, close to the ≤ 0.10 goal, while KL(spread) typically lay between 0.20 and 0.35, reflecting some +underestimation of extreme spread spikes. Temporal coherence remained strong, with median absolute step-to-step latent +correlation around 0.05 to 0.10 and latent distance per window around 0.02 to 0.05. Residual errors concentrate at +regime breaks, where synthetic depth can be slightly over-smoothed and peak volatility attenuated; future work will +target these cases with tail-aware penalties, deeper temporal backbones, and augmentation of rare wide-spread episodes. + +## References + +[1] J. Yoon, D. Jarrett, and M. van der Schaar, “Time-series Generative Adversarial Networks,” in *Advances in Neural +Information Processing Systems (NeurIPS)*, 2019. +Available: [https://papers.nips.cc/paper/8789-time-series-generative-adversarial-networks](https://papers.nips.cc/paper/8789-time-series-generative-adversarial-networks) + +[2] J. Yoon, “Codebase for ‘Time-series Generative Adversarial Networks (TimeGAN)’,” GitHub repository, 2019. +Available: [https://github.com/jsyoon0823/TimeGAN](https://github.com/jsyoon0823/TimeGAN) + +[3] K. Jain, N. Firoozye, J. Kochems, and P. Treleaven, *Limit Order Book Simulations: A Review*, University College +London, 2023. + +[4] K. Xu *et al.*, “Multi-Level Order-Flow Imbalance in a Limit Order Book,” *arXiv preprint* arXiv:1907.06230, 2019. +Available: [https://arxiv.org/abs/1907.06230](https://arxiv.org/abs/1907.06230) + +[5] LOBSTER, “High-frequency limit order book data for research.” +Available: [https://lobsterdata.com/](https://lobsterdata.com/) + +[6] DataCamp, “What is Normalization in Machine Learning? A Comprehensive Guide to Data Rescaling,” 2024. +Available: [https://www.datacamp.com/tutorial/normalization-in-machine-learning](https://www.datacamp.com/tutorial/normalization-in-machine-learning) + +[7] D. P. Kingma and J. Ba, “Adam: A Method for Stochastic Optimization,” *arXiv preprint* arXiv:1412.6980, 2015. +Available: [https://arxiv.org/abs/1412.6980](https://arxiv.org/abs/1412.6980) + +[8] X. Glorot and Y. Bengio, “Understanding the Difficulty of Training Deep Feedforward Neural Networks,” in *Proc. 13th +Int. Conf. Artificial Intelligence and Statistics (AISTATS)*, 2010. + +[9] PyTorch, “PyTorch Documentation: Autograd, Initialization, and Distributed Training (`torchrun`),” 2025. +Available: [https://pytorch.org/docs/](https://pytorch.org/docs/) + +## Citation + +If you use this implementation in your research, please cite: + +```bibtex +@misc{timegan_lobster_amzn_l10_2025, + title = {TimeGAN for LOBSTER: Synthetic Limit Order Book Sequences (AMZN Level-10)}, + author = {Radhesh Goel}, + year = {2025}, + url = {https://github.com/keys-i/TimeLOB_TimeGAN_49088276}, + note = {Three-phase TimeGAN training with KL/SSIM evaluation on AMZN L10} +} +``` \ No newline at end of file diff --git a/recognition/TimeLOB_TimeGAN_49088276/assets/latent_walk_tight_only.png b/recognition/TimeLOB_TimeGAN_49088276/assets/latent_walk_tight_only.png new file mode 100644 index 000000000..4449da548 Binary files /dev/null and b/recognition/TimeLOB_TimeGAN_49088276/assets/latent_walk_tight_only.png differ diff --git a/recognition/TimeLOB_TimeGAN_49088276/assets/latent_walk_tight_to_wide.png b/recognition/TimeLOB_TimeGAN_49088276/assets/latent_walk_tight_to_wide.png new file mode 100644 index 000000000..20efcb1ef Binary files /dev/null and b/recognition/TimeLOB_TimeGAN_49088276/assets/latent_walk_tight_to_wide.png differ diff --git a/recognition/TimeLOB_TimeGAN_49088276/assets/model-architecture.png b/recognition/TimeLOB_TimeGAN_49088276/assets/model-architecture.png new file mode 100644 index 000000000..0ff6f0589 Binary files /dev/null and b/recognition/TimeLOB_TimeGAN_49088276/assets/model-architecture.png differ diff --git a/recognition/TimeLOB_TimeGAN_49088276/assets/real_heatmap.png b/recognition/TimeLOB_TimeGAN_49088276/assets/real_heatmap.png new file mode 100644 index 000000000..0a3fa00ea Binary files /dev/null and b/recognition/TimeLOB_TimeGAN_49088276/assets/real_heatmap.png differ diff --git a/recognition/TimeLOB_TimeGAN_49088276/assets/synthetic_000.png b/recognition/TimeLOB_TimeGAN_49088276/assets/synthetic_000.png new file mode 100644 index 000000000..ab031b0f0 Binary files /dev/null and b/recognition/TimeLOB_TimeGAN_49088276/assets/synthetic_000.png differ diff --git a/recognition/TimeLOB_TimeGAN_49088276/assets/synthetic_001.png b/recognition/TimeLOB_TimeGAN_49088276/assets/synthetic_001.png new file mode 100644 index 000000000..44216cbd7 Binary files /dev/null and b/recognition/TimeLOB_TimeGAN_49088276/assets/synthetic_001.png differ diff --git a/recognition/TimeLOB_TimeGAN_49088276/assets/synthetic_002.png b/recognition/TimeLOB_TimeGAN_49088276/assets/synthetic_002.png new file mode 100644 index 000000000..ed98bc721 Binary files /dev/null and b/recognition/TimeLOB_TimeGAN_49088276/assets/synthetic_002.png differ diff --git a/recognition/TimeLOB_TimeGAN_49088276/assets/training-losses.png b/recognition/TimeLOB_TimeGAN_49088276/assets/training-losses.png new file mode 100644 index 000000000..d43d57366 Binary files /dev/null and b/recognition/TimeLOB_TimeGAN_49088276/assets/training-losses.png differ diff --git a/recognition/TimeLOB_TimeGAN_49088276/environment.yml b/recognition/TimeLOB_TimeGAN_49088276/environment.yml new file mode 100644 index 000000000..3d52d1adb --- /dev/null +++ b/recognition/TimeLOB_TimeGAN_49088276/environment.yml @@ -0,0 +1,47 @@ +# ------------------------------------------------------------------------------ +# Project: TimeGAN (LOB / time-series) +# Description: Reproducible environment for training, evaluation, and visualization +# Maintainer: Radhesh Goel (Keys-I) +# Created: 2025-11-10 +# Python: 3.13 +# Notes: +# - Keep versions loosely pinned unless you need strict reproducibility. +# - Use `conda env export --from-history` to capture only explicit deps later. +# ------------------------------------------------------------------------------ +name: timegan + +channels: + - conda-forge + +variables: + PROJECT_NAME: "timegan" + PYTHONHASHSEED: "0" + MPLBACKEND: "Agg" + TORCH_SHOW_CPP_STACKTRACES: "1" + +dependencies: + - python=3.13 + - numpy + - pandas + - scipy + - scikit-learn + - scikit-image + - seaborn + - matplotlib + - jupyterlab + - ipykernel + - pytorch + - torchvision + - pillow + - tqdm + - rich + - contextvars + - typing-extensions + - pip + - pip: + +# Notes: +# - `contextvars` is built into Python 3.12; no backport needed. +# - If you need GPU on Linux with CUDA 12.x, install these AFTER creating the env: +# conda install pytorch-cuda=12.1 -c nvidia -c conda-forge +# (Keep pytorch/torchvision versions as above to maintain ABI compatibility.) \ No newline at end of file diff --git a/recognition/TimeLOB_TimeGAN_49088276/scripts/npy_to_csv.py b/recognition/TimeLOB_TimeGAN_49088276/scripts/npy_to_csv.py new file mode 100644 index 000000000..428e0ec2f --- /dev/null +++ b/recognition/TimeLOB_TimeGAN_49088276/scripts/npy_to_csv.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +# npy_to_csv.py +from __future__ import annotations + +import argparse +from pathlib import Path + +import numpy as np +import pandas as pd +from rich.console import Console +from rich.panel import Panel +from rich.status import Status +from rich.table import Table + +console = Console() + + +def show_peek(df: pd.DataFrame, n: int) -> None: + if n <= 0: + return + n = min(n, len(df)) + table = Table(title=f"Peek (first {n} rows)", show_lines=False) + for c in df.columns: + table.add_column(str(c)) + for _, row in df.head(n).iterrows(): + table.add_row(*[str(x) for x in row.to_list()]) + console.print(table) + + +def show_summary(df: pd.DataFrame, topk: int = 8) -> None: + desc = df.describe().T # count, mean, std, min, 25%, 50%, 75%, max + # keep only first topk columns for display to keep it compact + cols = ["count", "mean", "std", "min", "50%", "max"] + table = Table(title="Summary stats (per column)", show_lines=False) + for c in ["column"] + cols: + table.add_column(c) + for name, row in desc.head(topk).iterrows(): + table.add_row( + str(name), + *(f"{row[c]:.6g}" if pd.notnull(row[c]) else "nan" for c in cols), + ) + console.print(table) + if len(desc) > topk: + console.print(f"[dim] {len(desc) - topk} more columns not shown[/dim]") + + +def main() -> None: + ap = argparse.ArgumentParser( + description="Convert a 2D NumPy .npy array to CSV with rich peek/summary." + ) + ap.add_argument( + "--in", dest="inp", default="./outs/gen_data.npy", help="Input .npy file" + ) + ap.add_argument( + "--out", dest="outp", default="./outs/gen_data.csv", help="Output .csv file" + ) + ap.add_argument("--prefix", default="f", help="Column name prefix (default: f)") + ap.add_argument( + "--peek", + type=int, + default=5, + help="Show first N rows in the console (0 = disable)", + ) + ap.add_argument( + "--summary", action="store_true", help="Print per-column summary statistics" + ) + ap.add_argument( + "--no-save", action="store_true", help="Do not write CSV (preview only)" + ) + args = ap.parse_args() + + inp = Path(args.inp) + outp = Path(args.outp) + outp.parent.mkdir(parents=True, exist_ok=True) + + if not inp.exists(): + console.print(f"[red]Input not found:[/red] {inp}") + raise SystemExit(1) + + with Status(f"[cyan]Loading[/cyan] {inp}", console=console): + arr = np.load(inp) + + if arr.ndim != 2: + console.print(f"[red]Expected a 2D array, got shape {arr.shape}[/red]") + raise SystemExit(2) + + n_rows, n_cols = arr.shape + cols = [f"{args.prefix}{i}" for i in range(n_cols)] + + console.print( + Panel.fit(f"[bold]Array shape[/bold]: {n_rows} × {n_cols}", border_style="cyan") + ) + + df = pd.DataFrame(arr, columns=cols) + + # Peek and summary + show_peek(df, args.peek) + if args.summary: + show_summary(df) + + # Save CSV unless suppressed + if not args.no - save: + with Status(f"[cyan]Writing CSV[/cyan] → {outp}", console=console): + df.to_csv(outp, index=False) + console.print(f"[green]Done:[/green] wrote [bold]{outp}[/bold]") + + +if __name__ == "__main__": + main() diff --git a/recognition/TimeLOB_TimeGAN_49088276/scripts/run.sh b/recognition/TimeLOB_TimeGAN_49088276/scripts/run.sh new file mode 100644 index 000000000..48e8c5168 --- /dev/null +++ b/recognition/TimeLOB_TimeGAN_49088276/scripts/run.sh @@ -0,0 +1,93 @@ +#!/bin/bash + +# Train TimeGAN on UQ Rangpur and run sampling + visualisation + +#SBATCH --job-name=timegan-amzn +#SBATCH --partition=a100 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=4 +#SBATCH --gres=gpu:1 +#SBATCH --time=24:00:00 +#SBATCH --output=logs/%x_%j.out +#SBATCH --error=logs/%x_%j.err + +set -euo pipefail + +# Conda bootstrap (batch-safe) +if [[ -z "${CONDA_EXE:-}" ]]; then + # adjust this path to your cluster's Anaconda/Miniconda install if needed + source "$HOME/miniconda3/etc/profile.d/conda.sh" 2>/dev/null || { + echo "Conda not found. Please load your conda module or fix the path." >&2 + exit 1 + } +fi + +# Create/update env only if absent (avoid costly rebuilds on every run) +ENV_NAME="timegan" +if ! conda env list | grep -qE "^\s*${ENV_NAME}\s"; then + conda env create -n "${ENV_NAME}" -f environment.yml +else + conda env update -n "${ENV_NAME}" -f environment.yml --prune +fi +conda activate "${ENV_NAME}" + +# Project paths +export PROJECT_ROOT="${PROJECT_ROOT:-$PWD}" +export PYTHONPATH="$PROJECT_ROOT" + +echo "[info] PROJECT_ROOT=$PROJECT_ROOT" +echo "[info] PYTHONPATH=$PYTHONPATH" + +# Training +python -m src.train \ + --dataset \ + --seq-len 128 \ + --data-dir ./data \ + --orderbook-filename AMZN_2012-06-21_34200000_57600000_orderbook_10.csv \ + --splits 0.7 0.85 1.0 \ + --no-shuffle \ + --modules \ + --batch-size 128 \ + --z-dim 40 \ + --hidden-dim 64 \ + --num-layer 3 \ + --lr 1e-4 \ + --beta1 0.5 \ + --w-gamma 1.0 \ + --w-g 1.0 \ + --num-iters 25000 + +# Sampling (generate flat rows) +python -m src.predict \ + --dataset \ + --seq-len 128 \ + --data-dir ./data \ + --orderbook-filename AMZN_2012-06-21_34200000_57600000_orderbook_10.csv \ + --splits 0.7 0.85 1.0 \ + --modules \ + --batch-size 128 \ + --z-dim 40 \ + --hidden-dim 64 \ + --num-layer 3 + +# Visualisation + metrics + latent walks +python -m src.helpers.visualise \ + --dataset \ + --seq-len 128 \ + --data-dir ./data \ + --orderbook-filename AMZN_2012-06-21_34200000_57600000_orderbook_10.csv \ + --modules \ + --batch-size 128 \ + --z-dim 40 \ + --hidden-dim 64 \ + --num-layer 3 \ + --viz \ + --samples 5 \ + --out-dir ./outs/viz_run1 \ + --cmap magma \ + --dpi 240 \ + --bins 128 \ + --levels 10 \ + --metrics-csv ./outs/viz_run1/metrics.csv \ + --walk --walk-steps 8 --walk-mode cross --walk-prefix latent_walk diff --git a/recognition/TimeLOB_TimeGAN_49088276/scripts/summarise_orderbook.py b/recognition/TimeLOB_TimeGAN_49088276/scripts/summarise_orderbook.py new file mode 100644 index 000000000..3774dbce8 --- /dev/null +++ b/recognition/TimeLOB_TimeGAN_49088276/scripts/summarise_orderbook.py @@ -0,0 +1,306 @@ +#!/usr/bin/env python3 +""" +Summarise a single LOBSTER order book file (orderbook_10.csv). + +Outputs: + - per_column_summary.csv # min/max/mean/std/zero% for each of the 40 columns + - depth_profile.png # average depth vs level (bid vs ask) + - spread_hist.png # histogram of best-level spread (USD) + - midprice_series.png # mid-price over time (USD) + - midlogret_hist.png # histogram of mid-price log returns + - summary.md # concise human-readable summary + +Assumptions: + - LOBSTER order book file has 40 columns, no header: + [ask_price_1, ask_size_1, ..., ask_price_10, ask_size_10, + bid_price_1, bid_size_1, ..., bid_price_10, bid_size_10] + - Prices are quoted as ticks = dollars * tick_scale (default 10_000); use --tick-scale to adjust. + +Usage: + python summarise_orderbook.py \ + --orderbook ./data/AMZN_2012-06-21_34200000_57600000_orderbook_10.csv \ + --outdir ./outs/summary_amzn_lvl10 \ + --tick-scale 10000 \ + --seq-len 128 +""" +from __future__ import annotations + +import argparse +import os +from dataclasses import dataclass +from typing import List + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd + + +# --------------------------- Config / Types ---------------------------- # +@dataclass +class OBMeta: + levels: int + tick_scale: float + seq_len: int | None + + +# --------------------------- Column Helpers ---------------------------- # +def make_orderbook_columns(levels: int = 10) -> List[str]: + cols: List[str] = [] + for i in range(1, levels + 1): + cols.append(f"ask_price_{i}") + cols.append(f"ask_size_{i}") + for i in range(1, levels + 1): + cols.append(f"bid_price_{i}") + cols.append(f"bid_size_{i}") + return cols # total 4*levels + + +# ------------------------------ I/O ----------------------------------- # +def load_orderbook(csv_path: str, levels: int) -> pd.DataFrame: + cols = make_orderbook_columns(levels) + try: + ob = pd.read_csv(csv_path, header=None, names=cols) + except Exception as e: + raise RuntimeError(f"Failed to read orderbook CSV at {csv_path}: {e}") + if ob.shape[1] != 4 * levels: + raise ValueError( + f"Expected {4*levels} columns for level={levels} (got {ob.shape[1]}). " + "Check --levels or file format." + ) + return ob + + +# ---------------------------- Computations ---------------------------- # +def compute_top_of_book( + ob: pd.DataFrame, tick_scale: float +) -> tuple[pd.Series, pd.Series, pd.Series, pd.Series]: + ask1 = ob["ask_price_1"] / tick_scale + bid1 = ob["bid_price_1"] / tick_scale + spread = ask1 - bid1 + mid_price = 0.5 * (ask1 + bid1) + # guard tiny/zero + mid_safe = mid_price.replace(0, np.nan).fillna(method="ffill").fillna(method="bfill") + mid_logret = np.log(mid_safe + 1e-12).diff().fillna(0.0) + return ask1, bid1, spread, mid_logret + + +def average_depth_profile(ob: pd.DataFrame, levels: int) -> tuple[np.ndarray, np.ndarray]: + bid_cols = [f"bid_size_{i}" for i in range(1, levels + 1)] + ask_cols = [f"ask_size_{i}" for i in range(1, levels + 1)] + bid_depth = ob[bid_cols].astype(float).mean(axis=0).values # shape [levels] + ask_depth = ob[ask_cols].astype(float).mean(axis=0).values # shape [levels] + return bid_depth, ask_depth + + +def per_column_summary(ob: pd.DataFrame) -> pd.DataFrame: + arr = ob.astype(float) + zeros = (arr == 0).sum(axis=0) + total = len(arr) + desc = arr.describe(percentiles=[0.25, 0.5, 0.75]).T + desc["zero_count"] = zeros + desc["zero_percent"] = (zeros / total) * 100.0 + # reorder columns nicely + keep = ["count", "mean", "std", "min", "25%", "50%", "75%", "max", "zero_count", "zero_percent"] + return desc[keep].rename_axis("column").reset_index() + + +def windows_possible(n_rows: int, seq_len: int | None) -> int | None: + if seq_len is None: + return None + return max(0, n_rows - seq_len + 1) + + +# ------------------------------- Plots -------------------------------- # +def plot_depth_profile(outdir: str, bid_depth: np.ndarray, ask_depth: np.ndarray) -> str: + levels = np.arange(1, len(bid_depth) + 1) + plt.figure(figsize=(7, 4)) + plt.plot(levels, bid_depth, marker="o", label="Bid depth") + plt.plot(levels, ask_depth, marker="o", label="Ask depth") + plt.xlabel("Level") + plt.ylabel("Average size") + plt.title("Average depth profile (mean size per level)") + plt.legend() + plt.tight_layout() + path = os.path.join(outdir, "depth_profile.png") + plt.savefig(path, dpi=160, bbox_inches="tight") + plt.close() + return path + + +def plot_spread_hist(outdir: str, spread: pd.Series) -> str: + plt.figure(figsize=(7, 4)) + plt.hist(spread.values, bins=100) + plt.xlabel("Spread (USD)") + plt.ylabel("Count") + plt.title("Histogram of best-level spread") + plt.tight_layout() + path = os.path.join(outdir, "spread_hist.png") + plt.savefig(path, dpi=160, bbox_inches="tight") + plt.close() + return path + + +def plot_midprice_series(outdir: str, mid_price: pd.Series, max_points: int = 4000) -> str: + # Downsample for visual clarity if huge + if len(mid_price) > max_points: + idx = np.linspace(0, len(mid_price) - 1, max_points).astype(int) + mp = mid_price.iloc[idx] + x = np.arange(len(mp)) + else: + mp = mid_price + x = np.arange(len(mid_price)) + plt.figure(figsize=(8, 4)) + plt.plot(x, mp.values, linewidth=1) + plt.xlabel("Event index (downsampled)" if len(mid_price) > max_points else "Event index") + plt.ylabel("Mid price (USD)") + plt.title("Mid price over time") + plt.tight_layout() + path = os.path.join(outdir, "midprice_series.png") + plt.savefig(path, dpi=160, bbox_inches="tight") + plt.close() + return path + + +def plot_midlogret_hist(outdir: str, mid_logret: pd.Series) -> str: + plt.figure(figsize=(7, 4)) + # clip heavy tails for nicer viz + vals = np.clip( + mid_logret.values, np.percentile(mid_logret, 0.1), np.percentile(mid_logret, 99.9) + ) + plt.hist(vals, bins=100) + plt.xlabel("log mid-price return") + plt.ylabel("Count") + plt.title("Histogram of log mid-price returns") + plt.tight_layout() + path = os.path.join(outdir, "midlogret_hist.png") + plt.savefig(path, dpi=160, bbox_inches="tight") + plt.close() + return path + + +# ------------------------------ Summary ------------------------------- # +def write_markdown_summary( + outdir: str, + ob_path: str, + meta: OBMeta, + n_rows: int, + zeros_total: int, + zeros_pct: float, + spread_stats: dict, + mid_ret_stats: dict, + window_count: int | None, + artifacts: dict[str, str], +) -> None: + md = [] + md.append("# Order book summary\n") + md.append(f"- **File**: `{ob_path}`") + md.append(f"- **Rows**: {n_rows:,}") + md.append(f"- **Levels**: {meta.levels}") + md.append(f"- **Tick scale**: {meta.tick_scale:g} (price = ticks / tick_scale)") + if meta.seq_len is not None: + md.append(f"- **Seq len** (for windows estimate): {meta.seq_len}") + md.append(f"- **Possible windows**: {window_count:,}") + md.append("") + md.append(f"- **Zeros**: {zeros_total:,} cells ({zeros_pct:.2f}%)") + md.append("") + md.append("## Top-of-book (level 1)\n") + md.append( + f"- Spread (USD): mean={spread_stats['mean']:.6f}, std={spread_stats['std']:.6f}, " + f"min={spread_stats['min']:.6f}, max={spread_stats['max']:.6f}" + ) + md.append( + f"- |log mid-price return|: mean={mid_ret_stats['mean']:.6f}, std={mid_ret_stats['std']:.6f}, " + f"p99={mid_ret_stats['p99']:.6f}" + ) + md.append("") + md.append("## Artifacts\n") + for name, path in artifacts.items(): + md.append(f"- {name}: `{path}`") + md.append("") + with open(os.path.join(outdir, "summary.md"), "w", encoding="utf-8") as f: + f.write("\n".join(md)) + + +# ------------------------------ Runner -------------------------------- # +def parse_args() -> argparse.Namespace: + ap = argparse.ArgumentParser(description="Standalone LOBSTER orderbook_10.csv summariser.") + ap.add_argument("--orderbook", required=True, help="Path to orderbook_10.csv") + ap.add_argument("--outdir", required=True, help="Output directory for plots and tables") + ap.add_argument("--levels", type=int, default=10, help="Number of book levels (default 10)") + ap.add_argument( + "--tick-scale", + type=float, + default=10_000.0, + help="LOBSTER tick scale (price = ticks / scale)", + ) + ap.add_argument( + "--seq-len", type=int, default=None, help="Optional: sequence length to estimate windows" + ) + return ap.parse_args() + + +def main() -> None: + args = parse_args() + os.makedirs(args.outdir, exist_ok=True) + meta = OBMeta(levels=args.levels, tick_scale=float(args.tick_scale), seq_len=args.seq_len) + + # Load + ob = load_orderbook(args.orderbook, meta.levels) + + # Column summary + col_summary = per_column_summary(ob) + col_summary_path = os.path.join(args.outdir, "per_column_summary.csv") + col_summary.to_csv(col_summary_path, index=False) + + # Zeros overall + zeros_total = (ob.values == 0).sum() + zeros_pct = 100.0 * zeros_total / (ob.shape[0] * ob.shape[1]) + + # Top-of-book derived series + ask1, bid1, spread, mid_logret = compute_top_of_book(ob, meta.tick_scale) + mid_price = 0.5 * (ask1 + bid1) + + # Depth profile + bid_depth, ask_depth = average_depth_profile(ob, meta.levels) + + # Plots + arts: dict[str, str] = {} + arts["depth_profile"] = plot_depth_profile(args.outdir, bid_depth, ask_depth) + arts["spread_hist"] = plot_spread_hist(args.outdir, spread) + arts["midprice_series"] = plot_midprice_series(args.outdir, mid_price) + arts["midlogret_hist"] = plot_midlogret_hist(args.outdir, mid_logret) + + # Small stats for summary + spread_stats = dict( + mean=float(spread.mean()), + std=float(spread.std()), + min=float(spread.min()), + max=float(spread.max()), + ) + abs_ret = mid_logret.abs() + mid_ret_stats = dict( + mean=float(abs_ret.mean()), std=float(abs_ret.std()), p99=float(abs_ret.quantile(0.99)) + ) + + # Windows estimate + wcount = windows_possible(len(ob), meta.seq_len) + + # Write markdown summary + write_markdown_summary( + outdir=args.outdir, + ob_path=args.orderbook, + meta=meta, + n_rows=len(ob), + zeros_total=int(zeros_total), + zeros_pct=float(zeros_pct), + spread_stats=spread_stats, + mid_ret_stats=mid_ret_stats, + window_count=wcount, + artifacts=arts, + ) + + print(f"[done] Summary written to: {args.outdir}") + + +if __name__ == "__main__": + main() diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/__init__.py b/recognition/TimeLOB_TimeGAN_49088276/src/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py new file mode 100644 index 000000000..cfba67e83 --- /dev/null +++ b/recognition/TimeLOB_TimeGAN_49088276/src/dataset.py @@ -0,0 +1,443 @@ +#!/usr/bin/env python3 +"""Lightweight LOBSTER preprocessing with continuous min–max scaling. + +This module provides a minimal data pipeline for AMZN Level-10 LOBSTER snapshots: + +1) Load the raw order book file (level-10, 40 columns). +2) Optionally filter out rows containing zeros. +3) Chronologically split into train, validation, and test. +4) Fit a train-only min–max scaler and transform all splits. +5) Produce sliding windows for TimeGAN training. + +It also includes a batch generator for windowed sequences and a convenience +``load_data`` wrapper that returns windowed train data and 2D val/test views. + +Created By: Radhesh Goel (Keys-I) +""" + +from __future__ import annotations + +from argparse import Namespace +from dataclasses import dataclass +from pathlib import Path +from typing import Optional, Tuple + +import numpy as np +from numpy.typing import NDArray + +from src.helpers.constants import DATA_DIR, ORDERBOOK_FILENAME, TRAIN_TEST_SPLIT +from src.helpers.richie import dataset_summary +from src.helpers.richie import log as rlog +from src.helpers.richie import status as rstatus + + +class MinMaxScaler: + """Feature-wise min–max scaler with a scikit-learn-like API. + + The scaler computes per-feature minima and maxima on the training split and + applies an epsilon-protected min–max transform. The inverse transform uses + the stored min and max. + """ + + def __init__(self, epsilon: float = 1e-7) -> None: + """Initialize the scaler. + + Args: + epsilon: Small constant added to the denominator to avoid division by zero. + """ + self.epsilon = epsilon + self._min: Optional[NDArray[np.floating]] = None + self._max: Optional[NDArray[np.floating]] = None + + def fit(self, data: NDArray[np.floating]) -> "MinMaxScaler": + """Compute per-feature minima and maxima. + + Args: + data: Array with features along the last dimension. + + Returns: + Self, for chaining. + """ + self._min = np.min(data, axis=0) + self._max = np.max(data, axis=0) + return self + + def transform(self, data: NDArray[np.floating]) -> NDArray[np.floating]: + """Apply min–max scaling using fitted statistics. + + Args: + data: Array to scale. + + Returns: + Scaled array of the same shape. + + Raises: + RuntimeError: If called before ``fit``. + """ + if self._min is None or self._max is None: + raise RuntimeError("Scaler must be fitted before transform.") + numerator = data - self._min + denominator = (self._max - self._min) + self.epsilon + return numerator / denominator + + def fit_transform(self, data: NDArray[np.floating]) -> NDArray[np.floating]: + """Fit the scaler and transform the data in one call. + + Args: + data: Array to fit and transform. + + Returns: + Scaled array of the same shape. + """ + return self.fit(data).transform(data) + + def inverse_transform(self, data: NDArray[np.floating]) -> NDArray[np.floating]: + """Invert the scaling back to the original feature space. + + Args: + data: Scaled array to invert. + + Returns: + Array in the original feature scale. + + Raises: + RuntimeError: If called before ``fit``. + """ + if self._min is None or self._max is None: + raise RuntimeError("Scaler must be fitted before inverse_transform.") + return data * ((self._max - self._min) + self.epsilon) + self._min + + +@dataclass(frozen=True) +class DatasetConfig: + """Configuration for loading and preprocessing order book data. + + Attributes: + seq_len: Window length (time steps). + data_dir: Directory containing the order book file. + orderbook_filename: Filename of the Level-10 order book CSV. + splits: Train/validation/test split fractions or cumulative cutoffs. + shuffle_windows: Shuffle windows after windowing the chosen split. + dtype: Target dtype for arrays. + filter_zero_rows: Whether to drop rows containing zeros. + """ + + seq_len: int + data_dir: Path = DATA_DIR + orderbook_filename: str = ORDERBOOK_FILENAME + splits: Tuple[float, float, float] = TRAIN_TEST_SPLIT + shuffle_windows: bool = True + dtype: type = np.float32 + filter_zero_rows: bool = True + + @classmethod + def from_namespace(cls, arg: Namespace) -> "DatasetConfig": + """Build a configuration from an argparse namespace. + + Args: + arg: Namespace carrying dataset-related flags. + + Returns: + A populated ``DatasetConfig`` instance. + """ + return cls( + seq_len=getattr(arg, "seq_len", 128), + data_dir=Path(getattr(arg, "data_dir", DATA_DIR)), + orderbook_filename=getattr(arg, "orderbook_filename", ORDERBOOK_FILENAME), + shuffle_windows=getattr(arg, "shuffle_windows", True), + dtype=getattr(arg, "dtype", np.float32), + filter_zero_rows=getattr(arg, "filter_zero_rows", True), + ) + + +class LOBDataset: + """End-to-end loader for a single LOBSTER Level-10 order book file.""" + + def __init__( + self, cfg: DatasetConfig, scaler: Optional[MinMaxScaler] = None + ) -> None: + """Initialize the loader. + + Args: + cfg: Dataset configuration. + scaler: Optional external scaler; a new ``MinMaxScaler`` is created if None. + """ + self.cfg = cfg + self.scaler = scaler or MinMaxScaler() + + self._raw: Optional[NDArray[np.int64]] = None + self._filtered: Optional[NDArray[np.floating]] = None + self._train: Optional[NDArray[np.floating]] = None + self._val: Optional[NDArray[np.floating]] = None + self._test: Optional[NDArray[np.floating]] = None + + def load(self) -> "LOBDataset": + """Load, split, scale, and summarize the dataset. + + Returns: + Self, for chaining. + """ + with rstatus( + "[bold cyan]Loading and preprocessing LOBSTER orderbook dataset..." + ): + data = self._read_raw() + data = ( + self._filter_unoccupied(data) + if self.cfg.filter_zero_rows + else data.astype(self.cfg.dtype) + ) + self._filtered = data.astype(self.cfg.dtype) + + self._split_chronological() + self._scale_train_only() + + self._render_summary() + rlog("[green]Dataset loaded, split, and scaled.[/green]") + return self + + def make_windows(self, split: str = "train") -> NDArray[np.float32]: + """Window a selected split into shape ``[num_windows, seq_len, num_features]``. + + Args: + split: One of {'train', 'val', 'test'}. + + Returns: + Windowed array for the chosen split. + """ + data = self._select_split(split) + return self._windowize(data, self.cfg.seq_len, self.cfg.shuffle_windows) + + def dataset_windowed( + self, + ) -> tuple[NDArray[np.float32], NDArray[np.float32], NDArray[np.float32]]: + """Return windowed train, val, and test arrays. + + Returns: + (train_w, val_w, test_w) where each is a 3D array of windows. + """ + train_w = self.make_windows(split="train") + val_w = self.make_windows(split="val") + test_w = self.make_windows(split="test") + return train_w, val_w, test_w + + def _read_raw(self) -> NDArray[np.int64]: + """Read the raw CSV into a 2D array of type int64. + + Returns: + Raw ndarray with shape [T, 40] for Level-10. + + Raises: + FileNotFoundError: If the CSV is not present at ``data_dir/orderbook_filename``. + """ + path = Path(self.cfg.data_dir, self.cfg.orderbook_filename) + if not path.exists(): + msg = ( + f"{path} not found.\n" + "Download AMZN level-10 sample from:\n" + "https://lobsterdata.com/info/sample/LOBSTER_SampleFile_AMZN_2012-06-21_10.zip\n" + "and place the '..._orderbook_10' file in the data directory." + ) + raise FileNotFoundError(msg) + rlog(f"[bold]Reading orderbook file[/bold]: {path}") + raw = np.loadtxt(path, delimiter=",", skiprows=0, dtype=np.int64) + rlog(f"Raw shape: {raw.shape}") + self._raw = raw + return raw + + @staticmethod + def _filter_unoccupied(data: NDArray[np.int64]) -> NDArray[np.float32]: + """Remove rows containing any zero to avoid invalid or dummy volumes. + + Args: + data: Raw order book rows [T, 40]. + + Returns: + Filtered float32 array. + """ + mask = ~(data == 0).any(axis=1) + filtered = data[mask].astype(np.float32) + rlog(f"Filtered rows (no zeros). Shape {filtered.shape}") + return filtered + + def _split_chronological(self) -> None: + """Split the filtered data chronologically into train, val, and test. + + Supports both proportion splits that sum to 1.0 and cumulative cutoffs + (e.g., 0.7, 0.85, 1.0). Ensures each split yields at least a minimum + number of windows for the configured sequence length. + """ + assert self._filtered is not None, "Call load() first." + n = len(self._filtered) + a, b, c = self.cfg.splits + + # proportions if they sum to ~1.0; otherwise treat as cumulative cutoffs + if abs((a + b + c) - 1.0) < 1e-6: + t_cut = int(n * a) + v_cut = int(n * (a + b)) + else: + if not (0.0 < a < b <= 1.0 + 1e-9): + raise ValueError( + f"Invalid cumulative splits {self.cfg.splits}; expected 0 < TRAIN < VAL ≤ 1." + ) + t_cut = int(n * a) + v_cut = int(n * b) + + self._train = self._filtered[:t_cut] + self._val = self._filtered[t_cut:v_cut] + self._test = self._filtered[v_cut:] + + # window-aware sanity check + l = self.cfg.seq_len + + def nwin(x: Optional[NDArray[np.floating]]) -> int: + if x is None: + return 0 + return len(x) - l + 1 + + min_w = 5 + if any(nwin(x) < min_w for x in (self._train, self._val, self._test)): + raise ValueError( + f"Not enough windows with seq_len={l} (need ≥{min_w}): " + f"train={nwin(self._train)}, val={nwin(self._val)}, test={nwin(self._test)}. " + "Try smaller --seq-len, different --splits, or --keep_zero_rows." + ) + + def _scale_train_only(self) -> None: + """Fit min–max on train and transform train, val, and test in place.""" + assert ( + self._train is not None and self._val is not None and self._test is not None + ) + rlog("[bold magenta]Fitting MinMaxScaler on train split.[/bold magenta]") + self._train = self.scaler.fit_transform(self._train) + self._val = self.scaler.transform(self._val) + self._test = self.scaler.transform(self._test) + + def _windowize( + self, + data: NDArray[np.float32], + seq_len: int, + shuffle_windows: bool, + ) -> NDArray[np.float32]: + """Slice a [T, F] split into overlapping windows of length ``seq_len``. + + Args: + data: 2D array [T, F]. + seq_len: Window length. + shuffle_windows: Shuffle windows after creation. + + Returns: + 3D array of windows [Nw, seq_len, F]. + + Raises: + ValueError: If ``seq_len`` exceeds the number of rows. + """ + n_samples, n_features = data.shape + n_windows = n_samples - seq_len + 1 + if n_windows <= 0: + raise ValueError( + f"seq_len={seq_len} is too large for data of length {n_samples}." + ) + + out = np.empty((n_windows, seq_len, n_features), dtype=self.cfg.dtype) + for i in range(n_windows): + out[i] = data[i : i + seq_len] + if shuffle_windows: + np.random.shuffle(out) + return out + + def _select_split(self, split: str) -> NDArray[np.float32]: + """Return the requested split array.""" + if split == "train": + return self._train # type: ignore[return-value] + if split == "val": + return self._val # type: ignore[return-value] + if split == "test": + return self._test # type: ignore[return-value] + raise ValueError("split must be 'train', 'val' or 'test'") + + def _render_summary(self) -> None: + """Print a dataset summary using Rich (or plain text fallback).""" + l = self.cfg.seq_len + + def counts(arr: Optional[NDArray[np.floating]]) -> tuple[int, int]: + rows = 0 if arr is None else int(arr.shape[0]) + wins = max(0, rows - l + 1) + return rows, wins + + splits_for_view = [ + ("train", counts(self._train)), + ("val", counts(self._val)), + ("test", counts(self._test)), + ] + + dataset_summary( + file_path=Path(self.cfg.data_dir, self.cfg.orderbook_filename), + seq_len=self.cfg.seq_len, + dtype_name=self.cfg.dtype.__name__, + filter_zero_rows=self.cfg.filter_zero_rows, + splits=splits_for_view, + ) + + +def batch_generator( + data: NDArray[np.float32], + time: Optional[NDArray[np.int32]], + batch_size: int, +) -> Tuple[NDArray[np.float32], NDArray[np.int32]]: + """Random mini-batch generator for windowed sequences. + + Args: + data: Array of shape [N, T, F] (windowed sequences). + time: Optional array of shape [N] giving per-window lengths (T_i). + If None, returns a constant length vector equal to data.shape[1]. + batch_size: Number of windows to sample (with replacement). + + Returns: + data_mb: Mini-batch of windows [batch_size, T, F] (float32). + t_mb: Vector of sequence lengths [batch_size] (int32). + + Raises: + ValueError: If ``data`` is not 3D or has zero windows. + """ + if data.ndim != 3: + raise ValueError(f"`data` must be [N, T, F]; got shape {data.shape}") + + n = data.shape[0] + if n == 0: + raise ValueError("Cannot sample mini-batch from empty data.") + + rng = np.random.default_rng() + idx = rng.integers(0, n, size=batch_size) # with replacement + + data_mb = data[idx].astype(np.float32, copy=False) + + if time is None: + t_mb = np.full((batch_size,), data_mb.shape[1], dtype=np.int32) + else: + if time.shape[0] != n: + raise ValueError(f"`time` length {time.shape[0]} does not match N={n}.") + t_mb = time[idx].astype(np.int32, copy=False) + + return data_mb, t_mb + + +def load_data( + arg: Namespace, +) -> tuple[NDArray[np.float32], NDArray[np.float32], NDArray[np.float32]]: + """Load, preprocess, window train, and return val/test 2D views. + + Args: + arg: Namespace containing dataset flags (see DataOptions). + + Returns: + train_w: Windowed training sequences [Nw, T, F]. + val: Validation rows [Tv, F] (scaled). + test: Test rows [Ts, F] (scaled). + """ + cfg = DatasetConfig.from_namespace(arg) + loader = LOBDataset(cfg).load() + train_w = loader.make_windows("train") + val = loader._val + test = loader._test + rlog("[bold green]Stock dataset has been loaded and preprocessed.[/bold green]") + return train_w, val, test diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/__init__.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py new file mode 100644 index 000000000..96b33cc09 --- /dev/null +++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/args.py @@ -0,0 +1,346 @@ +#!/usr/bin/env python3 +""" +Unified CLI options for TimeGAN–LOBSTER. + +This module defines three option groups: +- DataOptions: dataset loading and preprocessing flags. +- ModulesOptions: model and training hyperparameters. +- VisualiseOptions: visualisation and metric reporting flags. + +The top-level Options router splits argv into subsections using +`--dataset …`, `--modules …`, and `--viz …`, then parses each with the +corresponding subparser. All groups degrade safely if a section is omitted. +""" + +from __future__ import annotations + +import sys +from argparse import REMAINDER, ArgumentParser, Namespace +from pathlib import Path +from typing import List, Optional + +import numpy as np + +from src.helpers.constants import ( + DATA_DIR, + NUM_TRAINING_ITERATIONS, + ORDERBOOK_FILENAME, + OUTPUT_DIR, + TRAIN_TEST_SPLIT, +) + +# Optional override for orderbook filename from constants +try: + from src.helpers.constants import ORDERBOOK_FILENAME as _OB_ALT + + ORDERBOOK_DEFAULT = _OB_ALT +except Exception: + ORDERBOOK_DEFAULT = ORDERBOOK_FILENAME + +# Default LOB levels (fallback to 10 if not exported) +try: + from src.helpers.constants import NUM_LEVELS as _DEFAULT_LEVELS +except Exception: + _DEFAULT_LEVELS = 10 + + +class DataOptions: + """Dataset options parsed after `--dataset`.""" + + def __init__(self) -> None: + parser = ArgumentParser( + prog="timeganlob_dataset", + description="Lightweight LOBSTER preprocessing + MinMax scaling", + ) + parser.add_argument("--seq-len", type=int, default=128) + parser.add_argument( + "--data-dir", dest="data_dir", type=str, default=str(DATA_DIR) + ) + parser.add_argument( + "--orderbook-filename", + dest="orderbook_filename", + type=str, + default=ORDERBOOK_DEFAULT, + ) + parser.add_argument( + "--no-shuffle", + action="store_true", + help="Disable shuffling of windowed sequences", + ) + parser.add_argument( + "--keep-zero-rows", + dest="keep_zero_rows", + action="store_true", + help="Do NOT filter rows containing zeros.", + ) + parser.add_argument( + "--splits", + type=float, + nargs=3, + metavar=("TRAIN", "VAL", "TEST"), + help="Either proportions that sum to ~1.0 or cumulative cutoffs (e.g., 0.6 0.8 1.0).", + default=None, + ) + self._parser = parser + + def parse(self, argv: Optional[List[str]]) -> Namespace: + """Parse dataset arguments.""" + if argv is None: + argv = [] + ds = self._parser.parse_args(argv) + return Namespace( + seq_len=ds.seq_len, + data_dir=ds.data_dir, + orderbook_filename=ds.orderbook_filename, + splits=tuple(ds.splits) if ds.splits is not None else TRAIN_TEST_SPLIT, + shuffle_windows=not ds.no_shuffle, + dtype=np.float32, + filter_zero_rows=not ds.keep_zero_rows, + ) + + +class ModulesOptions: + """Model and training hyperparameters parsed after `--modules`.""" + + def __init__(self) -> None: + parser = ArgumentParser( + prog="timeganlob_modules", + description="Module/model hyperparameters and training weights.", + ) + # core shapes + parser.add_argument("--batch-size", type=int, default=128) + parser.add_argument( + "--seq-len", + type=int, + default=128, + help="Sequence length (convenience mirror).", + ) + parser.add_argument( + "--z-dim", type=int, default=40, help="Latent/input feature dimension." + ) + parser.add_argument( + "--hidden-dim", type=int, default=64, help="GRU hidden size." + ) + parser.add_argument( + "--num-layer", type=int, default=3, help="Number of stacked GRU layers." + ) + # optimizer + parser.add_argument( + "--lr", type=float, default=1e-4, help="Adam learning rate." + ) + parser.add_argument("--beta1", type=float, default=0.5, help="Adam beta1.") + # loss weights + parser.add_argument( + "--w-gamma", + type=float, + default=1.0, + help="Supervisor-related adversarial weight.", + ) + parser.add_argument( + "--w-g", type=float, default=1.0, help="Generator loss and moments weight." + ) + # schedule + parser.add_argument( + "--num-iters", + type=int, + default=NUM_TRAINING_ITERATIONS, + help="Iterations per phase (ER, Supervisor, Joint).", + ) + self._parser = parser + + def parse(self, argv: Optional[List[str]]) -> Namespace: + """Parse modules arguments.""" + if argv is None: + argv = [] + m = self._parser.parse_args(argv) + return Namespace( + batch_size=m.batch_size, + seq_len=m.seq_len, + z_dim=m.z_dim, + hidden_dim=m.hidden_dim, + num_layer=m.num_layer, + lr=m.lr, + beta1=m.beta1, + w_gamma=m.w_gamma, + w_g=m.w_g, + num_iters=m.num_iters, + ) + + +class VisualiseOptions: + """Visualisation and metric reporting flags parsed after `--viz`.""" + + def __init__(self) -> None: + parser = ArgumentParser( + prog="timeganlob_viz", + description="Visualisation and metric reporting options for generated LOB sequences.", + ) + parser.add_argument( + "--samples", + type=int, + default=3, + help="Number of synthetic samples to generate", + ) + parser.add_argument( + "--out-dir", + type=Path, + default=Path(OUTPUT_DIR) / "viz", + help="Output directory", + ) + parser.add_argument( + "--bins", type=int, default=100, help="Histogram bins for KL computation" + ) + parser.add_argument( + "--cmap", type=str, default="coolwarm", help="Colormap for heatmaps" + ) + parser.add_argument( + "--no-log1p", + action="store_true", + help="Disable log1p transform in heatmaps", + ) + parser.add_argument( + "--dpi", type=int, default=220, help="DPI for saved figures" + ) + parser.add_argument( + "--levels", + type=int, + default=None, + help=f"Override LOB levels (default: {_DEFAULT_LEVELS})", + ) + parser.add_argument( + "--no-ssim", action="store_true", help="Skip SSIM computation" + ) + parser.add_argument( + "--no-kl", action="store_true", help="Skip KL(spread/mpr) computation" + ) + parser.add_argument( + "--no-temp", + action="store_true", + help="Skip temporal correlation computation", + ) + parser.add_argument( + "--no-lat", action="store_true", help="Skip latent distance computation" + ) + parser.add_argument( + "--metrics-csv", + type=Path, + default=None, + help="Optional path to save metrics CSV", + ) + # latent walk controls + parser.add_argument( + "--walk", action="store_true", help="Enable latent-space walks" + ) + parser.add_argument( + "--walk-steps", + type=int, + default=8, + help="Number of interpolation steps for walks", + ) + parser.add_argument( + "--walk-mode", + type=str, + default="both", + choices=["within", "cross", "both"], + help="Which walk(s) to generate", + ) + parser.add_argument( + "--walk-prefix", + type=str, + default="latent_walk", + help="Prefix for walk panel filenames", + ) + self._parser = parser + + def parse(self, argv: Optional[List[str]]) -> Namespace: + """Parse visualisation arguments.""" + if argv is None: + argv = [] + v = self._parser.parse_args(argv) + return Namespace( + samples=int(v.samples), + out_dir=Path(v.out_dir), + bins=int(v.bins), + cmap=str(v.cmap), + no_log1p=bool(v.no_log1p), + dpi=int(v.dpi), + levels=(int(v.levels) if v.levels is not None else None), + no_ssim=bool(v.no_ssim), + no_kl=bool(v.no_kl), + no_temp=bool(v.no_temp), + no_lat=bool(v.no_lat), + metrics_csv=(Path(v.metrics_csv) if v.metrics_csv is not None else None), + walk=bool(v.walk), + walk_steps=int(v.walk_steps), + walk_mode=str(v.walk_mode).lower(), + walk_prefix=str(v.walk_prefix), + ) + + +class Options: + """Top-level router that splits argv into dataset, modules, and viz sections.""" + + def __init__(self) -> None: + parser = ArgumentParser( + prog="timeganlob", + description="TimeGAN-LOB entrypoint with nested dataset/module/viz options.", + ) + parser.add_argument("--seed", type=int, default=42, help="Global random seed") + parser.add_argument("--run-name", type=str, default="exp1", help="Run name") + parser.add_argument( + "--dataset", + nargs=REMAINDER, + help="All arguments after this flag go to DataOptions", + ) + parser.add_argument( + "--modules", + nargs=REMAINDER, + help="All arguments after this flag go to ModulesOptions", + ) + parser.add_argument( + "--viz", + nargs=REMAINDER, + help="All arguments after this flag go to VisualiseOptions", + ) + self._parser = parser + + def _extract( + self, flag: str, toks: List[str], stops: tuple[str, ...] + ) -> tuple[List[str], List[str]]: + """Extract the sub-sequence after `flag` until the next flag in `stops` or end.""" + if flag not in toks: + return [], toks + i = toks.index(flag) + rest = toks[i + 1 :] + next_idx = [j for j, t in enumerate(rest) if t in stops] + end = next_idx[0] if next_idx else len(rest) + section = rest[:end] + remaining = toks[:i] + rest[end:] + return section, remaining + + def parse(self, argv: Optional[List[str]] = None) -> Namespace: + """Parse top-level and nested option groups.""" + tokens: List[str] = list(sys.argv[1:] if argv is None else argv) + stop_flags = ("--dataset", "--modules", "--viz") + + ds_args, rem = self._extract("--dataset", tokens, stop_flags) + mod_args, rem = self._extract("--modules", rem, stop_flags) + viz_args, rem = self._extract("--viz", rem, stop_flags) + + top = self._parser.parse_args(rem) + + dataset_ns = DataOptions().parse(ds_args or []) + modules_ns = ModulesOptions().parse(mod_args or []) + visual_ns = VisualiseOptions().parse(viz_args or []) + + return Namespace( + seed=top.seed, + run_name=top.run_name, + dataset=dataset_ns, + modules=modules_ns, + viz=visual_ns, + ) + + +if __name__ == "__main__": + print(Options().parse()) diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py new file mode 100644 index 000000000..31bc608f2 --- /dev/null +++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/constants.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 +""" +Configuration constants for the TimeGAN–LOBSTER project. + +This module centralizes project paths and stable defaults used across the codebase: +- Repository root discovery (PROJECT_ROOT env, then git, then file fallback). +- Standard directories for outputs, weights, and data. +- Dataset filename defaults and LOB level count. +- Training schedule defaults and validation cadence. +- Train/val/test split with a strict sanity check. +""" + +from __future__ import annotations + +import os +import subprocess +from math import isclose +from pathlib import Path + + +def _repo_root() -> Path: + """Return the repository root using env, git, or file-based fallback. + + Resolution order: + 1) PROJECT_ROOT environment variable + 2) `git rev-parse --show-toplevel` + 3) Two directories above this file (for non-git environments) + + Returns: + Absolute Path to the repository root. + """ + env = os.getenv("PROJECT_ROOT") + if env: + return Path(env).resolve() + try: + out = subprocess.check_output( + ["git", "rev-parse", "--show-toplevel"], text=True + ).strip() + return Path(out).resolve() + except subprocess.CalledProcessError: + return Path(__file__).resolve().parents[2] + + +# Root and standard directories +ROOT_DIR = _repo_root() +OUTPUT_DIR = ROOT_DIR / "outs" +WEIGHTS_DIR = ROOT_DIR / "weights" +DATA_DIR = ROOT_DIR / "data" + +# Dataset defaults +ORDERBOOK_FILENAME = "AMZN_2012-06-21_34200000_57600000_orderbook_10.csv" +NUM_LEVELS = 10 # LOBSTER level-10 snapshots + +# Training defaults +NUM_TRAINING_ITERATIONS = 25_000 +VALIDATE_INTERVAL = 300 + +# Chronological split fractions (or convert to cumulative in the loader) +TRAIN_TEST_SPLIT = (0.7, 0.15, 0.15) +assert isclose( + sum(TRAIN_TEST_SPLIT), 1.0, rel_tol=0.0, abs_tol=1e-6 +), f"TRAIN_TEST_SPLIT must sum to 1.0 (got {sum(TRAIN_TEST_SPLIT):.8f})" diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/richie.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/richie.py new file mode 100644 index 000000000..a5d6945ff --- /dev/null +++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/richie.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python3 +""" +Rich-powered logging helpers with animated status for CLI output. + +This module provides a small wrapper around `rich` to standardize console UX: + +- `log(msg)`: structured logging with Rich, falling back to `print`. +- `status(msg)`: re-entrant-safe status context manager with a moving ellipsis. +- `rule(text)`: horizontal rule separator. +- `dataset_summary(...)`: formatted header and splits table for dataset stats. + +If `rich` is not available, all helpers degrade gracefully to plain text. +""" + +from __future__ import annotations + +import contextvars +import itertools +import threading +import time +from pathlib import Path +from typing import Iterable, Optional, Tuple + +try: + from rich import box + from rich.console import Console + from rich.panel import Panel + from rich.table import Table + + _CONSOLE: Optional[Console] = Console() +except Exception: # fallback if rich isn’t installed + _CONSOLE = None + +# Track nesting depth per context/thread for status() re-entrancy. +_live_depth: contextvars.ContextVar[int] = contextvars.ContextVar( + "_live_depth", default=0 +) + + +def log(msg: str) -> None: + """Write a single log line to the console. + + Uses Rich's Console.log when available, otherwise falls back to print(). + + Args: + msg: Text to log. + """ + if _CONSOLE: + _CONSOLE.log(msg) + else: + print(msg) + + +def status(msg: str): + """Return a re-entrant-safe status context manager with animated ellipsis. + + The outermost `with status("..."):` call starts a Rich status spinner and a + tiny background thread that appends a moving ellipsis to the message at a + fixed cadence. Nested calls are no-ops to avoid stacking multiple spinners. + + Example: + with status("Loading data"): + ... # long-running work + + Args: + msg: Base message to display next to the spinner. + + Returns: + A context manager suitable for `with` statements. + """ + depth = _live_depth.get() + if _CONSOLE and depth == 0: + rich_status = _CONSOLE.status(msg) + stop_flag = threading.Event() + dots = itertools.cycle( + ["", ".", "..", "...", "....", ".....", "....", "...", "..", "."] + ) + + class _Wrapper: + """Context manager that manages spinner lifecycle and animation.""" + + def __enter__(self): + self._token = _live_depth.set(depth + 1) + self._ctx = rich_status.__enter__() + + def _tick(): + """Animate ellipsis by updating the status message periodically.""" + next_tick = time.time() + 0.35 + while not stop_flag.wait(timeout=max(0.0, next_tick - time.time())): + try: + rich_status.update(f"{msg}{next(dots)}") + except Exception: + # Be resilient to any console teardown + pass + next_tick = time.time() + 0.35 + + self._thr = threading.Thread( + target=_tick, name="rich-status-ellipsis", daemon=True + ) + self._thr.start() + return self._ctx + + def __exit__(self, exc_type, exc, tb): + """Stop animation, restore base message, and exit the Rich status.""" + try: + stop_flag.set() + if hasattr(self, "_thr"): + self._thr.join(timeout=0.3) + try: + rich_status.update(msg) + except Exception: + pass + return rich_status.__exit__(exc_type, exc, tb) + finally: + _live_depth.reset(self._token) + + return _Wrapper() + + # Nested calls: return a no-op context manager to keep output clean. + class _Noop: + """No-op context manager used for nested `status` calls.""" + + def __enter__(self): + return None + + def __exit__(self, exc_type, exc, tb): + return False + + return _Noop() + + +def rule(text: str = "") -> None: + """Draw a horizontal rule in the console. + + Args: + text: Optional caption rendered in the rule. + """ + if _CONSOLE: + _CONSOLE.rule(text) + + +def dataset_summary( + *, + file_path: Path, + seq_len: int, + dtype_name: str, + filter_zero_rows: bool, + splits: Iterable[Tuple[str, Tuple[int, int]]], +) -> None: + """Render a dataset header panel and a splits table. + + On Rich consoles, shows a styled panel with file path, sequence length, + dtype, and zero-row filtering, followed by a table of split sizes and + window counts. Falls back to plain text when Rich is unavailable. + + Args: + file_path: Path to the loaded orderbook file. + seq_len: Sequence length used for windowing. + dtype_name: Name of the dtype used for arrays (e.g., 'float32'). + filter_zero_rows: Whether zero-containing rows were filtered out. + splits: Iterable of (split_name, (rows, windows)) entries. + """ + if _CONSOLE is None: + print( + f"Dataset: {file_path} | seq_len={seq_len} | dtype={dtype_name} | filter_zero_rows={filter_zero_rows}" + ) + for name, (rows, wins) in splits: + print(f"{name:>6}: rows={rows:,} windows={wins:,}") + return + + header = Panel.fit( + f"[bold cyan]LOBSTER dataset summary[/bold cyan]\n" + f"[dim]file:[/dim] {file_path}\n" + f"[dim]seq_len:[/dim] {seq_len} " + f"[dim]dtype:[/dim] {dtype_name} " + f"[dim]filter_zero_rows:[/dim] {filter_zero_rows}", + border_style="cyan", + ) + + table = Table( + title="Splits", + box=box.SIMPLE_HEAVY, + show_lines=False, + header_style="bold", + expand=False, + ) + table.add_column("Split") + table.add_column("Rows", justify="right") + table.add_column("Windows", justify="right") + + for name, (rows, wins) in splits: + table.add_row(name, f"{rows:,}", f"{wins:,}") + + _CONSOLE.rule() + _CONSOLE.print(header) + _CONSOLE.print(table) + _CONSOLE.rule() diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/utils.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/utils.py new file mode 100644 index 000000000..0e4814961 --- /dev/null +++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/utils.py @@ -0,0 +1,318 @@ +#!/usr/bin/env python3 +""" +Utility metrics and helpers for TimeGAN on LOBSTER data. + +This module provides: +- Sequence utilities: length extraction, noise sampling. +- Feature scaling: min–max forward and inverse transforms. +- Market metrics: spread, mid-price returns, KL divergence on histograms. +- Visual metrics: SSIM between saved heatmap images. +- Consistency metrics: temporal correlation of deltas, latent divergence. +""" + +from __future__ import annotations + +from pathlib import Path +from typing import Dict, Iterable, Literal, Tuple + +import matplotlib.pyplot as plt +import numpy as np +from numpy.typing import NDArray +from skimage.metrics import structural_similarity as ssim +from skimage.util import img_as_float + +Metric = Literal["spread", "mpr"] + + +def extract_seq_lengths( + sequences: Iterable[NDArray[np.floating]], +) -> Tuple[NDArray[np.int32], int]: + """Return per-sequence lengths and the maximum length. + + Args: + sequences: Iterable of arrays where the first dimension is time. + + Returns: + lengths: Vector of sequence lengths (int32). + max_len: Maximum length across sequences. + """ + lengths = np.asarray([int(s.shape[0]) for s in sequences], dtype=np.int32) + return lengths, int(lengths.max(initial=0)) + + +def sample_noise( + batch_size: int, + z_dim: int, + seq_len: int, + *, + mean: float | None = None, + std: float | None = None, + rng: np.random.Generator | None = None, +) -> NDArray[np.float32]: + """Sample noise windows for the generator. + + If mean and std are provided, draws from a uniform distribution whose + standard deviation matches ``std`` (variance of uniform a...b is (b-a)^2/12). + Otherwise, draws from U[0,1]. + + Args: + batch_size: Number of windows to sample. + z_dim: Latent dimensionality per time step. + seq_len: Time length per window. + mean: Optional target mean for uniform sampling. + std: Optional target standard deviation for uniform sampling. + rng: Optional NumPy Generator for reproducibility. + + Returns: + Array of shape [batch_size, seq_len, z_dim] (float32). + + Raises: + ValueError: If only one of mean or std is provided. + """ + if rng is None: + rng = np.random.default_rng() + + if (mean is None) ^ (std is None): + raise ValueError("Provide both mean and std, or neither") + + if mean is None and std is None: + out = rng.random((batch_size, seq_len, z_dim), dtype=np.float32) + else: + interval = float(std) * np.sqrt(12.0) + lo = float(mean) - interval / 2.0 + hi = float(mean) + interval / 2.0 + out = rng.uniform(lo, hi, size=(batch_size, seq_len, z_dim)).astype(np.float32) + return out + + +def minmax_scale( + data: NDArray[np.floating], + epsilon: float = 1e-7, +) -> Tuple[NDArray[np.float32], NDArray[np.float32], NDArray[np.float32]]: + """Apply feature-wise min–max scaling across all time and windows. + + Args: + data: Array of shape [N, T, F] to be scaled. + epsilon: Small constant to avoid division by zero. + + Returns: + norm: Scaled data in [0,1], shape [N, T, F]. + fmin: Per-feature minima, shape [F]. + fmax: Per-feature maxima, shape [F]. + + Raises: + ValueError: If input is not 3D. + """ + if data.ndim != 3: + raise ValueError( + f"Expected data with 3 dimensions [N, T, F], got shape {data.shape}" + ) + + fmin = np.min(data, axis=(0, 1)).astype(np.float32) + fmax = np.max(data, axis=(0, 1)).astype(np.float32) + denom = (fmax - fmin).astype(np.float32) + + norm = (data.astype(np.float32) - fmin) / (denom + epsilon) + return norm, fmin, fmax + + +def minmax_inverse( + norm: NDArray[np.floating], + fmin: NDArray[np.floating], + fmax: NDArray[np.floating], +) -> NDArray[np.float32]: + """Invert a min–max scaling transform. + + Args: + norm: Scaled data of shape [N, T, F] or [..., F]. + fmin: Per-feature minima [F]. + fmax: Per-feature maxima [F]. + + Returns: + Data restored to original scale (float32). + """ + fmin = np.asarray(fmin, dtype=np.float32) + fmax = np.asarray(fmax, dtype=np.float32) + return norm.astype(np.float32) * (fmax - fmin) + fmin + + +def _spread(series: NDArray[np.floating]) -> NDArray[np.float64]: + """Compute best-level spread from a [T, F] series. + + Assumes column 0 is best ask price and column 2 is best bid price. + + Args: + series: Array of shape [T, F]. + + Returns: + Spread time series [T] (float64). + + Raises: + ValueError: If shape is not [T, >=3]. + """ + if series.ndim != 2 or series.shape[1] < 3: + raise ValueError( + "Expected shape [T, >=3]; columns 0 (ask) and 2 (bid) required." + ) + return (series[:, 0] - series[:, 2]).astype(np.float64) + + +def _midprice_returns(series: NDArray[np.floating]) -> NDArray[np.float64]: + """Compute log mid-price returns from a [T, F] series. + + Uses columns 0 (ask) and 2 (bid). Mid is clipped away from zero for numerical stability. + + Args: + series: Array of shape [T, F]. + + Returns: + Log returns of mid-price, shape [T-1] (float64). + + Raises: + ValueError: If shape is not [T, >=3]. + """ + if series.ndim != 2 or series.shape[1] < 3: + raise ValueError( + "Expected shape [T, >=3]; columns 0 (ask) and 2 (bid) required." + ) + mid = 0.5 * (series[:, 0] + series[:, 2]) + mid = np.clip(mid, a_min=np.finfo(np.float64).tiny, a_max=None) + r = np.log(mid[1:]) - np.log(mid[:-1]) + return r.astype(np.float64) + + +def kl_divergence_hist( + real: NDArray[np.floating], + fake: NDArray[np.floating], + metric: Literal["spread", "mpr"] = "spread", + *, + bins: int = 100, + show_plot: bool = False, + epsilon: float = 1e-12, +) -> float: + """Estimate KL divergence between real and fake distributions via histograms. + + Args: + real: Real series [T, F]. + fake: Synthetic series [T, F]. + metric: Which 1D series to compare: 'spread' or 'mpr' (mid-price returns). + bins: Number of histogram bins. + show_plot: If True, display the normalized histograms. + epsilon: Smoothing mass added to each bin to avoid zeros. + + Returns: + Non-negative KL(real || fake) as a float. + + Raises: + ValueError: If inputs are not [T, F] or if metric is invalid. + """ + if real.ndim != 2 or fake.ndim != 2: + raise ValueError("Inputs must be 2D arrays [T, F].") + + if metric == "spread": + r_series = _spread(real) + f_series = _spread(fake) + elif metric == "mpr": + r_series = _midprice_returns(real) + f_series = _midprice_returns(fake) + else: + raise ValueError("metric must be 'spread' or 'mpr'.") + + lo = float(min(r_series.min(initial=0.0), f_series.min(initial=0.0))) + hi = float(max(r_series.max(initial=0.0), f_series.max(initial=0.0))) + if not np.isfinite(lo) or not np.isfinite(hi) or hi <= lo: + hi = lo + 1e-6 # avoid zero-width range + + r_hist, edges = np.histogram(r_series, bins=bins, range=(lo, hi), density=False) + f_hist, _ = np.histogram(f_series, bins=edges, density=False) + + r_p = r_hist.astype(np.float64) + epsilon + f_p = f_hist.astype(np.float64) + epsilon + r_p /= r_p.sum() + f_p /= f_p.sum() + + kl = float(np.sum(r_p * (np.log(r_p) - np.log(f_p)))) + if show_plot: + centers = 0.5 * (edges[:-1] + edges[1:]) + plt.plot(centers, r_p, label="real") + plt.plot(centers, f_p, label="fake") + plt.title(f"Histogram ({metric}); KL={kl:.4g}") + plt.legend() + plt.show() + return max(kl, 0.0) + + +def get_ssim(img1_path: Path | str, img2_path: Path | str) -> float: + """Compute SSIM between two image files read via matplotlib.""" + img1 = img_as_float(plt.imread(str(img1_path))) + img2 = img_as_float(plt.imread(str(img2_path))) + if img1.ndim == 2: + img1 = img1[..., None] + if img2.ndim == 2: + img2 = img2[..., None] + return float(ssim(img1, img2, channel_axis=2, data_range=1.0)) + + +def get_kl_metrics( + real_2d: NDArray, fake_2d: NDArray, bins: int = 100 +) -> Dict[str, float]: + """Compute KL divergences for spread and mid-price returns. + + Args: + real_2d: Real series [T, F]. + fake_2d: Synthetic series [T, F]. + bins: Number of histogram bins. + + Returns: + Dict with keys 'spread' and 'midprice_returns'. + """ + t = min(len(real_2d), len(fake_2d)) + real, fake = real_2d[:t], fake_2d[:t] + kl_spread = kl_divergence_hist(real, fake, metric="spread", bins=bins) + kl_mpr = kl_divergence_hist(real, fake, metric="mpr", bins=bins) + return {"spread": float(kl_spread), "midprice_returns": float(kl_mpr)} + + +def temporal_consistency(real: NDArray, fake: NDArray) -> float: + """Measure correlation between successive deltas in real and synthetic series. + + Computes first differences versus time for each feature, then averages the + Pearson correlation across overlapping features. + + Args: + real: Real matrix [T, F]. + fake: Synthetic matrix [T, F]. + + Returns: + Mean correlation of deltas over features (float). + """ + + def deltas(x: NDArray) -> NDArray: + return np.diff(x, axis=0) + + real_d, fake_d = deltas(real), deltas(fake) + f = min(real_d.shape[1], fake_d.shape[1]) + if f == 0: + return 0.0 + corrs = [] + for i in range(f): + c = np.corrcoef(real_d[:, i], fake_d[:, i])[0, 1] + corrs.append(c) + return float(np.nan_to_num(np.mean(corrs))) + + +def latent_divergence(real: NDArray, fake: NDArray) -> float: + """Compute a simple Frobenius distance between aligned real and fake matrices. + + Args: + real: Real matrix [T, F] or [T, d]. + fake: Synthetic matrix [T, F] or [T, d]. + + Returns: + Normalized Frobenius norm divided by the number of aligned rows. + """ + t = min(len(real), len(fake)) + if t == 0: + return 0.0 + diff = real[:t] - fake[:t] + return float(np.linalg.norm(diff, ord="fro") / t) diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py new file mode 100644 index 000000000..e42d2017a --- /dev/null +++ b/recognition/TimeLOB_TimeGAN_49088276/src/helpers/visualise.py @@ -0,0 +1,571 @@ +#!/usr/bin/env python3 +""" +Visualize TimeGAN results on LOBSTER data and compute evaluation metrics. + +This script renders depth heatmaps for real and synthetic limit order book (LOB) +sequences and reports quantitative metrics: +- SSIM between real and synthetic heatmaps +- KL divergence on spread and mid-price returns +- Temporal consistency (correlation of deltas) +- Latent divergence (normalized Frobenius distance) + +It also supports latent-space walks decoded via Encoder→Recovery: + • within-regime (tight spread → tight spread) + • cross-regime (tight spread → widespread) + +Usage example: + python -m src.viz.visualise \ + --viz --samples 5 --out-dir ./outs/viz_run1 --cmap magma --dpi 240 --bins 128 --levels 10 \ + --walk --walk-steps 8 --walk-mode cross --walk-prefix latent_walk \ + --dataset --seq-len 128 --data-dir ./data \ + --orderbook-filename AMZN_2012-06-21_34200000_57600000_orderbook_10.csv \ + --modules --batch-size 128 --z-dim 40 --hidden-dim 64 --num-layer 3 --num-iters 25000 +""" + +from __future__ import annotations + +from pathlib import Path +from typing import List, Tuple + +import matplotlib.pyplot as plt +import numpy as np +import seaborn as sns +import torch +from numpy.typing import NDArray + +from src.dataset import load_data +from src.helpers.args import Options +from src.helpers.constants import NUM_LEVELS as DEFAULT_LEVELS +from src.helpers.constants import OUTPUT_DIR +from src.helpers.richie import log as rlog +from src.helpers.richie import rule as rrule +from src.helpers.richie import status as rstatus +from src.helpers.utils import ( + get_kl_metrics, + get_ssim, + latent_divergence, + temporal_consistency, +) +from src.modules import TimeGAN + +# Optional pretty tables (graceful fallback if rich unavailable) +try: + from rich import box + from rich.console import Console + from rich.table import Table + + _HAS_RICH = True +except ImportError: + _HAS_RICH = False + + +def _pick_cmap(name: str): + """Return a matplotlib colormap by name, falling back to 'coolwarm' if unknown. + + Args: + name: Colormap name recognized by matplotlib. + + Returns: + A matplotlib colormap object. + """ + try: + return plt.get_cmap(name) + except Exception: + rlog(f"[warn] unknown colormap '{name}', falling back to 'coolwarm'") + return plt.get_cmap("coolwarm") + + +def _compute_spread_1(data_2d: NDArray) -> NDArray: + """Compute best-level spread from a 2D feature matrix. + + Assumes feature blocks per level as [ask_price, ask_size, bid_price, bid_size]. + Uses level-1 prices (columns 0 and 2). + + Args: + data_2d: Array of shape [T, F]. + + Returns: + Spread series of shape [T] (float64). + """ + ask1 = data_2d[:, 0] + bid1 = data_2d[:, 2] + return ask1 - bid1 + + +def _find_window_by_spread(data_2d: NDArray, win_len: int, mode: str = "tight") -> int: + """Locate a window start index by mean spread criterion. + + Args: + data_2d: Feature matrix [T, F]. + win_len: Desired window length. + mode: 'tight' to minimize mean spread, 'wide' to maximize mean spread. + + Returns: + Start index (int) of the selected window. + """ + spreads = _compute_spread_1(data_2d) + t = len(spreads) + if t <= win_len: + return 0 + csum = np.cumsum(np.insert(spreads, 0, 0.0)) + means = (csum[win_len:] - csum[:-win_len]) / win_len # length T - win_len + 1 + return int(np.argmin(means) if mode == "tight" else np.argmax(means)) + + +def plot_heatmap( + data_2d: NDArray, + *, + lvls: int, + cmap_: str, + use_log1p_: bool, + title: str, + save_path: Path, + dpi_: int, +) -> None: + """Render a single depth heatmap with asks stacked above bids. + + Rows represent 2*levels (top half = ask levels, bottom half = bid levels); + columns represent time. + + Args: + data_2d: Feature matrix [T, F]. + lvls: Number of LOB levels to visualize. + cmap_: Colormap name. + use_log1p_: Whether to apply log1p to volumes for dynamic-range compression. + title: Figure title. + save_path: Output PNG path. + dpi_: Figure DPI. + """ + asks = np.stack([data_2d[:, 4 * L + 1] for L in range(lvls)], axis=1) # [T, L] + bids = np.stack([data_2d[:, 4 * L + 3] for L in range(lvls)], axis=1) # [T, L] + + mat = np.concatenate([asks.T, bids.T], axis=0) # [2L, T] + if use_log1p_: + mat = np.log1p(mat) + + sns.set_theme(style="whitegrid") + fig, ax = plt.subplots(figsize=(9, 6)) + sns.heatmap(mat, ax=ax, cmap=_pick_cmap(cmap_), cbar=True) + ax.set_title(title) + ax.set_ylabel("Levels (Top = Ask, Bottom = Bid)") + ax.set_xlabel("Time") + fig.tight_layout() + + save_path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(save_path, dpi=dpi_, bbox_inches="tight") + plt.close(fig) + rlog(f"[save] heatmap → {save_path}") + + +def plot_walk_panel( + steps: List[NDArray], + *, + lvls: int, + cmap_: str, + use_log1p_: bool, + title: str, + save_path: Path, + dpi_: int, +) -> None: + """Render a latent-space walk as a row of small heatmaps, left to right. + + Each panel shows asks stacked over bids for a decoded intermediate window. + + Args: + steps: List of decoded windows [T, F]. + lvls: Number of LOB levels to visualize. + cmap_: Colormap name. + use_log1p_: Whether to apply log1p to volumes. + title: Subtitle for the panel figure. + save_path: Output PNG path. + dpi_: Figure DPI. + """ + n = len(steps) + sns.set_theme(style="whitegrid") + fig, axes = plt.subplots(1, n, figsize=(3 * n, 4), squeeze=False) + for k, step in enumerate(steps): + asks = np.stack([step[:, 4 * L + 1] for L in range(lvls)], axis=1) + bids = np.stack([step[:, 4 * L + 3] for L in range(lvls)], axis=1) + mat = np.concatenate([asks.T, bids.T], axis=0) + if use_log1p_: + mat = np.log1p(mat) + ax = axes[0, k] + sns.heatmap(mat, ax=ax, cmap=_pick_cmap(cmap_), cbar=False) + ax.set_title(f"Step {k}") + ax.set_xticks([]) + ax.set_yticks([]) + fig.suptitle(title) + fig.tight_layout() + save_path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(save_path, dpi=dpi_, bbox_inches="tight") + plt.close(fig) + rlog(f"[save] latent walk → {save_path}") + + +def _print_metrics_table( + ssim_rows_: List[Tuple[str, float]], + kl_rows_: List[Tuple[str, float, float]], + tmp_rows: List[Tuple[str, float]], + lat_rows_: List[Tuple[str, float]], +) -> None: + """Pretty-print SSIM, KL(spread), KL(mpr), temporal-corr, and latent-distance. + + Falls back to plain logs when Rich is not available. + + Args: + ssim_rows_: List of (sample, ssim). + kl_rows_: List of (sample, kl_spread, kl_mpr). + tmp_rows: List of (sample, temporal_corr). + lat_rows_: List of (sample, latent_distance). + """ + if not ssim_rows_: + rlog("[info] no metrics to display") + return + + if not _HAS_RICH: + for j in range(len(ssim_rows_)): + rlog( + f"{ssim_rows_[j][0]} | " + f"SSIM={ssim_rows_[j][1]:.4f} | " + f"KL(sp)={kl_rows_[j][1]:.4f} | KL(mpr)={kl_rows_[j][2]:.4f} | " + f"TempCorr={tmp_rows[j][1]:.4f} | LatDist={lat_rows_[j][1]:.4f}" + ) + return + + table = Table( + title="Quantitative Metrics", + header_style="bold cyan", + box=box.SIMPLE_HEAVY, + show_lines=False, + ) + table.add_column("Sample") + table.add_column("SSIM", justify="right") + table.add_column("KL(spread)", justify="right") + table.add_column("KL(mpr)", justify="right") + table.add_column("TempCorr", justify="right") + table.add_column("LatDist", justify="right") + + for j in range(len(ssim_rows_)): + table.add_row( + ssim_rows_[j][0], + f"{ssim_rows_[j][1]:.4f}", + f"{kl_rows_[j][1]:.4f}", + f"{kl_rows_[j][2]:.4f}", + f"{tmp_rows[j][1]:.4f}", + f"{lat_rows_[j][1]:.4f}", + ) + + rrule() + try: + Console().print(table) + except Exception: + for j in range(len(ssim_rows_)): + rlog( + f"{ssim_rows_[j][0]} | " + f"SSIM={ssim_rows_[j][1]:.4f} | " + f"KL(sp)={kl_rows_[j][1]:.4f} | KL(mpr)={kl_rows_[j][2]:.4f} | " + f"TempCorr={tmp_rows[j][1]:.4f} | LatDist={lat_rows_[j][1]:.4f}" + ) + rrule() + + +def _maybe_write_metrics_csv( + out_csv: Path | None, + ssim_rows_: List[Tuple[str, float]], + kl_rows_: List[Tuple[str, float, float]], + tmp_rows: List[Tuple[str, float]], + late_rows: List[Tuple[str, float]], +) -> None: + """Optionally write a CSV with all computed metrics. + + Args: + out_csv: Output CSV path or None to skip writing. + ssim_rows_: List of (sample, ssim). + kl_rows_: List of (sample, kl_spread, kl_mpr). + tmp_rows: List of (sample, temporal_corr). + late_rows: List of (sample, latent_distance). + """ + if not out_csv: + return + import csv + + out_csv.parent.mkdir(parents=True, exist_ok=True) + with out_csv.open("w", newline="") as f: + w = csv.writer(f) + w.writerow(["sample", "ssim", "kl_spread", "kl_mpr", "temp_corr", "lat_dist"]) + for j in range(len(ssim_rows_)): + w.writerow( + [ + ssim_rows_[j][0], + f"{ssim_rows_[j][1]:.6f}", + f"{kl_rows_[j][1]:.6f}", + f"{kl_rows_[j][2]:.6f}", + f"{tmp_rows[j][1]:.6f}", + f"{late_rows[j][1]:.6f}", + ] + ) + rlog(f"[save] metrics CSV → {out_csv}") + + +# -------------------------------------------------------------------------------------- +# Latent walks (modular) +# -------------------------------------------------------------------------------------- +def generate_latent_walks( + model_: TimeGAN, + test_: NDArray, + *, + seq_len: int, + lvls: int, + c_map: str, + used_log1p: bool, + dpi_: int, + steps: int, + out_dire: Path, + prefix: str = "latent_walk", +) -> None: + """Generate latent-space walk panels from encoded real windows. + + Produces two panels: + 1) within tight-spread regime (tight → tight2) + 2) cross regime (tight → wide) + + Args: + model_: Trained TimeGAN instance. + test_: Held-out 2D sequence matrix [T, F]. + seq_len: Window length for encoding/decoding. + lvls: Number of LOB levels to visualize. + c_map: Colormap name for heatmaps. + used_log1p: Whether to apply log1p to volumes. + dpi_: Figure DPI. + steps: Number of interpolation steps per panel. + out_dire: Directory to write panel images. + prefix: Filename prefix for saved panels. + """ + + def _encode_window(start_idx: int) -> NDArray: + """Encode a real window [seq_len, F] into latent [seq_len, d].""" + sl = slice(start_idx, start_idx + seq_len) + x_win = test_[sl][None, ...] # [1, T, F] + x_t = torch.as_tensor(x_win, dtype=torch.float32, device=model_.device) + with torch.no_grad(): + h = model_.netE(x_t) + return h[0].cpu().numpy() + + def _decode_latent(h_seq: NDArray) -> NDArray: + """Decode latent [seq_len, d] back to original scale [seq_len, F].""" + h_t = torch.as_tensor( + h_seq[None, ...], dtype=torch.float32, device=model_.device + ) + with torch.no_grad(): + x_tilde = model_.netR(h_t) # scaled space + x_np = x_tilde[0].cpu().numpy().astype(np.float32) + # inverse scale + x_np = model_.fmin.astype(np.float32) + x_np * ( + model_.fmax.astype(np.float32) - model_.fmin.astype(np.float32) + ) + return x_np + + def _walk_and_plot(h0: NDArray, h1: NDArray, save_path: Path, title: str) -> None: + """Linearly interpolate between two latent paths and render the panel.""" + alphas = np.linspace(0.0, 1.0, max(2, steps)) + decoded_windows: List[NDArray] = [] + with rstatus(f"[cyan]Interpolating latent space ({len(alphas)} steps)"): + for idx, a in enumerate(alphas): + hs = (1.0 - a) * h0 + a * h1 + xs = _decode_latent(hs) + decoded_windows.append(xs) + rlog(f"[walk] step {idx + 1}/{len(alphas)}") + plot_walk_panel( + decoded_windows, + lvls=lvls, + cmap_=c_map, + use_log1p_=used_log1p, + title=title, + save_path=save_path, + dpi_=dpi_, + ) + + # Endpoints + tight_idx = _find_window_by_spread(test_, seq_len, mode="tight") + wide_idx = _find_window_by_spread(test_, seq_len, mode="wide") + tight2_idx = max(0, min(len(test_) - seq_len, tight_idx + seq_len * 5)) + rlog(f"[walk] indices: tight={tight_idx}, tight2={tight2_idx}, wide={wide_idx}") + + # Encode endpoints + h_tight = _encode_window(tight_idx) + h_wide = _encode_window(wide_idx) + h_tight2 = _encode_window(tight2_idx) + + out_dire.mkdir(parents=True, exist_ok=True) + + # Cross regime: tight → wide + cross_png = out_dire / f"{prefix}_tight_to_wide.png" + _walk_and_plot( + h_tight, + h_wide, + cross_png, + "Latent walk from tight-spread to wide-spread regime: widening best-level separation and reshaping of depth", + ) + + # Within tight regime: tight → tight2 + within_png = out_dire / f"{prefix}_tight_only.png" + _walk_and_plot( + h_tight, + h_tight2, + within_png, + "Latent walk within a tight-spread regime: intermediate windows decoded to LOB depth heatmaps", + ) + + +# -------------------------------------------------------------------------------------- +# Main +# -------------------------------------------------------------------------------------- +if __name__ == "__main__": + rrule("[bold cyan]Heatmaps, SSIM, KL, Temporal Corr, Latent Distance[/bold cyan]") + + # Parse unified options: top-level + nested {dataset, modules, viz} + opts = Options().parse() + ds = opts.dataset + mod = opts.modules + viz = getattr(opts, "viz", None) + + # Visualization defaults if section was omitted + samples: int = max(0, getattr(viz, "samples", 3) if viz else 3) + out_dir: Path = ( + getattr(viz, "out_dir", Path(OUTPUT_DIR) / "viz") + if viz + else (Path(OUTPUT_DIR) / "viz") + ) + bins: int = getattr(viz, "bins", 100) if viz else 100 + cmap: str = getattr(viz, "cmap", "coolwarm") if viz else "coolwarm" + use_log1p: bool = not getattr(viz, "no_log1p", False) if viz else True + dpi: int = getattr(viz, "dpi", 220) if viz else 220 + levels: int = ( + int(getattr(viz, "levels")) + if (viz and getattr(viz, "levels", None) is not None) + else int(DEFAULT_LEVELS) + ) + do_ssim: bool = not getattr(viz, "no_ssim", False) if viz else True + do_kl: bool = not getattr(viz, "no_kl", False) if viz else True + do_temp: bool = not getattr(viz, "no_temp", False) if viz else True + do_lat: bool = not getattr(viz, "no_lat", False) if viz else True + metrics_csv: Path | None = getattr(viz, "metrics_csv", None) if viz else None + + # Latent-walk flags + do_walk: bool = bool(getattr(viz, "walk", False)) if viz else False + walk_steps: int = int(getattr(viz, "walk_steps", 8)) if viz else 8 + walk_mode: str = ( + str(getattr(viz, "walk_mode", "both")).lower() if viz else "both" + ) # within|cross|both + walk_prefix: str = ( + str(getattr(viz, "walk_prefix", "latent_walk")) if viz else "latent_walk" + ) + + # Load data; flatten val/test for image/kl metrics + with rstatus("[cyan]Loading data"): + train, val, test = load_data(ds) + if getattr(val, "ndim", None) == 3: + val = val.reshape(-1, val.shape[-1]) + if getattr(test, "ndim", None) == 3: + test = test.reshape(-1, test.shape[-1]) + + rlog( + f"[shapes] train={getattr(train, 'shape', None)} | val={getattr(val, 'shape', None)} | test={getattr(test, 'shape', None)}" + ) + + # Restore model + with rstatus("[cyan]Restoring checkpoint"): + model = TimeGAN(mod, train, val, test, load_weights=True) + + # Output directory + out_dir.mkdir(parents=True, exist_ok=True) + rlog(f"[init] output directory → {out_dir}") + rlog(f"[plan] will generate {samples} synthetic sample(s)") + + # Real heatmap once + real_png = out_dir / "real_heatmap.png" + with rstatus("[cyan]Rendering real heatmap"): + plot_heatmap( + test, + lvls=levels, + cmap_=cmap, + use_log1p_=use_log1p, + title="Real LOB Depth", + save_path=real_png, + dpi_=dpi, + ) + + # Accumulate metrics + ssim_rows: List[Tuple[str, float]] = [] + kl_rows: List[Tuple[str, float, float]] = [] + temp_rows: List[Tuple[str, float]] = [] + lat_rows: List[Tuple[str, float]] = [] + + # Generate and evaluate synthetic samples + for i in range(samples): + tag = f"synthetic_{i:03d}" + + with rstatus(f"[cyan]Generating {tag}"): + synth: NDArray = model.generate(num_rows=int(test.shape[0])) + + synth_png = out_dir / f"{tag}.png" + plot_heatmap( + synth, + lvls=levels, + cmap_=cmap, + use_log1p_=use_log1p, + title=f"Synthetic LOB Depth #{i:03d}", + save_path=synth_png, + dpi_=dpi, + ) + + # Metrics (respect toggles) + ssim_val = float("nan") + if do_ssim: + ssim_val = get_ssim(real_png, synth_png) + ssim_rows.append((tag, ssim_val)) + + kl_sp, kl_mpr = float("nan"), float("nan") + if do_kl: + kl_dict = get_kl_metrics(test, synth, bins=bins) + kl_sp, kl_mpr = float(kl_dict["spread"]), float(kl_dict["midprice_returns"]) + kl_rows.append((tag, kl_sp, kl_mpr)) + + temp_val = float("nan") + if do_temp: + temp_val = temporal_consistency(test, synth) + temp_rows.append((tag, temp_val)) + + lat_val = float("nan") + if do_lat: + lat_val = latent_divergence(test, synth) + lat_rows.append((tag, lat_val)) + + rlog( + f"[metrics] {tag} SSIM={ssim_val:.4f} | KL(sp)={kl_sp:.4f} | KL(mpr)={kl_mpr:.4f} | TempCorr={temp_val:.4f} | LatDist={lat_val:.4f}" + ) + + # Present + optional CSV + _print_metrics_table(ssim_rows, kl_rows, temp_rows, lat_rows) + _maybe_write_metrics_csv(metrics_csv, ssim_rows, kl_rows, temp_rows, lat_rows) + + # Latent walks + if do_walk: + rlog(f"[walk] mode={walk_mode} steps={walk_steps} prefix={walk_prefix}") + if walk_mode in ("both", "cross", "within"): + generate_latent_walks( + model, + test, + seq_len=mod.seq_len, + lvls=levels, + c_map=cmap, + used_log1p=use_log1p, + dpi_=dpi, + steps=walk_steps, + out_dire=out_dir, + prefix=walk_prefix, + ) + else: + rlog(f"[warn] unknown walk_mode '{walk_mode}', skipping walks") + + rrule("[bold green]Done[/bold green]") diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/modules.py b/recognition/TimeLOB_TimeGAN_49088276/src/modules.py new file mode 100644 index 000000000..5d9d1fb03 --- /dev/null +++ b/recognition/TimeLOB_TimeGAN_49088276/src/modules.py @@ -0,0 +1,680 @@ +#!/usr/bin/env python3 +""" +TimeGAN with LOB-aware enhancements. + +This module defines the five canonical components (Encoder, Recovery, Generator, +Supervisor, Discriminator) and a thin training wrapper for AMZN LOBSTER L10. +It supports three-phase training, periodic validation (KL on spread), history +plotting, checkpointing, and simple generation utilities. + +Inputs are sequences of shape ``[batch, seq_len, feature_dim]`` and all +component outputs mirror that shape where appropriate. + +Exports: + - Encoder + - Recovery + - Generator + - Supervisor + - Discriminator + - TimeGAN + - TemporalBackboneConfig (placeholder for future use) + +Created By: Radhesh Goel (Keys-I) +ID: s49088276 +""" + +from __future__ import annotations + +import math +from dataclasses import dataclass, field +from pathlib import Path +from typing import Dict, List, Optional, Protocol, Tuple, cast, runtime_checkable + +import matplotlib.pyplot as plt +import numpy as np +import torch +import torch.nn as nn +import torch.optim as optim +from numpy.typing import NDArray +from torch import Tensor +from tqdm.auto import tqdm # pretty progress bars + +from src.dataset import batch_generator +from src.helpers.constants import ( + NUM_TRAINING_ITERATIONS, + OUTPUT_DIR, + VALIDATE_INTERVAL, + WEIGHTS_DIR, +) + +# richie: centralized pretty CLI helpers (safe fallbacks inside) +from src.helpers.richie import log as rlog +from src.helpers.richie import rule as rrule +from src.helpers.richie import status as rstatus +from src.helpers.utils import ( + kl_divergence_hist, + minmax_inverse, + minmax_scale, + sample_noise, +) + + +def get_device() -> torch.device: + """Return CUDA, MPS, or CPU device in that order of preference.""" + if torch.cuda.is_available(): + return torch.device("cuda") + if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available(): + return torch.device("mps") + return torch.device("cpu") + + +def set_seed(seed: Optional[int]) -> None: + """Set Python-side RNG seeds (NumPy, Torch, CUDA) if seed is non-negative.""" + if seed is None or seed < 0: + return + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.use_deterministic_algorithms(False) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + +def xavier_gru_init(m: nn.Module) -> None: + """Initialize GRU and Linear layers with Xavier/orthogonal schemes.""" + if isinstance(m, nn.GRU): + for name, p in m.named_parameters(): + t = cast(Tensor, p) + if "weight_ih" in name: + nn.init.xavier_uniform_(t) + elif "weight_hh" in name: + nn.init.orthogonal_(t) + elif "bias" in name: + nn.init.zeros_(t) + elif isinstance(m, nn.Linear): + nn.init.xavier_uniform_(m.weight) + if m.bias is not None: + nn.init.zeros_(m.bias) + + +class Encoder(nn.Module): + """Embedding network: original feature space → latent space.""" + + def __init__(self, input_dim: int, hidden_dim: int, num_layers: int) -> None: + super().__init__() + self.rnn = nn.GRU( + input_size=input_dim, + hidden_size=hidden_dim, + num_layers=num_layers, + batch_first=True, + ) + self.proj = nn.Linear(hidden_dim, hidden_dim) + self.act = nn.Sigmoid() + self.apply(xavier_gru_init) + + def forward(self, x: torch.Tensor, apply_sigmoid: bool = True) -> torch.Tensor: + """Return latent sequence H given X.""" + h, _ = self.rnn(x) + h = self.proj(h) + return self.act(h) if apply_sigmoid else h + + +class Recovery(nn.Module): + """Recovery network: latent space → original space.""" + + def __init__(self, hidden_dim: int, output_dim: int, num_layers: int) -> None: + super().__init__() + self.rnn = nn.GRU( + input_size=hidden_dim, + hidden_size=output_dim, + num_layers=num_layers, + batch_first=True, + ) + self.proj = nn.Linear(output_dim, output_dim) + self.act = nn.Sigmoid() + self.apply(xavier_gru_init) + + def forward(self, h: torch.Tensor, apply_sigmoid: bool = True) -> torch.Tensor: + """Return reconstructed X̃ given H.""" + x_tilde, _ = self.rnn(h) + x_tilde = self.proj(x_tilde) + return self.act(x_tilde) if apply_sigmoid else x_tilde + + +class Generator(nn.Module): + """Generator: random noise Z → latent sequence E.""" + + def __init__(self, z_dim: int, hidden_dim: int, num_layers: int) -> None: + super().__init__() + self.rnn = nn.GRU( + input_size=z_dim, + hidden_size=hidden_dim, + num_layers=num_layers, + batch_first=True, + ) + self.proj = nn.Linear(hidden_dim, hidden_dim) + self.act = nn.Sigmoid() + self.apply(xavier_gru_init) + + def forward(self, z: torch.Tensor, apply_sigmoid: bool = True) -> torch.Tensor: + """Return latent path E given Z.""" + g, _ = self.rnn(z) + g = self.proj(g) + return self.act(g) if apply_sigmoid else g + + +class Supervisor(nn.Module): + """Supervisor: next-step latent supervision H_t → H_{t+1}.""" + + def __init__(self, hidden_dim: int, num_layers: int) -> None: + super().__init__() + self.rnn = nn.GRU( + input_size=hidden_dim, + hidden_size=hidden_dim, + num_layers=num_layers, + batch_first=True, + ) + self.proj = nn.Linear(hidden_dim, hidden_dim) + self.act = nn.Sigmoid() + self.apply(xavier_gru_init) + + def forward(self, h: torch.Tensor, apply_sigmoid: bool = True) -> torch.Tensor: + """Return supervised latent S(H).""" + s, _ = self.rnn(h) + s = self.proj(s) + return self.act(s) if apply_sigmoid else s + + +class Discriminator(nn.Module): + """Discriminator over latent sequences; outputs per-timestep logits.""" + + def __init__(self, hidden_dim: int, num_layers: int) -> None: + super().__init__() + self.rnn = nn.GRU( + input_size=hidden_dim, + hidden_size=hidden_dim, + num_layers=num_layers, + batch_first=True, + ) + self.proj = nn.Linear(hidden_dim, 1) + self.apply(xavier_gru_init) + + def forward(self, h: torch.Tensor) -> torch.Tensor: + """Return logits for each time step.""" + d, _ = self.rnn(h) + return self.proj(d) + + +@dataclass +class TrainingHistory: + """Buffer for tracking losses and validation metrics over iterations.""" + + er_iters: List[int] = field(default_factory=list) + er_vals: List[float] = field(default_factory=list) + + s_iters: List[int] = field(default_factory=list) + s_vals: List[float] = field(default_factory=list) + + g_iters: List[int] = field(default_factory=list) + g_vals: List[float] = field(default_factory=list) + + d_iters: List[int] = field(default_factory=list) + d_vals: List[float] = field(default_factory=list) + + kl_iters: List[int] = field(default_factory=list) + kl_vals: List[float] = field(default_factory=list) + + def add_er(self, it: int, v: float) -> None: + self.er_iters.append(it) + self.er_vals.append(v) + + def add_s(self, it: int, v: float) -> None: + self.s_iters.append(it) + self.s_vals.append(v) + + def add_g(self, it: int, v: float) -> None: + self.g_iters.append(it) + self.g_vals.append(v) + + def add_d(self, it: int, v: float) -> None: + self.d_iters.append(it) + self.d_vals.append(v) + + def add_kl(self, it: int, v: float) -> None: + self.kl_iters.append(it) + self.kl_vals.append(v) + + def save_plots(self, out_dir: Path, total_iters: int) -> Dict[str, Path]: + out_dir.mkdir(parents=True, exist_ok=True) + saved: Dict[str, Path] = {} + + # Training losses + fig, ax = plt.subplots(figsize=(9, 5)) + if self.er_iters: + ax.plot(self.er_iters, self.er_vals, label="Recon (E,R)") + if self.s_iters: + ax.plot(self.s_iters, self.s_vals, label="Supervisor (S)") + if self.g_iters: + ax.plot(self.g_iters, self.g_vals, label="Generator (G)") + if self.d_iters: + ax.plot(self.d_iters, self.d_vals, label="Discriminator (D)") + ax.set_title("Training Losses vs Iteration") + ax.set_xlabel("Iteration") + ax.set_ylabel("Loss") + ax.set_xlim( + 1, + max( + [ + total_iters, + *self.er_iters, + *self.s_iters, + *self.g_iters, + *self.d_iters, + ] + or [total_iters] + ), + ) + ax.legend(loc="best") + fig.tight_layout() + p1 = out_dir / "training_curves.png" + fig.savefig(p1, dpi=150, bbox_inches="tight") + plt.close(fig) + saved["training_curves"] = p1 + + # KL(spread) + if self.kl_iters: + fig, ax = plt.subplots(figsize=(9, 3.5)) + ax.plot(self.kl_iters, self.kl_vals, marker="o", linewidth=1) + ax.set_title("Validation KL(spread) vs Iteration") + ax.set_xlabel("Iteration") + ax.set_ylabel("KL(spread)") + ax.set_xlim(1, max(self.kl_iters)) + fig.tight_layout() + p2 = out_dir / "kl_spread_curve.png" + fig.savefig(p2, dpi=150, bbox_inches="tight") + plt.close(fig) + saved["kl_spread_curve"] = p2 + + return saved + + +@dataclass +class TimeGANHandles: + """Container for the five component modules (for external wiring if needed).""" + + encoder: Encoder + recovery: Recovery + generator: Generator + supervisor: Supervisor + discriminator: Discriminator + + +@runtime_checkable +class OptLike(Protocol): + """Structural type for options needed by TimeGAN.""" + + batch_size: int + seq_len: int + z_dim: int + hidden_dim: int + num_layer: int + lr: float + beta1: float + w_gamma: float + w_g: float + + +class TimeGAN: + """ + End-to-end TimeGAN wrapper with training & generation utilities. + """ + + def __init__( + self, + opt: OptLike, + train_data: NDArray[np.float32], + val_data: NDArray[np.float32], + test_data: NDArray[np.float32], + load_weights: bool = False, + ) -> None: + # set seed & device + set_seed(getattr(opt, "manualseed", getattr(opt, "seed", None))) + self.device = get_device() + + # options + self.opt = opt + self.batch_size: int = opt.batch_size + self.seq_len: int = opt.seq_len + self.z_dim: int = opt.z_dim + self.h_dim: int = opt.hidden_dim + self.n_layers: int = opt.num_layer + + # schedule + self.num_iterations = int(getattr(opt, "num_iters", NUM_TRAINING_ITERATIONS)) + self.validate_interval = VALIDATE_INTERVAL + + # scale train only; keep stats for inverse + self.train_norm, self.fmin, self.fmax = minmax_scale(train_data) + self.val = val_data + self.test = test_data + + # build modules (E/R operate on feature dimension) + feat_dim = int(self.train_norm.shape[-1]) + self.netE = Encoder(feat_dim, self.h_dim, self.n_layers).to(self.device) + self.netR = Recovery(self.h_dim, feat_dim, self.n_layers).to(self.device) + self.netG = Generator(self.z_dim, self.h_dim, self.n_layers).to(self.device) + self.netS = Supervisor(self.h_dim, self.n_layers).to(self.device) + self.netD = Discriminator(self.h_dim, self.n_layers).to(self.device) + + # losses + self.mse = nn.MSELoss() + self.l1 = nn.L1Loss() + self.bce_logits = nn.BCEWithLogitsLoss() + + # optimizers + self.optE = optim.Adam( + self.netE.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999) + ) + self.optR = optim.Adam( + self.netR.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999) + ) + self.optG = optim.Adam( + self.netG.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999) + ) + self.optS = optim.Adam( + self.netS.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999) + ) + self.optD = optim.Adam( + self.netD.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999) + ) + + self.history = TrainingHistory() + # load + if load_weights: + self._maybe_load() + + # initial banner + rrule("[bold cyan]TimeGAN • init[/bold cyan]") + rlog( + f"device={self.device} " + f"batch_size={self.batch_size} seq_len={self.seq_len} z_dim={self.z_dim} " + f"h_dim={self.h_dim} n_layers={self.n_layers} num_iters={self.num_iterations}" + ) + rlog( + f"train_norm={self.train_norm.shape} val={self.val.shape} test={self.test.shape}" + ) + + # small utility for smooth progress readouts + @staticmethod + def _ema(prev: Optional[float], x: float, alpha: float = 0.1) -> float: + return x if prev is None else (1 - alpha) * prev + alpha * x + + @staticmethod + def _ckpt_path() -> Path: + # NOTE: these are Paths from constants; ensure they are Path objects + out = OUTPUT_DIR / WEIGHTS_DIR + out.mkdir(parents=True, exist_ok=True) + return out / "timegan_ckpt.pt" + + def _maybe_load(self) -> None: + path = self._ckpt_path() + if not path.exists(): + rlog("[yellow]Checkpoint not found; starting fresh.[/yellow]") + return + with rstatus("[cyan]Loading checkpoint"): + state = torch.load(path, map_location=self.device) + self.netE.load_state_dict(state["netE"]) + self.netR.load_state_dict(state["netR"]) + self.netG.load_state_dict(state["netG"]) + self.netS.load_state_dict(state["netS"]) + self.netD.load_state_dict(state["netD"]) + self.optE.load_state_dict(state["optE"]) + self.optR.load_state_dict(state["optR"]) + self.optG.load_state_dict(state["optG"]) + self.optS.load_state_dict(state["optS"]) + self.optD.load_state_dict(state["optD"]) + rlog("[green]Checkpoint loaded.[/green]") + + def _save(self, *, with_history: bool = False) -> None: + with rstatus("[cyan]Saving checkpoint"): + torch.save( + { + "netE": self.netE.state_dict(), + "netR": self.netR.state_dict(), + "netG": self.netG.state_dict(), + "netS": self.netS.state_dict(), + "netD": self.netD.state_dict(), + "optE": self.optE.state_dict(), + "optR": self.optR.state_dict(), + "optG": self.optG.state_dict(), + "optS": self.optS.state_dict(), + "optD": self.optD.state_dict(), + }, + self._ckpt_path(), + ) + + if with_history and hasattr(self, "history") and self.history is not None: + # save plots + paths = self.history.save_plots( + OUTPUT_DIR, total_iters=self.num_iterations + ) + for k, p in paths.items(): + rlog(f"[green]Saved {k} → {p}[/green]") + + rlog("[green]Checkpoint saved.[/green]") + + def _to_device(self, *t: torch.Tensor) -> Tuple[torch.Tensor, ...]: + return tuple(x.to(self.device, non_blocking=True) for x in t) + + def _pretrain_er_step(self, x: torch.Tensor) -> float: + # E,R reconstruction loss + h = self.netE(x) + x_tilde = self.netR(h) + loss = self.mse(x_tilde, x) + self.optE.zero_grad() + self.optR.zero_grad() + loss.backward() + self.optE.step() + self.optR.step() + return float(loss.detach().cpu()) + + def _supervised_step(self, x: torch.Tensor) -> float: + # next-step supervision on latent H + h = self.netE(x) + s = self.netS(h) + loss = self.mse(h[:, 1:, :], s[:, :-1, :]) + self.optS.zero_grad() + loss.backward() + self.optS.step() + return float(loss.detach().cpu()) + + def _generator_step(self, x: torch.Tensor, z: torch.Tensor) -> float: + # build graph + h_real = self.netE(x) + s_real = self.netS(h_real) + e_hat = self.netG(z) + h_hat = self.netS(e_hat) + x_hat = self.netR(h_hat) + + # adversarial losses (on logits) + y_fake = self.netD(h_hat) + y_fake_e = self.netD(e_hat) + adv = self.bce_logits(y_fake, torch.ones_like(y_fake)) + adv_e = self.bce_logits(y_fake_e, torch.ones_like(y_fake_e)) + + # moment losses (match mean/std on reconstructions) + x_std = torch.std(x, dim=(0, 1), unbiased=False) + xh_std = torch.std(x_hat, dim=(0, 1), unbiased=False) + v1 = torch.mean(torch.abs(torch.sqrt(xh_std + 1e-6) - torch.sqrt(x_std + 1e-6))) + v2 = torch.mean( + torch.abs(torch.mean(x_hat, dim=(0, 1)) - torch.mean(x, dim=(0, 1))) + ) + + # supervised latent loss + sup = self.mse(s_real[:, :-1, :], h_real[:, 1:, :]) + + loss = ( + adv + + self.opt.w_gamma * adv_e + + self.opt.w_g * (v1 + v2) + + torch.sqrt(sup + 1e-12) + ) + self.optG.zero_grad() + self.optS.zero_grad() + loss.backward() + self.optG.step() + self.optS.step() + return float(loss.detach().cpu()) + + def _discriminator_step(self, x: torch.Tensor, z: torch.Tensor) -> float: + with torch.no_grad(): + e_hat = self.netG(z) + h_hat = self.netS(e_hat) + h_real = self.netE(x) + y_real = self.netD(h_real) + y_fake = self.netD(h_hat) + y_fake_e = self.netD(e_hat) + loss = ( + self.bce_logits(y_real, torch.ones_like(y_real)) + + self.bce_logits(y_fake, torch.zeros_like(y_fake)) + + self.opt.w_gamma * self.bce_logits(y_fake_e, torch.zeros_like(y_fake_e)) + ) + # optional hinge to avoid overshooting + if loss.item() > 0.15: + self.optD.zero_grad() + loss.backward() + self.optD.step() + return float(loss.detach().cpu()) + + def train_model(self) -> None: + rrule("[bold magenta]TimeGAN • training[/bold magenta]") + + # phase 1: encoder-recovery pretrain + er_ema: Optional[float] = None + for it in tqdm( + range(self.num_iterations), desc="Phase 1 • Pretrain (E,R)", unit="it" + ): + x, _T = batch_generator(self.train_norm, None, self.batch_size) # T unused + x = torch.as_tensor(x, dtype=torch.float32) + (x,) = self._to_device(x) + er = self._pretrain_er_step(x) + self.history.add_er(it + 1, er) + + er_ema = self._ema(er, er) + er_ema = self._ema(er_ema, er) + if (it + 1) % 10 == 0: + rlog( + f"[Pretrain] it={it + 1:,} recon={er:.4f} recon_ema={er_ema:.4f}" + ) + + # phase 2: supervisor + sup_ema: Optional[float] = None + for it in tqdm( + range(self.num_iterations), desc="Phase 2 • Supervisor (S)", unit="it" + ): + x, _T = batch_generator(self.train_norm, None, self.batch_size) + x = torch.as_tensor(x, dtype=torch.float32) + (x,) = self._to_device(x) + s = self._supervised_step(x) + self.history.add_s(it + 1, s) + + sup_ema = self._ema(sup_ema, s) + if (it + 1) % 10 == 0: + rlog(f"[Supervised] it={it + 1:,} s_loss={s:.4f} s_ema={sup_ema:.4f}") + + # phase 3: joint training + g_ema: Optional[float] = None + d_ema: Optional[float] = None + for it in tqdm( + range(self.num_iterations), desc="Phase 3 • Joint (G/S/D)", unit="it" + ): + x, _T = batch_generator(self.train_norm, None, self.batch_size) + z = sample_noise(self.batch_size, self.z_dim, self.seq_len) + x = torch.as_tensor(x, dtype=torch.float32) + z = torch.as_tensor(z, dtype=torch.float32) + x, z = self._to_device(x, z) + + # 2× G/ER per 1× D + for _ in range(2): + g_loss = self._generator_step(x, z) + self.history.add_g(it + 1, g_loss) + + g_ema = self._ema(g_ema, g_loss) + # light ER refine pass + self._pretrain_er_step(x) + d_loss = self._discriminator_step(x, z) + self.history.add_d(it + 1, d_loss) + + d_ema = self._ema(d_ema, d_loss) + + if (it + 1) % self.validate_interval == 0: + # quick KL check on a small synthetic sample (optional) + try: + fake = self.generate( + num_rows=min(len(self.val), 4096), mean=0.0, std=1.0 + ) + if self.val.shape[1] >= 3 and fake.shape[1] >= 3: + kl = kl_divergence_hist( + self.val[: len(fake)], fake, metric="spread" + ) + else: + kl = float("nan") + except Exception: + kl = float("nan") + self.history.add_kl(it + 1, kl) + self._save() + rlog( + f"[Joint] it={it + 1:,} G={g_loss:.4f} (ema={g_ema:.4f}) " + f"D={d_loss:.4f} (ema={d_ema:.4f}) KL(spread)={kl:.5g}" + ) + + # final save + self._save(with_history=True) + rrule("[bold green]TimeGAN • training complete[/bold green]") + + @torch.no_grad() + def generate( + self, + num_rows: int, + *, + mean: float = 0.0, + std: float = 1.0, + ) -> NDArray[np.float32]: + """Generate exactly `num_rows` rows of synthetic data (2D array). + + Steps: sample enough [B,T,F] windows → pass through G→S→R → + inverse-scale with train min/max → flatten to [num_rows, F]. + """ + assert num_rows > 0 + windows_needed = math.ceil(num_rows / self.seq_len) + z = sample_noise( + windows_needed, + self.z_dim, + self.seq_len, + mean=mean, + std=std, + ) + z = torch.as_tensor(z, dtype=torch.float32, device=self.device) + e_hat = self.netG(z) + h_hat = self.netS(e_hat) + x_hat = self.netR(h_hat) + x_hat_np = x_hat.detach().cpu().numpy() # [B, T, F] + x_hat_np = x_hat_np.reshape(-1, x_hat_np.shape[-1]) # [B*T, F] + x_hat_np = x_hat_np[:num_rows] + # inverse scale to original feature space + x_hat_np = minmax_inverse(x_hat_np, self.fmin, self.fmax) + return x_hat_np.astype(np.float32, copy=False) + + def print_parameter_count(self) -> None: + rrule("[bold cyan]Parameter counts[/bold cyan]") + sub = { + "Encoder": self.netE, + "Recovery": self.netR, + "Generator": self.netG, + "Supervisor": self.netS, + "Discriminator": self.netD, + } + for name, m in sub.items(): + total = sum(p.numel() for p in m.parameters()) + train = sum(p.numel() for p in m.parameters() if p.requires_grad) + rlog(f"[white]{name:<13}[/white] total={total:,} trainable={train:,}") diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/predict.py b/recognition/TimeLOB_TimeGAN_49088276/src/predict.py new file mode 100644 index 000000000..09e6dab58 --- /dev/null +++ b/recognition/TimeLOB_TimeGAN_49088276/src/predict.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +""" +Generate synthetic LOB sequences with a trained TimeGAN and save results. + +This script loads a trained TimeGAN checkpoint, generates a flat 2D array of +synthetic limit order book (LOB) rows that matches the length of the held-out +test split, and writes the array to ``gen_data.npy`` under ``OUTPUT_DIR``. + +It consumes command-line options via the unified ``Options`` router: +dataset-related flags are parsed by ``DataOptions`` and model/training flags by +``ModulesOptions``. Only the dataset and modules sections are used here; no +visualization is performed. + +Created By: Radhesh Goel (Keys-I) +ID: s49088276 +""" + +from __future__ import annotations + +import numpy as np + +from src.dataset import load_data +from src.helpers.args import Options +from src.helpers.constants import OUTPUT_DIR +from src.modules import TimeGAN + + +def predict() -> None: + """Load data and model, generate synthetic rows, and save to disk. + + This function: + 1. Parses command-line arguments via ``Options``. + 2. Loads the dataset using only the dataset options. + 3. Builds a TimeGAN model using only the modules/training options and + restores weights if available. + 4. Generates a flat array of synthetic rows with the same length as the + test split (flattening windows if needed). + 5. Saves the result to ``OUTPUT_DIR / "gen_data.npy"`` and prints the + output path and array shape. + + Returns: + None + """ + # Parse CLI args (top-level) + top = Options().parse() + + # Load data using ONLY dataset options + train_data, val_data, test_data = load_data(top.dataset) + + # Build model using ONLY modules/training options + model = TimeGAN(top.modules, train_data, val_data, test_data, load_weights=True) + + # Inference: generate exactly len(test_data) rows (2D array) + if getattr(test_data, "ndim", None) == 3: + num_rows = int(test_data.shape[0] * test_data.shape[1]) + else: + num_rows = int(test_data.shape[0]) + synth = model.generate(num_rows=num_rows, mean=0.0, std=1.0) + + # Save + out_dir = OUTPUT_DIR + out_dir.mkdir(parents=True, exist_ok=True) + out_path = out_dir / "gen_data.npy" + np.save(out_path, synth) + print(f"Saved synthetic data to: {out_path} | shape={synth.shape}") + + +if __name__ == "__main__": + predict() diff --git a/recognition/TimeLOB_TimeGAN_49088276/src/train.py b/recognition/TimeLOB_TimeGAN_49088276/src/train.py new file mode 100644 index 000000000..10e5c190d --- /dev/null +++ b/recognition/TimeLOB_TimeGAN_49088276/src/train.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 +""" +Train, validate, and test TimeGAN on AMZN LOBSTER level-10 sequences. + +This entrypoint wires the unified CLI options, loads data, prepares validation +views, constructs a TimeGAN model, and runs the canonical three-phase schedule: +(1) Encoder–Recovery pretrain, (2) Supervisor pretrain, and (3) joint +adversarial training. It also enables periodic evaluation hooks inside the model +for KL divergence on spread/returns and SSIM on depth heatmaps, with checkpoints +and plots saved by the model. + +The model components are defined in ``src.modules`` and the data loader in +``src.dataset``. + +Created By: Radhesh Goel (Keys-I) +ID: s49088276 +""" + +from __future__ import annotations + +from src.dataset import load_data +from src.helpers.args import Options +from src.modules import TimeGAN + + +def train() -> None: + """Parse options, load data, build the model, and run training. + + Steps: + 1. Parse top-level CLI options using ``Options``. + 2. Load train/val/test splits via dataset options. + 3. Flatten validation and test windows to 2D views when needed for + metrics that expect shape ``[T', F]``. + 4. Construct ``TimeGAN`` with module/training options and run the + three-phase training routine. + + Returns: + None + """ + # Parse CLI options + opt = Options().parse() + + # Load data using dataset options + train_data, val_data, test_data = load_data(opt.dataset) + + # If val/test are windowed [N, T, F], flatten to [T', F] for metric views + if getattr(val_data, "ndim", None) == 3: + val_data = val_data.reshape(-1, val_data.shape[-1]) + if getattr(test_data, "ndim", None) == 3: + test_data = test_data.reshape(-1, test_data.shape[-1]) + + # Build model from module options and train + model = TimeGAN(opt.modules, train_data, val_data, test_data, load_weights=False) + model.train_model() + + +if __name__ == "__main__": + train()