VectorInstitute · sarakodeiri · Nov 7, 2025 · Aug 21, 2025 · Aug 21, 2025 · Aug 21, 2025
diff --git a/.gitignore b/.gitignore
@@ -29,7 +29,7 @@ wheels/
 # Synthcity backups
 **/workspace/*.bkp
 
-# Dataset files
+# Data files
 examples/**/data/
 
 # Trained metaclassifiers

diff --git a/examples/ensemble_attack/config.yaml b/examples/ensemble_attack/config.yaml
@@ -11,14 +11,13 @@ data_paths:
   attack_results_path: ${base_example_dir}/attack_results # Path where the attack results will be stored
 
 model_paths:
-  shadow_models_path: ${base_example_dir}/shadow_models # Path where the shadow models are stored
   metaclassifier_model_path: ${base_example_dir}/trained_models # Path where the trained metaclassifier model will be saved
 
 # Pipeline control
 pipeline:
   run_data_processing: false # Set this to false if you have already saved the processed data
-  run_shadow_model_training: true
-  run_metaclassifier_training: false
+  run_shadow_model_training: false # Set this to false if shadow models are already trained and saved
+  run_metaclassifier_training: true
 
 
 # Dataset specific information used for processing in this example
@@ -54,7 +53,18 @@ shadow_training:
     tabddpm_training_config_path: ${base_example_dir}/data_configs/trans.json
   # Model training artifacts are saved under shadow_models_data_path/workspace_name/exp_name
   # Also, training configs for each shadow model are created under shadow_models_data_path.
-  shadow_models_output_path: ${base_data_dir}/shadow_models_data
+  shadow_models_output_path: ${base_data_dir}/shadow_models_and_data
+  target_model_output_path: ${base_data_dir}/target_model_and_data
+  final_shadow_models_path: [
+          "${shadow_training.shadow_models_output_path}/initial_model_rmia_1/shadow_workspace/pre_trained_model/rmia_shadows.pkl",
+          "${shadow_training.shadow_models_output_path}/initial_model_rmia_2/shadow_workspace/pre_trained_model/rmia_shadows.pkl",
+          "${shadow_training.shadow_models_output_path}/shadow_model_rmia_third_set/shadow_workspace/trained_model/rmia_shadows_third_set.pkl",
+      ] # Paths to final shadow models used for metaclassifier training (relative to shadow_models_output_path)
+      # These paths are a result of running the shadow model training pipeline, specifically the
+      # train_three_sets_of_shadow_models in shadow_model_training.py
+      # Each .pkl file contains the training data, trained model and training results for all shadow models in a list.
+  final_target_model_path: ${shadow_training.target_model_output_path}/target_model/shadow_workspace/trained_target_model/target_model.pkl
+  # Path to final target model (relative to target_model_output_path)
   fine_tuning_config:
     fine_tune_diffusion_iterations: 2
     fine_tune_classifier_iterations: 2
@@ -66,10 +76,13 @@ metaclassifier:
   # Data types json file is used for xgboost model training.
   data_types_file_path: ${base_example_dir}/data_configs/data_types.json
   model_type: "xgb"
-  use_gpu: true
+  # Model training parameters
+  num_optuna_trials: 10 # Original code: 100
+  num_kfolds: 5
+  use_gpu: false
   # Temporary. Might remove having an epoch parameter.
   epochs: 1
 
 
 # General settings
-random_seed: 42
+random_seed: 42 # Set to null for no seed, or an integer for a fixed seed
diff --git a/examples/ensemble_attack/data_configs/data_types.json b/examples/ensemble_attack/data_configs/data_types.json
@@ -1,5 +1,6 @@
 {
     "numerical": ["trans_date", "amount", "balance", "account"],
     "categorical": ["trans_type", "operation", "k_symbol", "bank"],
-    "variable_to_predict": "trans_type"
+    "variable_to_predict": "trans_type",
+    "id_column_name": "trans_id"
 }
diff --git a/examples/ensemble_attack/run_attack.py b/examples/ensemble_attack/run_attack.py
@@ -59,10 +59,25 @@ def main(config: DictConfig) -> None:
     # TODO: Investigate the source of error.
     if config.pipeline.run_shadow_model_training:
         shadow_pipeline = importlib.import_module("examples.ensemble_attack.run_shadow_model_training")
-        shadow_pipeline.run_shadow_model_training(config)
+        attack_data_paths = shadow_pipeline.run_shadow_model_training(config)
+        attack_data_paths = [Path(path) for path in attack_data_paths]
+
+        target_data_path = shadow_pipeline.run_target_model_training(config)
+        target_data_path = Path(target_data_path)
+
     if config.pipeline.run_metaclassifier_training:
+        if not config.pipeline.run_shadow_model_training:
+            # If shadow model training is skipped, we need to provide the previous shadow model and target model paths.
+
+            shadow_data_paths = [Path(path) for path in config.shadow_training.final_shadow_models_path]
+
+            target_data_path = Path(config.shadow_training.final_target_model_path)
+
+        assert len(shadow_data_paths) == 3, "The attack_data_paths list must contain exactly three elements."
+        assert target_data_path is not None, "The target_data_path must be provided for metaclassifier training."
+
         meta_pipeline = importlib.import_module("examples.ensemble_attack.run_metaclassifier_training")
-        meta_pipeline.run_metaclassifier_training(config)
+        meta_pipeline.run_metaclassifier_training(config, shadow_data_paths, target_data_path)
 
 
 if __name__ == "__main__":

diff --git a/examples/ensemble_attack/run_metaclassifier_training.py b/examples/ensemble_attack/run_metaclassifier_training.py
@@ -11,19 +11,30 @@
 from midst_toolkit.common.logger import log
 
 
-def run_metaclassifier_training(config: DictConfig) -> None:
+def run_metaclassifier_training(
+    config: DictConfig,
+    shadow_data_paths: list[Path],
+    target_data_path: Path,
+) -> None:
     """
     Fuction to run the metaclassifier training and evaluation.
 
     Args:
         config: Configuration object set in config.yaml.
+        shadow_data_paths: List of paths to the trained shadow models and all their attributes and synthetic data.
+            The list should contain three paths, one for each set of shadow models.
+        target_data_path: Path to the target model and all its attributes and synthetic data.
     """
     log(INFO, "Running metaclassifier training...")
+
     # Load the processed data splits.
     df_meta_train = load_dataframe(
         Path(config.data_paths.processed_attack_data_path),
         "master_challenge_train.csv",
     )
+
+    # y_meta_train consists of binary labels (0s and 1s) indicating whether each row in df_meta_train
+    # belongs to the target model's training set.
     y_meta_train = np.load(
         Path(config.data_paths.processed_attack_data_path) / "master_challenge_train_labels.npy",
     )
@@ -35,69 +46,101 @@ def run_metaclassifier_training(config: DictConfig) -> None:
         Path(config.data_paths.processed_attack_data_path) / "master_challenge_test_labels.npy",
     )
 
-    # Synthetic data borrowed from the attack implementation repository.
-    # From (https://github.com/CRCHUM-CITADEL/ensemble-mia/tree/main/input/tabddpm_black_box/meta_classifier)
-    # TODO: Change this file path to the path where the synthetic data is stored.
-    df_synthetic = load_dataframe(
-        Path(config.data_paths.processed_attack_data_path),
-        "synth.csv",
+    # Three sets of shadow models are trained separately and their paths are provided here.
+
+    assert len(shadow_data_paths) == 3, (
+        "At this point of development, the shadow_data_paths list must contain exactly three elements."
     )
 
+    shadow_data_collection = []
+
+    for model_path in shadow_data_paths:
+        assert model_path.exists(), (
+            f"No file found at {model_path}. Make sure the path is correct, or run shadow model training first."
+        )
+
+        with open(model_path, "rb") as f:
+            shadow_data_and_result = pickle.load(f)
+            shadow_data_collection.append(shadow_data_and_result)
+
+    assert target_data_path.exists(), (
+        f"No file found at {target_data_path}. Make sure the path is correct and that you have trained the target model."
+    )
+
+    with open(target_data_path, "rb") as f:
+        target_data_and_result = pickle.load(f)
+
+    target_synthetic = target_data_and_result["trained_results"][0].synthetic_data
+    assert target_synthetic is not None, "Target model pickle missing synthetic_data."
+    target_synthetic = target_synthetic.copy()
+
     df_reference = load_dataframe(
         Path(config.data_paths.population_path),
         "population_all_with_challenge_no_id.csv",
     )
-    # We should drop the id column from master metaclassifier train data.
-    if "trans_id" in df_meta_train.columns:
-        df_meta_train = df_meta_train.drop(columns=["trans_id", "account_id"])
-    if "trans_id" in df_meta_test.columns:
-        df_meta_test = df_meta_test.drop(columns=["trans_id", "account_id"])
+
+    # Extract trans_id from both train and test dataframes
+    assert "trans_id" in df_meta_train.columns, "Meta train data must have trans_id column"
+    train_trans_ids = df_meta_train["trans_id"]
+
+    assert "trans_id" in df_meta_test.columns, "Meta test data must have trans_id column"
+    test_trans_ids = df_meta_test["trans_id"]
+
+    df_meta_train = df_meta_train.drop(columns=["trans_id", "account_id"])
+    df_meta_test = df_meta_test.drop(columns=["trans_id", "account_id"])
 
     # Fit the metaclassifier.
     meta_classifier_enum = MetaClassifierType(config.metaclassifier.model_type)
 
     # 1. Initialize the attacker
     blending_attacker = BlendingPlusPlus(
         config=config,
+        shadow_data_collection=shadow_data_collection,
+        target_data=target_data_and_result,
         meta_classifier_type=meta_classifier_enum,
         random_seed=config.random_seed,
     )
-    log(
-        INFO,
-        f"{meta_classifier_enum} created with random seed {config.random_seed}, starting training...",
-    )
+
+    log(INFO, f"{meta_classifier_enum} created with random seed {config.random_seed}.")
 
     # 2. Train the attacker on the meta-train set
 
     blending_attacker.fit(
         df_train=df_meta_train,
         y_train=y_meta_train,
-        df_synthetic=df_synthetic,
+        df_target_synthetic=target_synthetic,
         df_reference=df_reference,
+        id_column_data=train_trans_ids,
         use_gpu=config.metaclassifier.use_gpu,
         epochs=config.metaclassifier.epochs,
     )
 
-    log(INFO, "Metaclassifier training finished.")
-
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    # TODO: Create the directory folder if it does not exist.
     model_filename = f"{timestamp}_{config.metaclassifier.model_type}_trained_metaclassifier.pkl"
     with open(Path(config.model_paths.metaclassifier_model_path) / model_filename, "wb") as f:
         pickle.dump(blending_attacker.trained_model, f)
 
     log(INFO, "Metaclassifier model saved, starting evaluation...")
 
+    # Get the synthetic data provided by the challenge for evaluation
+    # TODO: Check if the file is the correct one.
+    df_synthetic_original = load_dataframe(
+        Path(config.data_paths.processed_attack_data_path),
+        "synth.csv",
+    )
+
     # 3. Get predictions on the test set
     probabilities, pred_score = blending_attacker.predict(
         df_test=df_meta_test,
-        df_synthetic=df_synthetic,
+        df_original_synthetic=df_synthetic_original,
         df_reference=df_reference,
+        id_column_data=test_trans_ids,
         y_test=y_meta_test,
     )
 
     # Save the prediction probabilities
-    # TODO: Create the attack results directory folder if it does not exist.
+    attack_results_path = Path(config.data_paths.attack_results_path)
+    attack_results_path.mkdir(parents=True, exist_ok=True)
     np.save(
         Path(config.data_paths.attack_results_path)
         / f"{timestamp}_{config.metaclassifier.model_type}_test_pred_proba.npy",

diff --git a/examples/ensemble_attack/run_shadow_model_training.py b/examples/ensemble_attack/run_shadow_model_training.py
@@ -1,21 +1,103 @@
+import pickle
+import shutil
 from logging import INFO
 from pathlib import Path
+from typing import Any
 
 from omegaconf import DictConfig
 
 from midst_toolkit.attacks.ensemble.data_utils import load_dataframe
 from midst_toolkit.attacks.ensemble.rmia.shadow_model_training import (
     train_three_sets_of_shadow_models,
 )
+from midst_toolkit.attacks.ensemble.shadow_model_utils import (
+    save_additional_tabddpm_config,
+    train_tabddpm_and_synthesize,
+)
 from midst_toolkit.common.logger import log
 
 
-def run_shadow_model_training(config: DictConfig) -> None:
+def run_target_model_training(config: DictConfig) -> Path:
+    """
+    Function to run the target model training for RMIA attack.
+
+    Args:
+        config: Configuration object set in config.yaml.
+
+    Returns:
+        Path to the saved target model results.
+    """
+    log(INFO, "Running target model training...")
+
+    # Load the required dataframe for target model training.
+    df_real_data = load_dataframe(
+        Path(config.data_paths.processed_attack_data_path),
+        "real_train.csv",
+    )
+
+    # TODO: Test when pipeline is complete to make sure real_data is correct.
+
+    target_model_output_path = Path(config.shadow_training.target_model_output_path)
+    target_training_json_config_paths = config.shadow_training.training_json_config_paths
+
+    # TODO: Add this to config or .json files
+    table_name = "trans"
+    id_column_name = "trans_id"
+
+    target_folder = target_model_output_path / "target_model"
+
+    target_folder.mkdir(parents=True, exist_ok=True)
+    shutil.copyfile(
+        target_training_json_config_paths.table_domain_file_path,
+        target_folder / f"{table_name}_domain.json",
+    )
+    shutil.copyfile(
+        target_training_json_config_paths.dataset_meta_file_path,
+        target_folder / "dataset_meta.json",
+    )
+    configs, save_dir = save_additional_tabddpm_config(
+        data_dir=target_folder,
+        training_config_json_path=Path(target_training_json_config_paths.tabddpm_training_config_path),
+        final_config_json_path=target_folder / f"{table_name}.json",  # Path to the new json
+        experiment_name="trained_target_model",
+    )
+
+    train_result = train_tabddpm_and_synthesize(
+        train_set=df_real_data,
+        configs=configs,
+        save_dir=save_dir,
+        synthesize=True,
+    )
+
+    # TODO: Check: Selected_id_lists should be of form [[]]
+    selected_id_lists = [df_real_data[id_column_name].tolist()]
+
+    attack_data: dict[str, Any] = {
+        "selected_sets": selected_id_lists,
+        "trained_results": [],
+    }
+
+    attack_data["trained_results"].append(train_result)
+
+    # Pickle dump the results
+    result_path = Path(save_dir, "target_model.pkl")
+    with open(result_path, "wb") as file:
+        pickle.dump(attack_data, file)
+
+    return result_path
+
+
+def run_shadow_model_training(config: DictConfig) -> list[Path]:
     """
     Function to run the shadow model training for RMIA attack.
 
     Args:
         config: Configuration object set in config.yaml.
+
+    Returns:
+        Paths to the saved shadow model results for the three sets of shadow models. For more details,
+        see the documentation and return value of `train_three_sets_of_shadow_models`
+        at src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py.
     """
     log(INFO, "Running shadow model training...")
     # Load the required dataframes for shadow model training.
@@ -55,5 +137,7 @@ def run_shadow_model_training(config: DictConfig) -> None:
     )
     log(
         INFO,
-        f"Shadow model training finished and saved at 1) {first_set_result_path}, 2) {second_set_result_path}, 3) {third_set_result_path}",
+        f"Shadow model training finished and saved at \n1) {first_set_result_path} \n2) {second_set_result_path} \n3) {third_set_result_path}",
     )
+
+    return [first_set_result_path, second_set_result_path, third_set_result_path]