-
Notifications
You must be signed in to change notification settings - Fork 1
RMIA attack #64
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
RMIA attack #64
Changes from all commits
8b7a781
71178c0
5e4e3d9
a47fe95
5ab4ade
6166728
dbea030
e5a3263
3ce5786
d7977b3
7f7d120
5ac0172
822e0a3
8e410b6
c2a5115
260f1cb
85c43c3
3de5e07
2772609
0bd7849
3d95d9f
1afc8e4
fc2a772
65234a7
3f22c21
a0b5122
945658e
b70e818
c5d9a0a
85873c5
f2de6dc
ef7c6ee
812a54e
1151622
fc5a6aa
8e67448
2211a00
cc49d04
7c6981a
7b8f2f5
79c26b0
a17bfd4
91df09d
895dda0
9798a0f
0c35030
ca49fdc
fa0fb01
972f4ee
ee1fb7a
8fa101c
507a452
77fa27e
396b9c6
8f7a6dc
6cb82f0
c51af3c
0f8c851
90b09fe
b00d22b
a9a72b4
a4f542f
9706404
0d1cb64
4e406fe
0bf4fef
7f6b5b2
88a6d4b
6fc757a
5cec444
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,6 @@ | ||
| { | ||
| "numerical": ["trans_date", "amount", "balance", "account"], | ||
| "categorical": ["trans_type", "operation", "k_symbol", "bank"], | ||
| "variable_to_predict": "trans_type" | ||
| "variable_to_predict": "trans_type", | ||
| "id_column_name": "trans_id" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -59,10 +59,25 @@ def main(config: DictConfig) -> None: | |
| # TODO: Investigate the source of error. | ||
| if config.pipeline.run_shadow_model_training: | ||
| shadow_pipeline = importlib.import_module("examples.ensemble_attack.run_shadow_model_training") | ||
| shadow_pipeline.run_shadow_model_training(config) | ||
| attack_data_paths = shadow_pipeline.run_shadow_model_training(config) | ||
| attack_data_paths = [Path(path) for path in attack_data_paths] | ||
|
|
||
| target_data_path = shadow_pipeline.run_target_model_training(config) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This tightly couples shadow model training with target model training. Based on our discussion, the target model is the one we're attacking right? Theoretically, this model may already exist and we just want to attack it. I.e. we may not always want to or be able to train it? Again, I may still be misunderstanding our vocabulary here.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. From my understanding, the target model is the model we're attacking in the simulated setting, and is a shadow model. The main difference between the target model and the other shadow models is the data it's being trained on. I might be wrong, but I think the target model is being trained on the entire "real data" while the other shadow models are being trained/fine-tuned on different combinations and subsets of the population data. The docstring on
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure sounds good. Like I said, I'm not as deeply integrated in the vocabulary. So this may be perfectly reasonable. Just wanted to ask the question. |
||
| target_data_path = Path(target_data_path) | ||
|
|
||
| if config.pipeline.run_metaclassifier_training: | ||
| if not config.pipeline.run_shadow_model_training: | ||
| # If shadow model training is skipped, we need to provide the previous shadow model and target model paths. | ||
|
|
||
| shadow_data_paths = [Path(path) for path in config.shadow_training.final_shadow_models_path] | ||
|
|
||
| target_data_path = Path(config.shadow_training.final_target_model_path) | ||
|
|
||
| assert len(shadow_data_paths) == 3, "The attack_data_paths list must contain exactly three elements." | ||
| assert target_data_path is not None, "The target_data_path must be provided for metaclassifier training." | ||
|
|
||
| meta_pipeline = importlib.import_module("examples.ensemble_attack.run_metaclassifier_training") | ||
emersodb marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| meta_pipeline.run_metaclassifier_training(config) | ||
| meta_pipeline.run_metaclassifier_training(config, shadow_data_paths, target_data_path) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,19 +11,30 @@ | |
| from midst_toolkit.common.logger import log | ||
|
|
||
|
|
||
| def run_metaclassifier_training(config: DictConfig) -> None: | ||
| def run_metaclassifier_training( | ||
| config: DictConfig, | ||
| shadow_data_paths: list[Path], | ||
| target_data_path: Path, | ||
| ) -> None: | ||
| """ | ||
| Fuction to run the metaclassifier training and evaluation. | ||
|
|
||
| Args: | ||
| config: Configuration object set in config.yaml. | ||
| shadow_data_paths: List of paths to the trained shadow models and all their attributes and synthetic data. | ||
| The list should contain three paths, one for each set of shadow models. | ||
| target_data_path: Path to the target model and all its attributes and synthetic data. | ||
| """ | ||
| log(INFO, "Running metaclassifier training...") | ||
|
|
||
| # Load the processed data splits. | ||
| df_meta_train = load_dataframe( | ||
| Path(config.data_paths.processed_attack_data_path), | ||
| "master_challenge_train.csv", | ||
| ) | ||
|
|
||
| # y_meta_train consists of binary labels (0s and 1s) indicating whether each row in df_meta_train | ||
| # belongs to the target model's training set. | ||
| y_meta_train = np.load( | ||
| Path(config.data_paths.processed_attack_data_path) / "master_challenge_train_labels.npy", | ||
| ) | ||
|
|
@@ -35,69 +46,101 @@ def run_metaclassifier_training(config: DictConfig) -> None: | |
| Path(config.data_paths.processed_attack_data_path) / "master_challenge_test_labels.npy", | ||
| ) | ||
|
|
||
| # Synthetic data borrowed from the attack implementation repository. | ||
| # From (https://github.com/CRCHUM-CITADEL/ensemble-mia/tree/main/input/tabddpm_black_box/meta_classifier) | ||
| # TODO: Change this file path to the path where the synthetic data is stored. | ||
| df_synthetic = load_dataframe( | ||
| Path(config.data_paths.processed_attack_data_path), | ||
| "synth.csv", | ||
| # Three sets of shadow models are trained separately and their paths are provided here. | ||
emersodb marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| assert len(shadow_data_paths) == 3, ( | ||
| "At this point of development, the shadow_data_paths list must contain exactly three elements." | ||
| ) | ||
|
|
||
| shadow_data_collection = [] | ||
|
|
||
| for model_path in shadow_data_paths: | ||
| assert model_path.exists(), ( | ||
emersodb marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| f"No file found at {model_path}. Make sure the path is correct, or run shadow model training first." | ||
| ) | ||
|
|
||
| with open(model_path, "rb") as f: | ||
| shadow_data_and_result = pickle.load(f) | ||
| shadow_data_collection.append(shadow_data_and_result) | ||
|
|
||
| assert target_data_path.exists(), ( | ||
emersodb marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| f"No file found at {target_data_path}. Make sure the path is correct and that you have trained the target model." | ||
| ) | ||
|
|
||
| with open(target_data_path, "rb") as f: | ||
| target_data_and_result = pickle.load(f) | ||
|
|
||
| target_synthetic = target_data_and_result["trained_results"][0].synthetic_data | ||
| assert target_synthetic is not None, "Target model pickle missing synthetic_data." | ||
| target_synthetic = target_synthetic.copy() | ||
|
|
||
| df_reference = load_dataframe( | ||
| Path(config.data_paths.population_path), | ||
| "population_all_with_challenge_no_id.csv", | ||
| ) | ||
| # We should drop the id column from master metaclassifier train data. | ||
| if "trans_id" in df_meta_train.columns: | ||
| df_meta_train = df_meta_train.drop(columns=["trans_id", "account_id"]) | ||
| if "trans_id" in df_meta_test.columns: | ||
| df_meta_test = df_meta_test.drop(columns=["trans_id", "account_id"]) | ||
|
|
||
| # Extract trans_id from both train and test dataframes | ||
| assert "trans_id" in df_meta_train.columns, "Meta train data must have trans_id column" | ||
| train_trans_ids = df_meta_train["trans_id"] | ||
|
|
||
| assert "trans_id" in df_meta_test.columns, "Meta test data must have trans_id column" | ||
| test_trans_ids = df_meta_test["trans_id"] | ||
|
|
||
| df_meta_train = df_meta_train.drop(columns=["trans_id", "account_id"]) | ||
| df_meta_test = df_meta_test.drop(columns=["trans_id", "account_id"]) | ||
|
|
||
| # Fit the metaclassifier. | ||
| meta_classifier_enum = MetaClassifierType(config.metaclassifier.model_type) | ||
|
|
||
| # 1. Initialize the attacker | ||
| blending_attacker = BlendingPlusPlus( | ||
| config=config, | ||
| shadow_data_collection=shadow_data_collection, | ||
| target_data=target_data_and_result, | ||
| meta_classifier_type=meta_classifier_enum, | ||
| random_seed=config.random_seed, | ||
| ) | ||
| log( | ||
| INFO, | ||
| f"{meta_classifier_enum} created with random seed {config.random_seed}, starting training...", | ||
| ) | ||
|
|
||
| log(INFO, f"{meta_classifier_enum} created with random seed {config.random_seed}.") | ||
|
|
||
| # 2. Train the attacker on the meta-train set | ||
|
|
||
| blending_attacker.fit( | ||
| df_train=df_meta_train, | ||
| y_train=y_meta_train, | ||
fatemetkl marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| df_synthetic=df_synthetic, | ||
| df_target_synthetic=target_synthetic, | ||
| df_reference=df_reference, | ||
| id_column_data=train_trans_ids, | ||
| use_gpu=config.metaclassifier.use_gpu, | ||
| epochs=config.metaclassifier.epochs, | ||
| ) | ||
|
|
||
| log(INFO, "Metaclassifier training finished.") | ||
|
|
||
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | ||
| # TODO: Create the directory folder if it does not exist. | ||
emersodb marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| model_filename = f"{timestamp}_{config.metaclassifier.model_type}_trained_metaclassifier.pkl" | ||
| with open(Path(config.model_paths.metaclassifier_model_path) / model_filename, "wb") as f: | ||
| pickle.dump(blending_attacker.trained_model, f) | ||
|
|
||
| log(INFO, "Metaclassifier model saved, starting evaluation...") | ||
|
|
||
| # Get the synthetic data provided by the challenge for evaluation | ||
| # TODO: Check if the file is the correct one. | ||
| df_synthetic_original = load_dataframe( | ||
| Path(config.data_paths.processed_attack_data_path), | ||
| "synth.csv", | ||
| ) | ||
|
Comment on lines
+125
to
+130
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chainAddress or remove the TODO comment about file correctness. The TODO suggests uncertainty about whether Run the following script to understand the relationship between the target model's synthetic data and synth.csv: Do you want me to help clarify the intended usage pattern for these two synthetic datasets, or would you like me to open an issue to track resolving this TODO? 🏁 Script executed: #!/bin/bash
# Check for documentation or comments explaining the two synthetic datasets
rg -n "synth.csv|synthetic.*evaluation|df_synthetic_original" --type=py -A 3 -B 3
# Look for configuration or documentation about evaluation data
rg -n "processed_attack_data_path|synth" --type=yamlLength of output: 2267 🏁 Script executed: #!/bin/bash
# Find where df_synthetic is loaded (mentioned in line 72)
rg -n "df_synthetic\s*=" --type=py -B 5 -A 5 examples/ensemble_attack/run_metaclassifier_training.py | head -60
# Look for how df_synthetic is used in fit call
rg -n "\.fit\(" --type=py -B 5 -A 5 examples/ensemble_attack/run_metaclassifier_training.py
# Check for any documentation comments around training vs evaluation data
rg -n "training|evaluation" --type=py -B 2 -A 2 examples/ensemble_attack/run_metaclassifier_training.py | head -40Length of output: 287 Length of output: 55 Length of output: 2085 Length of output: 466 Length of output: 5238 Length of output: 6316 Length of output: 3909 Length of output: 12865 Length of output: 254 The TODO comment identifies a genuine issue: The file is loaded from
Clarify the intended behavior and either fix the code path or add the missing data generation step. 🤖 Prompt for AI Agents |
||
|
|
||
| # 3. Get predictions on the test set | ||
| probabilities, pred_score = blending_attacker.predict( | ||
| df_test=df_meta_test, | ||
| df_synthetic=df_synthetic, | ||
| df_original_synthetic=df_synthetic_original, | ||
| df_reference=df_reference, | ||
| id_column_data=test_trans_ids, | ||
| y_test=y_meta_test, | ||
| ) | ||
|
|
||
| # Save the prediction probabilities | ||
| # TODO: Create the attack results directory folder if it does not exist. | ||
| attack_results_path = Path(config.data_paths.attack_results_path) | ||
| attack_results_path.mkdir(parents=True, exist_ok=True) | ||
| np.save( | ||
| Path(config.data_paths.attack_results_path) | ||
| / f"{timestamp}_{config.metaclassifier.model_type}_test_pred_proba.npy", | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,21 +1,103 @@ | ||
| import pickle | ||
| import shutil | ||
| from logging import INFO | ||
| from pathlib import Path | ||
| from typing import Any | ||
|
|
||
| from omegaconf import DictConfig | ||
|
|
||
| from midst_toolkit.attacks.ensemble.data_utils import load_dataframe | ||
| from midst_toolkit.attacks.ensemble.rmia.shadow_model_training import ( | ||
| train_three_sets_of_shadow_models, | ||
| ) | ||
| from midst_toolkit.attacks.ensemble.shadow_model_utils import ( | ||
| save_additional_tabddpm_config, | ||
| train_tabddpm_and_synthesize, | ||
| ) | ||
| from midst_toolkit.common.logger import log | ||
|
|
||
|
|
||
| def run_shadow_model_training(config: DictConfig) -> None: | ||
| def run_target_model_training(config: DictConfig) -> Path: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since training the target model follows steps very similar to shadow model training and is part of the attack design, we could consider creating a dedicated function (one suggestion for the name is |
||
| """ | ||
| Function to run the target model training for RMIA attack. | ||
|
|
||
| Args: | ||
| config: Configuration object set in config.yaml. | ||
|
|
||
| Returns: | ||
| Path to the saved target model results. | ||
| """ | ||
| log(INFO, "Running target model training...") | ||
|
|
||
| # Load the required dataframe for target model training. | ||
| df_real_data = load_dataframe( | ||
| Path(config.data_paths.processed_attack_data_path), | ||
| "real_train.csv", | ||
| ) | ||
|
|
||
| # TODO: Test when pipeline is complete to make sure real_data is correct. | ||
|
|
||
| target_model_output_path = Path(config.shadow_training.target_model_output_path) | ||
| target_training_json_config_paths = config.shadow_training.training_json_config_paths | ||
|
|
||
| # TODO: Add this to config or .json files | ||
| table_name = "trans" | ||
| id_column_name = "trans_id" | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps not for this PR, but these could be configuration parameters yes? If not doing it here, maybe just put it as a todo? |
||
|
|
||
| target_folder = target_model_output_path / "target_model" | ||
|
|
||
| target_folder.mkdir(parents=True, exist_ok=True) | ||
| shutil.copyfile( | ||
| target_training_json_config_paths.table_domain_file_path, | ||
| target_folder / f"{table_name}_domain.json", | ||
| ) | ||
| shutil.copyfile( | ||
| target_training_json_config_paths.dataset_meta_file_path, | ||
| target_folder / "dataset_meta.json", | ||
| ) | ||
| configs, save_dir = save_additional_tabddpm_config( | ||
| data_dir=target_folder, | ||
| training_config_json_path=Path(target_training_json_config_paths.tabddpm_training_config_path), | ||
| final_config_json_path=target_folder / f"{table_name}.json", # Path to the new json | ||
| experiment_name="trained_target_model", | ||
| ) | ||
|
|
||
| train_result = train_tabddpm_and_synthesize( | ||
| train_set=df_real_data, | ||
| configs=configs, | ||
| save_dir=save_dir, | ||
| synthesize=True, | ||
| ) | ||
|
|
||
| # TODO: Check: Selected_id_lists should be of form [[]] | ||
| selected_id_lists = [df_real_data[id_column_name].tolist()] | ||
|
|
||
| attack_data: dict[str, Any] = { | ||
| "selected_sets": selected_id_lists, | ||
| "trained_results": [], | ||
| } | ||
|
|
||
| attack_data["trained_results"].append(train_result) | ||
|
|
||
| # Pickle dump the results | ||
| result_path = Path(save_dir, "target_model.pkl") | ||
| with open(result_path, "wb") as file: | ||
| pickle.dump(attack_data, file) | ||
|
|
||
| return result_path | ||
|
|
||
|
|
||
| def run_shadow_model_training(config: DictConfig) -> list[Path]: | ||
| """ | ||
| Function to run the shadow model training for RMIA attack. | ||
|
|
||
| Args: | ||
| config: Configuration object set in config.yaml. | ||
|
|
||
| Returns: | ||
emersodb marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| Paths to the saved shadow model results for the three sets of shadow models. For more details, | ||
| see the documentation and return value of `train_three_sets_of_shadow_models` | ||
| at src/midst_toolkit/attacks/ensemble/rmia/shadow_model_training.py. | ||
| """ | ||
| log(INFO, "Running shadow model training...") | ||
| # Load the required dataframes for shadow model training. | ||
|
|
@@ -55,5 +137,7 @@ def run_shadow_model_training(config: DictConfig) -> None: | |
| ) | ||
| log( | ||
| INFO, | ||
| f"Shadow model training finished and saved at 1) {first_set_result_path}, 2) {second_set_result_path}, 3) {third_set_result_path}", | ||
| f"Shadow model training finished and saved at \n1) {first_set_result_path} \n2) {second_set_result_path} \n3) {third_set_result_path}", | ||
| ) | ||
|
|
||
| return [first_set_result_path, second_set_result_path, third_set_result_path] | ||
Uh oh!
There was an error while loading. Please reload this page.