diff --git a/.gitignore b/.gitignore
index 508ed74c..eb0fdd16 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,7 @@ __pycache__/
 # Ansible artifacts
 .ansible/
 .task/
+*AnsiballZ*
 
 # Build artifacts
 dreadgoad
diff --git a/README.md b/README.md
index cd845d5e..a33aaf59 100644
--- a/README.md
+++ b/README.md
@@ -90,6 +90,7 @@ See [tools/variant_generator/](tools/variant_generator/) for details.
 - [Vulnerability catalog](docs/GOAD-vulnerabilities-comprehensive.md) -- all 50+ vulnerabilities with exploitation techniques
 - [Validation guide](docs/validation.md) -- automated vulnerability validation
 - [Provider guides](docs/mkdocs/docs/providers/) -- VirtualBox, VMware, Proxmox, AWS, Azure, Ludus
+- [AWS AMI build & deploy workflow](docs/mkdocs/docs/providers/aws-ami-workflow.md) -- end-to-end warpgate + Terragrunt + Ansible
 - [Extension guides](docs/mkdocs/docs/extensions/) -- ELK, Exchange, Wazuh, hardened workstation
 - [Architecture diagram](docs/architecture.svg)
 - [Upstream GOAD docs](https://orange-cyberdefense.github.io/GOAD/) -- original project documentation
diff --git a/ansible/ansible.cfg b/ansible/ansible.cfg
index 1b1ac9d6..803c0fe3 100644
--- a/ansible/ansible.cfg
+++ b/ansible/ansible.cfg
@@ -43,6 +43,7 @@ vars_plugins_enabled = host_group_vars,dreadnode.goad.lab_config
 # Note: fact_caching_connection is set via ANSIBLE_CACHE_PLUGIN_CONNECTION env var in Taskfile
 gathering = smart
 fact_caching = jsonfile
+fact_caching_connection = /tmp/ansible_facts_cache
 fact_caching_timeout = 86400
 
 # Diff output
diff --git a/ansible/playbooks/ad-trusts.yml b/ansible/playbooks/ad-trusts.yml
index 5cbf0e6e..baca9ad0 100644
--- a/ansible/playbooks/ad-trusts.yml
+++ b/ansible/playbooks/ad-trusts.yml
@@ -49,7 +49,6 @@
     domain_password: "{{ lab.domains[domain].domain_password }}"
     parent_domain: "{{ '.'.join(domain.split('.')[1:]) | default('') }}"
     trust: "{{ lab.domains[domain].trust | default('') }}"
-    lab: "{{ lab }}"
     domains: "{{ lab.domains.keys() }}"
     replication: forest
     dc_hostname_to_ip: "{{ hostvars[groups['dc'][0]]['dc_hostname_to_ip'] | default({}) }}"
diff --git a/cli/cmd/inventory.go b/cli/cmd/inventory.go
index dcb8fcb7..4ef9cbff 100644
--- a/cli/cmd/inventory.go
+++ b/cli/cmd/inventory.go
@@ -208,18 +208,25 @@ func runInventoryShow(cmd *cobra.Command, args []string) error {
 }
 
 func runInventoryMapping(cmd *cobra.Command, args []string) error {
+	outputPath, _ := cmd.Flags().GetString("output")
+	return generateInstanceMapping(context.Background(), outputPath)
+}
+
+// generateInstanceMapping queries AWS for instance private IPs and writes the
+// mapping to a JSON file that Ansible's network_discovery role uses to avoid
+// slow runtime detection over SSM. If outputPath is empty, it defaults to
+// /tmp/aws_instance_mapping_<env>.json.
+func generateInstanceMapping(ctx context.Context, outputPath string) error {
 	cfg, err := config.Get()
 	if err != nil {
 		return err
 	}
-	ctx := context.Background()
 
 	parsed, err := inv.Parse(cfg.InventoryPath())
 	if err != nil {
 		return err
 	}
 
-	outputPath, _ := cmd.Flags().GetString("output")
 	if outputPath == "" {
 		outputPath = filepath.Join(os.TempDir(), fmt.Sprintf("aws_instance_mapping_%s.json", cfg.Env))
 	}
diff --git a/cli/cmd/provision.go b/cli/cmd/provision.go
index bc27d44c..860a0fed 100644
--- a/cli/cmd/provision.go
+++ b/cli/cmd/provision.go
@@ -9,6 +9,8 @@ import (
 	"strings"
 	"time"
 
+	"slices"
+
 	"github.com/dreadnode/dreadgoad/internal/ansible"
 	"github.com/dreadnode/dreadgoad/internal/config"
 	"github.com/dreadnode/dreadgoad/internal/doctor"
@@ -55,7 +57,6 @@ func init() {
 	provisionCmd.Flags().Int("max-retries", 0, "Max retry attempts (default: from config)")
 	provisionCmd.Flags().Int("retry-delay", 0, "Delay between retries in seconds (default: from config)")
 
-	// ad-users inherits provision flags
 	adUsersCmd.Flags().String("plays", "ad-data.yml", "Playbooks to run")
 	adUsersCmd.Flags().String("limit", "", "Limit execution to specific hosts")
 	adUsersCmd.Flags().Int("max-retries", 0, "Max retry attempts")
@@ -141,6 +142,12 @@ func runProvision(cmd *cobra.Command, args []string) error {
 		return err
 	}
 
+	// Generate instance-to-IP mapping so Ansible can resolve host IPs
+	// without slow runtime network detection over SSM.
+	if err := generateInstanceMapping(ctx, ""); err != nil {
+		slog.Warn("instance mapping generation failed, playbooks will use runtime detection", "error", err)
+	}
+
 	fmt.Println("===============================================")
 	fmt.Printf("DreadGOAD provisioning started at %s\n", time.Now().Format(time.RFC3339))
 	fmt.Printf("Environment: %s\n", cfg.Env)
@@ -155,7 +162,13 @@ func runProvision(cmd *cobra.Command, args []string) error {
 	}
 	fmt.Println("-----------------------------------------------")
 
-	for _, playbook := range playbooks {
+	// Clean up stale SSM sessions before starting provisioning to prevent
+	// connection saturation from orphaned sessions of previous runs.
+	log := slog.Default()
+	log.Info("cleaning up stale SSM sessions before provisioning")
+	ansible.CleanupSSMSessions(ctx, cfg.Env, log)
+
+	for i, playbook := range playbooks {
 		opts := ansible.RetryOptions{
 			Playbook: playbook,
 			Env:      cfg.Env,
@@ -173,6 +186,18 @@ func runProvision(cmd *cobra.Command, args []string) error {
 		if err := ansible.RunPlaybookWithRetry(ctx, opts); err != nil {
 			return fmt.Errorf("provisioning failed at %s: %w", playbook, err)
 		}
+
+		// Between playbooks: clean up accumulated SSM sessions and wait
+		// after reboot-inducing playbooks for SSM agents to reconnect.
+		if i < len(playbooks)-1 {
+			ansible.CleanupSSMSessions(ctx, cfg.Env, log)
+
+			if slices.Contains(config.RebootPlaybooks, playbook) {
+				log.Info("playbook may have caused reboots, waiting for SSM reconnection",
+					"playbook", playbook, "delay", "120s")
+				time.Sleep(120 * time.Second)
+			}
+		}
 	}
 
 	fmt.Println("===============================================")
diff --git a/cli/internal/ansible/retry.go b/cli/internal/ansible/retry.go
index bd0410d9..6d1c2080 100644
--- a/cli/internal/ansible/retry.go
+++ b/cli/internal/ansible/retry.go
@@ -76,7 +76,7 @@ func RunPlaybookWithRetry(ctx context.Context, opts RetryOptions) error {
 
 		if result.TimedOut {
 			log.Error("playbook timed out (idle timeout)", "playbook", opts.Playbook)
-			cleanupSSMSessions(ctx, opts.Env, log)
+			CleanupSSMSessions(ctx, opts.Env, log)
 			continue
 		}
 
@@ -136,7 +136,7 @@ func retryWithErrorStrategy(ctx context.Context, opts RetryOptions, failResult *
 
 	case ErrSSMTransfer:
 		log.Info("SSM transfer error - fixing ssm-user accounts")
-		cleanupSSMSessions(ctx, opts.Env, log)
+		CleanupSSMSessions(ctx, opts.Env, log)
 		fixSSMUsers(ctx, opts.Env, failResult.FailedHosts, log)
 		log.Info("waiting for SSM Agent to stabilize", "delay", "30s")
 		time.Sleep(30 * time.Second)
@@ -154,7 +154,7 @@ func retryWithErrorStrategy(ctx context.Context, opts RetryOptions, failResult *
 
 	case ErrSSMReconnection:
 		log.Info("SSM reconnection needed - waiting for systems to reboot")
-		cleanupSSMSessions(ctx, opts.Env, log)
+		CleanupSSMSessions(ctx, opts.Env, log)
 		log.Info("waiting for Windows reboot and SSM reconnection", "delay", "120s")
 		time.Sleep(120 * time.Second)
 
@@ -224,7 +224,8 @@ func buildRetryLimit(userLimit, failedHosts string) string {
 	}
 }
 
-func cleanupSSMSessions(ctx context.Context, env string, log *slog.Logger) {
+// CleanupSSMSessions terminates stale SSM sessions to prevent connection saturation.
+func CleanupSSMSessions(ctx context.Context, env string, log *slog.Logger) {
 	cfg, err := config.Get()
 	if err != nil {
 		log.Warn("could not get config for SSM cleanup", "error", err)
@@ -322,5 +323,4 @@ func rebootFailedHosts(ctx context.Context, opts RetryOptions, log *slog.Logger)
 	}
 }
 
-// execCommand is a variable for testability.
 var execCommand = exec.CommandContext
diff --git a/dev-inventory b/dev-inventory
index ddac8f98..c7ed675b 100644
--- a/dev-inventory
+++ b/dev-inventory
@@ -21,6 +21,9 @@ ansible_remote_tmp=C:\Windows\Temp
 ; miscellaneous
 data_path="{{ playbook_dir }}/../../ad/GOAD-variant-1/data"
 
+; AWS instances have a single network adapter (no NAT adapter)
+two_adapters=false
+
 ; global settings inventory default value
 keyboard_layouts=["en-US", "da-DK", "fr-FR"]
 
@@ -46,17 +49,17 @@ dns_server_forwarder=1.1.1.1
 ; ------------------------------------------------
 ; sevenkingdoms.local
 ; ------------------------------------------------
-dc01 ansible_host=i-0e428dfc02f5007dd dict_key=dc01 dns_domain=dc01 ansible_user=ansible
+dc01 ansible_host=i-0e428dfc02f5007dd dict_key=dc01 dns_domain=dc01 ansible_user=ansible dc_ipv4=10.0.4.105 host_ipv4=10.0.4.105
 ; ------------------------------------------------
 ; north.sevenkingdoms.local
 ; ------------------------------------------------
-dc02 ansible_host=i-003cb089e4bffd044 dict_key=dc02 dns_domain=dc01 ansible_user=ansible
-srv02 ansible_host=i-0e5e90aa3674b019f dict_key=srv02 dns_domain=dc02 ansible_user=ansible
+dc02 ansible_host=i-003cb089e4bffd044 dict_key=dc02 dns_domain=dc01 ansible_user=ansible dc_ipv4=10.0.4.40 host_ipv4=10.0.4.40
+srv02 ansible_host=i-0e5e90aa3674b019f dict_key=srv02 dns_domain=dc02 ansible_user=ansible dc_ipv4=10.0.4.10 host_ipv4=10.0.4.10
 ; ------------------------------------------------
 ; essos.local
 ; ------------------------------------------------
-dc03 ansible_host=i-09be8150780ec08e6 dict_key=dc03 dns_domain=dc03 ansible_user=ansible
-srv03 ansible_host=i-00efbd3b68f5483a8 dict_key=srv03 dns_domain=dc03 ansible_user=ansible
+dc03 ansible_host=i-09be8150780ec08e6 dict_key=dc03 dns_domain=dc03 ansible_user=ansible dc_ipv4=10.0.4.83 host_ipv4=10.0.4.83
+srv03 ansible_host=i-00efbd3b68f5483a8 dict_key=srv03 dns_domain=dc03 ansible_user=ansible dc_ipv4=10.0.4.53 host_ipv4=10.0.4.53
 
 ; LAB SCENARIO CONFIGURATION -----------------------------
 
diff --git a/docs/GOAD-vulnerabilities-comprehensive.md b/docs/GOAD-vulnerabilities-comprehensive.md
index d7a07d2a..e9c02bd9 100644
--- a/docs/GOAD-vulnerabilities-comprehensive.md
+++ b/docs/GOAD-vulnerabilities-comprehensive.md
@@ -1,6 +1,6 @@
-# Comprehensive GOAD (Game of Active Directory) Vulnerabilities Writeup
+# GOAD Vulnerability Catalog
 
-**GOAD** is a vulnerable Active Directory penetration testing lab environment created by Mayfly (Orange Cyberdefense) designed to help security professionals practice realistic Active Directory attack techniques in a safe, controlled environment.
+**GOAD** is a vulnerable Active Directory penetration testing lab by Mayfly (Orange Cyberdefense). This document catalogs all known vulnerabilities and attack paths in the lab.
 
 **Lab Architecture:**
 
@@ -1344,7 +1344,7 @@ Tywin
 
 **Vulnerability:** Database trust relationships span forest boundaries
 
-- **Attack:** Leverage linked servers to execute commands across forests
+- **Attack:** Use linked servers to execute commands across forests
 - **Impact:** Cross-forest pivoting and command execution
 
 ---
@@ -1419,9 +1419,9 @@ Tywin
 
 ### Token Impersonation
 
-**Vulnerability:** Available tokens on compromised systems can be leveraged
+**Vulnerability:** Available tokens on compromised systems can be stolen
 
-- **Method:** Leverage user tokens to execute commands as other users without credentials
+- **Method:** Use stolen tokens to execute commands as other users without credentials
 - **Token Types:**
   - **Delegation tokens:** Created for interactive logins (RDP, console)
   - **Impersonation tokens:** Created for non-interactive sessions
@@ -1583,7 +1583,7 @@ Tywin
 
 ### ADCS Attacks
 
-- **Certipy** - Comprehensive ADCS exploitation
+- **Certipy** - ADCS enumeration and exploitation
 - **Certify** - Certificate template enumeration
 - **Coercer** - Authentication coercion
 - **Pywhisker / Whisker** - Shadow credentials
@@ -1677,7 +1677,7 @@ Based on the vulnerabilities in GOAD, here are key defensive measures:
 - **Official Documentation:** https://orange-cyberdefense.github.io/GOAD/
 - **Creator's Blog (Mayfly):** https://mayfly277.github.io/
 
-### Comprehensive Walkthrough Series (Mayfly)
+### Walkthrough Series (Mayfly)
 
 1. Part 1 - Reconnaissance and scan: https://mayfly277.github.io/posts/GOADv2-pwning_part1/
 2. Part 2 - Find users: https://mayfly277.github.io/posts/GOADv2-pwning-part2/
@@ -1708,25 +1708,9 @@ Based on the vulnerabilities in GOAD, here are key defensive measures:
 
 ---
 
-## Conclusion
+## Coverage
 
-GOAD (Game of Active Directory) is an exceptionally comprehensive vulnerable Active Directory lab that covers virtually all major Active Directory attack vectors, from initial reconnaissance through complete domain and forest compromise. It includes:
-
-- **50+ distinct vulnerabilities and attack techniques**
-- **15+ CVEs and exploitation methods**
-- **All major ADCS attacks (ESC1-15)**
-- **Complete Kerberos attack surface**
-- **ACL abuse chains**
-- **Delegation exploitation**
-- **Cross-domain and cross-forest attacks**
-- **Privilege escalation techniques**
-- **Lateral movement methods**
-
-The lab is actively maintained and updated with new attack techniques as they are discovered. It provides an excellent training environment for security professionals to practice Active Directory penetration testing in a safe, legal, and comprehensive manner.
-
-This document represents the most thorough compilation of GOAD vulnerabilities available, synthesized from official writeups (Parts 1-14 by Mayfly277), community contributions, and detailed exploitation guides.
-
-**Coverage Summary:**
+Compiled from Mayfly277's official writeups (Parts 1-14) and community contributions.
 
 - Part 1: Reconnaissance and scanning
 - Part 2: User discovery (ASREPRoast, password spraying)
@@ -1742,5 +1726,3 @@ This document represents the most thorough compilation of GOAD vulnerabilities a
 - Part 12: Trust exploitation (child-to-parent, forest trusts, golden ticket + ExtraSid)
 - Part 13: Post-exploitation (token impersonation, RDP hijacking, file coercion)
 - Part 14: Advanced ADCS (ESC5/7/9/10/11/13/14/15)
-
-**Last Updated:** March 2026
diff --git a/docs/mkdocs/docs/providers/aws-ami-workflow.md b/docs/mkdocs/docs/providers/aws-ami-workflow.md
new file mode 100644
index 00000000..31e2e264
--- /dev/null
+++ b/docs/mkdocs/docs/providers/aws-ami-workflow.md
@@ -0,0 +1,362 @@
+# AWS AMI Build & Deploy Workflow
+
+This guide covers the end-to-end workflow for deploying DreadGOAD on AWS: building pre-baked AMIs with warpgate, configuring Terragrunt, deploying infrastructure, and provisioning the lab with Ansible.
+
+## Overview
+
+```text
+warpgate build (golden AMIs)
+        |
+        v
+terragrunt apply (AWS infrastructure)
+        |
+        v
+ansible provisioning (AD configuration)
+```
+
+Building pre-baked AMIs saves approximately **170 minutes** per deployment by pre-installing Windows Updates, AD DS roles, MSSQL, and other dependencies that would otherwise install at runtime.
+
+## Prerequisites
+
+- [warpgate](https://github.com/dreadnode/warpgate) CLI installed
+- [Terraform](https://www.terraform.io/downloads.html) >= 1.7
+- [Terragrunt](https://terragrunt.gruntwork.io/) installed
+- [AWS CLI](https://aws.amazon.com/cli/) configured with appropriate credentials
+- [Ansible](https://docs.ansible.com/) >= 2.15
+- Go 1.21+ (for the `dreadgoad` CLI)
+
+## Environment and Region
+
+The `--env` and `--region` flags thread through the entire stack -- they determine which Terragrunt directory tree is used and which Ansible inventory the CLI targets. Understanding this mapping is important before you start.
+
+### How env and region map to infrastructure
+
+The `dreadgoad` CLI uses `--env` and `--region` to locate your Terragrunt configuration and Ansible inventory:
+
+```text
+infra/goad-deployment/{env}/{region}/
+                       │       │
+                       │       └── region.hcl + network/ + goad/{dc01,dc02,...}
+                       └── env.hcl (account ID, VPC CIDR, deployment name)
+```
+
+For example, `--env staging --region us-west-1` maps to `infra/goad-deployment/staging/us-west-1/`. The Ansible inventory is resolved as `{env}-inventory` (e.g., `staging-inventory`).
+
+!!! warning "Keep these consistent"
+    The `--env` and `--region` you pass to `dreadgoad` CLI commands must match the Terragrunt directory structure you deployed into. If you ran `terragrunt apply` under `staging/us-west-1/`, then use `--env staging --region us-west-1` for provisioning and health checks.
+
+### Setting env and region
+
+You have three options (highest priority wins):
+
+| Method | Example | Notes |
+|--------|---------|-------|
+| CLI flags | `dreadgoad provision --env staging --region us-west-1` | Highest priority, overrides everything |
+| Environment variables | `export DREADGOAD_ENV=staging` | Useful for CI or shell sessions |
+| Config file | `dreadgoad config set env staging` | Persistent defaults at `~/.config/dreadgoad/dreadgoad.yaml` |
+
+The config file is **optional** -- the CLI works with just flags or environment variables. If nothing is set, the defaults are `env=staging` and `region` is resolved from your Ansible inventory.
+
+To initialize a config file with defaults:
+
+```bash
+dreadgoad config init    # Creates ~/.config/dreadgoad/dreadgoad.yaml
+dreadgoad config show    # View the effective configuration
+```
+
+For full details on all config options, see [CLI configuration](../../cli.md).
+
+### Choosing an environment
+
+The repo ships with a `staging` directory tree. To use a different environment (e.g., `dev`), duplicate the directory structure:
+
+```bash
+cp -r infra/goad-deployment/staging infra/goad-deployment/dev
+```
+
+Then edit `dev/env.hcl` to set `env = "dev"` and adjust the account ID, VPC CIDR, or other settings as needed. Each environment gets its own Terraform state, so you can run multiple labs in parallel.
+
+Throughout this guide, examples use `staging` and `us-west-1` to match the defaults. Replace with your chosen env and region as needed.
+
+## Step 1: Build Golden AMIs with Warpgate
+
+DreadGOAD provides three warpgate templates under `warpgate-templates/`:
+
+| Template | Target Hosts | OS | Saves |
+|----------|-------------|-----|-------|
+| `goad-dc-base` | DC01, DC02 | Windows Server 2019 | ~25 min/host |
+| `goad-dc-base-2016` | DC03 | Windows Server 2016 | ~25 min/host |
+| `goad-mssql-base` | SRV02 | Windows Server 2019 | ~48 min/host |
+
+### Build all AMIs
+
+```bash
+# Domain Controllers (Windows 2019)
+warpgate build goad-dc-base --target ami
+
+# Domain Controller (Windows 2016, for DC03/meereen)
+warpgate build goad-dc-base-2016 --target ami
+
+# Member Server with MSSQL (Windows 2019, for SRV02/castelblack)
+warpgate build goad-mssql-base --target ami
+```
+
+To build for a specific region:
+
+```bash
+warpgate build goad-dc-base --target ami --vars aws_region=us-west-1
+```
+
+### Record the AMI IDs
+
+Each build outputs an AMI ID (e.g., `ami-0abc1234def56789`). Record these -- you'll need them in the next step:
+
+| Template | AMI ID | Used By |
+|----------|--------|---------|
+| `goad-dc-base` | `ami-xxxxxxxxx` | DC01 (kingslanding), DC02 (winterfell) |
+| `goad-dc-base-2016` | `ami-xxxxxxxxx` | DC03 (meereen) |
+| `goad-mssql-base` | `ami-xxxxxxxxx` | SRV02 (castelblack) |
+
+!!! note "SRV03 (braavos)"
+    SRV03 runs Windows Server 2016 as a member server. If you don't have a dedicated `goad-member-base-2016` AMI, you can use `goad-dc-base-2016` (the extra AD DS role won't interfere) or a vanilla Windows Server 2016 AMI.
+
+### What's pre-baked vs. runtime
+
+Pre-baked AMIs install roles and software but do **not** perform domain-specific configuration. This split keeps AMIs reusable across deployments:
+
+| Pre-baked (in AMI) | Runtime (Ansible) |
+|---|---|
+| Windows Updates | AD domain promotion |
+| AD DS role (unpromoted) | User/group creation |
+| DNS, RSAT tools | Trust relationships |
+| MSSQL Express (mssql-base) | GPO configuration |
+| IIS/WebDAV (mssql-base) | LAPS, ADCS |
+| PowerShell DSC modules | Vulnerability injection |
+| SSM agent configuration | Domain joins |
+
+## Step 2: Configure Terragrunt
+
+### Set your AWS account
+
+Edit `infra/goad-deployment/staging/env.hcl`:
+
+```hcl
+locals {
+  deployment_name = "goad"
+  aws_account_id  = "123456789012"  # Your AWS account ID
+  env             = "staging"
+  vpc_cidr        = "10.1.0.0/16"
+}
+```
+
+### Set your region
+
+Edit `infra/goad-deployment/staging/us-west-1/region.hcl` (or create a new region directory):
+
+```hcl
+locals {
+  aws_region = "us-west-1"
+}
+```
+
+### Insert AMI IDs into host configurations
+
+Each host has a `terragrunt.hcl` under `infra/goad-deployment/staging/us-west-1/goad/`. Update the `additional_windows_ami_filters` block in each:
+
+**DC01 and DC02** (`dc01/terragrunt.hcl`, `dc02/terragrunt.hcl`) -- use `goad-dc-base` AMI:
+
+```hcl
+additional_windows_ami_filters = [
+  {
+    name   = "image-id"
+    values = ["ami-xxxxxxxxx"]  # goad-dc-base AMI ID
+  }
+]
+
+windows_os         = "Windows_Server"
+windows_os_version = "2019-English-Full-Base"
+windows_ami_owners = ["self"]
+```
+
+**DC03** (`dc03/terragrunt.hcl`) -- use `goad-dc-base-2016` AMI:
+
+```hcl
+additional_windows_ami_filters = [
+  {
+    name   = "image-id"
+    values = ["ami-xxxxxxxxx"]  # goad-dc-base-2016 AMI ID
+  }
+]
+
+windows_os         = "Windows_Server"
+windows_os_version = "2016-English-Full-Base"
+windows_ami_owners = ["self"]
+```
+
+**SRV02** (`srv02/terragrunt.hcl`) -- use `goad-mssql-base` AMI:
+
+```hcl
+additional_windows_ami_filters = [
+  {
+    name   = "image-id"
+    values = ["ami-xxxxxxxxx"]  # goad-mssql-base AMI ID
+  }
+]
+
+windows_os         = "Windows_Server"
+windows_os_version = "2019-English-Full-Base"
+windows_ami_owners = ["self"]
+```
+
+### Set admin passwords
+
+Set per-host passwords via environment variables:
+
+```bash
+export TF_VAR_goad_dc01_password="YourSecurePassword1"
+export TF_VAR_goad_dc02_password="YourSecurePassword2"
+export TF_VAR_goad_dc03_password="YourSecurePassword3"
+export TF_VAR_goad_srv02_password="YourSecurePassword4"
+export TF_VAR_goad_srv03_password="YourSecurePassword5"
+```
+
+## Step 3: Deploy Infrastructure with Terragrunt
+
+### Initialize and apply
+
+```bash
+cd infra/goad-deployment/staging/us-west-1
+
+# Deploy networking first
+cd network
+terragrunt init
+terragrunt apply
+cd ..
+
+# Deploy all GOAD hosts
+cd goad
+terragrunt run-all init
+terragrunt run-all apply
+```
+
+!!! tip
+    `terragrunt run-all` deploys DC01-DC03, SRV02, and SRV03 in parallel. The dependency on the network module is resolved automatically.
+
+### Verify instances
+
+All instances use SSM for management -- no SSH keys or open ports required:
+
+```bash
+# Check instance status via AWS CLI
+aws ec2 describe-instances \
+  --filters "Name=tag:Project,Values=DreadGOAD" \
+  --query "Reservations[].Instances[].[Tags[?Key=='Name'].Value|[0],State.Name,InstanceId]" \
+  --output table
+
+# Connect to an instance via SSM
+aws ssm start-session --target <instance-id>
+```
+
+Or use the DreadGOAD CLI:
+
+```bash
+dreadgoad health-check --env staging --region us-west-1
+```
+
+## Step 4: Provision with Ansible
+
+Once all instances are running, provision the Active Directory environment:
+
+```bash
+# Full provisioning (env and region from config defaults or flags)
+dreadgoad provision --env staging --region us-west-1
+
+# Resume from a specific playbook (useful after a failure)
+dreadgoad provision --env staging --region us-west-1 --from ad-data.yml
+
+# Run only specific playbooks
+dreadgoad provision --env staging --plays build.yml,ad-servers.yml
+
+# Limit to specific hosts
+dreadgoad provision --env staging --plays ad-data.yml --limit dc01
+```
+
+!!! tip
+    If you set defaults via config file (`dreadgoad config set env staging`), you can omit the flags: `dreadgoad provision`
+
+Or run Ansible directly for more control:
+
+```bash
+cd ansible
+ansible-playbook -i ../ad/GOAD/data/inventory -i ../ad/GOAD/providers/aws/inventory main.yml
+```
+
+For step-by-step provisioning (useful for debugging):
+
+```bash
+ANSIBLE_CMD="ansible-playbook -i ../ad/GOAD/data/inventory -i ../ad/GOAD/providers/aws/inventory"
+$ANSIBLE_CMD build.yml            # Prerequisites and VM prep
+$ANSIBLE_CMD ad-servers.yml       # Create domains, enroll servers
+$ANSIBLE_CMD ad-parent_domain.yml # Parent domain setup
+$ANSIBLE_CMD ad-child_domain.yml  # Child domain setup
+sleep 5m                          # Allow replication
+$ANSIBLE_CMD ad-members.yml       # Domain member enrollment
+$ANSIBLE_CMD ad-trusts.yml        # Trust relationships
+$ANSIBLE_CMD ad-data.yml          # Users, groups, OUs
+$ANSIBLE_CMD ad-gmsa.yml          # Group Managed Service Accounts
+$ANSIBLE_CMD laps.yml             # LAPS configuration
+$ANSIBLE_CMD ad-relations.yml     # ACE/ACL relationships
+$ANSIBLE_CMD adcs.yml             # AD Certificate Services
+$ANSIBLE_CMD ad-acl.yml           # ACL attack paths
+$ANSIBLE_CMD servers.yml          # IIS and MSSQL config
+$ANSIBLE_CMD security.yml         # Defender and security settings
+$ANSIBLE_CMD vulnerabilities.yml  # Intentional vulnerabilities
+$ANSIBLE_CMD reboot.yml           # Final reboot
+```
+
+## Step 5: Validate
+
+```bash
+# Quick validation of key vulnerabilities
+dreadgoad validate --quick --env staging --region us-west-1
+
+# Full validation
+dreadgoad validate --env staging --region us-west-1
+```
+
+## Host Mapping Reference
+
+| Host | Computer Name | GOAD ID | Domain | OS | AMI Template |
+|------|--------------|---------|--------|----|-------------|
+| kingslanding | DC01 | dc01 | sevenkingdoms.local | 2019 | goad-dc-base |
+| winterfell | DC02 | dc02 | north.sevenkingdoms.local | 2019 | goad-dc-base |
+| meereen | DC03 | dc03 | essos.local | 2016 | goad-dc-base-2016 |
+| castelblack | SRV02 | srv02 | north.sevenkingdoms.local | 2019 | goad-mssql-base |
+| braavos | SRV03 | srv03 | essos.local | 2016 | (see note above) |
+
+## Rebuilding AMIs
+
+When you need to update the golden AMIs (e.g., for new Windows patches):
+
+1. Rebuild with warpgate: `warpgate build goad-dc-base --target ami`
+2. Update the AMI IDs in the relevant `terragrunt.hcl` files
+3. Redeploy affected instances: `terragrunt apply` in each host directory
+4. Re-run Ansible provisioning for the replaced instances
+
+## Troubleshooting
+
+**AMI not found**: Ensure `windows_ami_owners = ["self"]` is set and you built the AMI in the same region and AWS account.
+
+**SSM connection fails**: Check that VPC endpoints for `ssm`, `ssmmessages`, and `ec2messages` are configured (the network module handles this automatically).
+
+**Ansible timeouts**: Windows instances can take 5-10 minutes to fully boot and initialize SSM. If provisioning fails on first attempt, wait and retry.
+
+**Terragrunt dependency errors**: Always deploy the `network` module before host modules. Use `terragrunt run-all` from the `goad/` directory to handle ordering automatically.
+
+**Provisioning fails mid-run**: This is normal — stop with `Ctrl+C`, fix the issue (inventory, playbook, etc.), and resume with `--from`:
+
+```bash
+dreadgoad provision --env staging --region us-west-1 --from ad-trusts.yml
+```
+
+The CLI re-reads all configuration on each run, so your fixes are picked up immediately. See the [Stopping, Fixing, and Resuming](../provisioning.md#stopping-fixing-and-resuming-provisioning) section for the full workflow.
diff --git a/docs/mkdocs/docs/provisioning.md b/docs/mkdocs/docs/provisioning.md
index b78a7caf..9f6e5bc0 100644
--- a/docs/mkdocs/docs/provisioning.md
+++ b/docs/mkdocs/docs/provisioning.md
@@ -114,6 +114,78 @@ $ANSIBLE_CMD reboot.yml           # Reboot all VMs
 !!! tip
     If a playbook fails, you can usually just re-run it. Most transient failures are caused by Windows latency during installation. Wait a few minutes and retry.
 
+## Stopping, Fixing, and Resuming Provisioning
+
+Provisioning rarely succeeds on the first try without intervention. You'll often need to stop the process, fix an issue (inventory typo, playbook bug, missing variable), and resume from where you left off. This is a normal workflow — distinct from rebuilding the CLI itself.
+
+### The workflow
+
+1. **Stop provisioning** — press `Ctrl+C`. The CLI handles the signal gracefully and terminates the running Ansible process.
+
+2. **Make your fix** — edit inventory files, playbooks, `config.json`, or whatever caused the failure. Changes are picked up on the next run since the CLI re-reads configuration each time.
+
+3. **Resume with `--from`** — restart provisioning from the playbook that failed (or the one after the last that succeeded):
+
+    ```bash
+    dreadgoad provision --from ad-trusts.yml
+    ```
+
+    This runs `ad-trusts.yml` and everything after it, skipping playbooks that already completed.
+
+### Choosing where to resume
+
+Use `--from` with the name of any playbook in the sequence. The CLI runs that playbook and all subsequent ones:
+
+```bash
+# Failed during ad-members.yml — fix and resume from there
+dreadgoad provision --from ad-members.yml
+
+# Or re-run just a single playbook to test a fix
+dreadgoad provision --plays ad-trusts.yml
+
+# Re-run a single playbook against a specific host
+dreadgoad provision --plays ad-data.yml --limit dc01
+```
+
+`--from` and `--plays` are mutually exclusive. Use `--from` to resume a sequence, `--plays` to cherry-pick specific playbooks.
+
+### What the CLI handles automatically
+
+You don't need to manually retry most transient failures. The CLI has built-in retry logic (default: 3 attempts) with error-specific strategies:
+
+| Error Type | Automatic Fix |
+|-----------|--------------|
+| Fact gathering timeout | Reduces forks to 1, extends timeout |
+| SSM transfer errors | Cleans up stale sessions, recreates ssm-user accounts |
+| SSM reconnection | Waits for Windows reboot (2 min), then reconnects |
+| PowerShell errors | Forces PowerShell interactive mode |
+| MSI installer errors | Reboots failed hosts before retry |
+| Network adapter issues | Applies adapter workaround flags |
+
+Configure retry behavior with:
+
+```bash
+dreadgoad provision --max-retries 5 --retry-delay 60
+```
+
+### When to stop and fix manually
+
+Stop and fix manually when:
+
+- **The same error repeats across retries** — the automatic strategy isn't resolving it. Check the playbook logic or inventory.
+- **You spot a configuration mistake** — wrong IP, missing host, typo in a variable. Fix it and resume with `--from`.
+- **A playbook needs code changes** — e.g., a role has a bug. Fix the role, then resume from the affected playbook.
+
+### Logs
+
+Each provisioning run writes a timestamped log to the logs directory:
+
+```text
+logs/<env>-dreadgoad-<timestamp>.log
+```
+
+Check the log to identify which playbook failed and why before deciding where to resume from.
+
 ## Vagrant VM Management
 
 Common Vagrant commands for managing lab VMs:
diff --git a/docs/mkdocs/mkdocs.yml b/docs/mkdocs/mkdocs.yml
index 7ddb4cb8..be5d8d96 100644
--- a/docs/mkdocs/mkdocs.yml
+++ b/docs/mkdocs/mkdocs.yml
@@ -15,6 +15,7 @@ nav:
     - Vmware on Windows: providers/vmware_windows.md
     - Vmware Esxi: providers/vmware_esxi.md
     - Aws: providers/aws.md
+    - AWS AMI Workflow: providers/aws-ami-workflow.md
     - Azure: providers/azure.md
     - Proxmox: providers/proxmox.md
     - Ludus: providers/ludus.md
diff --git a/docs/validation.md b/docs/validation.md
index ce497758..7c9ede68 100644
--- a/docs/validation.md
+++ b/docs/validation.md
@@ -369,32 +369,9 @@ Get-ADObject $dn -Properties ms-DS-MachineAccountQuota |
   Select-Object -ExpandProperty ms-DS-MachineAccountQuota
 ```
 
-## Next Steps
-
-After validation:
-
-1. **Fix Failed Checks**: Use Ansible to reconfigure any failed vulnerabilities
-2. **Document Findings**: Update deployment notes with validation results
-3. **Test Exploitation**: Verify vulnerabilities are exploitable with actual attack tools
-4. **Regular Validation**: Run validation after any infrastructure changes
-
 ## Related Documentation
 
 - [`GOAD-vulnerabilities-comprehensive.md`](./GOAD-vulnerabilities-comprehensive.md) - Complete vulnerability catalog
 - [`cli.md`](./cli.md) - CLI usage and configuration reference
 - [GOAD Official Docs](https://github.com/Orange-Cyberdefense/GOAD) - Upstream documentation
 - [Mayfly's Walkthrough Series](https://mayfly277.github.io/categories/goad/) - Attack technique guides
-
-## Support
-
-For issues with validation:
-
-1. Check the validation script logs
-2. Verify AWS credentials and permissions
-3. Ensure all instances are running
-4. Review Ansible provisioning logs
-5. Check the comprehensive vulnerability documentation
-
----
-
-**Last Updated**: April 2026
diff --git a/playbooks.yml b/playbooks.yml
index 7420607c..b1422a45 100644
--- a/playbooks.yml
+++ b/playbooks.yml
@@ -57,6 +57,7 @@ GOAD-Mini:
   - vulnerabilities.yml
 
 default:
+  - network_setup.yml
   - build.yml
   - ad-servers.yml
   - ad-parent_domain.yml