diff --git a/gcp/.gitignore b/gcp/.gitignore new file mode 100644 index 00000000..96e296ac --- /dev/null +++ b/gcp/.gitignore @@ -0,0 +1,41 @@ +# Terraform +**/.terraform/ +**/.terraform.lock.hcl +*.tfstate +*.tfstate.backup +*.tfstate.*.backup +*.tfplan +*.tfvars.backup +**/.terraform.tfstate.lock.info + +# Sensitive files (never commit!) +secrets.yaml +secrets.json +*secret*.txt +*password*.txt +credentials.json + +# Local development +.env +.envrc +.direnv/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db + +# Logs +*.log +logs/ + +# Temporary files +tmp/ +temp/ +*.tmp diff --git a/gcp/README.md b/gcp/README.md new file mode 100644 index 00000000..ca528ea1 --- /dev/null +++ b/gcp/README.md @@ -0,0 +1,342 @@ +# Pierre MCP Server - GCP Deployment Architecture + +## Executive Summary + +This document provides a comprehensive Google Cloud Platform (GCP) deployment architecture for the Pierre MCP Server, designed by SRE best practices for production workloads. + +## Architecture Decision: Cloud Run + Cloud SQL + +### Why Cloud Run? + +**Recommended: Cloud Run** ✅ + +**Rationale:** +1. **Serverless Benefits**: Auto-scaling from 0 to N instances based on traffic +2. **Cost Efficiency**: Pay only for actual usage (billed per 100ms of CPU time) +3. **Container Native**: Direct Docker deployment, no Kubernetes complexity +4. **Built-in HTTPS**: Automatic SSL certificates and load balancing +5. **Global Distribution**: Deploy to multiple regions easily +6. **Fast Cold Starts**: Rust binary cold starts in <500ms +7. **Perfect Fit**: HTTP API workload with variable traffic patterns + +**Cloud Run vs Alternatives:** +- **GKE (Google Kubernetes Engine)**: Overkill for single container, higher operational overhead +- **Compute Engine VMs**: Requires manual scaling, patching, and load balancing +- **App Engine**: Less flexible, Cloud Run is the modern replacement + +### Why Cloud SQL for PostgreSQL? + +**Recommended: Cloud SQL (PostgreSQL 16)** ✅ + +**Rationale:** +1. **Managed Service**: Automatic backups, patching, high availability +2. **Production Ready**: Point-in-time recovery, automated failover +3. **Performance**: SSD storage, read replicas, connection pooling +4. **Security**: Encrypted at rest and in transit, IAM authentication +5. **Monitoring**: Built-in metrics and logging integration +6. **Cost Effective**: Starts at ~$10/month for db-f1-micro + +**Cloud SQL vs Alternatives:** +- **Cloud Spanner**: Overkill and expensive ($65/node/month) for this workload +- **Self-managed PostgreSQL on GCE**: High operational burden +- **AlloyDB**: More expensive, better for >10TB databases +- **SQLite on persistent disk**: Not recommended for production multi-tenant + +## Infrastructure Components + +### 1. Compute: Cloud Run Service +- **Service Name**: `pierre-mcp-server` +- **Container**: Custom Rust binary (~40MB) +- **CPU**: 1 vCPU (can burst to 2) +- **Memory**: 512Mi (can scale to 1Gi) +- **Concurrency**: 80 requests per instance +- **Min Instances**: 1 (avoid cold starts for critical path) +- **Max Instances**: 100 (adjust based on load testing) +- **Request Timeout**: 300s (5 minutes for long-running MCP operations) +- **Region**: `us-central1` (default, change as needed) + +### 2. Database: Cloud SQL for PostgreSQL +- **Instance Name**: `pierre-postgres` +- **Version**: PostgreSQL 16 +- **Tier**: `db-f1-micro` (dev/staging) or `db-custom-2-8192` (production) +- **Storage**: 20GB SSD (auto-increase enabled) +- **Backups**: Daily automated backups, 7-day retention +- **High Availability**: Regional HA configuration for production +- **Private IP**: VPC-native for security +- **Connection**: Via Cloud SQL Proxy or Private Service Connect + +### 3. Networking +- **VPC**: Custom VPC with private subnets +- **Serverless VPC Connector**: Bridge Cloud Run to Cloud SQL private IP +- **Cloud NAT**: Outbound connectivity for external API calls (Strava, Garmin, etc.) +- **Cloud Armor**: WAF and DDoS protection (optional, for production) +- **Cloud CDN**: Not needed currently (no static assets) + +### 4. Security & Secrets +- **Secret Manager**: Store sensitive credentials + - `PIERRE_MASTER_ENCRYPTION_KEY` + - `STRAVA_CLIENT_SECRET` + - `GARMIN_CLIENT_SECRET` + - `FITBIT_CLIENT_SECRET` + - `OPENWEATHER_API_KEY` + - Database connection strings +- **IAM Service Account**: Least-privilege access for Cloud Run +- **Workload Identity**: Secure authentication to GCP services + +### 5. Monitoring & Observability +- **Cloud Logging**: Structured JSON logs from application +- **Cloud Monitoring**: Custom metrics, dashboards, alerts +- **Cloud Trace**: Distributed tracing with OpenTelemetry +- **Uptime Checks**: Monitor `/health` endpoint +- **Alerting**: PagerDuty/Slack integration for critical issues + +### 6. CI/CD +- **Cloud Build**: Automated Docker builds and deployments +- **Artifact Registry**: Private container registry +- **GitHub Actions**: Trigger Cloud Build on push to main +- **Terraform Cloud**: Infrastructure state management (optional) + +### 7. External API Access + +The application requires outbound HTTPS access to: +- **Strava API**: `https://www.strava.com/api/v3/` +- **Garmin Connect**: `https://connectapi.garmin.com/` +- **Fitbit API**: `https://api.fitbit.com/` +- **OpenWeatherMap**: `https://api.openweathermap.org/` +- **USDA FoodData**: `https://api.nal.usda.gov/fdc/v1/` + +**Network Configuration:** +- Cloud Run → Cloud NAT → Internet (outbound) +- Whitelisting: Not required (APIs use OAuth 2.0 tokens) + +## Cost Estimation (Monthly) + +### Development/Staging +- Cloud Run: $5-20 (low traffic) +- Cloud SQL (db-f1-micro): $10 +- VPC Connector: $9 +- Cloud NAT: $45 +- Storage/Logs: $5 +- **Total: ~$75-90/month** + +### Production (Medium Scale) +- Cloud Run: $100-300 (moderate traffic, min instances) +- Cloud SQL (db-custom-2-8192): $150 +- Cloud SQL HA: +$150 +- VPC Connector: $9 (per connector) +- Cloud NAT: $45 +- Storage/Logs: $20 +- Cloud Monitoring: $10 +- **Total: ~$485-685/month** + +### Production (High Scale) +- Cloud Run: $500-1000 (high traffic) +- Cloud SQL (db-custom-4-16384): $300 +- Read Replicas: +$300 +- Cloud Armor: $5-50 +- All other services: $100 +- **Total: ~$1200-1750/month** + +## Why Terraform (IaC Approach) + +### Terraform vs Alternatives + +**Recommended: Terraform** ✅ + +**Why Terraform:** +1. **Industry Standard**: Most popular IaC tool (40%+ market share) +2. **Multi-Cloud**: Works across GCP, AWS, Azure (future flexibility) +3. **Mature Ecosystem**: 3000+ providers, extensive community +4. **State Management**: Built-in state locking and remote backends +5. **Plan/Apply Workflow**: Preview changes before applying +6. **Module System**: Reusable components for consistency +7. **GitOps Ready**: Version control, code review, CI/CD integration + +**Alternatives Considered:** + +❌ **Google Cloud Deployment Manager** +- GCP-only, deprecated in favor of Terraform +- YAML/Jinja2 templates less expressive than HCL +- Limited community support + +❌ **Pulumi** +- Uses real programming languages (Go, Python, TypeScript) +- Smaller community, less mature +- Overkill for this use case +- Requires developer expertise in specific language + +❌ **gcloud CLI Scripts** +- Imperative, not declarative +- No state management +- Difficult to maintain +- No drift detection +- **Use case**: Quick prototypes only + +❌ **Cloud Console (Manual Clicks)** +- Not reproducible +- No audit trail +- Human error prone +- Impossible to version control +- **Never use for production** + +### Terraform Structure + +``` +gcp/terraform/ +├── main.tf # Primary infrastructure definitions +├── variables.tf # Input variables +├── outputs.tf # Output values +├── versions.tf # Provider versions +├── backend.tf # Remote state configuration +├── modules/ +│ ├── cloud-run/ # Cloud Run service module +│ ├── cloud-sql/ # Cloud SQL database module +│ ├── networking/ # VPC, subnets, NAT module +│ └── secrets/ # Secret Manager module +├── environments/ +│ ├── dev/ +│ │ └── terraform.tfvars +│ ├── staging/ +│ │ └── terraform.tfvars +│ └── production/ +│ └── terraform.tfvars +└── README.md +``` + +## Deployment Strategy + +### Initial Deployment (One-Time Setup) +1. **Enable GCP APIs** (via Terraform or gcloud) +2. **Create Service Accounts** (least privilege) +3. **Create Terraform State Bucket** (GCS backend) +4. **Deploy Networking** (VPC, subnets, NAT) +5. **Deploy Cloud SQL** (database initialization) +6. **Store Secrets** (Secret Manager) +7. **Build Container** (Cloud Build) +8. **Deploy Cloud Run** (initial release) +9. **Run Database Migrations** (Cloud Run job) +10. **Verify Health Checks** (smoke tests) + +### Continuous Deployment (Every Commit) +1. **GitHub Actions** triggers on push to main +2. **Run Tests** (cargo test, linting) +3. **Build Docker Image** (Cloud Build) +4. **Push to Artifact Registry** (tagged with git SHA) +5. **Deploy to Staging** (auto-deploy) +6. **Run E2E Tests** (smoke tests against staging) +7. **Manual Approval** (for production) +8. **Deploy to Production** (blue-green deployment) +9. **Health Check** (automatic rollback on failure) + +### Database Migration Strategy +- **SQLx Migrations**: Embedded in application binary +- **Init Container**: Run migrations before app starts +- **Cloud Run Jobs**: Separate job for migrations +- **Rollback Plan**: Keep previous revision ready + +## High Availability & Disaster Recovery + +### High Availability +- **Cloud Run**: Multi-zone by default (no config needed) +- **Cloud SQL**: Regional HA with automatic failover +- **Read Replicas**: For read-heavy workloads +- **Health Checks**: Automatic instance replacement + +### Disaster Recovery +- **RTO (Recovery Time Objective)**: 15 minutes +- **RPO (Recovery Point Objective)**: 5 minutes +- **Backup Strategy**: + - Automated daily backups (Cloud SQL) + - Point-in-time recovery (7 days) + - Cross-region backup replication + - Export to Cloud Storage (weekly) + +### Monitoring & Alerting +- **Uptime SLI**: 99.9% availability +- **Latency SLI**: p95 < 500ms +- **Error Rate SLI**: < 0.1% +- **Alerts**: + - Service down (critical) + - Error rate spike (critical) + - Database connections exhausted (warning) + - Memory/CPU high (warning) + +## Security Best Practices + +### Network Security +- ✅ Private Cloud SQL (no public IP) +- ✅ VPC Service Controls (optional, for compliance) +- ✅ Cloud Armor for DDoS protection +- ✅ HTTPS only (Cloud Run enforces) + +### Identity & Access +- ✅ Service accounts with least privilege +- ✅ Workload Identity Federation +- ✅ Secret Manager for credentials +- ✅ IAM audit logging + +### Application Security +- ✅ JWT with RS256 signing +- ✅ Rate limiting (application-level) +- ✅ Input validation +- ✅ OWASP Top 10 compliance + +### Compliance +- ✅ Encryption at rest (Cloud SQL, Secret Manager) +- ✅ Encryption in transit (TLS 1.3) +- ✅ Audit logs (Cloud Audit Logs) +- ✅ PII redaction middleware (already implemented) + +## Scalability Plan + +### Vertical Scaling +- **Cloud Run**: Increase CPU/memory per instance +- **Cloud SQL**: Upgrade tier (db-custom-X-YYYY) + +### Horizontal Scaling +- **Cloud Run**: Increase max instances (auto-scaling) +- **Cloud SQL**: Add read replicas + +### Performance Optimization +- **Database Connection Pooling**: SQLx pool (already implemented) +- **Caching**: Redis (Cloud Memorystore) for session cache +- **CDN**: Cloud CDN for MCP SDK static files (future) + +### Load Testing +- **Tool**: k6 or Locust +- **Scenarios**: + - Baseline: 100 RPS sustained + - Peak: 1000 RPS burst + - Soak: 200 RPS for 6 hours +- **Metrics**: Latency, error rate, resource utilization + +## Migration from Current Setup + +### From SQLite (Local Dev) +1. Export SQLite data: `.dump` command +2. Convert to PostgreSQL: `pgloader` tool +3. Import to Cloud SQL: `psql` command +4. Verify data integrity: checksums + +### From Self-Managed PostgreSQL +1. Use `pg_dump` for full backup +2. Restore to Cloud SQL: `pg_restore` +3. Set up replication (optional): logical replication +4. Cutover: DNS/load balancer switch + +## Next Steps + +1. **Review Architecture**: Team approval +2. **Create GCP Project**: Separate projects for dev/staging/prod +3. **Set Up Terraform**: Initialize backend, write modules +4. **Deploy to Dev**: Test infrastructure code +5. **Deploy to Staging**: Full E2E testing +6. **Deploy to Production**: Gradual rollout +7. **Document Runbooks**: Incident response procedures + +## References + +- [Cloud Run Documentation](https://cloud.google.com/run/docs) +- [Cloud SQL for PostgreSQL](https://cloud.google.com/sql/docs/postgres) +- [Terraform GCP Provider](https://registry.terraform.io/providers/hashicorp/google/latest/docs) +- [GCP Best Practices](https://cloud.google.com/docs/enterprise/best-practices-for-enterprise-organizations) diff --git a/gcp/cloudbuild/cloudbuild-production.yaml b/gcp/cloudbuild/cloudbuild-production.yaml new file mode 100644 index 00000000..4ff4d248 --- /dev/null +++ b/gcp/cloudbuild/cloudbuild-production.yaml @@ -0,0 +1,179 @@ +# Cloud Build Configuration for Production Deployments +# Purpose: Manual-approval production deployments with additional safety checks +# Trigger: Manual execution or via GitHub release tags (v*.*.*) + +timeout: 3600s + +substitutions: + _REGION: us-central1 + _SERVICE_NAME: pierre-mcp-server + _ENVIRONMENT: production + _ARTIFACT_REGISTRY_REPO: pierre-mcp + _GCP_PROJECT_ID: ${PROJECT_ID} + +steps: + # Step 0: Pre-deployment validation + - name: 'gcr.io/google.com/cloudsdktool/cloud-sdk' + id: 'pre-deploy-validation' + entrypoint: bash + args: + - '-c' + - | + echo "🔍 Running pre-deployment checks..." + + # Check if staging deployment exists and is healthy + STAGING_URL=$(gcloud run services describe ${_SERVICE_NAME} \ + --region=${_REGION} \ + --format='value(status.url)' || echo "") + + if [ -z "$STAGING_URL" ]; then + echo "⚠️ Warning: Staging deployment not found" + else + echo "✅ Staging deployment exists: $STAGING_URL" + fi + + # Check for pending database migrations + echo "📋 Database migration check:" + echo " Ensure all migrations are tested in staging" + + # Verify git tag format (v1.0.0) + if [[ ! "$TAG_NAME" =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then + echo "❌ Invalid tag format. Expected: v1.0.0" + exit 1 + fi + + echo "✅ Pre-deployment validation passed" + + # Step 1: Build production image (same as staging, but tagged differently) + - name: 'gcr.io/cloud-builders/docker' + id: 'build-image' + args: + - 'build' + - '--tag=${_REGION}-docker.pkg.dev/${_GCP_PROJECT_ID}/${_ARTIFACT_REGISTRY_REPO}/${_SERVICE_NAME}:${TAG_NAME}' + - '--tag=${_REGION}-docker.pkg.dev/${_GCP_PROJECT_ID}/${_ARTIFACT_REGISTRY_REPO}/${_SERVICE_NAME}:production' + - '--file=Dockerfile' + - '.' + timeout: 2400s + waitFor: + - 'pre-deploy-validation' + + # Step 2: Push to Artifact Registry + - name: 'gcr.io/cloud-builders/docker' + id: 'push-image' + args: + - 'push' + - '--all-tags' + - '${_REGION}-docker.pkg.dev/${_GCP_PROJECT_ID}/${_ARTIFACT_REGISTRY_REPO}/${_SERVICE_NAME}' + waitFor: + - 'build-image' + + # Step 3: Deploy with gradual rollout (10% traffic to new revision) + - name: 'gcr.io/google.com/cloudsdktool/cloud-sdk' + id: 'deploy-canary' + entrypoint: bash + args: + - '-c' + - | + echo "🚀 Deploying canary release (10% traffic)..." + + gcloud run deploy ${_SERVICE_NAME} \ + --image=${_REGION}-docker.pkg.dev/${_GCP_PROJECT_ID}/${_ARTIFACT_REGISTRY_REPO}/${_SERVICE_NAME}:${TAG_NAME} \ + --region=${_REGION} \ + --platform=managed \ + --no-traffic \ + --tag=canary-${TAG_NAME} + + # Get the new revision name + NEW_REVISION=$(gcloud run revisions list \ + --service=${_SERVICE_NAME} \ + --region=${_REGION} \ + --format='value(metadata.name)' \ + --limit=1) + + echo "📦 New revision: $NEW_REVISION" + + # Split traffic: 90% to current, 10% to new + gcloud run services update-traffic ${_SERVICE_NAME} \ + --region=${_REGION} \ + --to-revisions=$NEW_REVISION=10 + + echo "✅ Canary deployment complete (10% traffic)" + waitFor: + - 'push-image' + + # Step 4: Monitor canary for 5 minutes + - name: 'gcr.io/google.com/cloudsdktool/cloud-sdk' + id: 'monitor-canary' + entrypoint: bash + args: + - '-c' + - | + echo "📊 Monitoring canary for 5 minutes..." + echo "Check for errors in Cloud Logging and metrics in Cloud Monitoring" + + SERVICE_URL=$(gcloud run services describe ${_SERVICE_NAME} \ + --region=${_REGION} \ + --format='value(status.url)') + + # Health check canary endpoint + for i in {1..30}; do + if curl -f -s "$SERVICE_URL/health" | grep -q "ok"; then + echo "✅ Canary health check $i/30 passed" + else + echo "❌ Canary health check failed at iteration $i" + exit 1 + fi + sleep 10 + done + + echo "✅ Canary monitoring complete - no issues detected" + waitFor: + - 'deploy-canary' + + # Step 5: Gradual rollout to 100% (requires manual approval in console) + # This step is informational - actual rollout done via gcloud command + - name: 'gcr.io/google.com/cloudsdktool/cloud-sdk' + id: 'rollout-instructions' + entrypoint: bash + args: + - '-c' + - | + echo "================================================" + echo "🎯 CANARY DEPLOYMENT SUCCESSFUL" + echo "================================================" + echo "" + echo "Current state: 10% traffic to new revision" + echo "" + echo "To complete rollout to 100%:" + echo "" + echo " gcloud run services update-traffic ${_SERVICE_NAME} \\" + echo " --region=${_REGION} \\" + echo " --to-latest" + echo "" + echo "To rollback to previous revision:" + echo "" + echo " gcloud run services update-traffic ${_SERVICE_NAME} \\" + echo " --region=${_REGION} \\" + echo " --to-revisions=PREVIOUS_REVISION=100" + echo "" + echo "================================================" + waitFor: + - 'monitor-canary' + +images: + - '${_REGION}-docker.pkg.dev/${_GCP_PROJECT_ID}/${_ARTIFACT_REGISTRY_REPO}/${_SERVICE_NAME}:${TAG_NAME}' + - '${_REGION}-docker.pkg.dev/${_GCP_PROJECT_ID}/${_ARTIFACT_REGISTRY_REPO}/${_SERVICE_NAME}:production' + +options: + machineType: 'E2_HIGHCPU_8' + diskSizeGb: 100 + logging: CLOUD_LOGGING_ONLY + logStreamingOption: STREAM_ON + +# Production deployment checklist: +# [ ] Staging deployment tested and validated +# [ ] Load testing completed +# [ ] Database migrations tested +# [ ] Rollback plan documented +# [ ] On-call engineer notified +# [ ] Monitoring dashboards reviewed diff --git a/gcp/cloudbuild/cloudbuild.yaml b/gcp/cloudbuild/cloudbuild.yaml new file mode 100644 index 00000000..46aa6806 --- /dev/null +++ b/gcp/cloudbuild/cloudbuild.yaml @@ -0,0 +1,116 @@ +# Cloud Build Configuration for Pierre MCP Server +# Triggers on: git push to main branch (via GitHub integration) +# Actions: Build Docker image → Push to Artifact Registry → Deploy to Cloud Run + +# Build timeout (max 60 minutes for Rust compilation) +timeout: 3600s + +# Substitution variables (passed from trigger or command line) +substitutions: + _REGION: us-central1 + _SERVICE_NAME: pierre-mcp-server + _ENVIRONMENT: staging # Override with --substitutions=_ENVIRONMENT=production + _ARTIFACT_REGISTRY_REPO: pierre-mcp + _GCP_PROJECT_ID: ${PROJECT_ID} + +# Build steps +steps: + # Step 1: Build the Docker image + - name: 'gcr.io/cloud-builders/docker' + id: 'build-image' + args: + - 'build' + - '--tag=${_REGION}-docker.pkg.dev/${_GCP_PROJECT_ID}/${_ARTIFACT_REGISTRY_REPO}/${_SERVICE_NAME}:${SHORT_SHA}' + - '--tag=${_REGION}-docker.pkg.dev/${_GCP_PROJECT_ID}/${_ARTIFACT_REGISTRY_REPO}/${_SERVICE_NAME}:${_ENVIRONMENT}' + - '--tag=${_REGION}-docker.pkg.dev/${_GCP_PROJECT_ID}/${_ARTIFACT_REGISTRY_REPO}/${_SERVICE_NAME}:latest' + - '--cache-from=${_REGION}-docker.pkg.dev/${_GCP_PROJECT_ID}/${_ARTIFACT_REGISTRY_REPO}/${_SERVICE_NAME}:latest' + - '--build-arg=BUILDKIT_INLINE_CACHE=1' + - '--file=Dockerfile' + - '.' + timeout: 2400s # 40 minutes for Rust build + + # Step 2: Push the image to Artifact Registry + - name: 'gcr.io/cloud-builders/docker' + id: 'push-image' + args: + - 'push' + - '--all-tags' + - '${_REGION}-docker.pkg.dev/${_GCP_PROJECT_ID}/${_ARTIFACT_REGISTRY_REPO}/${_SERVICE_NAME}' + waitFor: + - 'build-image' + + # Step 3: Deploy to Cloud Run (staging auto-deploy, production requires approval) + - name: 'gcr.io/google.com/cloudsdktool/cloud-sdk' + id: 'deploy-cloud-run' + entrypoint: bash + args: + - '-c' + - | + gcloud run deploy ${_SERVICE_NAME} \ + --image=${_REGION}-docker.pkg.dev/${_GCP_PROJECT_ID}/${_ARTIFACT_REGISTRY_REPO}/${_SERVICE_NAME}:${SHORT_SHA} \ + --region=${_REGION} \ + --platform=managed \ + --allow-unauthenticated \ + --tag=${_ENVIRONMENT}-${SHORT_SHA} + waitFor: + - 'push-image' + + # Step 4: Run database migrations (Cloud Run Job) + - name: 'gcr.io/google.com/cloudsdktool/cloud-sdk' + id: 'run-migrations' + entrypoint: bash + args: + - '-c' + - | + echo "Database migrations are handled automatically by the application on startup" + echo "SQLx migrations run on first container boot via src/database/mod.rs" + waitFor: + - 'deploy-cloud-run' + + # Step 5: Smoke test the deployed service + - name: 'gcr.io/cloud-builders/curl' + id: 'smoke-test' + entrypoint: bash + args: + - '-c' + - | + # Get the Cloud Run service URL + SERVICE_URL=$(gcloud run services describe ${_SERVICE_NAME} \ + --region=${_REGION} \ + --format='value(status.url)') + + echo "Testing health endpoint: $SERVICE_URL/health" + + # Wait for service to be ready (max 60 seconds) + for i in {1..12}; do + if curl -f -s "$SERVICE_URL/health" | grep -q "ok"; then + echo "✅ Health check passed!" + exit 0 + fi + echo "⏳ Waiting for service to be ready... ($i/12)" + sleep 5 + done + + echo "❌ Health check failed after 60 seconds" + exit 1 + waitFor: + - 'run-migrations' + +# Images to store in Artifact Registry +images: + - '${_REGION}-docker.pkg.dev/${_GCP_PROJECT_ID}/${_ARTIFACT_REGISTRY_REPO}/${_SERVICE_NAME}:${SHORT_SHA}' + - '${_REGION}-docker.pkg.dev/${_GCP_PROJECT_ID}/${_ARTIFACT_REGISTRY_REPO}/${_SERVICE_NAME}:${_ENVIRONMENT}' + - '${_REGION}-docker.pkg.dev/${_GCP_PROJECT_ID}/${_ARTIFACT_REGISTRY_REPO}/${_SERVICE_NAME}:latest' + +# Cloud Build options +options: + machineType: 'E2_HIGHCPU_8' # Use high-CPU machine for faster Rust builds + diskSizeGb: 100 + logging: CLOUD_LOGGING_ONLY + logStreamingOption: STREAM_ON + +# IAM permissions required for Cloud Build service account: +# - roles/run.admin (deploy Cloud Run services) +# - roles/iam.serviceAccountUser (act as Cloud Run service account) +# - roles/artifactregistry.writer (push images) +# - roles/cloudbuild.builds.builder (default) diff --git a/gcp/docs/ARCHITECTURE.md b/gcp/docs/ARCHITECTURE.md new file mode 100644 index 00000000..5481ab4e --- /dev/null +++ b/gcp/docs/ARCHITECTURE.md @@ -0,0 +1,593 @@ +# Pierre MCP Server - GCP Architecture + +## Architecture Overview + +``` +┌─────────────────────────────────────────────────────────────────────────┐ +│ USERS / MCP CLIENTS │ +│ (Claude Desktop, ChatGPT, Custom Agents) │ +└────────────────────────────┬────────────────────────────────────────────┘ + │ HTTPS + ▼ +┌─────────────────────────────────────────────────────────────────────────┐ +│ CLOUD LOAD BALANCER │ +│ (Automatic with Cloud Run) │ +│ SSL/TLS Termination, DDoS Protection │ +└────────────────────────────┬────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────┐ +│ CLOUD RUN SERVICE │ +│ (pierre-mcp-server container) │ +│ ┌───────────────────────────────────────────────────────────────────┐ │ +│ │ Instance 1 (Min: 1-2, Max: 100, Auto-scaling) │ │ +│ │ ┌─────────────────────────────────────────────────────────────┐ │ │ +│ │ │ Pierre MCP Server (Rust Binary) │ │ │ +│ │ │ - HTTP API (Port 8081) │ │ │ +│ │ │ - MCP Protocol Handler │ │ │ +│ │ │ - OAuth 2.0 Server │ │ │ +│ │ │ - JWT Authentication (RS256) │ │ │ +│ │ │ - Multi-tenant Logic │ │ │ +│ │ │ - Intelligence Engine │ │ │ +│ │ └─────────────────────────────────────────────────────────────┘ │ │ +│ └───────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ┌────────┴────────┐ │ +│ │ │ │ +└────────────────────┼─────────────────┼──────────────────────────────────┘ + │ │ + ┌───────────▼────────┐ └──────────────┐ + │ Serverless VPC │ │ + │ Connector │ │ + │ (Private Network) │ │ + └───────────┬────────┘ │ + │ │ + ▼ ▼ + ┌────────────────────────────────┐ ┌──────────────────────────┐ + │ CLOUD SQL POSTGRES │ │ SECRET MANAGER │ + │ (PostgreSQL 16, HA) │ │ (OAuth Secrets, Keys) │ + │ ┌──────────────────────────┐ │ │ ┌────────────────────┐ │ + │ │ Database: pierre_mcp │ │ │ │ Strava Secret │ │ + │ │ Tables: 26+ │ │ │ │ Garmin Secret │ │ + │ │ Users, Tenants, │ │ │ │ Fitbit Secret │ │ + │ │ Activities, Goals │ │ │ │ OpenWeather Key │ │ + │ │ OAuth Tokens (enc) │ │ │ │ Encryption Key │ │ + │ │ API Keys │ │ │ │ DB Password │ │ + │ └──────────────────────────┘ │ │ └────────────────────┘ │ + │ │ └──────────────────────────┘ + │ Private IP: 10.0.0.x │ + │ Automated Backups (Daily) │ + │ Point-in-Time Recovery │ + │ Regional HA (Production) │ + └────────────────────────────────┘ + │ + │ (Backups) + ▼ + ┌──────────────────────────┐ + │ CLOUD STORAGE │ + │ (Backup Retention) │ + └──────────────────────────┘ + + ▲ + │ (Outbound to External APIs) + │ + ┌────────────────┴───────────────┐ + │ CLOUD NAT GATEWAY │ + │ (Outbound Internet Access) │ + └────────────────┬───────────────┘ + │ + ┌───────────┴───────────┐ + │ │ + ▼ ▼ +┌─────────────────┐ ┌───────────────────┐ +│ EXTERNAL APIs │ │ MONITORING & │ +│ │ │ OBSERVABILITY │ +│ • Strava API │ │ │ +│ • Garmin API │ │ • Cloud Logging │ +│ • Fitbit API │ │ • Cloud Monitoring│ +│ • OpenWeather │ │ • Cloud Trace │ +│ • USDA FoodData │ │ • Uptime Checks │ +└─────────────────┘ │ • Alerts/PagerDuty│ + └───────────────────┘ +``` + +## Network Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ VPC NETWORK │ +│ (pierre-vpc-{env}) │ +│ │ +│ ┌────────────────────────────────────────────────────────┐ │ +│ │ Subnet: 10.0.0.0/24 (Regional) │ │ +│ │ │ │ +│ │ ┌──────────────────────────────────────────────────┐ │ │ +│ │ │ Cloud SQL Private IP Pool │ │ │ +│ │ │ Reserved: 10.0.0.0/16 (VPC Peering) │ │ │ +│ │ └──────────────────────────────────────────────────┘ │ │ +│ │ │ │ +│ │ ┌──────────────────────────────────────────────────┐ │ │ +│ │ │ Serverless VPC Connector │ │ │ +│ │ │ IP Range: 10.8.0.0/28 │ │ │ +│ │ │ Connects Cloud Run ↔ Cloud SQL │ │ │ +│ │ └──────────────────────────────────────────────────┘ │ │ +│ │ │ │ +│ └────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌────────────────────────────────────────────────────────┐ │ +│ │ Cloud Router + Cloud NAT │ │ +│ │ (Outbound connectivity for Cloud Run) │ │ +│ │ • External API calls (Strava, Garmin, etc.) │ │ +│ │ • Static public IP for egress │ │ +│ └────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## Traffic Flow + +### 1. User Request Flow + +``` +User/MCP Client + │ + │ 1. HTTPS Request (GET /health, POST /mcp, etc.) + ▼ +Cloud Run Service (Internet-facing) + │ + │ 2. Authenticate (JWT Bearer Token from OAuth 2.0) + ▼ +Application Logic (Rust) + │ + ├── 3a. Database Query ──┐ + │ (via Serverless VPC │ + │ Connector) │ + │ ▼ + │ Cloud SQL (Private IP) + │ │ + │ ┌─────────────────────┘ + │ │ 4. Query Result + │ ▼ + ├── 3b. Secret Fetch ────┐ + │ (IAM Authentication) │ + │ ▼ + │ Secret Manager + │ │ + │ ┌─────────────────────┘ + │ │ 5. Secret Value + │ ▼ + └── 3c. External API ────┐ + (via Cloud NAT) │ + ▼ + Strava/Garmin/Fitbit API + │ + ┌────────────────────┘ + │ 6. Provider Data + ▼ +Response JSON (with logging to Cloud Logging) + │ + │ 7. HTTP Response + ▼ +User/MCP Client +``` + +### 2. OAuth Provider Flow + +``` +User → Cloud Run → OAuth Login Page + │ + │ User clicks "Connect Strava" + ▼ +Cloud Run → Redirect to Strava OAuth + │ + │ User authorizes + ▼ +Strava → Callback to Cloud Run (/api/oauth/callback/strava) + │ + │ Exchange code for token + ▼ +Cloud Run: + 1. Decrypt existing tokens (if any) + 2. Store new token (encrypted with AAD) + 3. Save to Cloud SQL + 4. Emit SSE notification + │ + ▼ +User receives success notification +``` + +### 3. Database Migration Flow + +``` +Cloud Run Service Starts + │ + │ 1. Read DATABASE_URL from env + ▼ +SQLx Migration System + │ + │ 2. Check applied migrations (sqlx table) + ├─── 3. Run pending migrations (CREATE TABLE IF NOT EXISTS) + │ - users + │ - user_oauth_tokens (encrypted columns) + │ - tenants + │ - api_keys + │ - [26+ tables] + │ + │ 4. Create indexes + │ 5. Set up foreign keys + ▼ +Application Ready + │ + └── Health check returns 200 OK +``` + +## Security Architecture + +### 1. Network Security + +``` +┌──────────────────────────────────────────────────────────┐ +│ SECURITY LAYERS │ +├──────────────────────────────────────────────────────────┤ +│ │ +│ Layer 1: Cloud Armor (WAF) [Production Only] │ +│ - DDoS protection │ +│ - Rate limiting │ +│ - Geo-fencing │ +│ - OWASP Top 10 rules │ +│ │ +├──────────────────────────────────────────────────────────┤ +│ │ +│ Layer 2: Cloud Run IAM │ +│ - JWT token validation (RS256) │ +│ - Service account authentication │ +│ - allUsers invoker (public API) │ +│ │ +├──────────────────────────────────────────────────────────┤ +│ │ +│ Layer 3: Application-Level Auth │ +│ - Bearer token required │ +│ - Tenant isolation (multi-tenancy) │ +│ - API key authentication │ +│ - Rate limiting (per user/tenant) │ +│ - PII redaction middleware │ +│ │ +├──────────────────────────────────────────────────────────┤ +│ │ +│ Layer 4: Database Security │ +│ - Private IP only (no public access) │ +│ - VPC Service Controls │ +│ - Encrypted at rest (AES-256) │ +│ - Encrypted in transit (TLS 1.3) │ +│ - OAuth tokens encrypted with AAD │ +│ │ +├──────────────────────────────────────────────────────────┤ +│ │ +│ Layer 5: Secret Management │ +│ - Secret Manager (no env vars) │ +│ - Automatic rotation (planned) │ +│ - IAM-based access control │ +│ - Audit logging │ +│ │ +└──────────────────────────────────────────────────────────┘ +``` + +### 2. IAM Permissions Model + +``` +Service Account: pierre-mcp-server-runner-{env}@PROJECT.iam.gserviceaccount.com + +Roles: +├── roles/cloudsql.client +│ └── Connect to Cloud SQL instances +│ +├── roles/secretmanager.secretAccessor +│ └── Read secrets (OAuth, encryption keys) +│ +├── roles/logging.logWriter +│ └── Write structured logs +│ +├── roles/monitoring.metricWriter +│ └── Export custom metrics +│ +└── roles/cloudtrace.agent + └── Send distributed traces +``` + +## Data Flow Architecture + +### Database Schema (26+ Tables) + +``` +┌────────────────────────────────────────────────────────────┐ +│ CORE TABLES │ +├────────────────────────────────────────────────────────────┤ +│ │ +│ users Multi-tenant user accounts │ +│ ├── id (UUID) │ +│ ├── email │ +│ ├── tenant_id (FK) │ +│ ├── tier (free/pro/enterprise) │ +│ └── is_admin │ +│ │ +│ user_oauth_tokens Encrypted OAuth credentials │ +│ ├── user_id + tenant_id + provider (PK) │ +│ ├── access_token (ENCRYPTED with AAD) │ +│ ├── refresh_token (ENCRYPTED with AAD) │ +│ └── expires_at │ +│ │ +│ tenants Multi-tenant isolation │ +│ ├── id (UUID) │ +│ ├── name │ +│ ├── slug │ +│ └── subscription_tier │ +│ │ +│ api_keys Programmatic access │ +│ ├── id (UUID) │ +│ ├── key_hash (bcrypt) │ +│ ├── user_id (FK) │ +│ └── rate_limit │ +│ │ +└────────────────────────────────────────────────────────────┘ + +┌────────────────────────────────────────────────────────────┐ +│ FITNESS DATA │ +├────────────────────────────────────────────────────────────┤ +│ goals User fitness goals │ +│ insights AI-generated insights │ +│ fitness_configurations Algorithm settings │ +└────────────────────────────────────────────────────────────┘ + +┌────────────────────────────────────────────────────────────┐ +│ OAUTH 2.0 SERVER │ +├────────────────────────────────────────────────────────────┤ +│ oauth2_clients Registered MCP clients │ +│ oauth2_auth_codes Authorization codes (PKCE) │ +│ oauth2_refresh_tokens Refresh tokens │ +│ rsa_keypairs JWT signing keys (RS256) │ +└────────────────────────────────────────────────────────────┘ + +┌────────────────────────────────────────────────────────────┐ +│ A2A PROTOCOL │ +├────────────────────────────────────────────────────────────┤ +│ a2a_clients Agent registrations │ +│ a2a_sessions Active sessions │ +│ a2a_tasks Task execution │ +└────────────────────────────────────────────────────────────┘ +``` + +## Monitoring & Observability + +### Metrics Collected + +``` +Cloud Run Metrics: +├── Request count (per minute) +├── Request latency (p50, p95, p99) +├── Error rate (4xx, 5xx) +├── Container CPU usage +├── Container memory usage +├── Instance count (current, min, max) +└── Cold start latency + +Cloud SQL Metrics: +├── Connection count +├── Query latency +├── Disk usage +├── CPU utilization +├── Memory utilization +└── Replication lag (if HA enabled) + +Application Metrics (Custom): +├── OAuth token refresh count +├── Provider API call latency +├── Cache hit/miss ratio +├── Database query performance +└── Multi-tenant request distribution +``` + +### Logging Strategy + +``` +Cloud Logging (Structured JSON): +├── HTTP Access Logs (Cloud Run automatic) +├── Application Logs (RUST_LOG) +│ ├── Level: ERROR, WARN, INFO, DEBUG +│ ├── Request ID correlation +│ └── PII redaction applied +├── SQL Query Logs (slow queries only) +├── OAuth Flow Logs (audit trail) +└── Security Events (failed auth, rate limits) + +Log Retention: +├── Development: 7 days +├── Staging: 30 days +└── Production: 90 days (compliance) +``` + +### Alerting Rules + +``` +Critical Alerts (PagerDuty): +├── Service down (health check failed) +├── Error rate > 5% (5min window) +├── Database connection exhausted +└── External API failure (Strava, Garmin down) + +Warning Alerts (Slack): +├── High latency (p95 > 1s) +├── Memory usage > 80% +├── Database disk > 85% +└── Unusual traffic spike (+50% baseline) +``` + +## Deployment Strategy + +### Environments + +``` +┌──────────────────────────────────────────────────────────┐ +│ Development (pierre-mcp-dev) │ +├──────────────────────────────────────────────────────────┤ +│ • db-f1-micro (0.6GB RAM) │ +│ • Min instances: 0 (scale to zero) │ +│ • Auto-deploy from main branch │ +│ • Cost: ~$75/month │ +└──────────────────────────────────────────────────────────┘ + +┌──────────────────────────────────────────────────────────┐ +│ Staging (pierre-mcp-staging) │ +├──────────────────────────────────────────────────────────┤ +│ • db-custom-2-8192 (2 vCPU, 8GB) │ +│ • Min instances: 1 │ +│ • Auto-deploy from main branch │ +│ • Production parity │ +│ • Cost: ~$200/month │ +└──────────────────────────────────────────────────────────┘ + +┌──────────────────────────────────────────────────────────┐ +│ Production (pierre-mcp-prod) │ +├──────────────────────────────────────────────────────────┤ +│ • db-custom-4-16384 (4 vCPU, 16GB) │ +│ • High Availability (Regional HA) │ +│ • Min instances: 2 │ +│ • Manual approval required │ +│ • Canary deployments (10% → 100%) │ +│ • Cost: ~$500-1500/month │ +└──────────────────────────────────────────────────────────┘ +``` + +### CI/CD Pipeline + +``` +GitHub Push (main branch) + │ + ▼ +GitHub Actions / Cloud Build Trigger + │ + ├── 1. Run Tests (cargo test) + ├── 2. Lint (cargo clippy) + ├── 3. Security Scan (cargo deny) + ├── 4. Build Docker Image + │ └── Multi-stage: Rust build → Debian runtime + │ + ├── 5. Push to Artifact Registry + │ └── Tag: latest, {SHORT_SHA}, {ENV} + │ + ├── 6. Deploy to Cloud Run + │ └── Blue-green deployment (automatic) + │ + ├── 7. Run Database Migrations + │ └── SQLx embedded migrations + │ + └── 8. Smoke Test + └── curl /health (wait 60s for ready) + +Production Deployment (git tag v1.0.0) + │ + ├── Same steps as above + ├── Deploy canary (10% traffic) + ├── Monitor for 5 minutes + ├── Manual approval to promote + └── Rollback plan ready +``` + +## Disaster Recovery + +### RTO/RPO Targets + +``` +┌────────────────────────────────────────────────┐ +│ Recovery Objectives │ +├────────────────────────────────────────────────┤ +│ RTO (Recovery Time Objective): 15 minutes │ +│ RPO (Recovery Point Objective): 5 minutes │ +└────────────────────────────────────────────────┘ +``` + +### Backup Strategy + +``` +Cloud SQL Automated Backups: +├── Daily backups at 3 AM UTC +├── Point-in-time recovery (7 days) +├── Transaction log retention (7 days) +└── Export to Cloud Storage (weekly) + +Cloud Run Revisions: +├── Last 10 revisions retained +├── Instant rollback capability +└── Tagged releases (v1.0.0) kept indefinitely + +Infrastructure as Code: +├── Terraform state in GCS +├── State versioning enabled +└── Git repository (version control) +``` + +## Scalability + +### Auto-Scaling Configuration + +``` +Cloud Run: +├── Min Instances: 0 (dev), 1 (staging), 2 (prod) +├── Max Instances: 10 (dev), 50 (staging), 100 (prod) +├── Concurrency: 80 requests per instance +├── CPU Throttling: After request completion +└── Scale-to-zero: Enabled for dev only + +Cloud SQL: +├── Vertical scaling: Change tier (manual) +├── Read replicas: Add up to 10 (manual) +├── Connection pooling: SQLx (10 connections) +└── Auto-increase storage: Enabled +``` + +### Expected Performance + +``` +Single Instance Capacity: +├── Throughput: ~500 RPS (simple GET) +├── Throughput: ~200 RPS (database queries) +├── Latency: p50=50ms, p95=200ms, p99=500ms +└── Cold start: <500ms (Rust binary) + +100 Instances (Max Scale): +├── Throughput: 20,000+ RPS +├── Peak Load: 100,000+ daily active users +└── Database: Read replicas required +``` + +## Cost Breakdown (Production) + +``` +Monthly Estimate: +├── Cloud Run: $100-300 +│ ├── CPU time (vCPU-seconds) +│ ├── Memory (GB-seconds) +│ └── Requests (per million) +│ +├── Cloud SQL: $300-400 +│ ├── db-custom-4-16384 tier +│ ├── HA configuration (+100%) +│ ├── Storage (100GB SSD) +│ └── Automated backups +│ +├── Networking: $60 +│ ├── VPC connector ($9) +│ ├── Cloud NAT ($45) +│ └── Egress traffic ($5-10) +│ +├── Secret Manager: $5 +│ └── Secret access operations +│ +├── Logging & Monitoring: $20-50 +│ ├── Log ingestion +│ ├── Metrics +│ └── Traces +│ +└── TOTAL: $485-825/month + +Scale to 10x traffic: ~$2000-3000/month +``` diff --git a/gcp/docs/DEPLOYMENT_GUIDE.md b/gcp/docs/DEPLOYMENT_GUIDE.md new file mode 100644 index 00000000..2b5fb9eb --- /dev/null +++ b/gcp/docs/DEPLOYMENT_GUIDE.md @@ -0,0 +1,567 @@ +# Pierre MCP Server - GCP Deployment Guide + +## Table of Contents + +1. [Prerequisites](#prerequisites) +2. [Initial Setup](#initial-setup) +3. [Development Deployment](#development-deployment) +4. [Staging Deployment](#staging-deployment) +5. [Production Deployment](#production-deployment) +6. [Continuous Deployment](#continuous-deployment) +7. [Troubleshooting](#troubleshooting) +8. [Operations Runbook](#operations-runbook) + +## Prerequisites + +### Required Tools + +```bash +# Install gcloud SDK +curl https://sdk.cloud.google.com | bash +exec -l $SHELL +gcloud init + +# Install Terraform (v1.6+) +brew install terraform # macOS +# or +wget https://releases.hashicorp.com/terraform/1.6.0/terraform_1.6.0_linux_amd64.zip +unzip terraform_1.6.0_linux_amd64.zip +sudo mv terraform /usr/local/bin/ + +# Install jq (for JSON parsing) +brew install jq # macOS +sudo apt-get install jq # Ubuntu/Debian + +# Verify installations +gcloud version +terraform version +jq --version +``` + +### GCP Account Setup + +1. **Create GCP Projects** (one per environment): + ```bash + # Development + gcloud projects create pierre-mcp-dev --name="Pierre MCP Dev" + + # Staging + gcloud projects create pierre-mcp-staging --name="Pierre MCP Staging" + + # Production + gcloud projects create pierre-mcp-prod --name="Pierre MCP Production" + ``` + +2. **Link Billing Account**: + ```bash + # List billing accounts + gcloud billing accounts list + + # Link to projects + gcloud billing projects link pierre-mcp-dev \ + --billing-account=BILLING_ACCOUNT_ID + ``` + +3. **Configure OAuth Applications**: + - **Strava**: https://www.strava.com/settings/api + - **Garmin**: https://developer.garmin.com/ + - **Fitbit**: https://dev.fitbit.com/apps + - **OpenWeatherMap**: https://openweathermap.org/api + +## Initial Setup + +### Step 1: Run GCP Project Setup Script + +```bash +cd gcp/scripts + +# Development environment +./setup-gcp-project.sh pierre-mcp-dev dev us-central1 + +# Staging environment +./setup-gcp-project.sh pierre-mcp-staging staging us-central1 + +# Production environment +./setup-gcp-project.sh pierre-mcp-prod production us-central1 +``` + +This script: +- ✅ Enables required GCP APIs +- ✅ Creates service accounts +- ✅ Configures IAM permissions +- ✅ Creates Artifact Registry +- ✅ Creates Terraform state bucket +- ✅ Generates master encryption key + +### Step 2: Store OAuth Secrets + +```bash +# Set active project +gcloud config set project pierre-mcp-dev + +# Store Strava credentials +echo "your-strava-client-secret" | \ + gcloud secrets create pierre-mcp-server-strava-client-secret-dev --data-file=- + +# Store Garmin credentials +echo "your-garmin-client-secret" | \ + gcloud secrets create pierre-mcp-server-garmin-client-secret-dev --data-file=- + +# Store Fitbit credentials +echo "your-fitbit-client-secret" | \ + gcloud secrets create pierre-mcp-server-fitbit-client-secret-dev --data-file=- + +# Store OpenWeather API key +echo "your-openweather-api-key" | \ + gcloud secrets create pierre-mcp-server-openweather-api-key-dev --data-file=- + +# Verify secrets +gcloud secrets list +``` + +### Step 3: Configure Terraform Variables + +Edit the environment-specific tfvars files: + +```bash +cd gcp/terraform/environments + +# Edit dev environment +vim dev/terraform.tfvars +# Update: +# - project_id +# - container_image URL +# - OAuth client IDs +# - alert_email + +# Repeat for staging and production +vim staging/terraform.tfvars +vim production/terraform.tfvars +``` + +### Step 4: Build Initial Container Image + +```bash +# Build and push to Artifact Registry +cd /path/to/pierre_mcp_server + +# Configure Docker authentication +gcloud auth configure-docker us-central1-docker.pkg.dev + +# Build image +docker build -t us-central1-docker.pkg.dev/pierre-mcp-dev/pierre-mcp/pierre-mcp-server:v0.1.0 . + +# Push image +docker push us-central1-docker.pkg.dev/pierre-mcp-dev/pierre-mcp/pierre-mcp-server:v0.1.0 +``` + +## Development Deployment + +### Deploy with Terraform + +```bash +cd gcp/scripts + +# Generate plan +./deploy-terraform.sh dev plan + +# Review the plan, then apply +./deploy-terraform.sh dev apply +``` + +### Verify Deployment + +```bash +# Get service URL +SERVICE_URL=$(cd ../terraform && terraform output -raw cloud_run_service_url) + +# Test health endpoint +curl $SERVICE_URL/health + +# Expected response: {"status":"ok"} + +# View logs +gcloud logging read \ + "resource.type=cloud_run_revision AND resource.labels.service_name=pierre-mcp-server" \ + --limit 20 \ + --format json +``` + +### Deploy New Version + +```bash +# Build new image +docker build -t us-central1-docker.pkg.dev/pierre-mcp-dev/pierre-mcp/pierre-mcp-server:v0.2.0 . +docker push us-central1-docker.pkg.dev/pierre-mcp-dev/pierre-mcp/pierre-mcp-server:v0.2.0 + +# Update Cloud Run +gcloud run deploy pierre-mcp-server \ + --image us-central1-docker.pkg.dev/pierre-mcp-dev/pierre-mcp/pierre-mcp-server:v0.2.0 \ + --region us-central1 +``` + +## Staging Deployment + +Staging mirrors production configuration for realistic testing. + +```bash +# Deploy infrastructure +cd gcp/scripts +./deploy-terraform.sh staging plan +./deploy-terraform.sh staging apply + +# Set up Cloud Build trigger (one-time) +gcloud builds triggers create github \ + --repo-name=pierre_mcp_server \ + --repo-owner=Async-IO \ + --branch-pattern="^main$" \ + --build-config=gcp/cloudbuild/cloudbuild.yaml \ + --substitutions=_ENVIRONMENT=staging + +# Trigger manual build +gcloud builds submit \ + --config gcp/cloudbuild/cloudbuild.yaml \ + --substitutions=_ENVIRONMENT=staging +``` + +## Production Deployment + +⚠️ **Production deployments require extra caution and approvals.** + +### Pre-Deployment Checklist + +- [ ] Code reviewed and approved +- [ ] Staging deployment successful +- [ ] Load testing completed +- [ ] Database migrations tested +- [ ] Rollback plan documented +- [ ] On-call engineer notified +- [ ] Monitoring dashboards reviewed + +### Deploy Infrastructure (First Time) + +```bash +cd gcp/scripts + +# Review plan carefully +./deploy-terraform.sh production plan + +# Get approval from team lead +# Then apply +./deploy-terraform.sh production apply +``` + +### Deploy Application (Canary Release) + +```bash +# Tag release in git +git tag -a v1.0.0 -m "Production release v1.0.0" +git push origin v1.0.0 + +# Trigger production build with canary deployment +gcloud builds submit \ + --config gcp/cloudbuild/cloudbuild-production.yaml \ + --substitutions=TAG_NAME=v1.0.0,_ENVIRONMENT=production +``` + +The canary deployment: +1. Deploys new revision with 10% traffic +2. Monitors for 5 minutes +3. Provides commands for full rollout or rollback + +### Complete Rollout (After Canary Success) + +```bash +# Promote canary to 100% traffic +gcloud run services update-traffic pierre-mcp-server \ + --region us-central1 \ + --to-latest + +# Monitor for 30 minutes +# Check error rates, latency, logs +``` + +### Rollback (If Issues Detected) + +```bash +# List revisions +gcloud run revisions list \ + --service pierre-mcp-server \ + --region us-central1 + +# Rollback to previous revision +PREVIOUS_REVISION="pierre-mcp-server-00042-abc" # Replace with actual +gcloud run services update-traffic pierre-mcp-server \ + --region us-central1 \ + --to-revisions=$PREVIOUS_REVISION=100 +``` + +## Continuous Deployment + +### GitHub Actions Integration + +Create `.github/workflows/deploy-gcp.yml`: + +```yaml +name: Deploy to GCP + +on: + push: + branches: + - main + release: + types: [published] + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - id: auth + uses: google-github-actions/auth@v2 + with: + credentials_json: ${{ secrets.GCP_CREDENTIALS }} + + - name: Build and Deploy + run: | + ENV="staging" + if [[ "${{ github.event_name }}" == "release" ]]; then + ENV="production" + fi + + gcloud builds submit \ + --config gcp/cloudbuild/cloudbuild.yaml \ + --substitutions=_ENVIRONMENT=$ENV +``` + +### Cloud Build Trigger Setup + +```bash +# Connect GitHub repository +gcloud builds triggers create github \ + --name=pierre-mcp-staging-deploy \ + --repo-name=pierre_mcp_server \ + --repo-owner=Async-IO \ + --branch-pattern="^main$" \ + --build-config=gcp/cloudbuild/cloudbuild.yaml + +# Production trigger (manual only) +gcloud builds triggers create github \ + --name=pierre-mcp-production-deploy \ + --repo-name=pierre_mcp_server \ + --repo-owner=Async-IO \ + --tag-pattern="^v[0-9]+\.[0-9]+\.[0-9]+$" \ + --build-config=gcp/cloudbuild/cloudbuild-production.yaml \ + --require-approval +``` + +## Troubleshooting + +### Service Won't Start + +```bash +# Check Cloud Run logs +gcloud logging read \ + "resource.type=cloud_run_revision" \ + --limit 50 \ + --format json | jq -r '.[] | .textPayload' + +# Common issues: +# 1. Database connection failed +# → Check Cloud SQL instance is running +# → Verify VPC connector is attached + +# 2. Secret access denied +# → Check service account has secretAccessor role + +# 3. Port mismatch +# → Ensure container listens on port 8081 +``` + +### Database Connection Issues + +```bash +# Test Cloud SQL connectivity +gcloud sql instances describe pierre-mcp-server-postgres-dev \ + --format="value(connectionName)" + +# Check VPC connector +gcloud compute networks vpc-access connectors describe \ + serverless-connector-dev \ + --region us-central1 + +# Verify private IP +gcloud sql instances describe pierre-mcp-server-postgres-dev \ + --format="value(ipAddresses[0].ipAddress)" +``` + +### Secret Manager Issues + +```bash +# List secrets +gcloud secrets list + +# Check secret value +gcloud secrets versions access latest \ + --secret=pierre-mcp-server-db-password-dev + +# Verify IAM permissions +gcloud secrets get-iam-policy \ + pierre-mcp-server-db-password-dev +``` + +### Terraform State Issues + +```bash +# Unlock stuck state +terraform force-unlock LOCK_ID + +# Import existing resource +terraform import google_cloud_run_service.pierre_mcp_server \ + projects/PROJECT_ID/locations/REGION/services/SERVICE_NAME + +# Refresh state +cd gcp/scripts +./deploy-terraform.sh dev refresh +``` + +## Operations Runbook + +### Daily Operations + +**Monitoring Dashboard**: +```bash +# Open Cloud Console monitoring +gcloud monitoring dashboards list +``` + +**Check Service Health**: +```bash +curl https://pierre-mcp-server-xxxxx.run.app/health +``` + +**View Recent Logs**: +```bash +gcloud logging read \ + "resource.type=cloud_run_revision" \ + --limit 100 \ + --format json | jq -r '.[] | "\(.timestamp) \(.textPayload)"' +``` + +### Incident Response + +#### Service Down + +1. **Check uptime**: + ```bash + gcloud monitoring uptime-checks list + ``` + +2. **View error logs**: + ```bash + gcloud logging read \ + "resource.type=cloud_run_revision AND severity>=ERROR" \ + --limit 50 + ``` + +3. **Rollback if needed** (see rollback section above) + +#### High Error Rate + +1. **Check error distribution**: + ```bash + gcloud logging read \ + "resource.type=cloud_run_revision AND httpRequest.status>=400" \ + --limit 100 \ + --format json | jq -r '.[] | .httpRequest.status' | sort | uniq -c + ``` + +2. **Identify failing endpoint**: + ```bash + gcloud logging read \ + "resource.type=cloud_run_revision AND httpRequest.status>=500" \ + --limit 20 \ + --format json | jq -r '.[] | .httpRequest.requestUrl' + ``` + +#### Database Performance Issues + +1. **Check Cloud SQL metrics**: + ```bash + gcloud sql operations list \ + --instance pierre-mcp-server-postgres-prod + ``` + +2. **View slow queries**: + ```bash + gcloud logging read \ + "resource.type=cloudsql_database AND log_name=~postgres.log" \ + --limit 50 + ``` + +3. **Scale database if needed**: + ```bash + gcloud sql instances patch pierre-mcp-server-postgres-prod \ + --tier db-custom-8-32768 + ``` + +### Scaling Operations + +**Increase Cloud Run instances**: +```bash +gcloud run services update pierre-mcp-server \ + --region us-central1 \ + --max-instances 200 \ + --min-instances 5 +``` + +**Scale database**: +```bash +gcloud sql instances patch INSTANCE_NAME \ + --tier db-custom-4-16384 +``` + +### Backup and Recovery + +**Manual backup**: +```bash +gcloud sql backups create \ + --instance pierre-mcp-server-postgres-prod \ + --description "Manual backup before migration" +``` + +**Restore from backup**: +```bash +# List backups +gcloud sql backups list --instance INSTANCE_NAME + +# Restore +gcloud sql backups restore BACKUP_ID \ + --backup-instance SOURCE_INSTANCE \ + --backup-id BACKUP_ID \ + --instance TARGET_INSTANCE +``` + +### Cost Optimization + +**View current costs**: +```bash +gcloud billing accounts get-iam-policy BILLING_ACCOUNT_ID +``` + +**Optimize Cloud Run**: +- Scale to zero when not in use (dev/staging) +- Reduce min instances +- Right-size CPU/memory + +**Optimize Cloud SQL**: +- Use smaller tier for dev/staging +- Disable HA for non-production +- Clean up old backups + +## Support + +- **Documentation**: `gcp/docs/` +- **GitHub Issues**: https://github.com/Async-IO/pierre_mcp_server/issues +- **On-Call**: platform-oncall@example.com diff --git a/gcp/docs/QUICKSTART.md b/gcp/docs/QUICKSTART.md new file mode 100644 index 00000000..646cb41c --- /dev/null +++ b/gcp/docs/QUICKSTART.md @@ -0,0 +1,325 @@ +# Pierre MCP Server - GCP Quick Start + +Get your Pierre MCP Server running on Google Cloud Platform in 30 minutes. + +## Prerequisites + +- GCP account with billing enabled +- `gcloud` CLI installed and authenticated +- Terraform 1.6+ installed +- Docker installed (for local builds) +- OAuth credentials from Strava/Garmin/Fitbit + +## Quick Start (Development Environment) + +### Step 1: Clone Repository + +```bash +git clone https://github.com/Async-IO/pierre_mcp_server.git +cd pierre_mcp_server +``` + +### Step 2: Create GCP Project + +```bash +# Create project +gcloud projects create pierre-mcp-dev --name="Pierre MCP Dev" + +# Set as active +gcloud config set project pierre-mcp-dev + +# Link billing (replace with your billing account) +gcloud billing projects link pierre-mcp-dev \ + --billing-account=XXXXXX-XXXXXX-XXXXXX +``` + +### Step 3: Run Setup Script + +```bash +cd gcp/scripts +./setup-gcp-project.sh pierre-mcp-dev dev us-central1 +``` + +This script will: +- ✅ Enable 12 required GCP APIs (2-3 minutes) +- ✅ Create Artifact Registry for Docker images +- ✅ Create service account with IAM roles +- ✅ Create GCS bucket for Terraform state +- ✅ Generate master encryption key + +### Step 4: Store OAuth Secrets + +```bash +# Strava (get from https://www.strava.com/settings/api) +echo "YOUR_STRAVA_CLIENT_SECRET" | \ + gcloud secrets create pierre-mcp-server-strava-client-secret-dev \ + --data-file=- + +# Garmin (optional) +echo "YOUR_GARMIN_CLIENT_SECRET" | \ + gcloud secrets create pierre-mcp-server-garmin-client-secret-dev \ + --data-file=- + +# Fitbit (optional) +echo "YOUR_FITBIT_CLIENT_SECRET" | \ + gcloud secrets create pierre-mcp-server-fitbit-client-secret-dev \ + --data-file=- + +# OpenWeather (optional, get from https://openweathermap.org/api) +echo "YOUR_OPENWEATHER_API_KEY" | \ + gcloud secrets create pierre-mcp-server-openweather-api-key-dev \ + --data-file=- +``` + +### Step 5: Build and Push Docker Image + +```bash +cd /path/to/pierre_mcp_server + +# Configure Docker auth +gcloud auth configure-docker us-central1-docker.pkg.dev + +# Build image +docker build \ + -t us-central1-docker.pkg.dev/pierre-mcp-dev/pierre-mcp/pierre-mcp-server:v0.1.0 \ + -f Dockerfile \ + . + +# Push to Artifact Registry +docker push us-central1-docker.pkg.dev/pierre-mcp-dev/pierre-mcp/pierre-mcp-server:v0.1.0 +``` + +### Step 6: Configure Terraform + +```bash +cd gcp/terraform/environments/dev + +# Edit terraform.tfvars +vim terraform.tfvars +``` + +Update these values: +```hcl +project_id = "pierre-mcp-dev" +container_image = "us-central1-docker.pkg.dev/pierre-mcp-dev/pierre-mcp/pierre-mcp-server:v0.1.0" + +# OAuth client IDs (get from provider developer portals) +strava_client_id = "YOUR_STRAVA_CLIENT_ID" +garmin_client_id = "YOUR_GARMIN_CLIENT_ID" +fitbit_client_id = "YOUR_FITBIT_CLIENT_ID" + +# Alert email +alert_email = "your-email@example.com" + +# Secrets (managed via Secret Manager above) +secrets = { + strava_client_secret = "" # Already in Secret Manager + garmin_client_secret = "" + fitbit_client_secret = "" + openweather_api_key = "" +} +``` + +### Step 7: Deploy with Terraform + +```bash +cd gcp/scripts + +# Initialize Terraform (first time only) +cd ../terraform +terraform init -backend-config="bucket=pierre-mcp-dev-terraform-state" + +# Back to scripts directory +cd ../scripts + +# Generate plan +./deploy-terraform.sh dev plan + +# Review the plan output, then apply +./deploy-terraform.sh dev apply +``` + +### Step 8: Verify Deployment + +```bash +# Get service URL +cd ../terraform +SERVICE_URL=$(terraform output -raw cloud_run_service_url) +echo "Service URL: $SERVICE_URL" + +# Test health endpoint +curl $SERVICE_URL/health +# Expected: {"status":"ok"} + +# Test MCP tools endpoint +curl -H "Authorization: Bearer YOUR_JWT_TOKEN" $SERVICE_URL/mcp/tools +``` + +### Step 9: View Logs + +```bash +# Real-time logs +gcloud logging tail "resource.type=cloud_run_revision" + +# Last 50 logs +gcloud logging read \ + "resource.type=cloud_run_revision AND resource.labels.service_name=pierre-mcp-server" \ + --limit 50 \ + --format json +``` + +## Next Steps + +### 1. Register MCP Client (Claude Desktop) + +Add to `~/Library/Application Support/Claude/claude_desktop_config.json`: + +```json +{ + "mcpServers": { + "pierre-fitness": { + "command": "npx", + "args": [ + "-y", + "pierre-mcp-client@next", + "--server", + "YOUR_CLOUD_RUN_URL" + ] + } + } +} +``` + +### 2. Create Admin User + +```bash +SERVICE_URL=$(cd ../terraform && terraform output -raw cloud_run_service_url) + +curl -X POST $SERVICE_URL/admin/setup \ + -H "Content-Type: application/json" \ + -d '{ + "email": "admin@example.com", + "password": "SecurePass123!", + "display_name": "Admin User" + }' +``` + +### 3. Test OAuth Flow + +```bash +# Navigate to OAuth initiation endpoint +open "$SERVICE_URL/api/oauth/auth/strava/YOUR_USER_ID" + +# Complete authorization in browser +# Check status +curl "$SERVICE_URL/api/oauth/status" \ + -H "Authorization: Bearer YOUR_JWT_TOKEN" +``` + +### 4. Set Up Continuous Deployment + +```bash +# Create Cloud Build trigger +gcloud builds triggers create github \ + --name=pierre-mcp-dev-deploy \ + --repo-name=pierre_mcp_server \ + --repo-owner=Async-IO \ + --branch-pattern="^main$" \ + --build-config=gcp/cloudbuild/cloudbuild.yaml \ + --substitutions=_ENVIRONMENT=dev +``` + +Now every push to `main` branch will automatically deploy to dev! + +## Troubleshooting + +### Container fails to start + +```bash +# Check logs +gcloud logging read \ + "resource.type=cloud_run_revision AND severity>=ERROR" \ + --limit 20 + +# Common issues: +# 1. Database connection failed → Check VPC connector +# 2. Secret access denied → Verify IAM permissions +# 3. Port mismatch → Ensure HTTP_PORT=8081 +``` + +### Database connection issues + +```bash +# Check Cloud SQL status +gcloud sql instances describe pierre-mcp-server-postgres-dev + +# Check VPC connector +gcloud compute networks vpc-access connectors describe \ + serverless-connector-dev \ + --region us-central1 +``` + +### Terraform errors + +```bash +# Refresh state +cd gcp/scripts +./deploy-terraform.sh dev refresh + +# Force unlock (if stuck) +cd ../terraform +terraform force-unlock LOCK_ID + +# Destroy and recreate (dev only!) +cd ../scripts +./deploy-terraform.sh dev destroy +./deploy-terraform.sh dev apply +``` + +## Clean Up + +To avoid charges, destroy resources when not needed: + +```bash +cd gcp/scripts +./deploy-terraform.sh dev destroy +``` + +This will: +- Delete Cloud Run service +- Delete Cloud SQL instance +- Delete VPC and networking +- Keep Terraform state and Artifact Registry + +To delete everything: +```bash +# Delete project (CAUTION: Irreversible!) +gcloud projects delete pierre-mcp-dev +``` + +## Cost Monitoring + +```bash +# View current month costs +gcloud billing projects describe pierre-mcp-dev \ + --format="value(billingAccountName)" + +# Set budget alert (via console) +# Navigate to: Billing → Budgets & alerts +# Set alert at $50/month for dev +``` + +## Support + +- **Documentation**: [Full Deployment Guide](./DEPLOYMENT_GUIDE.md) +- **Architecture**: [Architecture Overview](./ARCHITECTURE.md) +- **Issues**: https://github.com/Async-IO/pierre_mcp_server/issues +- **Discussions**: https://github.com/Async-IO/pierre_mcp_server/discussions + +## What's Next? + +1. **Staging Environment**: Repeat steps for `pierre-mcp-staging` +2. **Custom Domain**: Configure Cloud Run custom domain +3. **Monitoring Dashboards**: Set up Cloud Monitoring dashboards +4. **Load Testing**: Test with k6 or Locust +5. **Production Deployment**: Follow [Production Deployment Guide](./DEPLOYMENT_GUIDE.md#production-deployment) diff --git a/gcp/scripts/deploy-terraform.sh b/gcp/scripts/deploy-terraform.sh new file mode 100755 index 00000000..3927ce19 --- /dev/null +++ b/gcp/scripts/deploy-terraform.sh @@ -0,0 +1,198 @@ +#!/usr/bin/env bash +# Deploy Infrastructure with Terraform +# Purpose: Simplified Terraform deployment wrapper with safety checks +# Usage: ./deploy-terraform.sh + +set -euo pipefail + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +# Script arguments +ENVIRONMENT="${1:-}" +ACTION="${2:-plan}" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TERRAFORM_DIR="$(cd "$SCRIPT_DIR/../terraform" && pwd)" + +# Validate arguments +if [ -z "$ENVIRONMENT" ]; then + echo -e "${RED}❌ Error: Environment required${NC}" + echo "Usage: $0 [action]" + echo "" + echo "Environments: dev, staging, production" + echo "Actions: plan, apply, destroy, output" + echo "" + echo "Examples:" + echo " $0 dev plan # Preview changes for dev" + echo " $0 staging apply # Apply changes to staging" + echo " $0 production output # Show production outputs" + exit 1 +fi + +if [[ ! "$ENVIRONMENT" =~ ^(dev|staging|production)$ ]]; then + echo -e "${RED}❌ Error: Invalid environment '$ENVIRONMENT'${NC}" + echo "Valid environments: dev, staging, production" + exit 1 +fi + +if [[ ! "$ACTION" =~ ^(plan|apply|destroy|output|refresh)$ ]]; then + echo -e "${RED}❌ Error: Invalid action '$ACTION'${NC}" + echo "Valid actions: plan, apply, destroy, output, refresh" + exit 1 +fi + +TFVARS_FILE="$TERRAFORM_DIR/environments/$ENVIRONMENT/terraform.tfvars" + +if [ ! -f "$TFVARS_FILE" ]; then + echo -e "${RED}❌ Error: Terraform variables file not found${NC}" + echo "Expected: $TFVARS_FILE" + exit 1 +fi + +echo -e "${BLUE}========================================${NC}" +echo -e "${BLUE}Pierre MCP Server - Terraform Deployment${NC}" +echo -e "${BLUE}========================================${NC}" +echo "" +echo "Environment: $ENVIRONMENT" +echo "Action: $ACTION" +echo "Variables: $TFVARS_FILE" +echo "Directory: $TERRAFORM_DIR" +echo "" + +# Change to Terraform directory +cd "$TERRAFORM_DIR" + +# Check Terraform version +TERRAFORM_VERSION=$(terraform version -json | jq -r '.terraform_version') +echo -e "${YELLOW}📦 Terraform version: $TERRAFORM_VERSION${NC}" + +# Initialize Terraform if needed +if [ ! -d ".terraform" ]; then + echo -e "${YELLOW}🔧 Initializing Terraform...${NC}" + terraform init +fi + +# Extract project ID from tfvars +PROJECT_ID=$(grep '^project_id' "$TFVARS_FILE" | sed 's/.*=\s*"\(.*\)"/\1/') +echo -e "${YELLOW}📋 GCP Project: $PROJECT_ID${NC}" + +# Set active GCP project +gcloud config set project "$PROJECT_ID" --quiet + +# Validate Terraform configuration +echo -e "${YELLOW}✅ Validating Terraform configuration...${NC}" +terraform validate + +# Execute Terraform action +case "$ACTION" in + plan) + echo -e "${YELLOW}📊 Generating execution plan...${NC}" + terraform plan \ + -var-file="$TFVARS_FILE" \ + -out="terraform-$ENVIRONMENT.tfplan" + + echo "" + echo -e "${GREEN}✅ Plan generated successfully!${NC}" + echo "" + echo "To apply this plan:" + echo " $0 $ENVIRONMENT apply" + ;; + + apply) + # Safety check for production + if [ "$ENVIRONMENT" == "production" ]; then + echo -e "${RED}⚠️ WARNING: You are about to modify PRODUCTION infrastructure!${NC}" + read -p "Type 'production' to confirm: " confirm + if [ "$confirm" != "production" ]; then + echo "Deployment cancelled" + exit 1 + fi + fi + + echo -e "${YELLOW}🚀 Applying infrastructure changes...${NC}" + + # Check if plan file exists + if [ -f "terraform-$ENVIRONMENT.tfplan" ]; then + echo "Using existing plan file..." + terraform apply "terraform-$ENVIRONMENT.tfplan" + rm -f "terraform-$ENVIRONMENT.tfplan" + else + echo "No plan file found, running apply with auto-approve..." + terraform apply \ + -var-file="$TFVARS_FILE" \ + -auto-approve + fi + + echo "" + echo -e "${GREEN}========================================${NC}" + echo -e "${GREEN}✅ Infrastructure deployed successfully!${NC}" + echo -e "${GREEN}========================================${NC}" + echo "" + + # Show important outputs + echo -e "${YELLOW}📋 Deployment Outputs:${NC}" + terraform output -json | jq -r 'to_entries[] | "\(.key) = \(.value.value)"' | grep -E '(service_url|database_connection|health_check)' || true + + echo "" + echo -e "${YELLOW}Next steps:${NC}" + echo "1. Test the deployment:" + echo " SERVICE_URL=\$(terraform output -raw cloud_run_service_url)" + echo " curl \$SERVICE_URL/health" + echo "" + echo "2. View logs:" + echo " gcloud logging read \"resource.type=cloud_run_revision\" --limit 50" + echo "" + echo "3. Deploy new version:" + echo " gcloud builds submit --config=../cloudbuild/cloudbuild.yaml" + echo "" + ;; + + destroy) + echo -e "${RED}⚠️ WARNING: You are about to DESTROY infrastructure!${NC}" + echo "Environment: $ENVIRONMENT" + echo "" + + if [ "$ENVIRONMENT" == "production" ]; then + echo -e "${RED}🚨 PRODUCTION DESTRUCTION BLOCKED${NC}" + echo "Destroying production requires manual intervention." + echo "If you really need to destroy production:" + echo "1. Remove deletion_protection from terraform.tfvars" + echo "2. Run: terraform destroy -var-file=$TFVARS_FILE" + exit 1 + fi + + read -p "Type '$ENVIRONMENT' to confirm destruction: " confirm + if [ "$confirm" != "$ENVIRONMENT" ]; then + echo "Destruction cancelled" + exit 1 + fi + + echo -e "${YELLOW}💥 Destroying infrastructure...${NC}" + terraform destroy \ + -var-file="$TFVARS_FILE" \ + -auto-approve + + echo "" + echo -e "${GREEN}✅ Infrastructure destroyed${NC}" + ;; + + output) + echo -e "${YELLOW}📋 Infrastructure Outputs:${NC}" + terraform output + ;; + + refresh) + echo -e "${YELLOW}🔄 Refreshing Terraform state...${NC}" + terraform refresh -var-file="$TFVARS_FILE" + echo -e "${GREEN}✅ State refreshed${NC}" + ;; +esac + +echo "" +echo -e "${BLUE}========================================${NC}" +echo -e "${BLUE}Deployment complete!${NC}" +echo -e "${BLUE}========================================${NC}" diff --git a/gcp/scripts/setup-gcp-project.sh b/gcp/scripts/setup-gcp-project.sh new file mode 100755 index 00000000..9beb7254 --- /dev/null +++ b/gcp/scripts/setup-gcp-project.sh @@ -0,0 +1,188 @@ +#!/usr/bin/env bash +# Setup GCP Project for Pierre MCP Server +# Purpose: One-time setup of GCP project, APIs, service accounts, and permissions +# Usage: ./setup-gcp-project.sh + +set -euo pipefail + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Script arguments +PROJECT_ID="${1:-}" +ENVIRONMENT="${2:-dev}" +REGION="${3:-us-central1}" + +if [ -z "$PROJECT_ID" ]; then + echo -e "${RED}❌ Error: Project ID required${NC}" + echo "Usage: $0 [environment] [region]" + echo "Example: $0 pierre-mcp-dev dev us-central1" + exit 1 +fi + +echo -e "${GREEN}========================================${NC}" +echo -e "${GREEN}Pierre MCP Server - GCP Project Setup${NC}" +echo -e "${GREEN}========================================${NC}" +echo "" +echo "Project ID: $PROJECT_ID" +echo "Environment: $ENVIRONMENT" +echo "Region: $REGION" +echo "" + +# Confirm with user +read -p "Continue with setup? (y/n) " -n 1 -r +echo "" +if [[ ! $REPLY =~ ^[Yy]$ ]]; then + echo "Setup cancelled" + exit 1 +fi + +# Set active project +echo -e "${YELLOW}📋 Setting active GCP project...${NC}" +gcloud config set project "$PROJECT_ID" + +# Enable required APIs +echo -e "${YELLOW}🔌 Enabling required GCP APIs (this may take 2-3 minutes)...${NC}" +gcloud services enable \ + run.googleapis.com \ + sqladmin.googleapis.com \ + compute.googleapis.com \ + vpcaccess.googleapis.com \ + servicenetworking.googleapis.com \ + secretmanager.googleapis.com \ + cloudresourcemanager.googleapis.com \ + iam.googleapis.com \ + logging.googleapis.com \ + monitoring.googleapis.com \ + artifactregistry.googleapis.com \ + cloudbuild.googleapis.com + +echo -e "${GREEN}✅ APIs enabled${NC}" + +# Create Artifact Registry repository +echo -e "${YELLOW}📦 Creating Artifact Registry repository...${NC}" +if ! gcloud artifacts repositories describe pierre-mcp --location="$REGION" &>/dev/null; then + gcloud artifacts repositories create pierre-mcp \ + --repository-format=docker \ + --location="$REGION" \ + --description="Pierre MCP Server container images" + echo -e "${GREEN}✅ Artifact Registry repository created${NC}" +else + echo -e "${YELLOW}ℹ️ Artifact Registry repository already exists${NC}" +fi + +# Create service account for Cloud Run +echo -e "${YELLOW}👤 Creating Cloud Run service account...${NC}" +SA_NAME="pierre-mcp-server-runner-$ENVIRONMENT" +SA_EMAIL="$SA_NAME@$PROJECT_ID.iam.gserviceaccount.com" + +if ! gcloud iam service-accounts describe "$SA_EMAIL" &>/dev/null; then + gcloud iam service-accounts create "$SA_NAME" \ + --display-name="Cloud Run service account for Pierre MCP ($ENVIRONMENT)" \ + --description="Service account with least-privilege access" + echo -e "${GREEN}✅ Service account created: $SA_EMAIL${NC}" +else + echo -e "${YELLOW}ℹ️ Service account already exists${NC}" +fi + +# Grant IAM roles to service account +echo -e "${YELLOW}🔐 Granting IAM roles to service account...${NC}" +for role in \ + "roles/cloudsql.client" \ + "roles/secretmanager.secretAccessor" \ + "roles/logging.logWriter" \ + "roles/monitoring.metricWriter" \ + "roles/cloudtrace.agent"; do + + gcloud projects add-iam-policy-binding "$PROJECT_ID" \ + --member="serviceAccount:$SA_EMAIL" \ + --role="$role" \ + --condition=None \ + --quiet +done +echo -e "${GREEN}✅ IAM roles granted${NC}" + +# Configure Cloud Build service account permissions +echo -e "${YELLOW}🏗️ Configuring Cloud Build permissions...${NC}" +PROJECT_NUMBER=$(gcloud projects describe "$PROJECT_ID" --format="value(projectNumber)") +CLOUD_BUILD_SA="$PROJECT_NUMBER@cloudbuild.gserviceaccount.com" + +for role in \ + "roles/run.admin" \ + "roles/iam.serviceAccountUser"; do + + gcloud projects add-iam-policy-binding "$PROJECT_ID" \ + --member="serviceAccount:$CLOUD_BUILD_SA" \ + --role="$role" \ + --condition=None \ + --quiet +done +echo -e "${GREEN}✅ Cloud Build permissions configured${NC}" + +# Create GCS bucket for Terraform state +echo -e "${YELLOW}🪣 Creating GCS bucket for Terraform state...${NC}" +BUCKET_NAME="$PROJECT_ID-terraform-state" + +if ! gsutil ls -b "gs://$BUCKET_NAME" &>/dev/null; then + gsutil mb -p "$PROJECT_ID" -l "$REGION" "gs://$BUCKET_NAME" + gsutil versioning set on "gs://$BUCKET_NAME" + gsutil uniformbucketlevelaccess set on "gs://$BUCKET_NAME" + echo -e "${GREEN}✅ Terraform state bucket created: gs://$BUCKET_NAME${NC}" +else + echo -e "${YELLOW}ℹ️ Terraform state bucket already exists${NC}" +fi + +# Create initial secrets in Secret Manager +echo -e "${YELLOW}🔑 Creating Secret Manager secrets...${NC}" + +create_secret_if_not_exists() { + local secret_name=$1 + local secret_value=$2 + + if ! gcloud secrets describe "$secret_name" &>/dev/null; then + echo "$secret_value" | gcloud secrets create "$secret_name" \ + --data-file=- \ + --replication-policy="automatic" + echo -e "${GREEN}✅ Created secret: $secret_name${NC}" + else + echo -e "${YELLOW}ℹ️ Secret already exists: $secret_name${NC}" + fi +} + +# Generate master encryption key (base64-encoded 32-byte key) +MASTER_KEY=$(openssl rand -base64 32) +create_secret_if_not_exists "pierre-mcp-server-master-encryption-key-$ENVIRONMENT" "$MASTER_KEY" + +echo -e "${YELLOW}ℹ️ Note: OAuth secrets (Strava, Garmin, Fitbit, OpenWeather) should be added manually:${NC}" +echo "" +echo " gcloud secrets create pierre-mcp-server-strava-client-secret-$ENVIRONMENT --data-file=- <<< 'YOUR_SECRET'" +echo " gcloud secrets create pierre-mcp-server-garmin-client-secret-$ENVIRONMENT --data-file=- <<< 'YOUR_SECRET'" +echo " gcloud secrets create pierre-mcp-server-fitbit-client-secret-$ENVIRONMENT --data-file=- <<< 'YOUR_SECRET'" +echo " gcloud secrets create pierre-mcp-server-openweather-api-key-$ENVIRONMENT --data-file=- <<< 'YOUR_KEY'" +echo "" + +# Summary +echo "" +echo -e "${GREEN}========================================${NC}" +echo -e "${GREEN}✅ GCP Project Setup Complete!${NC}" +echo -e "${GREEN}========================================${NC}" +echo "" +echo "Project ID: $PROJECT_ID" +echo "Region: $REGION" +echo "Service Account: $SA_EMAIL" +echo "Artifact Registry: $REGION-docker.pkg.dev/$PROJECT_ID/pierre-mcp" +echo "Terraform Bucket: gs://$BUCKET_NAME" +echo "" +echo -e "${YELLOW}Next Steps:${NC}" +echo "1. Add OAuth secrets to Secret Manager (see commands above)" +echo "2. Configure Terraform backend:" +echo " cd gcp/terraform" +echo " terraform init -backend-config=\"bucket=$BUCKET_NAME\"" +echo "3. Review and update terraform.tfvars in gcp/terraform/environments/$ENVIRONMENT/" +echo "4. Run Terraform:" +echo " terraform plan -var-file=environments/$ENVIRONMENT/terraform.tfvars" +echo " terraform apply -var-file=environments/$ENVIRONMENT/terraform.tfvars" +echo "" diff --git a/gcp/terraform/README.md b/gcp/terraform/README.md new file mode 100644 index 00000000..0b14a8a3 --- /dev/null +++ b/gcp/terraform/README.md @@ -0,0 +1,460 @@ +# Pierre MCP Server - Terraform Infrastructure + +Infrastructure as Code (IaC) for deploying Pierre MCP Server on Google Cloud Platform. + +## Directory Structure + +``` +terraform/ +├── main.tf # Primary infrastructure resources +├── variables.tf # Input variable definitions +├── outputs.tf # Output value definitions +├── versions.tf # Terraform and provider versions +├── backend.tf # Remote state configuration +├── environments/ # Environment-specific configurations +│ ├── dev/ +│ │ └── terraform.tfvars +│ ├── staging/ +│ │ └── terraform.tfvars +│ └── production/ +│ └── terraform.tfvars +└── README.md # This file +``` + +## Prerequisites + +1. **Terraform CLI** (v1.6+): + ```bash + brew install terraform # macOS + # or download from https://www.terraform.io/downloads + ``` + +2. **Google Cloud SDK**: + ```bash + curl https://sdk.cloud.google.com | bash + gcloud auth application-default login + ``` + +3. **GCP Project Setup**: + ```bash + # Run the setup script first + ../scripts/setup-gcp-project.sh PROJECT_ID ENVIRONMENT REGION + ``` + +## Quick Start + +### 1. Initialize Terraform + +```bash +cd terraform + +# Initialize with remote state backend +terraform init -backend-config="bucket=YOUR_PROJECT_ID-terraform-state" +``` + +### 2. Configure Variables + +Edit the appropriate environment file: + +```bash +vim environments/dev/terraform.tfvars +``` + +Required variables: +- `project_id`: GCP project ID +- `container_image`: Docker image URL from Artifact Registry +- `strava_client_id`, `garmin_client_id`, `fitbit_client_id`: OAuth app IDs + +### 3. Plan Deployment + +```bash +terraform plan -var-file=environments/dev/terraform.tfvars -out=dev.tfplan +``` + +Review the plan output carefully! + +### 4. Apply Changes + +```bash +terraform apply dev.tfplan +``` + +Or use the deployment script: + +```bash +cd ../scripts +./deploy-terraform.sh dev apply +``` + +## Resources Created + +### Networking +- **VPC Network**: Custom VPC with private subnets +- **Subnet**: Regional subnet (10.0.0.0/24) +- **Serverless VPC Connector**: Bridge Cloud Run ↔ Cloud SQL +- **Cloud Router + NAT**: Outbound connectivity for external APIs +- **Private VPC Connection**: For Cloud SQL private IP + +### Compute +- **Cloud Run Service**: Serverless container deployment + - Auto-scaling (0-100 instances) + - CPU: 1-2 vCPU + - Memory: 512Mi-2Gi + - Concurrency: 80 requests/instance + - Health checks: `/health` endpoint + +### Database +- **Cloud SQL Instance**: PostgreSQL 16 + - Tier: db-f1-micro (dev) to db-custom-4-16384 (prod) + - Private IP only + - Automated backups (daily at 3 AM UTC) + - Point-in-time recovery + - High availability (production only) + +### Security +- **Service Account**: `pierre-mcp-server-runner-{env}` + - IAM roles for Cloud SQL, Secret Manager, Logging +- **Secret Manager Secrets**: + - Database password (auto-generated) + - Master encryption key + - OAuth client secrets + - External API keys + +### Monitoring +- **Uptime Check**: `/health` endpoint monitoring +- **Alert Policy**: Service downtime notifications +- **Notification Channel**: Email alerts + +## Variables Reference + +### Required Variables + +| Variable | Description | Example | +|----------|-------------|---------| +| `project_id` | GCP project ID | `pierre-mcp-dev` | +| `container_image` | Docker image URL | `gcr.io/PROJECT/image:tag` | +| `environment` | Environment name | `dev`, `staging`, `production` | + +### Optional Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `region` | `us-central1` | GCP region | +| `cloud_run_cpu` | `"1"` | CPU allocation | +| `cloud_run_memory` | `"512Mi"` | Memory allocation | +| `cloud_run_min_instances` | `0` | Minimum instances | +| `cloud_run_max_instances` | `100` | Maximum instances | +| `database_tier` | `db-f1-micro` | Cloud SQL tier | +| `database_high_availability` | `false` | Enable HA | +| `enable_uptime_checks` | `true` | Enable monitoring | +| `alert_email` | `""` | Alert recipient | + +See `variables.tf` for complete list. + +## Outputs + +After successful deployment: + +```bash +terraform output +``` + +Key outputs: +- `cloud_run_service_url`: Public URL for the service +- `database_connection_name`: Cloud SQL connection string +- `health_check_url`: Health endpoint URL +- `deployment_instructions`: Quick reference guide + +## Managing Secrets + +### Add Secrets via Terraform + +Edit `environments/{env}/terraform.tfvars`: + +```hcl +secrets = { + strava_client_secret = "your-secret" + garmin_client_secret = "your-secret" + fitbit_client_secret = "your-secret" + openweather_api_key = "your-key" +} +``` + +**Warning**: Never commit secrets to git! Use `.gitignore` or environment variables. + +### Add Secrets via gcloud CLI (Recommended) + +```bash +echo "your-secret" | \ + gcloud secrets create pierre-mcp-server-strava-client-secret-dev \ + --data-file=- +``` + +Terraform will automatically detect and use existing secrets. + +## State Management + +### Remote State Backend + +Terraform state is stored in Google Cloud Storage: + +```hcl +# backend.tf +terraform { + backend "gcs" { + bucket = "PROJECT_ID-terraform-state" + prefix = "pierre-mcp-server" + } +} +``` + +### State Locking + +GCS backend provides automatic state locking to prevent concurrent modifications. + +### Viewing State + +```bash +# List all resources +terraform state list + +# Show specific resource +terraform state show google_cloud_run_service.pierre_mcp_server + +# Pull state locally (read-only) +terraform state pull > current-state.json +``` + +### Importing Existing Resources + +If resources were created manually: + +```bash +terraform import \ + google_cloud_run_service.pierre_mcp_server \ + projects/PROJECT_ID/locations/REGION/services/SERVICE_NAME +``` + +## Multi-Environment Management + +### Workspace Strategy (Not Recommended) + +We use separate state files per environment instead of Terraform workspaces. + +### Environment Isolation + +Each environment has: +- ✅ Separate GCP project +- ✅ Separate state file (via `terraform.tfvars`) +- ✅ Separate resource naming (`{resource}-{env}`) +- ✅ Separate IAM permissions + +### Deploying Multiple Environments + +```bash +# Development +terraform apply -var-file=environments/dev/terraform.tfvars + +# Staging +terraform apply -var-file=environments/staging/terraform.tfvars + +# Production +terraform apply -var-file=environments/production/terraform.tfvars +``` + +## Common Operations + +### Update Container Image + +```bash +# Option 1: Update tfvars and re-apply +vim environments/dev/terraform.tfvars +# Change: container_image = "...new-tag" +terraform apply -var-file=environments/dev/terraform.tfvars + +# Option 2: Use gcloud directly (faster) +gcloud run deploy pierre-mcp-server \ + --image=NEW_IMAGE_URL \ + --region=us-central1 +``` + +### Scale Cloud Run + +```bash +# Via Terraform: Edit tfvars +cloud_run_min_instances = 2 +cloud_run_max_instances = 200 + +# Via gcloud (immediate) +gcloud run services update pierre-mcp-server \ + --region=us-central1 \ + --min-instances=2 \ + --max-instances=200 +``` + +### Upgrade Database Tier + +```bash +# Edit tfvars +database_tier = "db-custom-4-16384" + +# Apply (will cause brief downtime) +terraform apply -var-file=environments/production/terraform.tfvars +``` + +### Add New Secret + +```bash +# Create secret +gcloud secrets create pierre-mcp-server-new-secret-dev \ + --data-file=- + +# Grant access +gcloud secrets add-iam-policy-binding \ + pierre-mcp-server-new-secret-dev \ + --member="serviceAccount:SERVICE_ACCOUNT@PROJECT.iam.gserviceaccount.com" \ + --role="roles/secretmanager.secretAccessor" +``` + +## Troubleshooting + +### Plan Fails with API Errors + +```bash +# Enable required APIs +gcloud services enable run.googleapis.com sqladmin.googleapis.com ... + +# Or run setup script +../scripts/setup-gcp-project.sh PROJECT_ID ENV REGION +``` + +### State Lock Stuck + +```bash +terraform force-unlock LOCK_ID +``` + +### Resource Already Exists + +```bash +# Import existing resource +terraform import RESOURCE_TYPE.NAME RESOURCE_ID + +# Or delete manually +gcloud run services delete SERVICE_NAME --region=REGION +``` + +### Database Won't Delete (Protection) + +For production, `deletion_protection = true` prevents accidental deletion. + +To delete: +```bash +# Option 1: Disable protection +database_deletion_protection = false +terraform apply + +# Option 2: Delete manually +gcloud sql instances delete INSTANCE_NAME +``` + +## Best Practices + +### 1. Never Commit Secrets + +Add to `.gitignore`: +``` +*.tfvars +!environments/*/terraform.tfvars.example +*.tfstate +*.tfstate.backup +.terraform/ +``` + +### 2. Use Separate Projects per Environment + +``` +pierre-mcp-dev +pierre-mcp-staging +pierre-mcp-prod +``` + +### 3. Tag Resources + +```hcl +labels = { + environment = "production" + managed_by = "terraform" + team = "platform" + cost_center = "engineering" +} +``` + +### 4. Enable Deletion Protection + +```hcl +deletion_protection = var.environment == "production" ? true : false +``` + +### 5. Use Variables for Everything + +Never hardcode values in `main.tf`. + +### 6. Document Changes + +```bash +git commit -m "infra: increase Cloud Run max instances to 200" +``` + +## Cost Estimation + +Before applying: + +```bash +# Install cost estimation tool +terraform plan -out=plan.tfplan +terraform show -json plan.tfplan | infracost breakdown --path - +``` + +## Terraform Modules (Future) + +For reusability across projects, consider extracting into modules: + +``` +modules/ +├── cloud-run/ +├── cloud-sql/ +├── networking/ +└── monitoring/ +``` + +## CI/CD Integration + +### GitHub Actions Example + +```yaml +- name: Terraform Apply + run: | + cd gcp/terraform + terraform init -backend-config="bucket=${{ secrets.TF_STATE_BUCKET }}" + terraform apply -var-file=environments/${{ matrix.env }}/terraform.tfvars -auto-approve +``` + +### Cloud Build Integration + +```yaml +steps: + - name: 'hashicorp/terraform' + args: ['init', '-backend-config=bucket=$_TF_STATE_BUCKET'] + dir: 'gcp/terraform' + + - name: 'hashicorp/terraform' + args: ['apply', '-var-file=environments/$_ENVIRONMENT/terraform.tfvars', '-auto-approve'] + dir: 'gcp/terraform' +``` + +## Support + +- **Terraform Registry**: https://registry.terraform.io/providers/hashicorp/google/latest/docs +- **GCP Terraform Examples**: https://github.com/terraform-google-modules +- **Project Issues**: https://github.com/Async-IO/pierre_mcp_server/issues diff --git a/gcp/terraform/backend.tf b/gcp/terraform/backend.tf new file mode 100644 index 00000000..7aebe8d2 --- /dev/null +++ b/gcp/terraform/backend.tf @@ -0,0 +1,23 @@ +# Terraform remote state configuration +# Stores state in GCS bucket for team collaboration and state locking +# +# IMPORTANT: Create the bucket manually before running terraform init: +# gsutil mb -p PROJECT_ID -l REGION gs://PROJECT_ID-terraform-state +# gsutil versioning set on gs://PROJECT_ID-terraform-state +# +# Then initialize with: +# terraform init -backend-config="bucket=PROJECT_ID-terraform-state" + +terraform { + backend "gcs" { + # bucket = "REPLACE_WITH_YOUR_PROJECT_ID-terraform-state" # Set via -backend-config + prefix = "pierre-mcp-server" + } +} + +# Alternative: Local backend for testing (NOT for production) +# terraform { +# backend "local" { +# path = "terraform.tfstate" +# } +# } diff --git a/gcp/terraform/environments/dev/terraform.tfvars b/gcp/terraform/environments/dev/terraform.tfvars new file mode 100644 index 00000000..5783bf51 --- /dev/null +++ b/gcp/terraform/environments/dev/terraform.tfvars @@ -0,0 +1,70 @@ +# Development Environment Configuration +# Purpose: Local development testing, rapid iteration +# Cost: ~$75-90/month + +# GCP Project Configuration +project_id = "pierre-mcp-dev" # REPLACE with your actual GCP project ID +region = "us-central1" +zone = "us-central1-a" +environment = "dev" + +# Cloud Run Configuration (Minimal resources for dev) +service_name = "pierre-mcp-server" +container_image = "gcr.io/pierre-mcp-dev/pierre-mcp-server:latest" # REPLACE +cloud_run_cpu = "1" +cloud_run_memory = "512Mi" +cloud_run_min_instances = 0 # Scale to zero when not in use +cloud_run_max_instances = 10 +cloud_run_concurrency = 80 +cloud_run_timeout = 300 + +# Cloud SQL Configuration (Smallest tier for dev) +database_name = "pierre_mcp_server" +database_user = "pierre" +database_tier = "db-f1-micro" # Shared CPU, 0.6GB RAM +database_disk_size = 10 # 10GB minimum +database_disk_type = "PD_SSD" +database_backup_enabled = true +database_backup_retention_days = 3 # Keep 3 days of backups +database_high_availability = false # No HA for dev +database_private_network = true + +# Networking Configuration +vpc_name = "pierre-vpc" +subnet_cidr = "10.0.0.0/24" +serverless_connector_cidr = "10.8.0.0/28" + +# OAuth Provider Configuration (Use test credentials) +strava_client_id = "your-dev-strava-client-id" +strava_redirect_uri = "" # Will auto-generate from Cloud Run URL + +garmin_client_id = "" +garmin_redirect_uri = "" + +fitbit_client_id = "" +fitbit_redirect_uri = "" + +# Secrets (Store in Secret Manager via terraform apply) +# secrets = { +# strava_client_secret = "your-strava-secret" +# garmin_client_secret = "your-garmin-secret" +# fitbit_client_secret = "your-fitbit-secret" +# openweather_api_key = "your-openweather-key" +# } + +# Monitoring & Alerting +enable_uptime_checks = true +alert_email = "devteam@example.com" # REPLACE + +# Security +enable_cloud_armor = false # Not needed for dev +allowed_ingress_cidrs = ["0.0.0.0/0"] # Open to internet for testing + +# Resource Labels +labels = { + environment = "dev" + managed_by = "terraform" + application = "pierre-mcp-server" + team = "platform" + cost_center = "engineering" +} diff --git a/gcp/terraform/environments/production/terraform.tfvars b/gcp/terraform/environments/production/terraform.tfvars new file mode 100644 index 00000000..9f37aa6f --- /dev/null +++ b/gcp/terraform/environments/production/terraform.tfvars @@ -0,0 +1,92 @@ +# Production Environment Configuration +# Purpose: Live production workload serving real users +# Cost: ~$500-1500/month (scales with usage) + +# GCP Project Configuration +project_id = "pierre-mcp-prod" # REPLACE with your actual GCP project ID +region = "us-central1" +zone = "us-central1-a" +environment = "production" + +# Cloud Run Configuration (High availability, auto-scaling) +service_name = "pierre-mcp-server" +container_image = "gcr.io/pierre-mcp-prod/pierre-mcp-server:v1.0.0" # REPLACE with tagged version +cloud_run_cpu = "2" +cloud_run_memory = "2Gi" +cloud_run_min_instances = 2 # Always have 2 instances for redundancy +cloud_run_max_instances = 100 +cloud_run_concurrency = 80 +cloud_run_timeout = 300 + +# Cloud SQL Configuration (Production-grade) +database_name = "pierre_mcp_server" +database_user = "pierre" +database_tier = "db-custom-4-16384" # 4 vCPU, 16GB RAM +database_disk_size = 100 # 100GB with auto-resize +database_disk_type = "PD_SSD" +database_backup_enabled = true +database_backup_retention_days = 30 # 30 days for compliance +database_high_availability = true # Regional HA with automatic failover +database_private_network = true + +# Networking Configuration +vpc_name = "pierre-vpc" +subnet_cidr = "10.0.0.0/24" +serverless_connector_cidr = "10.8.0.0/28" + +# OAuth Provider Configuration (Production OAuth apps) +strava_client_id = "your-production-strava-client-id" +strava_redirect_uri = "https://api.pierre-fitness.com/api/oauth/callback/strava" # REPLACE with your domain + +garmin_client_id = "your-production-garmin-client-id" +garmin_redirect_uri = "https://api.pierre-fitness.com/api/oauth/callback/garmin" + +fitbit_client_id = "your-production-fitbit-client-id" +fitbit_redirect_uri = "https://api.pierre-fitness.com/api/oauth/callback/fitbit" + +# Secrets (NEVER commit these! Manage via Secret Manager CLI or console) +# secrets = { +# strava_client_secret = "" # Set via: gcloud secrets versions add ... --data-file=- +# garmin_client_secret = "" +# fitbit_client_secret = "" +# openweather_api_key = "" +# } + +# Monitoring & Alerting (Critical for production) +enable_uptime_checks = true +alert_email = "platform-oncall@example.com" # REPLACE with PagerDuty/OpsGenie email + +# Security (Production hardening) +enable_cloud_armor = true # Enable WAF and DDoS protection +allowed_ingress_cidrs = ["0.0.0.0/0"] # Public API, can restrict to known IPs if needed + +# Resource Labels (For cost tracking and governance) +labels = { + environment = "production" + managed_by = "terraform" + application = "pierre-mcp-server" + team = "platform" + cost_center = "product" + compliance = "gdpr-compliant" + sla = "99.9" +} + +# ============================================================================ +# PRODUCTION DEPLOYMENT CHECKLIST +# ============================================================================ +# Before deploying to production: +# +# [ ] Domain configured and DNS pointed to Cloud Run URL +# [ ] SSL certificate provisioned (automatic with Cloud Run custom domains) +# [ ] OAuth apps registered with production callback URLs +# [ ] Secrets stored in Secret Manager (not in tfvars!) +# [ ] Database backups tested and verified +# [ ] Monitoring dashboards created +# [ ] Alert notification channels configured (PagerDuty, Slack) +# [ ] Runbooks documented for incident response +# [ ] Load testing completed (1000+ RPS sustained) +# [ ] Security scan passed (OWASP, dependency audit) +# [ ] GDPR/compliance requirements reviewed +# [ ] Disaster recovery plan documented +# [ ] Team trained on deployment procedures +# ============================================================================ diff --git a/gcp/terraform/environments/staging/terraform.tfvars b/gcp/terraform/environments/staging/terraform.tfvars new file mode 100644 index 00000000..77979c57 --- /dev/null +++ b/gcp/terraform/environments/staging/terraform.tfvars @@ -0,0 +1,70 @@ +# Staging Environment Configuration +# Purpose: Pre-production testing, integration testing, QA validation +# Cost: ~$200-300/month + +# GCP Project Configuration +project_id = "pierre-mcp-staging" # REPLACE with your actual GCP project ID +region = "us-central1" +zone = "us-central1-a" +environment = "staging" + +# Cloud Run Configuration (Production-like sizing) +service_name = "pierre-mcp-server" +container_image = "gcr.io/pierre-mcp-staging/pierre-mcp-server:latest" # REPLACE +cloud_run_cpu = "2" +cloud_run_memory = "1Gi" +cloud_run_min_instances = 1 # Always have 1 instance warm +cloud_run_max_instances = 50 +cloud_run_concurrency = 80 +cloud_run_timeout = 300 + +# Cloud SQL Configuration (Mid-tier for staging) +database_name = "pierre_mcp_server" +database_user = "pierre" +database_tier = "db-custom-2-8192" # 2 vCPU, 8GB RAM +database_disk_size = 20 +database_disk_type = "PD_SSD" +database_backup_enabled = true +database_backup_retention_days = 7 +database_high_availability = false # Single zone for staging +database_private_network = true + +# Networking Configuration +vpc_name = "pierre-vpc" +subnet_cidr = "10.0.0.0/24" +serverless_connector_cidr = "10.8.0.0/28" + +# OAuth Provider Configuration (Staging OAuth apps) +strava_client_id = "your-staging-strava-client-id" +strava_redirect_uri = "" # Will auto-generate + +garmin_client_id = "your-staging-garmin-client-id" +garmin_redirect_uri = "" + +fitbit_client_id = "your-staging-fitbit-client-id" +fitbit_redirect_uri = "" + +# Secrets (Managed separately via Secret Manager) +# secrets = { +# strava_client_secret = "staging-strava-secret" +# garmin_client_secret = "staging-garmin-secret" +# fitbit_client_secret = "staging-fitbit-secret" +# openweather_api_key = "staging-openweather-key" +# } + +# Monitoring & Alerting +enable_uptime_checks = true +alert_email = "platform-staging-alerts@example.com" # REPLACE + +# Security +enable_cloud_armor = false # Can enable if testing WAF rules +allowed_ingress_cidrs = ["0.0.0.0/0"] # Open for QA team testing + +# Resource Labels +labels = { + environment = "staging" + managed_by = "terraform" + application = "pierre-mcp-server" + team = "platform" + cost_center = "engineering" +} diff --git a/gcp/terraform/main.tf b/gcp/terraform/main.tf new file mode 100644 index 00000000..d35c1793 --- /dev/null +++ b/gcp/terraform/main.tf @@ -0,0 +1,694 @@ +# Pierre MCP Server - Main Terraform Configuration +# Deploys Cloud Run + Cloud SQL + Networking infrastructure on GCP + +# Enable required GCP APIs +resource "google_project_service" "required_apis" { + for_each = toset([ + "run.googleapis.com", # Cloud Run + "sqladmin.googleapis.com", # Cloud SQL + "compute.googleapis.com", # Compute Engine (for VPC) + "vpcaccess.googleapis.com", # Serverless VPC Access + "servicenetworking.googleapis.com", # Service Networking (for private IP) + "secretmanager.googleapis.com", # Secret Manager + "cloudresourcemanager.googleapis.com", # Resource Manager + "iam.googleapis.com", # IAM + "logging.googleapis.com", # Cloud Logging + "monitoring.googleapis.com", # Cloud Monitoring + "artifactregistry.googleapis.com", # Artifact Registry + "cloudbuild.googleapis.com", # Cloud Build + ]) + + service = each.key + disable_on_destroy = false + + # Prevent accidental API disabling + lifecycle { + prevent_destroy = false + } +} + +# ============================================================================ +# NETWORKING +# ============================================================================ + +# VPC Network +resource "google_compute_network" "vpc" { + name = "${var.vpc_name}-${var.environment}" + auto_create_subnetworks = false + routing_mode = "REGIONAL" + + depends_on = [google_project_service.required_apis] +} + +# Subnet for Cloud SQL and other resources +resource "google_compute_subnetwork" "subnet" { + name = "${var.vpc_name}-subnet-${var.environment}" + ip_cidr_range = var.subnet_cidr + region = var.region + network = google_compute_network.vpc.id + + private_ip_google_access = true + + log_config { + aggregation_interval = "INTERVAL_10_MIN" + flow_sampling = 0.5 + metadata = "INCLUDE_ALL_METADATA" + } +} + +# Global address for Cloud SQL private IP +resource "google_compute_global_address" "private_ip_address" { + count = var.database_private_network ? 1 : 0 + name = "private-ip-address-${var.environment}" + purpose = "VPC_PEERING" + address_type = "INTERNAL" + prefix_length = 16 + network = google_compute_network.vpc.id +} + +# Private VPC connection for Cloud SQL +resource "google_service_networking_connection" "private_vpc_connection" { + count = var.database_private_network ? 1 : 0 + network = google_compute_network.vpc.id + service = "servicenetworking.googleapis.com" + reserved_peering_ranges = [google_compute_global_address.private_ip_address[0].name] + + depends_on = [google_project_service.required_apis] +} + +# Serverless VPC Access Connector (Cloud Run → Cloud SQL) +resource "google_vpc_access_connector" "connector" { + name = "serverless-connector-${var.environment}" + region = var.region + network = google_compute_network.vpc.name + ip_cidr_range = var.serverless_connector_cidr + + min_instances = 2 + max_instances = 3 + machine_type = "e2-micro" + + depends_on = [ + google_compute_subnetwork.subnet, + google_project_service.required_apis + ] +} + +# Cloud Router for NAT +resource "google_compute_router" "router" { + name = "cloud-router-${var.environment}" + region = var.region + network = google_compute_network.vpc.id + + bgp { + asn = 64514 + } +} + +# Cloud NAT (for outbound connectivity to Strava, Garmin, Fitbit, etc.) +resource "google_compute_router_nat" "nat" { + name = "cloud-nat-${var.environment}" + router = google_compute_router.router.name + region = var.region + + nat_ip_allocate_option = "AUTO_ONLY" + source_subnetwork_ip_ranges_to_nat = "ALL_SUBNETWORKS_ALL_IP_RANGES" + + log_config { + enable = true + filter = "ERRORS_ONLY" + } +} + +# ============================================================================ +# CLOUD SQL (PostgreSQL) +# ============================================================================ + +# Generate random database password +resource "random_password" "db_password" { + length = 32 + special = true +} + +# Cloud SQL Instance +resource "google_sql_database_instance" "postgres" { + name = "${var.service_name}-postgres-${var.environment}" + database_version = "POSTGRES_16" + region = var.region + + settings { + tier = var.database_tier + availability_type = var.database_high_availability ? "REGIONAL" : "ZONAL" + disk_type = var.database_disk_type + disk_size = var.database_disk_size + disk_autoresize = true + + ip_configuration { + ipv4_enabled = !var.database_private_network + private_network = var.database_private_network ? google_compute_network.vpc.id : null + enable_private_path_for_google_cloud_services = var.database_private_network + } + + backup_configuration { + enabled = var.database_backup_enabled + start_time = "03:00" + point_in_time_recovery_enabled = true + transaction_log_retention_days = 7 + backup_retention_settings { + retained_backups = var.database_backup_retention_days + } + } + + maintenance_window { + day = 7 # Sunday + hour = 3 # 3 AM + update_track = "stable" + } + + insights_config { + query_insights_enabled = true + query_plans_per_minute = 5 + query_string_length = 1024 + record_application_tags = true + } + + database_flags { + name = "max_connections" + value = "100" + } + + database_flags { + name = "shared_buffers" + value = "32768" # 256MB for db-custom-2-8192 + } + + database_flags { + name = "log_checkpoints" + value = "on" + } + + database_flags { + name = "log_connections" + value = "on" + } + + database_flags { + name = "log_disconnections" + value = "on" + } + } + + deletion_protection = var.environment == "production" ? true : false + + depends_on = [ + google_service_networking_connection.private_vpc_connection, + google_project_service.required_apis + ] +} + +# PostgreSQL Database +resource "google_sql_database" "pierre_db" { + name = var.database_name + instance = google_sql_database_instance.postgres.name +} + +# Database User +resource "google_sql_user" "pierre_user" { + name = var.database_user + instance = google_sql_database_instance.postgres.name + password = random_password.db_password.result +} + +# ============================================================================ +# SECRET MANAGER +# ============================================================================ + +# Database password secret +resource "google_secret_manager_secret" "db_password" { + secret_id = "${var.service_name}-db-password-${var.environment}" + + replication { + auto {} + } + + labels = merge(var.labels, { + environment = var.environment + secret_type = "database" + }) + + depends_on = [google_project_service.required_apis] +} + +resource "google_secret_manager_secret_version" "db_password_version" { + secret = google_secret_manager_secret.db_password.id + secret_data = random_password.db_password.result +} + +# Master encryption key secret +resource "google_secret_manager_secret" "master_encryption_key" { + secret_id = "${var.service_name}-master-encryption-key-${var.environment}" + + replication { + auto {} + } + + labels = merge(var.labels, { + environment = var.environment + secret_type = "encryption" + }) + + depends_on = [google_project_service.required_apis] +} + +resource "google_secret_manager_secret_version" "master_encryption_key_version" { + secret = google_secret_manager_secret.master_encryption_key.id + secret_data = base64encode(random_password.db_password.result) # Generate secure random key +} + +# OAuth provider secrets (conditional) +locals { + oauth_secrets = { + strava_client_secret = var.secrets["strava_client_secret"] + garmin_client_secret = var.secrets["garmin_client_secret"] + fitbit_client_secret = var.secrets["fitbit_client_secret"] + openweather_api_key = var.secrets["openweather_api_key"] + } +} + +resource "google_secret_manager_secret" "secrets" { + for_each = { for k, v in local.oauth_secrets : k => v if v != "" && v != null } + + secret_id = "${var.service_name}-${each.key}-${var.environment}" + + replication { + auto {} + } + + labels = merge(var.labels, { + environment = var.environment + secret_type = "oauth" + }) + + depends_on = [google_project_service.required_apis] +} + +resource "google_secret_manager_secret_version" "secret_versions" { + for_each = google_secret_manager_secret.secrets + + secret = each.value.id + secret_data = local.oauth_secrets[each.key] +} + +# ============================================================================ +# IAM & SERVICE ACCOUNTS +# ============================================================================ + +# Service account for Cloud Run +resource "google_service_account" "cloud_run_sa" { + account_id = "${var.service_name}-runner-${var.environment}" + display_name = "Cloud Run service account for Pierre MCP Server (${var.environment})" + description = "Service account with least-privilege access for Cloud Run workload" +} + +# Grant Cloud SQL Client role to service account +resource "google_project_iam_member" "cloud_sql_client" { + project = var.project_id + role = "roles/cloudsql.client" + member = "serviceAccount:${google_service_account.cloud_run_sa.email}" +} + +# Grant Secret Manager Secret Accessor role +resource "google_secret_manager_secret_iam_member" "db_password_access" { + secret_id = google_secret_manager_secret.db_password.id + role = "roles/secretmanager.secretAccessor" + member = "serviceAccount:${google_service_account.cloud_run_sa.email}" +} + +resource "google_secret_manager_secret_iam_member" "master_key_access" { + secret_id = google_secret_manager_secret.master_encryption_key.id + role = "roles/secretmanager.secretAccessor" + member = "serviceAccount:${google_service_account.cloud_run_sa.email}" +} + +resource "google_secret_manager_secret_iam_member" "oauth_secrets_access" { + for_each = google_secret_manager_secret.secrets + + secret_id = each.value.id + role = "roles/secretmanager.secretAccessor" + member = "serviceAccount:${google_service_account.cloud_run_sa.email}" +} + +# Grant logging and monitoring permissions +resource "google_project_iam_member" "log_writer" { + project = var.project_id + role = "roles/logging.logWriter" + member = "serviceAccount:${google_service_account.cloud_run_sa.email}" +} + +resource "google_project_iam_member" "monitoring_metric_writer" { + project = var.project_id + role = "roles/monitoring.metricWriter" + member = "serviceAccount:${google_service_account.cloud_run_sa.email}" +} + +resource "google_project_iam_member" "trace_agent" { + project = var.project_id + role = "roles/cloudtrace.agent" + member = "serviceAccount:${google_service_account.cloud_run_sa.email}" +} + +# ============================================================================ +# CLOUD RUN SERVICE +# ============================================================================ + +resource "google_cloud_run_service" "pierre_mcp_server" { + name = var.service_name + location = var.region + + template { + spec { + service_account_name = google_service_account.cloud_run_sa.email + + containers { + image = var.container_image + + ports { + name = "http1" + container_port = 8081 + } + + resources { + limits = { + cpu = var.cloud_run_cpu + memory = var.cloud_run_memory + } + } + + env { + name = "RUST_LOG" + value = var.environment == "production" ? "info" : "debug" + } + + env { + name = "HTTP_PORT" + value = "8081" + } + + env { + name = "DATABASE_URL" + value = "postgresql://${google_sql_user.pierre_user.name}:$(DATABASE_PASSWORD)@${google_sql_database_instance.postgres.private_ip_address}:5432/${google_sql_database.pierre_db.name}?sslmode=require" + } + + env { + name = "DATABASE_PASSWORD" + value_from { + secret_key_ref { + name = google_secret_manager_secret.db_password.secret_id + key = "latest" + } + } + } + + env { + name = "PIERRE_MASTER_ENCRYPTION_KEY" + value_from { + secret_key_ref { + name = google_secret_manager_secret.master_encryption_key.secret_id + key = "latest" + } + } + } + + env { + name = "PIERRE_RSA_KEY_SIZE" + value = "4096" + } + + env { + name = "JWT_EXPIRY_HOURS" + value = "24" + } + + # OAuth Provider Configuration + env { + name = "STRAVA_CLIENT_ID" + value = var.strava_client_id + } + + dynamic "env" { + for_each = contains(keys(google_secret_manager_secret.secrets), "strava_client_secret") ? [1] : [] + content { + name = "STRAVA_CLIENT_SECRET" + value_from { + secret_key_ref { + name = google_secret_manager_secret.secrets["strava_client_secret"].secret_id + key = "latest" + } + } + } + } + + env { + name = "STRAVA_REDIRECT_URI" + value = var.strava_redirect_uri != "" ? var.strava_redirect_uri : "${google_cloud_run_service.pierre_mcp_server.status[0].url}/api/oauth/callback/strava" + } + + env { + name = "GARMIN_CLIENT_ID" + value = var.garmin_client_id + } + + dynamic "env" { + for_each = contains(keys(google_secret_manager_secret.secrets), "garmin_client_secret") ? [1] : [] + content { + name = "GARMIN_CLIENT_SECRET" + value_from { + secret_key_ref { + name = google_secret_manager_secret.secrets["garmin_client_secret"].secret_id + key = "latest" + } + } + } + } + + env { + name = "GARMIN_REDIRECT_URI" + value = var.garmin_redirect_uri != "" ? var.garmin_redirect_uri : "${google_cloud_run_service.pierre_mcp_server.status[0].url}/api/oauth/callback/garmin" + } + + env { + name = "FITBIT_CLIENT_ID" + value = var.fitbit_client_id + } + + dynamic "env" { + for_each = contains(keys(google_secret_manager_secret.secrets), "fitbit_client_secret") ? [1] : [] + content { + name = "FITBIT_CLIENT_SECRET" + value_from { + secret_key_ref { + name = google_secret_manager_secret.secrets["fitbit_client_secret"].secret_id + key = "latest" + } + } + } + } + + env { + name = "FITBIT_REDIRECT_URI" + value = var.fitbit_redirect_uri != "" ? var.fitbit_redirect_uri : "${google_cloud_run_service.pierre_mcp_server.status[0].url}/api/oauth/callback/fitbit" + } + + # OpenWeather API Key + dynamic "env" { + for_each = contains(keys(google_secret_manager_secret.secrets), "openweather_api_key") ? [1] : [] + content { + name = "OPENWEATHER_API_KEY" + value_from { + secret_key_ref { + name = google_secret_manager_secret.secrets["openweather_api_key"].secret_id + key = "latest" + } + } + } + } + + # Database connection pool settings + env { + name = "POSTGRES_MAX_CONNECTIONS" + value = "10" + } + + env { + name = "POSTGRES_MIN_CONNECTIONS" + value = "2" + } + + env { + name = "POSTGRES_ACQUIRE_TIMEOUT" + value = "30" + } + + # Health check configuration + startup_probe { + http_get { + path = "/health" + port = 8081 + } + initial_delay_seconds = 10 + timeout_seconds = 3 + period_seconds = 10 + failure_threshold = 3 + } + + liveness_probe { + http_get { + path = "/health" + port = 8081 + } + initial_delay_seconds = 30 + timeout_seconds = 3 + period_seconds = 30 + failure_threshold = 3 + } + } + + container_concurrency = var.cloud_run_concurrency + timeout_seconds = var.cloud_run_timeout + } + + metadata { + annotations = { + "autoscaling.knative.dev/minScale" = tostring(var.cloud_run_min_instances) + "autoscaling.knative.dev/maxScale" = tostring(var.cloud_run_max_instances) + "run.googleapis.com/vpc-access-connector" = google_vpc_access_connector.connector.name + "run.googleapis.com/vpc-access-egress" = "private-ranges-only" + "run.googleapis.com/startup-cpu-boost" = "true" + "run.googleapis.com/execution-environment" = "gen2" + } + + labels = merge(var.labels, { + environment = var.environment + }) + } + } + + traffic { + percent = 100 + latest_revision = true + } + + autogenerate_revision_name = true + + depends_on = [ + google_vpc_access_connector.connector, + google_sql_database_instance.postgres, + google_project_service.required_apis + ] + + lifecycle { + ignore_changes = [ + template[0].metadata[0].annotations["client.knative.dev/user-image"], + template[0].metadata[0].annotations["run.googleapis.com/client-name"], + template[0].metadata[0].annotations["run.googleapis.com/client-version"], + ] + } +} + +# IAM policy for Cloud Run (allow public access or restrict) +resource "google_cloud_run_service_iam_member" "public_access" { + count = length(var.allowed_ingress_cidrs) > 0 && contains(var.allowed_ingress_cidrs, "0.0.0.0/0") ? 1 : 0 + + service = google_cloud_run_service.pierre_mcp_server.name + location = google_cloud_run_service.pierre_mcp_server.location + role = "roles/run.invoker" + member = "allUsers" +} + +# ============================================================================ +# MONITORING & ALERTING +# ============================================================================ + +# Uptime check for health endpoint +resource "google_monitoring_uptime_check_config" "health_check" { + count = var.enable_uptime_checks ? 1 : 0 + + display_name = "${var.service_name}-health-check-${var.environment}" + timeout = "10s" + period = "60s" + + http_check { + path = "/health" + port = "443" + use_ssl = true + validate_ssl = true + } + + monitored_resource { + type = "uptime_url" + labels = { + project_id = var.project_id + host = replace(google_cloud_run_service.pierre_mcp_server.status[0].url, "https://", "") + } + } + + content_matchers { + content = "ok" + matcher = "CONTAINS_STRING" + } +} + +# Alert policy for service downtime +resource "google_monitoring_alert_policy" "service_down" { + count = var.enable_uptime_checks && var.alert_email != "" ? 1 : 0 + + display_name = "${var.service_name} Service Down (${var.environment})" + combiner = "OR" + + conditions { + display_name = "Uptime check failed" + + condition_threshold { + filter = "metric.type=\"monitoring.googleapis.com/uptime_check/check_passed\" AND resource.type=\"uptime_url\" AND metric.label.check_id=\"${google_monitoring_uptime_check_config.health_check[0].uptime_check_id}\"" + duration = "300s" + comparison = "COMPARISON_LT" + threshold_value = 1 + + aggregations { + alignment_period = "60s" + per_series_aligner = "ALIGN_NEXT_OLDER" + } + } + } + + notification_channels = [ + google_monitoring_notification_channel.email[0].id + ] + + alert_strategy { + auto_close = "1800s" + } + + documentation { + content = <<-EOT + The Pierre MCP Server (${var.environment}) health check has failed. + + Runbook: + 1. Check Cloud Run logs: gcloud logging read "resource.type=cloud_run_revision AND resource.labels.service_name=${var.service_name}" --limit 50 + 2. Check database connectivity: Verify Cloud SQL instance is running + 3. Check recent deployments: Review last Cloud Run revision + 4. Manual verification: curl ${google_cloud_run_service.pierre_mcp_server.status[0].url}/health + EOT + } +} + +# Email notification channel +resource "google_monitoring_notification_channel" "email" { + count = var.alert_email != "" ? 1 : 0 + + display_name = "Email - ${var.alert_email}" + type = "email" + + labels = { + email_address = var.alert_email + } +} diff --git a/gcp/terraform/outputs.tf b/gcp/terraform/outputs.tf new file mode 100644 index 00000000..3bbb035e --- /dev/null +++ b/gcp/terraform/outputs.tf @@ -0,0 +1,158 @@ +# Cloud Run Outputs +output "cloud_run_service_url" { + description = "URL of the deployed Cloud Run service" + value = google_cloud_run_service.pierre_mcp_server.status[0].url +} + +output "cloud_run_service_id" { + description = "Cloud Run service ID" + value = google_cloud_run_service.pierre_mcp_server.id +} + +output "cloud_run_service_name" { + description = "Cloud Run service name" + value = google_cloud_run_service.pierre_mcp_server.name +} + +# Cloud SQL Outputs +output "database_instance_name" { + description = "Cloud SQL instance name" + value = google_sql_database_instance.postgres.name +} + +output "database_connection_name" { + description = "Cloud SQL instance connection name (for Cloud SQL Proxy)" + value = google_sql_database_instance.postgres.connection_name +} + +output "database_private_ip" { + description = "Cloud SQL private IP address" + value = google_sql_database_instance.postgres.private_ip_address + sensitive = true +} + +output "database_public_ip" { + description = "Cloud SQL public IP address (if enabled)" + value = length(google_sql_database_instance.postgres.ip_address) > 0 ? google_sql_database_instance.postgres.ip_address[0].ip_address : "N/A" +} + +output "database_name" { + description = "PostgreSQL database name" + value = google_sql_database.pierre_db.name +} + +output "database_user" { + description = "PostgreSQL database user" + value = google_sql_user.pierre_user.name +} + +# Networking Outputs +output "vpc_network_name" { + description = "VPC network name" + value = google_compute_network.vpc.name +} + +output "vpc_network_id" { + description = "VPC network ID" + value = google_compute_network.vpc.id +} + +output "subnet_name" { + description = "Subnet name" + value = google_compute_subnetwork.subnet.name +} + +output "serverless_vpc_connector_name" { + description = "Serverless VPC Access connector name" + value = google_vpc_access_connector.connector.name +} + +output "cloud_nat_name" { + description = "Cloud NAT gateway name" + value = google_compute_router_nat.nat.name +} + +# Secret Manager Outputs +output "secret_ids" { + description = "Map of secret names to their Secret Manager IDs" + value = { + for k, v in google_secret_manager_secret.secrets : k => v.id + } + sensitive = true +} + +# Service Account Outputs +output "cloud_run_service_account_email" { + description = "Email of the Cloud Run service account" + value = google_service_account.cloud_run_sa.email +} + +output "cloud_run_service_account_name" { + description = "Name of the Cloud Run service account" + value = google_service_account.cloud_run_sa.name +} + +# Project Configuration +output "project_id" { + description = "GCP project ID" + value = var.project_id +} + +output "region" { + description = "GCP region" + value = var.region +} + +output "environment" { + description = "Environment name" + value = var.environment +} + +# Database Connection String (for application configuration) +output "database_url" { + description = "PostgreSQL connection URL (use with Cloud SQL Proxy or private IP)" + value = "postgresql://${google_sql_user.pierre_user.name}:GENERATED_PASSWORD@${google_sql_database_instance.postgres.private_ip_address}:5432/${google_sql_database.pierre_db.name}" + sensitive = true +} + +# Health Check Endpoint +output "health_check_url" { + description = "URL for health check endpoint" + value = "${google_cloud_run_service.pierre_mcp_server.status[0].url}/health" +} + +# Deployment Instructions +output "deployment_instructions" { + description = "Quick start deployment instructions" + value = <<-EOT + =================================================================== + Pierre MCP Server Deployment Complete! + =================================================================== + + Service URL: ${google_cloud_run_service.pierre_mcp_server.status[0].url} + Health Check: ${google_cloud_run_service.pierre_mcp_server.status[0].url}/health + + Database Connection: + Instance: ${google_sql_database_instance.postgres.connection_name} + Private IP: ${google_sql_database_instance.postgres.private_ip_address} + Database: ${google_sql_database.pierre_db.name} + User: ${google_sql_user.pierre_user.name} + + Next Steps: + 1. Retrieve database password from Secret Manager: + gcloud secrets versions access latest --secret="${var.service_name}-db-password" + + 2. Test the health endpoint: + curl ${google_cloud_run_service.pierre_mcp_server.status[0].url}/health + + 3. View logs: + gcloud logging read "resource.type=cloud_run_revision AND resource.labels.service_name=${google_cloud_run_service.pierre_mcp_server.name}" --limit 50 + + 4. Deploy new version: + gcloud run deploy ${google_cloud_run_service.pierre_mcp_server.name} \ + --image=NEW_IMAGE_URL \ + --region=${var.region} + + =================================================================== + EOT +} diff --git a/gcp/terraform/variables.tf b/gcp/terraform/variables.tf new file mode 100644 index 00000000..0d52f2fe --- /dev/null +++ b/gcp/terraform/variables.tf @@ -0,0 +1,230 @@ +# GCP Project Configuration +variable "project_id" { + description = "GCP project ID" + type = string +} + +variable "region" { + description = "GCP region for resources" + type = string + default = "us-central1" +} + +variable "zone" { + description = "GCP zone for zonal resources" + type = string + default = "us-central1-a" +} + +# Environment Configuration +variable "environment" { + description = "Environment name (dev, staging, production)" + type = string + validation { + condition = contains(["dev", "staging", "production"], var.environment) + error_message = "Environment must be dev, staging, or production" + } +} + +# Cloud Run Configuration +variable "service_name" { + description = "Cloud Run service name" + type = string + default = "pierre-mcp-server" +} + +variable "container_image" { + description = "Container image URL (e.g., gcr.io/PROJECT/pierre-mcp-server:latest)" + type = string +} + +variable "cloud_run_cpu" { + description = "CPU allocation for Cloud Run (e.g., '1', '2', '4')" + type = string + default = "1" +} + +variable "cloud_run_memory" { + description = "Memory allocation for Cloud Run (e.g., '512Mi', '1Gi', '2Gi')" + type = string + default = "512Mi" +} + +variable "cloud_run_min_instances" { + description = "Minimum number of Cloud Run instances" + type = number + default = 0 +} + +variable "cloud_run_max_instances" { + description = "Maximum number of Cloud Run instances" + type = number + default = 100 +} + +variable "cloud_run_concurrency" { + description = "Maximum concurrent requests per instance" + type = number + default = 80 +} + +variable "cloud_run_timeout" { + description = "Request timeout in seconds" + type = number + default = 300 +} + +# Cloud SQL Configuration +variable "database_name" { + description = "Cloud SQL database name" + type = string + default = "pierre_mcp_server" +} + +variable "database_user" { + description = "Cloud SQL database user" + type = string + default = "pierre" +} + +variable "database_tier" { + description = "Cloud SQL tier (db-f1-micro, db-custom-2-8192, etc.)" + type = string + default = "db-f1-micro" +} + +variable "database_disk_size" { + description = "Cloud SQL disk size in GB" + type = number + default = 20 +} + +variable "database_disk_type" { + description = "Cloud SQL disk type (PD_SSD or PD_HDD)" + type = string + default = "PD_SSD" +} + +variable "database_backup_enabled" { + description = "Enable automated backups" + type = bool + default = true +} + +variable "database_backup_retention_days" { + description = "Number of days to retain backups" + type = number + default = 7 +} + +variable "database_high_availability" { + description = "Enable high availability (regional) configuration" + type = bool + default = false +} + +variable "database_private_network" { + description = "Enable private IP for Cloud SQL (recommended for production)" + type = bool + default = true +} + +# Networking Configuration +variable "vpc_name" { + description = "VPC network name" + type = string + default = "pierre-vpc" +} + +variable "subnet_cidr" { + description = "Subnet CIDR range" + type = string + default = "10.0.0.0/24" +} + +variable "serverless_connector_cidr" { + description = "CIDR range for Serverless VPC Access connector" + type = string + default = "10.8.0.0/28" +} + +# Secret Manager Secrets +variable "secrets" { + description = "Map of secret names to their values (will be stored in Secret Manager)" + type = map(string) + sensitive = true + default = {} +} + +# External API Configuration (OAuth Providers) +variable "strava_client_id" { + description = "Strava OAuth client ID" + type = string + default = "" +} + +variable "strava_redirect_uri" { + description = "Strava OAuth redirect URI" + type = string + default = "" +} + +variable "garmin_client_id" { + description = "Garmin OAuth client ID" + type = string + default = "" +} + +variable "garmin_redirect_uri" { + description = "Garmin OAuth redirect URI" + type = string + default = "" +} + +variable "fitbit_client_id" { + description = "Fitbit OAuth client ID" + type = string + default = "" +} + +variable "fitbit_redirect_uri" { + description = "Fitbit OAuth redirect URI" + type = string + default = "" +} + +# Monitoring & Alerting +variable "enable_uptime_checks" { + description = "Enable Cloud Monitoring uptime checks" + type = bool + default = true +} + +variable "alert_email" { + description = "Email address for critical alerts" + type = string + default = "" +} + +# Labels (for cost tracking and organization) +variable "labels" { + description = "Labels to apply to all resources" + type = map(string) + default = { + managed_by = "terraform" + application = "pierre-mcp-server" + } +} + +# Security +variable "enable_cloud_armor" { + description = "Enable Cloud Armor WAF protection" + type = bool + default = false +} + +variable "allowed_ingress_cidrs" { + description = "CIDR ranges allowed to access the service" + type = list(string) + default = ["0.0.0.0/0"] # Open to internet by default, restrict in production +} diff --git a/gcp/terraform/versions.tf b/gcp/terraform/versions.tf new file mode 100644 index 00000000..a00eb1b4 --- /dev/null +++ b/gcp/terraform/versions.tf @@ -0,0 +1,28 @@ +terraform { + required_version = ">= 1.6.0" + + required_providers { + google = { + source = "hashicorp/google" + version = "~> 5.0" + } + google-beta = { + source = "hashicorp/google-beta" + version = "~> 5.0" + } + random = { + source = "hashicorp/random" + version = "~> 3.5" + } + } +} + +provider "google" { + project = var.project_id + region = var.region +} + +provider "google-beta" { + project = var.project_id + region = var.region +}