From 2f2ca28c70d3b2972137488a6c95cb1417beb450 Mon Sep 17 00:00:00 2001 From: achapin Date: Fri, 5 Jun 2026 01:18:08 -0400 Subject: [PATCH 1/2] feat(openstack): add OpenStack deployment assets and configs Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- openstack/.gitignore | 29 + openstack/QUICKSTART.md | 211 +++++++ openstack/README.md | 527 ++++++++++++++++++ openstack/TROUBLESHOOTING.md | 510 +++++++++++++++++ openstack/bootstrap-k8s.sh | 478 ++++++++++++++++ openstack/cinder-csi-storageclass.yaml | 8 + openstack/deploy-k8s-cluster.sh | 392 +++++++++++++ openstack/deploy-openstudio-cluster.sh | 501 +++++++++++++++++ openstack/deploy.sh | 436 +++++++++++++++ openstack/k8s-cloud-init.yaml | 321 +++++++++++ .../inventory/sample/group_vars/all.yml | 102 ++++ .../sample/group_vars/k8s_cluster.yml | 90 +++ openstack/main.tf | 315 +++++++++++ openstack/openstudio-large.tfvars | 21 + openstack/openstudio-micro.tfvars | 22 + openstack/openstudio-small.tfvars | 21 + openstack/outputs.tf | 86 +++ openstack/setup-kubectl.sh | 170 ++++++ openstack/storage-classes.yaml | 57 ++ openstack/tofu-with-env.sh | 47 ++ openstack/values-openstack-nfs-small.yaml | 180 ++++++ openstack/values-openstack-nfs.yaml | 180 ++++++ openstack/values-openstack.yaml | 182 ++++++ openstack/variables.tf | 138 +++++ openstack/versions.tf | 9 + 25 files changed, 5033 insertions(+) create mode 100644 openstack/.gitignore create mode 100644 openstack/QUICKSTART.md create mode 100644 openstack/README.md create mode 100644 openstack/TROUBLESHOOTING.md create mode 100755 openstack/bootstrap-k8s.sh create mode 100644 openstack/cinder-csi-storageclass.yaml create mode 100755 openstack/deploy-k8s-cluster.sh create mode 100755 openstack/deploy-openstudio-cluster.sh create mode 100755 openstack/deploy.sh create mode 100644 openstack/k8s-cloud-init.yaml create mode 100644 openstack/kubespray/inventory/sample/group_vars/all.yml create mode 100644 openstack/kubespray/inventory/sample/group_vars/k8s_cluster.yml create mode 100644 openstack/main.tf create mode 100644 openstack/openstudio-large.tfvars create mode 100644 openstack/openstudio-micro.tfvars create mode 100644 openstack/openstudio-small.tfvars create mode 100644 openstack/outputs.tf create mode 100755 openstack/setup-kubectl.sh create mode 100644 openstack/storage-classes.yaml create mode 100755 openstack/tofu-with-env.sh create mode 100644 openstack/values-openstack-nfs-small.yaml create mode 100644 openstack/values-openstack-nfs.yaml create mode 100644 openstack/values-openstack.yaml create mode 100644 openstack/variables.tf create mode 100644 openstack/versions.tf diff --git a/openstack/.gitignore b/openstack/.gitignore new file mode 100644 index 0000000..9cae43e --- /dev/null +++ b/openstack/.gitignore @@ -0,0 +1,29 @@ +# Terraform/OpenTofu files +*.tfstate +*.tfstate.* +*.tfplan +*.tfplan.* +.terraform/ +.terraform.lock.hcl + +# Environment files with credentials +.env +.env.local +.env.production + +# IDE files +.vscode/ +.idea/ +*.swp +*.swo + +# OS files +.DS_Store +Thumbs.db + +# Backup files +*.backup +*.bak + +# Log files +*.log diff --git a/openstack/QUICKSTART.md b/openstack/QUICKSTART.md new file mode 100644 index 0000000..5a3e45c --- /dev/null +++ b/openstack/QUICKSTART.md @@ -0,0 +1,211 @@ +# OpenStudio Server on OpenStack - Quick Start + +This guide prioritizes the OpenStack-managed Kubernetes path (for example Azimuth). The self-managed Terraform/OpenTofu + Kubespray flow in this directory is legacy and may not work in all environments. + +> [!WARNING] +> Legacy self-managed scripts in `openstack/` are not actively tested. Use at your own risk. + +## Recommended Prerequisites + +- Access to a Kubernetes cluster created/managed by your OpenStack platform team (for example via Azimuth) +- kubectl installed +- Helm installed + +## ๐Ÿš€ Recommended Deployment (Managed Kubernetes) + +Once your cluster is created and kubeconfig is configured: + +```bash +cp ../openstudio-server/values_production.templateyaml ../openstudio-server/values.yaml +# edit ../openstudio-server/values.yaml (provider=openstack, resources, storage, and secret name) +kubectl -n openstudio-server create secret generic openstudio-app-secrets \ + --from-literal=db-username="openstudio" \ + --from-literal=db-password="replace-with-strong-password" \ + --from-literal=redis-password="replace-with-strong-password" \ + --from-literal=web-secret-key="replace-with-long-random-secret" +helm upgrade --install openstudio-server ../openstudio-server -f ../openstudio-server/values.yaml +``` + +`secrets.validateExistingSecret` is strict by default when using `secrets.existingSecret`. For offline render-only checks, use `--set secrets.validateExistingSecret=false`. + +All tracked values files in this repository are templates. Keep real credentials in a local untracked values file and Kubernetes Secret. + +The production template sets OpenStack storage hardening defaults: + +- `nfs-server-provisioner.persistence.size: 1Ti` +- `db.persistence.storageClass: cinder-csi` +- `redis.persistence.storageClass: cinder-csi` +- `global.storageClasses.block: cinder-csi` (override this if your Cinder class name differs) + +## Legacy Self-Managed Deployment (Optional, Untested) + +If you explicitly choose to run self-managed cluster automation from this directory: + +```bash +# Required before deploy-openstudio-cluster.sh: +kubectl -n openstudio-server create secret generic openstudio-app-secrets \ + --from-literal=db-username="openstudio" \ + --from-literal=db-password="replace-with-strong-password" \ + --from-literal=redis-password="replace-with-strong-password" \ + --from-literal=web-secret-key="replace-with-long-random-secret" + +./deploy-openstudio-cluster.sh small +``` + +This flow provisions and bootstraps Kubernetes directly on OpenStack, but it is a legacy path and may require substantial environment-specific troubleshooting. +By default it applies `./values-openstack.yaml`, enforces `global.provider.name=openstack`, and requires `APP_SECRET_NAME` (default `openstudio-app-secrets`) to already exist. +It now also validates that `APP_SECRET_NAME` includes non-empty `db-username`, `db-password`, `redis-password`, and `web-secret-key` values before running Helm. + +## Expected Output + +``` +=================================== +OpenStudio Server K8s Deployment +=================================== + +[INFO] Checking prerequisites... +[SUCCESS] Prerequisites check passed +[INFO] Deploying OpenStack infrastructure with Terraform... +[SUCCESS] Infrastructure deployment completed +[INFO] Testing network connectivity to deployed instances... +[SUCCESS] SSH connectivity test successful +[INFO] Monitoring Kubernetes cluster bootstrap process... +[SUCCESS] All nodes are ready! +[INFO] Setting up kubectl configuration locally... +[SUCCESS] kubectl configuration completed +[SUCCESS] Cluster verification passed + +=================================== +DEPLOYMENT COMPLETED SUCCESSFULLY! +=================================== + +Cluster Information: + Master IP: 10.60.124.23 + Cluster Name: openstudio-server + Total Nodes: 3 + +kubectl is configured and ready to use: + kubectl get nodes + kubectl get pods --all-namespaces + +To deploy OpenStudio Helm chart: + cd ../helm + helm upgrade --install openstudio-server ./openstudio-server +``` + +## Deploy OpenStudio Helm Chart + +After the Kubernetes cluster is ready: + +```bash +cd ../helm +helm upgrade --install openstudio-server ./openstudio-server +``` + +## Verify Deployment + +Check cluster status: +```bash +kubectl get nodes +kubectl get pods --all-namespaces +``` + +## Troubleshooting + +If you encounter issues: + +```bash +# Show current status and troubleshooting info +./deploy.sh troubleshoot + +# Destroy and redeploy +./deploy.sh destroy +./deploy.sh +``` + +## Manual Steps (If Needed) + +If the automated deployment encounters connectivity issues: + +1. **Check network connectivity:** + ```bash + ping + ssh ubuntu@ + ``` + +2. **Monitor bootstrap process manually:** + ```bash + ./bootstrap-k8s.sh + ``` + +3. **Setup kubectl manually:** + ```bash + ./setup-kubectl.sh + ``` + + TLS defaults are CA-first. The setup scripts now keep TLS verification enabled and set `tls-server-name` (default `kubernetes`). + Override behavior with: + ```bash + KUBE_TLS_SERVER_NAME= ./setup-kubectl.sh + OPENSTACK_ALLOW_INSECURE_KUBECTL=true ./setup-kubectl.sh # opt-in only + ``` + +## Common Issues + +### SSH Connection Timeouts +- **Cause:** Network policies blocking floating IP ranges (10.60.x.x) +- **Solution:** Check with network administrator or try from different network + +### DNS Resolution Issues +- **Cause:** Instances can't reach DNS servers (10.60.10.240, 10.20.49.97) +- **Solution:** Check OpenStack network configuration + +### Cloud-init Bootstrap Failures +- **Cause:** Package download failures, network issues +- **Solution:** Check console logs: + ```bash + openstack console log show openstudio-server-master + ``` + +## Architecture + +The automated deployment creates: + +- **Master Node (1x):** Kubernetes control plane + CSI driver +- **Worker Node (1x):** Kubernetes worker for general workloads +- **Web Node (1x):** Labeled for web frontend workloads +- **Storage:** Cinder CSI driver for persistent volumes +- **Networking:** Private network with router and floating IPs +- **Security:** Security groups with minimal required ports + +## Total Deployment Time + +- **Infrastructure:** ~2-3 minutes +- **Kubernetes Bootstrap:** ~8-12 minutes +- **Total:** ~10-15 minutes + +## Next Steps + +Once deployment is complete: + +1. Deploy OpenStudio Helm chart +2. Access web interface via LoadBalancer or NodePort +3. Run OpenStudio simulations +4. Scale workers as needed + +## Advanced Usage + +```bash +# Show all available commands +./deploy.sh help + +# Show detailed status +./deploy.sh status + +# Destroy infrastructure +./deploy.sh destroy +``` + +--- + +**Success Criteria:** After running `./deploy.sh`, you should have a working 3-node Kubernetes cluster with kubectl configured and ready to deploy the OpenStudio Helm chart. diff --git a/openstack/README.md b/openstack/README.md new file mode 100644 index 0000000..a6c6915 --- /dev/null +++ b/openstack/README.md @@ -0,0 +1,527 @@ +# OpenStudio Server on OpenStack (Legacy Self-Managed Cluster Path) + +This directory contains legacy automation for building a Kubernetes cluster directly on OpenStack (Terraform/OpenTofu + Kubespray), then deploying OpenStudio Server. + +> [!WARNING] +> This self-managed OpenStack cluster path is **not actively tested** and may not work in all environments. +> Prefer a Kubernetes cluster created and managed by your OpenStack platform team (for example via **Azimuth**), then use this repository only for Helm deployment. + +## Recommended Approach (Azimuth or Admin-Managed Kubernetes) + +1. Create a Kubernetes cluster using your OpenStack-managed platform (for example Azimuth). +2. Configure local kubeconfig access to that cluster. +3. Deploy OpenStudio Server using this repository's Helm chart and values templates. + +```bash +# Option A: start from the tracked baseline values.yaml +# Option B: copy production template to a local override file +cp ../openstudio-server/values_production.templateyaml ../openstudio-server/values.local.yaml +# Edit your values file (resources/provider=openstack/storage classes/secret name) +kubectl -n openstudio-server create secret generic openstudio-app-secrets \ + --from-literal=db-username="openstudio" \ + --from-literal=db-password="replace-with-strong-password" \ + --from-literal=redis-password="replace-with-strong-password" \ + --from-literal=web-secret-key="replace-with-long-random-secret" +helm upgrade --install openstudio-server ../openstudio-server \ + -f ../openstudio-server/values.yaml \ + -f ../openstudio-server/values.local.yaml +``` + +## Legacy Quick Start (Use at Your Own Risk) + +If you still need to self-manage Kubernetes on OpenStack with the scripts in this directory: + +```bash +# Pre-create app secret used by deploy-openstudio-cluster.sh +kubectl -n openstudio-server create secret generic openstudio-app-secrets \ + --from-literal=db-username="openstudio" \ + --from-literal=db-password="replace-with-strong-password" \ + --from-literal=redis-password="replace-with-strong-password" \ + --from-literal=web-secret-key="replace-with-long-random-secret" + +./deploy-openstudio-cluster.sh small +``` + +By default, `deploy-openstudio-cluster.sh` deploys with: + +- `HELM_VALUES_FILE=./values-openstack.yaml` +- `APP_SECRET_NAME=openstudio-app-secrets` +- `global.provider.name=openstack` + +## Legacy Path Features + +- **๐Ÿ” Corporate Firewall Support**: Automatic detection and workaround for certificate interception +- **โšก One-Click Deployment**: Automated infrastructure, Kubernetes, and application deployment +- **๐Ÿ“ Multiple Cluster Sizes**: Small, large, and test configurations +- **๐Ÿท๏ธ Node Workload Separation**: Dedicated web and worker node groups with proper tainting +- **๐Ÿ’พ Storage Integration**: Cinder CSI driver with multiple storage classes +- **๐ŸŒ LoadBalancer Support**: Octavia integration for external service exposure +- **๐Ÿ”ง EKS Compatibility**: Matches AWS EKS configuration patterns for consistency + +## ๐Ÿ“‹ Prerequisites + +### Required Tools +- [OpenTofu](https://opentofu.org/) (Terraform alternative) +- [Ansible](https://ansible.com/) +- [kubectl](https://kubernetes.io/docs/tasks/tools/) +- [Helm](https://helm.sh/) +- SSH access with key-based authentication + +### OpenStack Environment Variables +```bash +export TF_VAR_openstack_user_name="your-username" +export TF_VAR_openstack_password="your-password" +export TF_VAR_openstack_auth_url="https://your-openstack-api:5000" +export TF_VAR_openstack_tenant_name="your-project" +export TF_VAR_openstack_user_domain_name="your-domain" +export TF_VAR_openstack_project_domain_id="your-project-domain-id" +export TF_VAR_openstack_project_id="your-project-id" +export TF_VAR_openstack_region="RegionOne" +export TF_VAR_key_pair="your-openstack-keypair-name" +export TF_VAR_public_key="$(cat ~/.ssh/id_rsa.pub)" + +# Optional hardening: narrow these CIDRs instead of permissive defaults. +export TF_VAR_admin_access_cidr="203.0.113.10/32" +export TF_VAR_k8s_api_access_cidr="203.0.113.10/32" +export TF_VAR_nodeport_access_cidr="0.0.0.0/0" +``` + +## ๐Ÿ—๏ธ Cluster Configurations + +### Small Cluster (Development/Testing) +- **Master**: 1x CS.Wee (8 vCPU, 32GB RAM) +- **Web Nodes**: 2x CS.Medium (16 vCPU, 64GB RAM each) +- **Worker Nodes**: 1x CM.Large (32 vCPU, 64GB RAM) +- **Storage**: 100GB per node + +```bash +./deploy-openstudio-cluster.sh small +``` + +### Large Cluster (Production) +- **Master**: 1x CS.Large (16 vCPU, 64GB RAM) +- **Web Nodes**: 1x CS.2XMedium (32 vCPU, 128GB RAM) +- **Worker Nodes**: 1x CM.2XLarge (64 vCPU, 256GB RAM) +- **Storage**: 550GB per node + +```bash +./deploy-openstudio-cluster.sh large +``` + +### Test Cluster (Single Node) +- **Master**: 1x CS.Wee (8 vCPU, 32GB RAM) +- **Storage**: 100GB + +```bash +./deploy-openstudio-cluster.sh test +``` + +## ๐ŸŽฏ Usage Examples + +### Basic Deployment +```bash +# Deploy small cluster +./deploy-openstudio-cluster.sh small + +# Deploy large cluster with cleanup +./deploy-openstudio-cluster.sh large --cleanup + +# Deploy test cluster without Helm (infrastructure only) +./deploy-openstudio-cluster.sh test --skip-helm + +# Use a custom values overlay and secret name +HELM_VALUES_FILE=./values-openstack-nfs.yaml APP_SECRET_NAME=openstudio-app-secrets \ + ./deploy-openstudio-cluster.sh small +``` + +### Advanced Options +```bash +# Skip Terraform (use existing infrastructure) +./deploy-openstudio-cluster.sh small --skip-terraform + +# Skip Kubespray (use existing Kubernetes) +./deploy-openstudio-cluster.sh small --skip-kubespray --skip-terraform + +# Verbose output for debugging +./deploy-openstudio-cluster.sh small --verbose +``` + +## ๐Ÿ”ง Manual Deployment Steps + +If you prefer manual control, you can run each step individually: + +### 1. Infrastructure Deployment +```bash +# Initialize Terraform +tofu init + +# Deploy infrastructure +tofu plan -var-file="openstudio-small.tfvars" -out=tfplan +tofu apply tfplan +``` + +### 2. Kubernetes Deployment +```bash +# Generate inventory (automatically detects Terraform outputs) +# Copy custom group_vars +cp -r kubespray/inventory/sample/group_vars inventory/ + +# Run Kubespray +cd $HOME/kubespray +ansible-playbook -i ../openstudio-server-helm/openstack/inventory/inventory.ini \ + --become --become-user=root cluster.yml +``` + +### 3. Configure Storage and Services +```bash +# Get kubeconfig +scp ubuntu@:/etc/kubernetes/admin.conf ./kubeconfig +export KUBECONFIG=$(pwd)/kubeconfig + +# Apply storage classes +kubectl apply -f storage-classes.yaml + +# Optional: create a local override values file from tracked template +cp ./openstudio-server/values_production.templateyaml ./openstudio-server/values.local.yaml + +# Recommended default for this environment: +# secrets.existingSecret: openstudio-app-secrets +# secrets.create: false +# (set these in your local values file, e.g. values.local.yaml) + +# Option A (recommended): create one Kubernetes Secret and reference it +kubectl -n openstudio-server create secret generic openstudio-app-secrets \ + --from-literal=db-username="openstudio" \ + --from-literal=db-password="replace-with-strong-password" \ + --from-literal=redis-password="replace-with-strong-password" \ + --from-literal=web-secret-key="replace-with-long-random-secret" + +# Option B: let chart create Secret from values at deploy time +export OS_DB_USERNAME="openstudio" +export OS_DB_PASSWORD="replace-with-strong-password" +export OS_REDIS_PASSWORD="replace-with-strong-password" +export OS_SECRET_KEY_BASE="replace-with-long-random-secret" + +# Deploy OpenStudio Server +helm upgrade --install openstudio-server ./openstudio-server \ + --namespace openstudio-server \ + --create-namespace \ + -f ./openstudio-server/values.yaml \ + -f ./openstudio-server/values.local.yaml \ + --timeout=20m \ + --wait +``` + +Preflight checks for Option A (existing secret): + +```bash +kubectl get secret -n openstudio-server openstudio-app-secrets +kubectl get secret -n openstudio-server openstudio-app-secrets -o jsonpath='{.data}' | jq 'keys' +./scripts/validate-app-secret.sh --namespace openstudio-server --secret-name openstudio-app-secrets +``` + +Expected keys: + +- `db-username` +- `db-password` +- `redis-password` +- `web-secret-key` + +Security hardening notes: + +- The chart no longer ships plaintext default credentials. +- App pods use `secretKeyRef` for DB/Redis/app secrets. +- If using chart-managed secrets (`secrets.create=true`), deploys fail fast unless `db.username`, `db.password`, `redis.password`, and `web.secret_key_value` are set. +- If using an externally managed secret (`secrets.existingSecret`), credentials only need to be entered once when creating that secret. +- `secrets.validateExistingSecret` is strict by default. For offline render-only checks, set `--set secrets.validateExistingSecret=false`. +- For normal upgrades in this environment, keep shared defaults in tracked `./openstudio-server/values.yaml` and put local overrides in `./openstudio-server/values.local.yaml`. +- If your cluster policy blocks Helm hook jobs, disable cleanup hook with `--set hooks.preDeleteCleanup.enabled=false`. +- If `secrets.existingSecret` is set, keep `secrets.create=false`; the chart now fails fast when both are enabled. +- `provider.name` is deprecated and disabled by default; use `global.provider.name`. For temporary migration-only fallback, set `global.provider.allowLegacyName=true`. + +### Upgrade migration for `--reuse-values` users + +Older installs that relied on chart-managed or plaintext values should migrate to an external Kubernetes Secret before upgrading: + +```bash +# 1) Create (or update) the app secret in the release namespace +kubectl -n openstudio-server create secret generic openstudio-app-secrets \ + --from-literal=db-username="openstudio" \ + --from-literal=db-password="replace-with-strong-password" \ + --from-literal=redis-password="replace-with-strong-password" \ + --from-literal=web-secret-key="replace-with-long-random-secret" \ + --dry-run=client -o yaml | kubectl apply -f - + +# 2) Ensure your local values file uses external-secret mode +# secrets: +# existingSecret: openstudio-app-secrets +# create: false + +# 3) Upgrade using explicit values file (preferred over pure --reuse-values) +helm upgrade --install openstudio-server ./openstudio-server \ + --namespace openstudio-server \ + -f ./openstudio-server/values.yaml +``` + +By default, this chart enables Cluster Autoscaler on AWS and disables it for other providers (including OpenStack). If you want autoscaling on OpenStack, set: + +- `autoscaler.enabled: true` +- `autoscaler.openstackNodeGroups` entries with `name`, `min`, and `max` +- either `autoscaler.openstack.cloudConfigSecretName` **or** a `--cloud-config=...` arg in `autoscaler.extraArgs` +- optional private-CA wiring via `autoscaler.openstack.caBundleSecretName` (with optional `caBundleSecretKey` and `caBundleMountPath`) + +`autoscaler.image.tag` defaults to `v..0` and is validated against cluster version to reduce Kubernetes/cluster-autoscaler version skew. + +When `autoscaler.enabled=true` on OpenStack, the chart performs a safety check and fails install/upgrade if a pre-existing `kube-system/cluster-autoscaler` deployment exists and is not owned by this Helm release. This prevents dual autoscaler configuration drift with platform-managed clusters (for example Azimuth). + +If your release-time RBAC cannot read `kube-system` deployments, set: + +```yaml +autoscaler: + openstack: + checkExistingDeploymentOwnership: false +``` + +Use this override only when required by RBAC constraints. + +Example: + +```yaml +autoscaler: + enabled: true + openstack: + cloudConfigSecretName: cloud-config + # Optional for private OpenStack API CAs: + caBundleSecretName: openstack-api-ca + # caBundleSecretKey: ca.crt + # caBundleMountPath: /etc/ssl/certs/openstack-ca.crt + openstackNodeGroups: + - name: web-group + min: 1 + max: 5 + - name: worker-group + min: 1 + max: 50 +``` + +When `caBundleSecretName` is set, the chart mounts the CA file and sets `SSL_CERT_FILE` in the autoscaler container. If you already bake private CA trust into node/runtime images, this is not required. + +The chart now reads provider from `global.provider.name` in your values file and applies provider-aware node affinity defaults automatically. For OpenStack, the default node label assumptions are: + +- Label key: `capi.stackhpc.com/node-group` +- Web node group value: `web` +- Worker node group value: `worker` + +OpenStack defaults to `global.nodeGroups.affinityMode: preferred` to avoid unschedulable pods when labels drift; set `required` to enforce strict placement. + +If your cluster uses different labels, set `global.nodeGroups.labelKey`, `global.nodeGroups.web`, and `global.nodeGroups.worker` in your values file. + +Additional OpenStack defaults are automatically applied when omitted in values: + +- `db.persistence.storageClass`: `nfs` +- `redis.persistence.storageClass`: `nfs` +- `load_balancer.externalTrafficPolicy`: `Cluster` + +For OpenStack block-backed PVCs, the chart now uses `global.storageClasses.block` (default `cinder-csi`) as the backing class for the NFS provisioner PVC. +`openstack/storage-classes.yaml` also includes a `csi-cinder` compatibility alias for older clusters/configs. + +For production hardening, the tracked `openstudio-server/values_production.templateyaml` explicitly sets: + +- `db.persistence.storageClass: cinder-csi` +- `redis.persistence.storageClass: cinder-csi` +- `nfs-server-provisioner.persistence.size: 1Ti` + +Preflight check before deploy/upgrade: + +```bash +kubectl get storageclass +kubectl get sc cinder-csi +``` + +If your cluster uses a different Cinder class name, set it explicitly in your values file: + +```yaml +global: + storageClasses: + block: +``` + +Render/lint matrix before deploy: + +```bash +./scripts/install-dry-run.sh +``` + +Kubeconfig helper scripts now default to TLS verification with `tls-server-name=kubernetes`. +If your API server certificate uses a different server name, set: + +```bash +KUBE_TLS_SERVER_NAME= ./setup-kubectl.sh +``` + +Only if required, you can opt into insecure mode: + +```bash +OPENSTACK_ALLOW_INSECURE_KUBECTL=true ./setup-kubectl.sh +``` + +## ๐Ÿญ Architecture + +### Network Architecture +``` + โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” + โ”‚ External โ”‚ + โ”‚ Network โ”‚ + โ”‚ (Floating IPs) โ”‚ + โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ”‚ + โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” + โ”‚ OpenStack โ”‚ + โ”‚ Router โ”‚ + โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ”‚ + โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” + โ”‚ โ”‚ โ”‚ + โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” + โ”‚ Master Node โ”‚ โ”‚ Web Nodes โ”‚ โ”‚ Worker Nodes โ”‚ + โ”‚ (Control โ”‚ โ”‚ (Web UI/API) โ”‚ โ”‚ (Simulation) โ”‚ + โ”‚ Plane) โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ + โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ +``` + +### Storage Architecture +- **Cinder CSI**: Dynamic persistent volume provisioning +- **SSD Storage Class**: Default for databases and critical data +- **NFS Storage Class**: Shared storage for multi-pod applications +- **Standard Storage Class**: Cost-effective storage for logs/temp data + +### Node Workload Separation +- **Web Nodes**: Handle HTTP requests, API calls, and user interface +- **Worker Nodes**: Execute compute-intensive OpenStudio simulations +- **Master Node**: Kubernetes control plane (can be made highly available) + +## ๐Ÿ” Corporate Firewall Support + +The deployment automatically detects and handles corporate firewall restrictions: + +### Automatic Detection +- Tests connectivity to major container registries +- Detects certificate interception (common in corporate environments) +- Creates firewall status file: `/etc/corporate-firewall-status` + +### Automatic Workarounds +- **Containerd Configuration**: Selective TLS verification bypassing +- **Download Settings**: Extended timeouts and retry mechanisms +- **Registry Mirrors**: Fallback registry configurations +- **CNI Configuration**: Pre-configured bridge CNI for reliability + +### Manual Override +```bash +# Check firewall detection on nodes +kubectl get nodes -o wide +ssh ubuntu@ "cat /etc/corporate-firewall-status" + +# View applied workarounds +ssh ubuntu@ "cat /etc/containerd/config.toml" +``` + +## ๐Ÿ“Š Monitoring and Troubleshooting + +### Check Deployment Status +```bash +# Export kubeconfig +export KUBECONFIG=$(pwd)/kubeconfig + +# Check cluster health +kubectl get nodes +kubectl get pods --all-namespaces + +# Check OpenStudio Server status +kubectl get pods -n openstudio-server +kubectl get services -n openstudio-server +``` + +### Access OpenStudio Server +```bash +# Port forward to access web interface +kubectl port-forward -n openstudio-server service/web 8080:80 + +# Visit: http://localhost:8080 +``` + +### Common Issues + +#### Corporate Firewall Issues +```bash +# Check firewall detection logs +ssh ubuntu@ "sudo journalctl -u corporate-firewall-detection" + +# Check containerd configuration +ssh ubuntu@ "sudo cat /etc/containerd/config.toml" +``` + +#### Storage Issues +```bash +# Check storage classes +kubectl get storageclasses + +# Check persistent volumes +kubectl get pv +kubectl get pvc -n openstudio-server +``` + +#### LoadBalancer Issues +```bash +# Check cloud provider configuration +kubectl get configmap -n kube-system cloud-config -o yaml + +# Check external cloud provider +kubectl get pods -n kube-system | grep cloud +``` + +## ๐Ÿ”„ Updates and Maintenance + +### Scaling Workers +```bash +# Scale worker replicas +kubectl scale deployment worker -n openstudio-server --replicas=5 + +# Add more worker nodes (modify tfvars and re-apply) +# Edit openstudio-small.tfvars: worker_count = 3 +tofu plan -var-file="openstudio-small.tfvars" +tofu apply +``` + +### Upgrading Kubernetes +```bash +# Update Kubespray version in group_vars +# Re-run Kubespray upgrade playbook +cd $HOME/kubespray +ansible-playbook -i ../openstudio-server-helm/openstack/inventory/inventory.ini \ + --become --become-user=root upgrade-cluster.yml +``` + +## ๐Ÿงน Cleanup + +```bash +# Destroy entire cluster +./deploy-openstudio-cluster.sh small --cleanup + +# Or manually with Terraform +tofu destroy -var-file="openstudio-small.tfvars" -auto-approve +``` + +## ๐Ÿค Contributing + +This solution bridges the gap between AWS EKS and OpenStack deployments, providing: +- Consistent deployment patterns +- Corporate environment compatibility +- Production-ready configurations +- Comprehensive automation + +## ๐Ÿ“š Additional Resources + +- [OpenStudio Server Documentation](https://github.com/NREL/OpenStudio-server) +- [Kubespray Documentation](https://github.com/kubernetes-sigs/kubespray) +- [OpenStack Cloud Provider](https://github.com/kubernetes/cloud-provider-openstack) +- [OpenTofu Documentation](https://opentofu.org/docs/) diff --git a/openstack/TROUBLESHOOTING.md b/openstack/TROUBLESHOOTING.md new file mode 100644 index 0000000..c55eb89 --- /dev/null +++ b/openstack/TROUBLESHOOTING.md @@ -0,0 +1,510 @@ +# Troubleshooting Guide: OpenStudio Server on OpenStack + +This guide covers common issues encountered when deploying OpenStudio Server on OpenStack with Kubernetes and their solutions. + +> [!WARNING] +> This troubleshooting content is for the legacy self-managed OpenStack Kubernetes path in `openstack/`. That path is not actively tested; prefer managed Kubernetes (for example Azimuth) when available. + +## ๐Ÿšจ Critical Issues and Solutions + +### 1. Pod Network Isolation (Most Common Issue) + +**Symptoms:** +- Pods cannot reach the Kubernetes API server (10.96.0.1:443) +- CSI drivers fail to provision volumes +- NFS provisioner logs show "context deadline exceeded" or timeouts +- Services can't be reached from pods + +**Root Cause:** +OpenStack network configurations often isolate pod networks (10.244.0.0/16) from service networks (10.96.0.0/12) and external connectivity. + +**Solution:** +Apply `hostNetwork: true` to affected deployments: + +```bash +# NFS provisioner (automatically done by bootstrap script) +kubectl patch deployment nfs-subdir-external-provisioner -n kube-system \ + -p '{"spec":{"template":{"spec":{"hostNetwork":true}}}}' + +# CSI driver controller (if using Cinder CSI) +kubectl patch deployment csi-cinder-controllerplugin -n kube-system \ + -p '{"spec":{"template":{"spec":{"hostNetwork":true}}}}' + +# Any custom provisioner +kubectl patch deployment -n \ + -p '{"spec":{"template":{"spec":{"hostNetwork":true}}}}' +``` + +**Prevention:** +Ensure the bootstrap script applies these patches automatically. + +### 2. Storage Provisioning Failures + +**Symptoms:** +- PVCs stuck in "Pending" state +- Events show "Waiting for a volume to be created" +- Storage provisioner pods are failing or restarting + +**Diagnosis:** +```bash +# Check storage classes +kubectl get storageclass + +# Check PVC status +kubectl describe pvc + +# Check provisioner status +kubectl get pods -n kube-system -l app=nfs-subdir-external-provisioner + +# Check provisioner logs +kubectl logs -n kube-system -l app=nfs-subdir-external-provisioner +``` + +**Solutions:** + +#### For NFS Storage Issues: +```bash +# Verify NFS server on master +MASTER_IP=$(terraform output -raw master_floating_ip) +ssh ubuntu@$MASTER_IP "systemctl status nfs-server" +ssh ubuntu@$MASTER_IP "showmount -e localhost" + +# Test NFS connectivity from other nodes +WEB_IP=$(terraform output -json web_floating_ips | jq -r '.[0]') +MASTER_INTERNAL_IP=$(terraform output -raw master_ip) +ssh ubuntu@$WEB_IP "showmount -e $MASTER_INTERNAL_IP" + +# Restart NFS services if needed +ssh ubuntu@$MASTER_IP "sudo systemctl restart nfs-server && sudo exportfs -ra" + +# Check NFS provisioner has hostNetwork +kubectl get deployment nfs-subdir-external-provisioner -n kube-system -o yaml | grep -A5 -B5 hostNetwork +``` + +#### For Cinder CSI Issues (if still using): +```bash +# Check CSI driver pods +kubectl get pods -n kube-system | grep csi-cinder + +# Check OpenStack credentials +kubectl get secret cloud-config -n kube-system -o yaml + +# Test OpenStack connectivity from CSI pod +kubectl exec -n kube-system -- curl -k +``` + +#### For StorageClass Name Mismatch: + +If events show errors like: + +- `storageclass.storage.k8s.io "cinder-csi" not found` + +your chart values are likely using the wrong Cinder StorageClass name for this cluster. + +```bash +kubectl get storageclass +kubectl get pvc -A +``` + +Set the OpenStack block class explicitly in values and redeploy: + +```yaml +global: + storageClasses: + block: cinder-csi +``` + +### 3. Container Image Pull Failures + +**Symptoms:** +- Pods stuck in "ImagePullBackOff" or "ErrImagePull" +- Long delays downloading container images +- TLS handshake failures or certificate errors + +**Diagnosis:** +```bash +# Check pod events +kubectl describe pod + +# Check containerd configuration +ssh ubuntu@ "cat /etc/containerd/config.toml" + +# Check corporate firewall detection +ssh ubuntu@ "cat /etc/corporate-firewall-status" + +# Test registry connectivity +ssh ubuntu@ "curl -I https://registry-1.docker.io/v2/" +``` + +**Solutions:** + +#### Manual Registry Configuration: +```bash +# Update containerd config with TLS skip (if corporate firewall detected) +ssh ubuntu@ "sudo tee /etc/containerd/config.toml << 'EOF' +version = 2 + +[plugins] + [plugins.\"io.containerd.grpc.v1.cri\"] + [plugins.\"io.containerd.grpc.v1.cri\".registry] + [plugins.\"io.containerd.grpc.v1.cri\".registry.configs] + [plugins.\"io.containerd.grpc.v1.cri\".registry.configs.\"registry-1.docker.io\".tls] + insecure_skip_verify = true + [plugins.\"io.containerd.grpc.v1.cri\".registry.configs.\"quay.io\".tls] + insecure_skip_verify = true +EOF" + +# Restart containerd +ssh ubuntu@ "sudo systemctl restart containerd" +``` + +#### Pre-pull Critical Images: +```bash +# Pre-pull images on all nodes +NODES=$(kubectl get nodes -o jsonpath='{.items[*].status.addresses[?(@.type=="ExternalIP")].address}') +for node in $NODES; do + ssh ubuntu@$node "sudo crictl pull registry.k8s.io/pause:3.9" + ssh ubuntu@$node "sudo crictl pull nrel/openstudio-server:3.8.0-1" + ssh ubuntu@$node "sudo crictl pull mongo:6.0.7" + ssh ubuntu@$node "sudo crictl pull redis:6.0.9" +done +``` + +### 4. DNS Resolution Failures + +**Symptoms:** +- Pods can't resolve service names +- External DNS lookups fail from pods +- CoreDNS pods are failing or restarting + +**Diagnosis:** +```bash +# Check CoreDNS status +kubectl get pods -n kube-system -l k8s-app=kube-dns + +# Test DNS from a pod +kubectl run test-pod --image=busybox --rm -it -- nslookup kubernetes.default.svc.cluster.local + +# Check CoreDNS configuration +kubectl get configmap coredns -n kube-system -o yaml +``` + +**Solutions:** + +#### Update CoreDNS Configuration: +```bash +# Get internal DNS servers (usually from /etc/resolv.conf on nodes) +INTERNAL_DNS=$(ssh ubuntu@ "grep nameserver /etc/resolv.conf | head -1 | awk '{print \$2}'") + +# Update CoreDNS config +kubectl edit configmap coredns -n kube-system +# Add/update the forward section: +# forward . $INTERNAL_DNS + +# Restart CoreDNS +kubectl rollout restart deployment/coredns -n kube-system +``` + +### 5. Node Readiness Issues + +**Symptoms:** +- Nodes stuck in "NotReady" state +- Kubelet service is not running +- Pods won't schedule to nodes + +**Diagnosis:** +```bash +# Check node status +kubectl get nodes +kubectl describe node + +# Check kubelet status on node +ssh ubuntu@ "systemctl status kubelet" +ssh ubuntu@ "journalctl -u kubelet -f" + +# Check network plugin +kubectl get pods -n kube-system -l k8s-app=flannel +``` + +**Solutions:** + +#### Restart Node Services: +```bash +# Restart kubelet +ssh ubuntu@ "sudo systemctl restart kubelet" + +# Restart containerd +ssh ubuntu@ "sudo systemctl restart containerd" + +# Check for swap (must be disabled) +ssh ubuntu@ "sudo swapoff -a" +``` + +#### Fix Network Plugin Issues: +```bash +# Reinstall Flannel if needed +kubectl delete -f https://raw.githubusercontent.com/flannel-io/flannel/master/Documentation/kube-flannel.yml +kubectl apply -f https://raw.githubusercontent.com/flannel-io/flannel/master/Documentation/kube-flannel.yml + +# Or use a known working version +kubectl apply -f https://raw.githubusercontent.com/flannel-io/flannel/v0.20.2/Documentation/kube-flannel.yml +``` + +### 6. Helm Release Stuck in Failed State (Managed-Field Conflicts) + +**Symptoms:** +- `helm status` reports `STATUS: failed` +- Description contains conflict errors against fields such as: + - `Deployment/worker .spec.replicas` + - `HorizontalPodAutoscaler/worker .spec.maxReplicas` +- Runtime workloads still appear healthy and continue processing jobs + +**Diagnosis:** +```bash +helm status openstudio-server -n openstudio-server +helm history openstudio-server -n openstudio-server +kubectl -n openstudio-server get deploy worker -o wide +kubectl -n openstudio-server get hpa worker -o wide +``` + +**Guarded Reconcile Path (use only for conflict failures):** +```bash +# Prefer automated helper +./scripts/openstudio-reliability --mode reconcile-helm --apply --allow-chart-apply + +# Equivalent direct command +helm upgrade openstudio-server ./openstudio-server \ + -n openstudio-server \ + --reuse-values \ + --server-side=false \ + --description "Reconcile release after managed-field conflicts" +``` + +Do **not** run reconcile if the release failed for non-conflict reasons, or while workloads are unstable. +If reconcile causes regressions, roll back immediately: + +```bash +helm rollback openstudio-server -n openstudio-server +``` + +**Prevention:** +- Standardize on one operational reconcile path and avoid ad-hoc mixed apply methods. +- Use `./scripts/openstudio-reliability --mode check` before and after upgrades. +- Keep a queue/job snapshot before mutation: + - `./scripts/openstudio-reliability --mode snapshot --snapshot-dir ` + +### 7. Analyses Stuck in `started` (Queue/State Divergence) + +**Symptoms:** +- PAT/UI analyses remain in `started` for a long period. +- Redis queue depths are near zero, or `requeued` accumulates. +- Pods and Helm release appear healthy. + +**Diagnosis:** +```bash +./scripts/openstudio-reliability --mode check --stale-minutes 70 +./scripts/openstudio-reliability --mode snapshot \ + --stale-minutes 70 \ + --snapshot-dir ./incident-snapshots/openstudio-server-$(date +%Y%m%d-%H%M%S) +``` + +**Guarded Recovery (apply-gated):** +```bash +./scripts/openstudio-reliability --mode recover-stuck --stale-minutes 70 --apply +``` + +What recovery does: +- Ensures worker queue subscriptions include `simulations,requeued`. +- Requeues stale started datapoints for stale started analyses. +- Finalizes stale started `batch_run` jobs only if all datapoints are terminal. +- Prints post-recovery queue and divergence checks. + +**Prevention:** +- Keep worker queues configured with `simulations,requeued`. +- Add alerts for stale started jobs/datapoints and non-zero `requeued` backlog. + +## ๐Ÿ“Š Diagnostic Commands Reference + +### Cluster Health Check +```bash +# Overall cluster status +kubectl cluster-info +kubectl get nodes +kubectl get pods --all-namespaces + +# System pod health +kubectl get pods -n kube-system + +# Storage health +kubectl get storageclass +kubectl get pv +kubectl get pvc --all-namespaces +``` + +### Network Diagnostics +```bash +# Test pod-to-pod networking +kubectl run test1 --image=nginx +kubectl run test2 --image=nginx +kubectl exec -it test1 -- ping $(kubectl get pod test2 -o jsonpath='{.status.podIP}') + +# Test service discovery +kubectl exec -it test1 -- nslookup kubernetes.default.svc.cluster.local + +# Test external connectivity +kubectl exec -it test1 -- wget -qO- http://httpbin.org/ip +``` + +### Storage Diagnostics +```bash +# NFS server status +MASTER_IP=$(terraform output -raw master_floating_ip) +ssh ubuntu@$MASTER_IP "systemctl status nfs-server" +ssh ubuntu@$MASTER_IP "exportfs -v" + +# Test NFS mount from worker node +WORKER_IP=$(terraform output -json worker_floating_ips | jq -r '.[0]') +MASTER_INTERNAL=$(terraform output -raw master_ip) +ssh ubuntu@$WORKER_IP "sudo mkdir -p /tmp/nfs-test" +ssh ubuntu@$WORKER_IP "sudo mount -t nfs $MASTER_INTERNAL:/srv/nfs/k8s-storage /tmp/nfs-test" +ssh ubuntu@$WORKER_IP "sudo umount /tmp/nfs-test" +``` + +### OpenStudio Server Diagnostics +```bash +# Pod status +kubectl get pods -n openstudio-test + +# PVC status +kubectl get pvc -n openstudio-test + +# Service status +kubectl get svc -n openstudio-test + +# Events +kubectl get events -n openstudio-test --sort-by='.lastTimestamp' + +# Detailed pod inspection +kubectl describe pod -n openstudio-test +kubectl logs -n openstudio-test +``` + +## ๐Ÿ”ง Recovery Procedures + +### Complete Cluster Reset +If the cluster is in a bad state: + +```bash +# 1. Clean up Kubernetes +ssh ubuntu@ "sudo kubeadm reset -f" + +# 2. Clean up on all nodes +for node_ip in ; do + ssh ubuntu@$node_ip "sudo systemctl stop kubelet containerd" + ssh ubuntu@$node_ip "sudo rm -rf /etc/kubernetes /var/lib/kubelet /var/lib/etcd" + ssh ubuntu@$node_ip "sudo systemctl start containerd" +done + +# 3. Reinitialize cluster (run bootstrap script) +./bootstrap-k8s.sh +``` + +### Storage Reset +If NFS storage is corrupted: + +```bash +# 1. Delete all PVCs first +kubectl delete pvc --all --all-namespaces + +# 2. Clean NFS directory on master +MASTER_IP=$(terraform output -raw master_floating_ip) +ssh ubuntu@$MASTER_IP "sudo rm -rf /srv/nfs/k8s-storage/*" + +# 3. Reinstall NFS provisioner +helm uninstall nfs-subdir-external-provisioner -n kube-system +# Then run the NFS setup from bootstrap script +``` + +### Infrastructure Reset +If OpenStack infrastructure is corrupted: + +```bash +# 1. Destroy all resources +terraform destroy -auto-approve + +# 2. Clean up any orphaned resources manually +openstack server list +openstack volume list +openstack network list + +# 3. Redeploy +./deploy.sh +``` + +## ๐Ÿ“ Preventive Measures + +### Regular Health Checks +```bash +# Create a health check script +cat > health-check.sh << 'EOF' +#!/bin/bash +echo "=== Cluster Health Check ===" +kubectl get nodes +echo "=== Storage Classes ===" +kubectl get storageclass +echo "=== System Pods ===" +kubectl get pods -n kube-system +echo "=== OpenStudio Pods ===" +kubectl get pods -n openstudio-test +echo "=== PVCs ===" +kubectl get pvc -n openstudio-test +echo "=== NFS Server Status ===" +MASTER_IP=$(terraform output -raw master_floating_ip) +ssh ubuntu@$MASTER_IP "systemctl is-active nfs-server" +EOF + +chmod +x health-check.sh +./health-check.sh +``` + +### Monitoring Setup +```bash +# Set up basic monitoring for critical components +kubectl create namespace monitoring + +# Monitor NFS server disk usage +ssh ubuntu@ "df -h /srv/nfs/k8s-storage" + +# Monitor provisioner health +kubectl get pods -n kube-system -l app=nfs-subdir-external-provisioner --watch +``` + +## ๐Ÿ†˜ Emergency Contacts and Resources + +### Log Locations +- **System logs**: `journalctl -u -f` +- **Kubernetes logs**: `/var/log/pods/` +- **Container logs**: `crictl logs ` +- **Cloud-init logs**: `/var/log/cloud-init-output.log` + +### Useful Resources +- [Kubernetes Troubleshooting](https://kubernetes.io/docs/tasks/debug-application-cluster/troubleshooting/) +- [OpenStack Documentation](https://docs.openstack.org/) +- [NFS Troubleshooting](https://linux.die.net/man/5/exports) + +Remember: When in doubt, check the logs first! Most issues can be diagnosed by examining pod events (`kubectl describe pod`) and system logs (`journalctl`). + +## OpenStack/Calico Networking Requirements (New) + +When running Calico with IPIP encapsulation on OpenStack: + +- Allow IP-in-IP (protocol 4) ingress/egress between node subnet CIDR +- Allow BGP (TCP/179) ingress/egress between node subnet CIDR + +These are implemented in `openstack/additional-security-rules.tf` and reference `openstack_networking_subnet_v2.k8s_subnet.cidr` dynamically. Without them, you may see: + +- NodeLocalDNS timeouts to CoreDNS Service IP +- calico-kube-controllers failing to reach API via 10.233.0.1 +- Cross-node Service IPs intermittently unreachable + +Post-bootstrap, `bootstrap-k8s.sh` now runs a networking healthcheck that validates CoreDNS/NodeLocalDNS and restarts them if needed. diff --git a/openstack/bootstrap-k8s.sh b/openstack/bootstrap-k8s.sh new file mode 100755 index 0000000..18ec92d --- /dev/null +++ b/openstack/bootstrap-k8s.sh @@ -0,0 +1,478 @@ +#!/bin/bash + +# bootstrap-k8s.sh +# Post-deployment Kubernetes bootstrap script +# +# This script: +# 1. Monitors the Kubernetes cluster initialization progress +# 2. Waits for all nodes to be ready +# 3. Configures kubectl for local access +# 4. Sets up NFS storage with external provisioner +# 5. Verifies cluster readiness for OpenStudio Server + +set -e + +KUBE_TLS_SERVER_NAME="${KUBE_TLS_SERVER_NAME:-kubernetes}" +OPENSTACK_ALLOW_INSECURE_KUBECTL="${OPENSTACK_ALLOW_INSECURE_KUBECTL:-false}" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +PURPLE='\033[0;35m' +NC='\033[0m' # No Color + +# Function to print colored output +print_status() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +print_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +print_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +print_progress() { + echo -e "${PURPLE}[PROGRESS]${NC} $1" +} + +configure_kubectl_tls() { + if [[ "${OPENSTACK_ALLOW_INSECURE_KUBECTL}" == "true" ]]; then + kubectl config set-cluster kubernetes --insecure-skip-tls-verify=true >/dev/null + print_warning "TLS verification disabled (OPENSTACK_ALLOW_INSECURE_KUBECTL=true)" + return + fi + + kubectl config set-cluster kubernetes --insecure-skip-tls-verify=false >/dev/null + kubectl config set-cluster kubernetes --tls-server-name="${KUBE_TLS_SERVER_NAME}" >/dev/null + print_status "TLS verification enabled (tls-server-name=${KUBE_TLS_SERVER_NAME})" +} + +# Ensure default StorageClass is set to the desired class (defaults to nfs-client) +ensure_default_storageclass() { + local desired="${1:-nfs-client}" + + if ! kubectl get storageclass "$desired" >/dev/null 2>&1; then + print_error "StorageClass '$desired' not found" + kubectl get storageclass || true + return 1 + fi + + local current + current=$(kubectl get storageclass -o jsonpath='{range .items[?(@.metadata.annotations.storageclass\.kubernetes\.io/is-default-class=="true")]}{.metadata.name}{"\n"}{end}') + if [[ "$current" != "$desired" ]]; then + print_status "Setting default StorageClass to '$desired'" + for sc in $(kubectl get storageclass -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}'); do + if [[ "$sc" == "$desired" ]]; then + kubectl annotate storageclass "$sc" storageclass.kubernetes.io/is-default-class="true" --overwrite >/dev/null 2>&1 || true + else + kubectl annotate storageclass "$sc" storageclass.kubernetes.io/is-default-class="false" --overwrite >/dev/null 2>&1 || true + fi + done + fi + + kubectl get storageclass + print_success "Default StorageClass ensured: $desired" +} + +# Label nodes into web and worker groups; optionally taint worker nodes when WORKER_TAINT=true +label_and_taint_nodegroups() { + print_status "Labeling nodes into web and worker groups" + + for n in $(kubectl get nodes -o name); do + local name="${n#node/}" + if [[ "$name" == *"web"* ]]; then + kubectl label "$n" nodegroup=web --overwrite >/dev/null 2>&1 || true + elif [[ "$name" == *"worker"* || "$name" == *"wrk"* || "$name" == *"compute"* ]]; then + kubectl label "$n" nodegroup=worker --overwrite >/dev/null 2>&1 || true + fi + done + + if [[ "${WORKER_TAINT:=false}" == "true" ]]; then + print_status "Applying optional worker taint 'worker=true:NoSchedule'" + for n in $(kubectl get nodes -l nodegroup=worker -o name); do + kubectl taint "$n" worker=true:NoSchedule --overwrite >/dev/null 2>&1 || true + done + fi + + kubectl get nodes --show-labels | grep -E "nodegroup=(web|worker)" || true + print_success "Node labeling complete" +} + +# Ensure OpenStack cloud-config Secret exists in kube-system for CCM (Octavia) +ensure_openstack_cloud_secret() { + print_status "Ensuring OpenStack cloud-config Secret exists in kube-system" + + if kubectl -n kube-system get secret cloud-config >/dev/null 2>&1; then + print_success "cloud-config Secret already present" + return 0 + fi + + if [[ -f "openstack-cloud-config.yaml" ]]; then + kubectl apply -f openstack-cloud-config.yaml >/dev/null 2>&1 || { + print_error "Failed to apply openstack-cloud-config.yaml" + return 1 + } + print_success "Applied openstack-cloud-config.yaml (cloud-config Secret created)" + else + print_warning "openstack-cloud-config.yaml not found; CCM may not provision Octavia LoadBalancers" + return 1 + fi +} + +# Check prerequisites +check_prerequisites() { + print_status "Checking prerequisites..." + + if ! command -v tofu &> /dev/null; then + print_error "OpenTofu (tofu) is not installed or not in PATH" + exit 1 + fi + + if ! command -v kubectl &> /dev/null; then + print_error "kubectl is not installed or not in PATH" + exit 1 + fi + + if ! command -v ssh &> /dev/null; then + print_error "ssh is not installed or not in PATH" + exit 1 + fi + + if [[ ! -f "main.tf" ]]; then + print_error "main.tf not found. Please run this script from the openstack/ directory." + exit 1 + fi + + print_success "Prerequisites check passed" +} + +# Get cluster information from Terraform +get_cluster_info() { + print_status "Retrieving cluster information from Terraform..." + + # Use the tofu-with-env.sh script if available + if [[ -f "./tofu-with-env.sh" ]]; then + TOFU_CMD="./tofu-with-env.sh" + else + TOFU_CMD="tofu" + print_warning "tofu-with-env.sh not found, using tofu directly" + fi + + # Get cluster endpoints + MASTER_FLOATING_IP=$(${TOFU_CMD} output -raw master_floating_ip 2>/dev/null | tail -n 1) + MASTER_PRIVATE_IP=$(${TOFU_CMD} output -raw master_ip 2>/dev/null | tail -n 1) + + if [[ -z "$MASTER_FLOATING_IP" ]]; then + print_error "Could not retrieve master_floating_ip from Terraform output" + exit 1 + fi + + if [[ -z "$MASTER_PRIVATE_IP" ]]; then + print_error "Could not retrieve master_ip from Terraform output" + exit 1 + fi + + print_success "Master floating IP: $MASTER_FLOATING_IP" + print_success "Master private IP: $MASTER_PRIVATE_IP" +} + +# Monitor cluster initialization +monitor_cluster_init() { + print_status "Monitoring Kubernetes cluster initialization..." + print_status "This may take 10-15 minutes for the complete process..." + + local max_attempts=15 # 60 minutes max + local attempt=1 + + while [[ $attempt -le $max_attempts ]]; do + print_progress "Attempt $attempt/$max_attempts: Checking cluster status..." + + # Check if we can SSH to master + if ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no ubuntu@$MASTER_FLOATING_IP "echo 'SSH connection successful'" &>/dev/null; then + print_success "SSH connection to master node established" + + # Check if master initialization is complete + if ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no ubuntu@$MASTER_FLOATING_IP "test -f /opt/master-initialized" &>/dev/null; then + print_success "Master node initialization complete" + + # Check if all nodes are ready + local nodes_ready=$(ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no ubuntu@$MASTER_FLOATING_IP "kubectl get nodes --no-headers 2>/dev/null | wc -l" 2>/dev/null || echo "0") + local nodes_not_ready=$(ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no ubuntu@$MASTER_FLOATING_IP "kubectl get nodes --no-headers 2>/dev/null | grep -v Ready | wc -l" 2>/dev/null || echo "1") + + if [[ "$nodes_ready" -ge 3 && "$nodes_not_ready" -eq 0 ]]; then + print_success "All nodes are Ready!" + return 0 + else + print_progress "Nodes status: $nodes_ready total, $nodes_not_ready not ready" + fi + else + print_progress "Master node still initializing..." + fi + else + print_progress "Waiting for master node to become accessible..." + fi + + sleep 30 + ((attempt++)) + done + + print_error "Cluster initialization timed out after $max_attempts attempts" + print_error "Check the cloud-init logs on the instances:" + print_error " ssh ubuntu@$MASTER_FLOATING_IP 'sudo tail -f /var/log/cloud-init-output.log'" + return 1 +} + +# Copy kubeconfig from master +setup_local_kubectl() { + print_status "Setting up local kubectl access..." + + # Create backup of existing kubeconfig + if [[ -f "$HOME/.kube/config" ]]; then + BACKUP_PATH="$HOME/.kube/config.backup.$(date +%Y%m%d-%H%M%S)" + print_status "Backing up existing kubeconfig to $BACKUP_PATH" + cp "$HOME/.kube/config" "$BACKUP_PATH" + fi + + # Create .kube directory if it doesn't exist + mkdir -p "$HOME/.kube" + + # Copy kubeconfig from master + print_status "Copying kubeconfig from master node..." + scp -o StrictHostKeyChecking=no ubuntu@$MASTER_FLOATING_IP:/home/ubuntu/.kube/config "$HOME/.kube/config.new" + + # Update the server endpoint to use floating IP + sed -i.bak "s|server: https://$MASTER_PRIVATE_IP:6443|server: https://$MASTER_FLOATING_IP:6443|g" "$HOME/.kube/config.new" + + # Move the updated config into place + mv "$HOME/.kube/config.new" "$HOME/.kube/config" + + configure_kubectl_tls + + print_success "Local kubectl configured successfully" +} + +# Test kubectl connectivity +test_kubectl() { + print_status "Testing kubectl connectivity..." + + if kubectl cluster-info &>/dev/null; then + print_success "kubectl is working!" + kubectl get nodes + else + print_error "kubectl connection failed" + return 1 + fi +} + +# Setup NFS storage for the cluster +setup_nfs_storage() { + print_status "Setting up NFS storage for the cluster..." + + # First, setup NFS server on master node + print_status "Configuring NFS server on master node..." + ssh -o StrictHostKeyChecking=no ubuntu@$MASTER_FLOATING_IP ' + sudo apt update && sudo apt install -y nfs-kernel-server + sudo mkdir -p /srv/nfs/k8s-storage + sudo chown nobody:nogroup /srv/nfs/k8s-storage + sudo chmod 777 /srv/nfs/k8s-storage + echo "/srv/nfs/k8s-storage 10.244.0.0/16(rw,sync,no_subtree_check,no_root_squash) +/srv/nfs/k8s-storage 10.0.1.0/24(rw,sync,no_subtree_check,no_root_squash)" | sudo tee /etc/exports + sudo systemctl enable nfs-server + sudo systemctl restart nfs-server + sudo exportfs -ra + ' + + # Check if helm is available + if ! command -v helm &> /dev/null; then + print_error "Helm is required for NFS provisioner installation" + return 1 + fi + + print_status "Installing NFS subdir external provisioner..." + + # Add the NFS provisioner Helm repository + helm repo add nfs-subdir-external-provisioner https://kubernetes-sigs.github.io/nfs-subdir-external-provisioner/ + helm repo update + + # Install the NFS provisioner with hostNetwork enabled to work around pod networking issues + helm install nfs-subdir-external-provisioner nfs-subdir-external-provisioner/nfs-subdir-external-provisioner \ + --set nfs.server=$MASTER_PRIVATE_IP \ + --set nfs.path=/srv/nfs/k8s-storage \ + --set storageClass.defaultClass=true \ + --namespace kube-system \ + --wait --timeout=300s + + # Patch the deployment to use hostNetwork (required due to pod network isolation in OpenStack) + print_status "Applying hostNetwork patch for NFS provisioner..." + kubectl patch deployment nfs-subdir-external-provisioner -n kube-system -p '{"spec":{"template":{"spec":{"hostNetwork":true}}}}' + + # Wait for the provisioner to be ready + kubectl rollout status deployment/nfs-subdir-external-provisioner -n kube-system --timeout=300s + + print_success "NFS storage setup complete!" + print_status "Default storage class 'nfs-client' is now available" +} + +# Verify cluster readiness +verify_cluster_readiness() { + print_status "Verifying cluster readiness for OpenStudio Server..." + + # Check nodes + # Post-bootstrap networking healthcheck and quick remediation + post_bootstrap_network_healthcheck() { + print_status "Running post-bootstrap networking healthcheck..." + + # Minimal checks: kube-proxy mode, CoreDNS service IP, NodeLocalDNS, Calico nodes + kubectl -n kube-system get ds kube-proxy >/dev/null 2>&1 || print_warning "kube-proxy not found" + kubectl -n kube-system get svc coredns >/dev/null 2>&1 || print_warning "CoreDNS service missing" + kubectl -n kube-system get ds nodelocaldns >/dev/null 2>&1 || print_warning "NodeLocalDNS not found" + kubectl -n kube-system get po -l k8s-app=calico-node >/dev/null 2>&1 || print_warning "Calico nodes not found" + + # Deploy a tiny netshoot pod for tests + kubectl apply -f network-debug-pod.yaml >/dev/null 2>&1 || true + sleep 5 + + # Test DNS resolution from a pod via NodeLocalDNS + POD=$(kubectl get pod -l app=network-debug -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + if [[ -n "$POD" ]]; then + if kubectl exec "$POD" -- nslookup kubernetes.default.svc.cluster.local 169.254.25.10 >/dev/null 2>&1; then + print_success "DNS resolution via NodeLocalDNS works" + else + print_warning "DNS resolution via NodeLocalDNS failed; attempting DNS component restarts" + kubectl -n kube-system rollout restart deploy/coredns >/dev/null 2>&1 || true + kubectl -n kube-system rollout restart ds/nodelocaldns >/dev/null 2>&1 || true + sleep 10 + fi + fi + + # If calico-kube-controllers cannot reach API, restart it + if kubectl -n kube-system get deploy calico-kube-controllers >/dev/null 2>&1; then + READY=$(kubectl -n kube-system get deploy calico-kube-controllers -o jsonpath='{.status.availableReplicas}' 2>/dev/null || echo 0) + if [[ "${READY:-0}" == "0" ]]; then + print_warning "calico-kube-controllers not available; restarting" + kubectl -n kube-system delete pod -l k8s-app=calico-kube-controllers >/dev/null 2>&1 || true + fi + fi + } + + local nodes_count=$(kubectl get nodes --no-headers | wc -l) + if [[ $nodes_count -ge 3 ]]; then + print_success "Cluster has $nodes_count nodes (minimum 3 required)" + else + print_warning "Cluster only has $nodes_count nodes, minimum 3 recommended" + fi + + # Check node labels + local web_nodes=$(kubectl get nodes -l nodegroup=web --no-headers | wc -l) + local worker_nodes=$(kubectl get nodes -l nodegroup=worker --no-headers | wc -l) + + if [[ $web_nodes -gt 0 ]]; then + print_success "Found $web_nodes web nodes with proper labels" + else + print_warning "No web nodes with nodegroup=web-group labels found" + fi + + if [[ $worker_nodes -gt 0 ]]; then + print_success "Found $worker_nodes worker nodes with proper labels" + else + print_warning "No worker nodes with nodegroup=worker-group labels found" + fi + + # Check storage classes + local storage_classes=$(kubectl get storageclass --no-headers | wc -l) + if [[ $storage_classes -gt 0 ]]; then + print_success "Found $storage_classes storage class(es)" + kubectl get storageclass + else + print_warning "No storage classes found - you may need to configure Cinder CSI" + fi + + # Check system pods + local system_pods_ready=$(kubectl get pods -n kube-system --no-headers | grep Running | wc -l) + local system_pods_total=$(kubectl get pods -n kube-system --no-headers | wc -l) + + print_status "System pods: $system_pods_ready/$system_pods_total running" + + if [[ $system_pods_ready -eq $system_pods_total ]]; then + print_success "All system pods are running" + else + print_warning "Some system pods are not running yet" + kubectl get pods -n kube-system | grep -v Running || true + fi +} + +# Main execution +main() { + echo "==================================" + echo "Kubernetes Bootstrap Script" + echo "==================================" + echo + + check_prerequisites + get_cluster_info + + echo + print_status "Starting cluster initialization monitoring..." + + if monitor_cluster_init; then + echo + print_success "Cluster initialization complete!" + + setup_local_kubectl + test_kubectl + + # Ensure CCM has cloud-config secret (required for Octavia LB) + ensure_openstack_cloud_secret || true + + echo + setup_nfs_storage + + ensure_default_storageclass "nfs-client" + label_and_taint_nodegroups + + echo + verify_cluster_readiness + + # Networking healthcheck (best-effort) + echo + post_bootstrap_network_healthcheck || true + + echo + echo "==================================" + print_success "Kubernetes cluster is ready!" + echo "==================================" + echo + print_status "Next steps:" + echo "1. Install OpenStudio Server Helm chart with NFS storage:" + echo " helm install openstudio-server ../openstudio-server -f values-openstack-nfs.yaml -n openstudio-test --create-namespace" + echo "2. Verify NFS storage is working by checking PVCs" + echo + print_status "Useful commands:" + echo " kubectl get nodes" + echo " kubectl get pods --all-namespaces" + echo " kubectl get storageclass" + echo + else + echo + print_error "Cluster initialization failed!" + echo + print_status "Troubleshooting steps:" + echo "1. Check cloud-init logs: ssh ubuntu@$MASTER_FLOATING_IP 'sudo tail -f /var/log/cloud-init-output.log'" + echo "2. Check master init logs: ssh ubuntu@$MASTER_FLOATING_IP 'sudo tail -f /var/log/master-init.log'" + echo "3. Check worker join logs on worker nodes" + echo + exit 1 + fi +} + +# Run main function +main "$@" diff --git a/openstack/cinder-csi-storageclass.yaml b/openstack/cinder-csi-storageclass.yaml new file mode 100644 index 0000000..1c9f9a7 --- /dev/null +++ b/openstack/cinder-csi-storageclass.yaml @@ -0,0 +1,8 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: cinder-csi +provisioner: cinder.csi.openstack.org +allowVolumeExpansion: true +reclaimPolicy: Delete +volumeBindingMode: WaitForFirstConsumer diff --git a/openstack/deploy-k8s-cluster.sh b/openstack/deploy-k8s-cluster.sh new file mode 100755 index 0000000..442c35e --- /dev/null +++ b/openstack/deploy-k8s-cluster.sh @@ -0,0 +1,392 @@ +#!/bin/bash + +# deploy-k8s-cluster.sh +# Master orchestration script for automated OpenStack Kubernetes deployment +# +# This script provides a complete one-command deployment: +# 1. Validates prerequisites +# 2. Deploys OpenStack infrastructure with Terraform +# 3. Waits for and monitors automatic Kubernetes bootstrap +# 4. Configures local kubectl access +# 5. Verifies cluster readiness for OpenStudio Server + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +PURPLE='\033[0;35m' +BOLD='\033[1m' +NC='\033[0m' # No Color + +# Configuration +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + +# Function to print colored output +print_header() { + echo -e "${BOLD}${BLUE}================================================${NC}" + echo -e "${BOLD}${BLUE}$1${NC}" + echo -e "${BOLD}${BLUE}================================================${NC}" +} + +print_status() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +print_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +print_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +print_progress() { + echo -e "${PURPLE}[PROGRESS]${NC} $1" +} + +# Function to show usage +show_usage() { + cat << EOF +Usage: $0 [OPTIONS] + +Deploy a complete Kubernetes cluster on OpenStack with automatic bootstrap. + +OPTIONS: + -h, --help Show this help message + -d, --destroy Destroy the existing cluster instead of creating + -p, --plan-only Show Terraform plan without applying changes + -s, --skip-bootstrap Skip the bootstrap monitoring (deploy infrastructure only) + --no-kubectl Skip kubectl configuration setup + +EXAMPLES: + $0 # Deploy complete cluster with bootstrap + $0 --plan-only # Show what would be deployed + $0 --skip-bootstrap # Deploy infrastructure only + $0 --destroy # Destroy existing cluster + +PREREQUISITES: + - OpenTofu/Terraform installed + - kubectl installed + - OpenStack credentials configured in .env file + - SSH key pair configured + +EOF +} + +# Parse command line arguments +DESTROY_MODE=false +PLAN_ONLY=false +SKIP_BOOTSTRAP=false +SKIP_KUBECTL=false + +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + show_usage + exit 0 + ;; + -d|--destroy) + DESTROY_MODE=true + shift + ;; + -p|--plan-only) + PLAN_ONLY=true + shift + ;; + -s|--skip-bootstrap) + SKIP_BOOTSTRAP=true + shift + ;; + --no-kubectl) + SKIP_KUBECTL=true + shift + ;; + *) + print_error "Unknown option: $1" + show_usage + exit 1 + ;; + esac +done + +# Validate prerequisites +validate_prerequisites() { + print_status "Validating prerequisites..." + + local errors=0 + + # Check if we're in the right directory + if [[ ! -f "$SCRIPT_DIR/main.tf" ]]; then + print_error "main.tf not found in $SCRIPT_DIR" + ((errors++)) + fi + + if [[ ! -f "$SCRIPT_DIR/.env" ]]; then + print_error ".env file not found. Please copy .env.template and configure your OpenStack credentials" + ((errors++)) + fi + + # Check required tools + if ! command -v tofu &> /dev/null; then + print_error "OpenTofu (tofu) is not installed or not in PATH" + print_error "Install from: https://opentofu.org/docs/intro/install/" + ((errors++)) + fi + + if ! command -v kubectl &> /dev/null && [[ "$SKIP_KUBECTL" == false ]]; then + print_error "kubectl is not installed or not in PATH" + print_error "Install from: https://kubernetes.io/docs/tasks/tools/" + ((errors++)) + fi + + if ! command -v ssh &> /dev/null && [[ "$SKIP_BOOTSTRAP" == false ]]; then + print_error "ssh is not installed or not in PATH" + ((errors++)) + fi + + # Check cloud-init file + if [[ ! -f "$SCRIPT_DIR/k8s-cloud-init.yaml" ]]; then + print_error "k8s-cloud-init.yaml not found" + ((errors++)) + fi + + if [[ $errors -gt 0 ]]; then + print_error "Prerequisites validation failed with $errors error(s)" + exit 1 + fi + + print_success "Prerequisites validation passed" +} + +# Initialize Terraform +init_terraform() { + print_status "Initializing Terraform..." + + cd "$SCRIPT_DIR" + + if ./tofu-with-env.sh init; then + print_success "Terraform initialized successfully" + else + print_error "Terraform initialization failed" + exit 1 + fi +} + +# Deploy or destroy infrastructure +manage_infrastructure() { + cd "$SCRIPT_DIR" + + if [[ "$DESTROY_MODE" == true ]]; then + print_header "DESTROYING INFRASTRUCTURE" + print_warning "This will destroy all resources including:" + print_warning "- All Kubernetes nodes and data" + print_warning "- Floating IPs and networks" + print_warning "- Storage volumes and snapshots" + echo + read -p "Are you sure you want to continue? (yes/no): " confirm + + if [[ "$confirm" != "yes" ]]; then + print_status "Operation cancelled" + exit 0 + fi + + print_status "Destroying OpenStack infrastructure..." + if ./tofu-with-env.sh destroy -auto-approve; then + print_success "Infrastructure destroyed successfully" + exit 0 + else + print_error "Infrastructure destruction failed" + exit 1 + fi + + elif [[ "$PLAN_ONLY" == true ]]; then + print_header "TERRAFORM PLAN" + ./tofu-with-env.sh plan + print_status "Plan complete. Use '$0' (without --plan-only) to apply changes." + exit 0 + + else + print_header "DEPLOYING INFRASTRUCTURE" + print_status "Deploying OpenStack infrastructure with automated Kubernetes bootstrap..." + + if ./tofu-with-env.sh apply -auto-approve; then + print_success "Infrastructure deployed successfully" + else + print_error "Infrastructure deployment failed" + exit 1 + fi + fi +} + +# Wait for infrastructure to be ready +wait_for_infrastructure() { + print_status "Waiting for infrastructure to be accessible..." + + # Get master floating IP + local master_ip + for i in {1..30}; do + master_ip=$(./tofu-with-env.sh output -raw master_floating_ip 2>/dev/null | tail -n 1 || echo "") + if [[ -n "$master_ip" && "$master_ip" != *"Error"* ]]; then + break + fi + print_progress "Attempt $i: Waiting for Terraform outputs..." + sleep 10 + done + + if [[ -z "$master_ip" ]]; then + print_error "Could not retrieve master floating IP" + exit 1 + fi + + print_success "Master floating IP: $master_ip" + + # Wait for SSH connectivity + print_status "Waiting for SSH connectivity to master node..." + for i in {1..60}; do + if ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no ubuntu@"$master_ip" "echo 'SSH ready'" &>/dev/null; then + print_success "SSH connectivity established" + return 0 + fi + print_progress "Attempt $i: Waiting for SSH connectivity..." + sleep 15 + done + + print_error "SSH connectivity timeout" + exit 1 +} + +# Monitor bootstrap process +monitor_bootstrap() { + if [[ "$SKIP_BOOTSTRAP" == true ]]; then + print_status "Skipping bootstrap monitoring as requested" + return 0 + fi + + print_header "MONITORING KUBERNETES BOOTSTRAP" + + # Use the bootstrap script + if [[ -f "$SCRIPT_DIR/bootstrap-k8s.sh" ]]; then + print_status "Starting bootstrap monitoring script..." + if "$SCRIPT_DIR/bootstrap-k8s.sh"; then + print_success "Bootstrap monitoring completed successfully" + else + print_error "Bootstrap monitoring failed" + print_status "You can check the status manually by running:" + print_status " ./bootstrap-k8s.sh" + exit 1 + fi + else + print_warning "bootstrap-k8s.sh not found, skipping bootstrap monitoring" + fi +} + +# Setup kubectl (simplified version if bootstrap script is not available) +setup_kubectl_fallback() { + if [[ "$SKIP_KUBECTL" == true ]]; then + print_status "Skipping kubectl setup as requested" + return 0 + fi + + print_header "SETTING UP KUBECTL ACCESS" + + if [[ -f "$SCRIPT_DIR/setup-kubectl.sh" ]]; then + print_status "Using setup-kubectl.sh script..." + if "$SCRIPT_DIR/setup-kubectl.sh"; then + print_success "kubectl setup completed successfully" + else + print_warning "kubectl setup script failed, manual setup may be required" + fi + else + print_warning "setup-kubectl.sh not found, manual kubectl setup required" + fi +} + +# Show final status and next steps +show_completion() { + print_header "DEPLOYMENT COMPLETED" + + # Get cluster info + local master_ip=$(./tofu-with-env.sh output -raw master_floating_ip 2>/dev/null | tail -n 1 || echo "unknown") + local cluster_name=$(./tofu-with-env.sh output -json cluster_info 2>/dev/null | jq -r '.cluster_name' || echo "openstudio-server") + + echo + print_success "OpenStack Kubernetes cluster is ready!" + echo + print_status "Cluster Information:" + echo " โ€ข Cluster Name: $cluster_name" + echo " โ€ข Master IP: $master_ip" + echo " โ€ข API Server: https://$master_ip:6443" + echo + print_status "Useful Commands:" + echo " โ€ข Check cluster: kubectl get nodes" + echo " โ€ข View pods: kubectl get pods --all-namespaces" + echo " โ€ข SSH to master: ssh ubuntu@$master_ip" + echo + print_status "Next Steps:" + echo " 1. Verify cluster status: kubectl get nodes" + echo " 2. Install OpenStudio Server: Use Helm charts from ../charts/" + echo " 3. Configure storage: Create OpenStack credentials secret if needed" + echo + print_status "Troubleshooting:" + echo " โ€ข Bootstrap logs: ssh ubuntu@$master_ip 'sudo tail -f /var/log/cloud-init-output.log'" + echo " โ€ข Re-run bootstrap: ./bootstrap-k8s.sh" + echo " โ€ข Re-configure kubectl: ./setup-kubectl.sh" + echo +} + +# Main execution function +main() { + # Change to script directory + cd "$SCRIPT_DIR" + + print_header "OPENSTUDIO SERVER KUBERNETES DEPLOYMENT" + echo + print_status "Starting automated deployment process..." + echo + + # Validate prerequisites + validate_prerequisites + echo + + # Initialize Terraform + init_terraform + echo + + # Deploy or destroy infrastructure + manage_infrastructure + + # Only continue with bootstrap if we're not destroying + if [[ "$DESTROY_MODE" != true && "$PLAN_ONLY" != true ]]; then + echo + + # Wait for infrastructure + wait_for_infrastructure + echo + + # Monitor bootstrap + monitor_bootstrap + echo + + # Setup kubectl if bootstrap was skipped + if [[ "$SKIP_BOOTSTRAP" == true ]]; then + setup_kubectl_fallback + echo + fi + + # Show completion status + show_completion + fi +} + +# Handle script interruption +trap 'print_error "Script interrupted by user"; exit 130' INT + +# Run main function +main "$@" diff --git a/openstack/deploy-openstudio-cluster.sh b/openstack/deploy-openstudio-cluster.sh new file mode 100755 index 0000000..7a3eec3 --- /dev/null +++ b/openstack/deploy-openstudio-cluster.sh @@ -0,0 +1,501 @@ +#!/bin/bash +# OpenStudio Server Kubernetes Cluster Deployment Script for OpenStack +# This script automates the full deployment process with corporate firewall support + +set -euo pipefail + +# Script configuration +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +KUBESPRAY_PATH="${KUBESPRAY_PATH:-$HOME/kubespray}" +HELM_CHART_PATH="${HELM_CHART_PATH:-../openstudio-server}" +HELM_VALUES_FILE="${HELM_VALUES_FILE:-$SCRIPT_DIR/values-openstack.yaml}" +APP_SECRET_NAME="${APP_SECRET_NAME:-openstudio-app-secrets}" + +# Color codes for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Logging function +log() { + echo -e "${BLUE}[$(date +'%Y-%m-%d %H:%M:%S')]${NC} $1" +} + +success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +error() { + echo -e "${RED}[ERROR]${NC} $1" + exit 1 +} + +# Help function +usage() { + cat << EOF +Usage: $0 [OPTIONS] + +Deploy OpenStudio Server Kubernetes cluster on OpenStack + +CLUSTER_SIZE: + small - Deploy small cluster configuration (2 web, 1 worker) + large - Deploy large cluster configuration (1 web, 1 worker, high-spec) + test - Deploy test cluster (single master only) + +OPTIONS: + -h, --help Show this help message + -c, --cleanup Clean up existing cluster before deployment + -s, --skip-terraform Skip Terraform infrastructure deployment + -k, --skip-kubespray Skip Kubespray Kubernetes deployment + -d, --skip-helm Skip Helm chart deployment + -v, --verbose Enable verbose output + +ENVIRONMENT VARIABLES: + Required OpenStack credentials: + - TF_VAR_openstack_user_name + - TF_VAR_openstack_password + - TF_VAR_openstack_auth_url + - TF_VAR_openstack_tenant_name + - TF_VAR_openstack_user_domain_name + - TF_VAR_openstack_project_domain_id + - TF_VAR_openstack_project_id + - HELM_VALUES_FILE (optional; default: ./values-openstack.yaml) + - APP_SECRET_NAME (optional; default: openstudio-app-secrets) + +Examples: + $0 small # Deploy small cluster + $0 large --cleanup # Clean up and deploy large cluster + $0 test --skip-helm # Deploy test cluster without helm + +EOF +} + +# Parse command line arguments +CLUSTER_SIZE="" +CLEANUP=false +SKIP_TERRAFORM=false +SKIP_KUBESPRAY=false +SKIP_HELM=false +VERBOSE=false + +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + usage + exit 0 + ;; + -c|--cleanup) + CLEANUP=true + shift + ;; + -s|--skip-terraform) + SKIP_TERRAFORM=true + shift + ;; + -k|--skip-kubespray) + SKIP_KUBESPRAY=true + shift + ;; + -d|--skip-helm) + SKIP_HELM=true + shift + ;; + -v|--verbose) + VERBOSE=true + set -x + shift + ;; + small|large|test) + CLUSTER_SIZE="$1" + shift + ;; + *) + error "Unknown option: $1" + ;; + esac +done + +# Validate cluster size +if [[ -z "$CLUSTER_SIZE" ]]; then + error "Cluster size is required. Use 'small', 'large', or 'test'" +fi + +# Validate cluster size options +case "$CLUSTER_SIZE" in + small|large|test) + ;; + *) + error "Invalid cluster size: $CLUSTER_SIZE. Use 'small', 'large', or 'test'" + ;; +esac + +# Check prerequisites +check_prerequisites() { + log "Checking prerequisites..." + + # Check required tools + local tools=("tofu" "ansible" "kubectl" "helm") + for tool in "${tools[@]}"; do + if ! command -v "$tool" &> /dev/null; then + error "$tool is not installed or not in PATH" + fi + done + + # Check OpenStack credentials + local required_vars=( + "TF_VAR_openstack_user_name" + "TF_VAR_openstack_password" + "TF_VAR_openstack_auth_url" + "TF_VAR_openstack_tenant_name" + ) + + for var in "${required_vars[@]}"; do + if [[ -z "${!var:-}" ]]; then + error "Required environment variable $var is not set" + fi + done + + # Check Kubespray + if [[ ! -d "$KUBESPRAY_PATH" ]] && [[ "$SKIP_KUBESPRAY" == false ]]; then + warning "Kubespray not found at $KUBESPRAY_PATH" + read -p "Do you want to clone Kubespray? (y/N): " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + git clone https://github.com/kubernetes-sigs/kubespray.git "$KUBESPRAY_PATH" + cd "$KUBESPRAY_PATH" + pip install -r requirements.txt + cd "$SCRIPT_DIR" + else + error "Kubespray is required for deployment" + fi + fi + + success "Prerequisites check passed" +} + +# Clean up existing infrastructure +cleanup_cluster() { + if [[ "$CLEANUP" == true ]]; then + log "Cleaning up existing cluster..." + + if [[ -f terraform.tfstate ]]; then + log "Destroying existing Terraform infrastructure..." + tofu destroy -var-file="openstudio-${CLUSTER_SIZE}.tfvars" -auto-approve || warning "Terraform destroy had warnings" + fi + + success "Cleanup completed" + fi +} + +# Deploy infrastructure with Terraform +deploy_infrastructure() { + if [[ "$SKIP_TERRAFORM" == true ]]; then + log "Skipping Terraform deployment..." + return + fi + + log "Deploying infrastructure with Terraform..." + + local tfvars_file="openstudio-${CLUSTER_SIZE}.tfvars" + if [[ ! -f "$tfvars_file" ]]; then + error "Configuration file $tfvars_file not found" + fi + + # Initialize Terraform + tofu init + + # Plan deployment + log "Planning Terraform deployment..." + tofu plan -var-file="$tfvars_file" -out=tfplan + + # Apply deployment + log "Applying Terraform deployment..." + tofu apply tfplan + + # Wait for instances to be ready + log "Waiting for instances to be ready..." + sleep 60 + + success "Infrastructure deployment completed" +} + +# Generate Kubespray inventory +generate_inventory() { + log "Generating Kubespray inventory..." + + # Get Terraform outputs + local master_ip + local web_ips + local worker_ips + + master_ip=$(tofu output -raw master_floating_ip 2>/dev/null || echo "") + web_ips=$(tofu output -json web_floating_ips 2>/dev/null | jq -r '.[]' || echo "") + worker_ips=$(tofu output -json worker_floating_ips 2>/dev/null | jq -r '.[]' || echo "") + + if [[ -z "$master_ip" ]]; then + error "Could not get master IP from Terraform output" + fi + + # Create inventory file + local inventory_file="inventory/inventory.ini" + mkdir -p inventory + + cat > "$inventory_file" << EOF +[all] +master ansible_host=$master_ip ip=$master_ip +EOF + + # Add web nodes if they exist + if [[ -n "$web_ips" ]]; then + local web_count=1 + while IFS= read -r web_ip; do + if [[ -n "$web_ip" ]]; then + echo "web-$web_count ansible_host=$web_ip ip=$web_ip" >> "$inventory_file" + ((web_count++)) + fi + done <<< "$web_ips" + fi + + # Add worker nodes if they exist + if [[ -n "$worker_ips" ]]; then + local worker_count=1 + while IFS= read -r worker_ip; do + if [[ -n "$worker_ip" ]]; then + echo "worker-$worker_count ansible_host=$worker_ip ip=$worker_ip" >> "$inventory_file" + ((worker_count++)) + fi + done <<< "$worker_ips" + fi + + # Add group definitions + cat >> "$inventory_file" << EOF + +[kube-master] +master + +[etcd] +master + +[kube-node] +EOF + + # Add web nodes to kube-node group + if [[ -n "$web_ips" ]]; then + local web_count=1 + while IFS= read -r web_ip; do + if [[ -n "$web_ip" ]]; then + echo "web-$web_count" >> "$inventory_file" + ((web_count++)) + fi + done <<< "$web_ips" + fi + + # Add worker nodes to kube-node group + if [[ -n "$worker_ips" ]]; then + local worker_count=1 + while IFS= read -r worker_ip; do + if [[ -n "$worker_ip" ]]; then + echo "worker-$worker_count" >> "$inventory_file" + ((worker_count++)) + fi + done <<< "$worker_ips" + fi + + cat >> "$inventory_file" << EOF + +[calico-rr] + +[k8s-cluster:children] +kube-master +kube-node +calico-rr + +[k8s-cluster:vars] +ansible_user=ubuntu +ansible_ssh_private_key_file=~/.ssh/id_rsa +EOF + + success "Inventory generated: $inventory_file" +} + +# Deploy Kubernetes with Kubespray +deploy_kubernetes() { + if [[ "$SKIP_KUBESPRAY" == true ]]; then + log "Skipping Kubespray deployment..." + return + fi + + log "Deploying Kubernetes with Kubespray..." + + # Copy custom group_vars + log "Copying custom group_vars..." + cp -r "$SCRIPT_DIR/kubespray/inventory/sample/group_vars" "$SCRIPT_DIR/inventory/" + + # Run Kubespray + cd "$KUBESPRAY_PATH" + + # Detect if we should use corporate firewall wrapper + local ansible_command="ansible-playbook" + local master_ip + master_ip=$(cd "$SCRIPT_DIR" && tofu output -raw master_floating_ip) + + # Check if corporate firewall wrapper exists on master node + if ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no "ubuntu@$master_ip" "test -f /usr/local/bin/kubespray-corporate-firewall.sh" 2>/dev/null; then + log "Corporate firewall detection found, using wrapper script" + export CORPORATE_FIREWALL_DETECTED=true + fi + + log "Running Kubespray playbook..." + $ansible_command -i "$SCRIPT_DIR/inventory/inventory.ini" \ + --become --become-user=root \ + cluster.yml + + cd "$SCRIPT_DIR" + + success "Kubernetes deployment completed" +} + +# Configure cluster post-deployment +configure_cluster() { + log "Configuring cluster..." + + # Get kubeconfig + local master_ip + master_ip=$(tofu output -raw master_floating_ip) + + log "Retrieving kubeconfig from master node..." + scp -o StrictHostKeyChecking=no "ubuntu@$master_ip:/etc/kubernetes/admin.conf" "./kubeconfig" + + # Update kubeconfig with external IP + sed -i.bak "s/127.0.0.1:6443/$master_ip:6443/g" "./kubeconfig" + sed -i.bak "s/localhost:6443/$master_ip:6443/g" "./kubeconfig" + + export KUBECONFIG="$SCRIPT_DIR/kubeconfig" + + # Wait for cluster to be ready + log "Waiting for cluster to be ready..." + local retries=0 + local max_retries=30 + + while ! kubectl get nodes &>/dev/null && [[ $retries -lt $max_retries ]]; do + sleep 10 + ((retries++)) + log "Waiting for cluster... (attempt $retries/$max_retries)" + done + + if [[ $retries -eq $max_retries ]]; then + error "Cluster did not become ready within expected time" + fi + + # Apply storage classes + log "Applying storage classes..." + kubectl apply -f storage-classes.yaml + + # Label nodes + log "Applying node labels..." + kubectl label nodes --overwrite --selector='!node-role.kubernetes.io/control-plane' nodegroup=worker-group workload=compute || warning "Node labeling had issues" + + success "Cluster configuration completed" +} + +# Deploy OpenStudio Server Helm chart +deploy_openstudio() { + if [[ "$SKIP_HELM" == true ]]; then + log "Skipping Helm chart deployment..." + return + fi + + log "Deploying OpenStudio Server Helm chart..." + + export KUBECONFIG="$SCRIPT_DIR/kubeconfig" + + # Add required Helm repositories + log "Adding Helm repositories..." + helm repo add nfs-server-provisioner https://kubernetes-sigs.github.io/nfs-ganesha-server-and-external-provisioner + helm repo update + + if [[ ! -f "$HELM_VALUES_FILE" ]]; then + error "Helm values file not found: $HELM_VALUES_FILE" + fi + + # Create namespace + kubectl create namespace openstudio-server || true + + local secret_validator="$SCRIPT_DIR/../scripts/validate-app-secret.sh" + if [[ -x "$secret_validator" ]]; then + "$secret_validator" --namespace openstudio-server --secret-name "$APP_SECRET_NAME" + else + warning "Secret validator not found at $secret_validator; skipping preflight validation in this workflow" + fi + + # Deploy OpenStudio Server + log "Installing OpenStudio Server..." + helm upgrade --install openstudio-server "$HELM_CHART_PATH" \ + --namespace openstudio-server \ + --values "$HELM_CHART_PATH/values.yaml" \ + --values "$HELM_VALUES_FILE" \ + --set secrets.existingSecret="$APP_SECRET_NAME" \ + --set secrets.create=false \ + --set secrets.validateExistingSecret=true \ + --set global.provider.name=openstack \ + --timeout=20m \ + --wait + + success "OpenStudio Server deployment completed" +} + +# Display deployment summary +show_summary() { + log "Deployment Summary" + echo "==================" + + local master_ip + master_ip=$(tofu output -raw master_floating_ip 2>/dev/null || echo "N/A") + + echo "Cluster Size: $CLUSTER_SIZE" + echo "Master Node IP: $master_ip" + echo "Kubeconfig: $SCRIPT_DIR/kubeconfig" + + if [[ "$SKIP_HELM" == false ]]; then + export KUBECONFIG="$SCRIPT_DIR/kubeconfig" + echo "" + echo "OpenStudio Server Services:" + kubectl get services -n openstudio-server || warning "Could not get services" + echo "" + echo "To access the web interface, run:" + echo "kubectl port-forward -n openstudio-server service/web 8080:80" + echo "Then visit: http://localhost:8080" + fi + + echo "" + echo "SSH to master: ssh ubuntu@$master_ip" + echo "Kubectl access: export KUBECONFIG=$SCRIPT_DIR/kubeconfig" + + success "Deployment completed successfully!" +} + +# Main deployment flow +main() { + log "Starting OpenStudio Server deployment ($CLUSTER_SIZE)" + + check_prerequisites + cleanup_cluster + deploy_infrastructure + generate_inventory + deploy_kubernetes + configure_cluster + deploy_openstudio + show_summary +} + +# Trap signals for cleanup +trap 'error "Script interrupted"' INT TERM + +# Run main function +main "$@" diff --git a/openstack/deploy.sh b/openstack/deploy.sh new file mode 100755 index 0000000..606f759 --- /dev/null +++ b/openstack/deploy.sh @@ -0,0 +1,436 @@ +#!/bin/bash + +############################################################################## +# OpenStudio Server Kubernetes Deployment Automation +############################################################################## + +set -euo pipefail + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Configuration +DEPLOYMENT_NAME="openstudio-server" +LOG_FILE="deployment-$(date +%Y%m%d-%H%M%S).log" +MAX_RETRIES=5 +RETRY_DELAY=30 +KUBE_TLS_SERVER_NAME="${KUBE_TLS_SERVER_NAME:-kubernetes}" +OPENSTACK_ALLOW_INSECURE_KUBECTL="${OPENSTACK_ALLOW_INSECURE_KUBECTL:-false}" + +# Logging function +log() { + echo -e "${1}" | tee -a "${LOG_FILE}" +} + +error() { + log "${RED}[ERROR]${NC} $1" + exit 1 +} + +warning() { + log "${YELLOW}[WARNING]${NC} $1" +} + +success() { + log "${GREEN}[SUCCESS]${NC} $1" +} + +info() { + log "${BLUE}[INFO]${NC} $1" +} + +configure_kubectl_tls() { + if [[ "${OPENSTACK_ALLOW_INSECURE_KUBECTL}" == "true" ]]; then + kubectl config set-cluster kubernetes --insecure-skip-tls-verify=true &> /dev/null || true + warning "TLS verification disabled (OPENSTACK_ALLOW_INSECURE_KUBECTL=true)" + return + fi + + kubectl config set-cluster kubernetes --insecure-skip-tls-verify=false &> /dev/null || true + kubectl config set-cluster kubernetes --tls-server-name="${KUBE_TLS_SERVER_NAME}" &> /dev/null || true + info "TLS verification enabled (tls-server-name=${KUBE_TLS_SERVER_NAME})" +} + +# Function to check prerequisites +check_prerequisites() { + info "Checking prerequisites..." + + # Check if required commands exist + local commands=("tofu" "openstack" "ssh" "kubectl") + for cmd in "${commands[@]}"; do + if ! command -v "$cmd" &> /dev/null; then + error "Required command not found: $cmd" + fi + done + + # Check if we can authenticate with OpenStack + if ! openstack token issue &> /dev/null; then + error "OpenStack authentication failed. Please check your credentials." + fi + + # Check if Terraform configuration exists + if [[ ! -f "main.tf" ]]; then + error "Terraform configuration (main.tf) not found in current directory" + fi + + success "Prerequisites check passed" +} + +# Function to deploy infrastructure +deploy_infrastructure() { + info "Deploying OpenStack infrastructure with Terraform..." + + # Initialize Terraform if needed + if [[ ! -d ".terraform" ]]; then + info "Initializing Terraform..." + tofu init + fi + + # Plan deployment + info "Creating Terraform plan..." + tofu plan -out=deployment.tfplan + + # Apply deployment + info "Applying Terraform configuration..." + tofu apply -auto-approve deployment.tfplan + + # Clean up plan file + rm -f deployment.tfplan + + success "Infrastructure deployment completed" +} + +# Function to extract deployment information +get_deployment_info() { + info "Extracting deployment information..." + + # Get master floating IP + MASTER_IP=$(tofu output -raw master_floating_ip 2>/dev/null || echo "") + if [[ -z "$MASTER_IP" ]]; then + error "Could not retrieve master floating IP from Terraform output" + fi + + # Get cluster info + CLUSTER_NAME=$(tofu output -json cluster_info | jq -r '.cluster_name' 2>/dev/null || echo "openstudio-server") + TOTAL_NODES=$(tofu output -json cluster_info | jq -r '.total_nodes' 2>/dev/null || echo "3") + + info "Master IP: $MASTER_IP" + info "Cluster Name: $CLUSTER_NAME" + info "Total Nodes: $TOTAL_NODES" +} + +# Function to test connectivity +test_connectivity() { + info "Testing network connectivity to deployed instances..." + + # Test ping first + if ping -c 3 -W 5 "$MASTER_IP" &> /dev/null; then + success "Ping test to master node successful" + return 0 + else + warning "Ping test to master node failed" + fi + + # Test SSH connectivity + local retry_count=0 + while [[ $retry_count -lt $MAX_RETRIES ]]; do + info "Testing SSH connectivity (attempt $((retry_count + 1))/$MAX_RETRIES)..." + + if timeout 15 ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no ubuntu@"$MASTER_IP" 'echo "SSH connection successful"' &> /dev/null; then + success "SSH connectivity test successful" + return 0 + fi + + warning "SSH connectivity test failed, retrying in $RETRY_DELAY seconds..." + sleep $RETRY_DELAY + ((retry_count++)) + done + + error "SSH connectivity test failed after $MAX_RETRIES attempts" +} + +# Function to monitor Kubernetes bootstrap +monitor_kubernetes_bootstrap() { + info "Monitoring Kubernetes cluster bootstrap process..." + info "This may take 10-15 minutes for the complete process..." + + local retry_count=0 + local max_bootstrap_retries=30 + local bootstrap_delay=30 + + while [[ $retry_count -lt $max_bootstrap_retries ]]; do + info "Bootstrap check (attempt $((retry_count + 1))/$max_bootstrap_retries)..." + + # Check if cluster is accessible and nodes are ready + if ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no ubuntu@"$MASTER_IP" 'sudo kubectl get nodes --no-headers 2>/dev/null | wc -l' &> /dev/null; then + local ready_nodes=$(ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no ubuntu@"$MASTER_IP" 'sudo kubectl get nodes --no-headers 2>/dev/null | grep -c "Ready" || echo "0"') + + info "Ready nodes: $ready_nodes/$TOTAL_NODES" + + if [[ "$ready_nodes" == "$TOTAL_NODES" ]]; then + success "All nodes are ready!" + return 0 + fi + fi + + info "Cluster not ready yet, waiting $bootstrap_delay seconds..." + sleep $bootstrap_delay + ((retry_count++)) + done + + warning "Kubernetes bootstrap monitoring timed out. Cluster may still be initializing." + return 1 +} + +# Function to setup kubectl locally +setup_kubectl() { + info "Setting up kubectl configuration locally..." + + # Create kubeconfig directory if it doesn't exist + mkdir -p ~/.kube + + # Copy kubeconfig from master node + if scp -o StrictHostKeyChecking=no ubuntu@"$MASTER_IP":/etc/kubernetes/admin.conf ~/.kube/config-"$CLUSTER_NAME" &> /dev/null; then + + # Update kubeconfig to use floating IP + sed -i.bak "s/https:\/\/[0-9]\{1,3\}\.[0-9]\{1,3\}\.[0-9]\{1,3\}\.[0-9]\{1,3\}:6443/https:\/\/$MASTER_IP:6443/g" ~/.kube/config-"$CLUSTER_NAME" + + # Merge with existing kubeconfig or set as default + if [[ -f ~/.kube/config ]]; then + info "Merging with existing kubeconfig..." + KUBECONFIG=~/.kube/config:~/.kube/config-"$CLUSTER_NAME" kubectl config view --flatten > ~/.kube/config-merged + mv ~/.kube/config-merged ~/.kube/config + else + cp ~/.kube/config-"$CLUSTER_NAME" ~/.kube/config + fi + + # Set context + kubectl config use-context kubernetes-admin@kubernetes &> /dev/null || true + configure_kubectl_tls + + success "kubectl configuration completed" + return 0 + else + warning "Failed to copy kubeconfig from master node" + return 1 + fi +} + +# Function to setup NFS storage +setup_nfs_storage() { + info "Setting up NFS storage..." + + # Check if NFS storage class already exists + if kubectl get storageclass | grep -q nfs-client; then + success "NFS storage class is already available" + return 0 + fi + + warning "NFS storage not found, this should be handled by bootstrap script" + warning "Run: ./bootstrap-k8s.sh to set up NFS storage" + return 1 +} + +# Function to verify cluster +verify_cluster() { + info "Verifying Kubernetes cluster..." + + # Test kubectl connectivity + if ! kubectl cluster-info &> /dev/null; then + warning "kubectl cluster-info failed" + return 1 + fi + + # Check nodes + local ready_nodes=$(kubectl get nodes --no-headers | grep -c "Ready" || echo "0") + if [[ "$ready_nodes" != "$TOTAL_NODES" ]]; then + warning "Not all nodes are ready ($ready_nodes/$TOTAL_NODES)" + kubectl get nodes + return 1 + fi + + # Check system pods + local system_pods_ready=$(kubectl get pods -n kube-system --no-headers | grep -c "Running" || echo "0") + local total_system_pods=$(kubectl get pods -n kube-system --no-headers | wc -l || echo "0") + + info "System pods running: $system_pods_ready/$total_system_pods" + + if [[ "$system_pods_ready" -lt 5 ]]; then # At least 5 system pods should be running + warning "System pods may not be fully ready" + kubectl get pods -n kube-system + return 1 + fi + + success "Cluster verification passed" + return 0 +} + +# Function to display next steps +show_next_steps() { + info "===================================" + info "DEPLOYMENT COMPLETED SUCCESSFULLY!" + info "===================================" + echo "" + info "Cluster Information:" + info " Master IP: $MASTER_IP" + info " Cluster Name: $CLUSTER_NAME" + info " Total Nodes: $TOTAL_NODES" + echo "" + info "kubectl is configured and ready to use:" + info " kubectl get nodes" + info " kubectl get pods --all-namespaces" + echo "" + info "To deploy OpenStudio Helm chart:" + info " helm upgrade --install openstudio-server ../openstudio-server -f values-openstack-nfs.yaml -n openstudio-test --create-namespace" + echo "" + info "SSH Access:" + info " ssh ubuntu@$MASTER_IP" + echo "" + info "Log file: $LOG_FILE" +} + +# Function to handle deployment failure +handle_failure() { + error "Deployment failed. Check the log file: $LOG_FILE" + echo "" + warning "Common troubleshooting steps:" + warning "1. Check OpenStack authentication: openstack token issue" + warning "2. Check network connectivity to floating IPs" + warning "3. Verify security group rules allow SSH (port 22)" + warning "4. Check DNS resolution within OpenStack network" + warning "5. Review cloud-init logs: ssh ubuntu@ 'sudo journalctl -u cloud-final'" + echo "" + warning "To destroy and retry:" + warning " tofu destroy -auto-approve" + warning " ./deploy.sh" +} + +# Function to show troubleshooting information +show_troubleshooting() { + warning "===================================" + warning "TROUBLESHOOTING INFORMATION" + warning "===================================" + echo "" + warning "Current Status:" + + # Check if infrastructure exists + if tofu show &> /dev/null; then + info "โœ“ Infrastructure is deployed" + + # Get IPs from terraform + local master_ip=$(tofu output -raw master_floating_ip 2>/dev/null || echo "N/A") + local web_ips=$(tofu output -json web_floating_ips 2>/dev/null | jq -r '.[]' 2>/dev/null || echo "N/A") + local worker_ips=$(tofu output -json worker_floating_ips 2>/dev/null | jq -r '.[]' 2>/dev/null || echo "N/A") + + echo " Master IP: $master_ip" + echo " Web IPs: $web_ips" + echo " Worker IPs: $worker_ips" + + # Test connectivity + if [[ "$master_ip" != "N/A" ]]; then + if ping -c 1 -W 3 "$master_ip" &> /dev/null; then + info "โœ“ Master IP is pingable" + else + warning "โœ— Master IP is not reachable via ping" + fi + + if timeout 10 ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no ubuntu@"$master_ip" 'echo "SSH OK"' &> /dev/null; then + info "โœ“ Master is accessible via SSH" + else + warning "โœ— Master is not accessible via SSH" + fi + fi + else + warning "โœ— No infrastructure found" + fi + + echo "" + warning "Common Issues:" + warning "1. Network connectivity: Some networks block private IP ranges (10.x.x.x)" + warning "2. DNS resolution: Instances might not reach DNS servers" + warning "3. Security groups: Firewall rules might block connections" + warning "4. Cloud-init: Bootstrap scripts might have failed" + echo "" + warning "Manual Steps:" + warning "1. Check OpenStack instances: openstack server list" + warning "2. Check security groups: openstack security group show openstudio-server-secgroup" + warning "3. Check router: openstack router show openstudio-server-router" + warning "4. View console logs: openstack console log show openstudio-server-master" +} + +# Main execution function +main() { + local start_time=$(date +%s) + + info "===================================" + info "OpenStudio Server K8s Deployment" + info "===================================" + info "Started at: $(date)" + info "Log file: $LOG_FILE" + echo "" + + # Handle command line arguments + case "${1:-deploy}" in + "deploy") + # Full deployment workflow + check_prerequisites + deploy_infrastructure + get_deployment_info + + if test_connectivity; then + if monitor_kubernetes_bootstrap; then + setup_kubectl + setup_nfs_storage + + if verify_cluster; then + show_next_steps + else + warning "Cluster verification had issues, but deployment may still be usable" + show_next_steps + fi + else + warning "Bootstrap monitoring timed out, but cluster may still be initializing" + warning "You can manually check progress with: ./bootstrap-k8s.sh" + show_next_steps + fi + else + handle_failure + fi + ;; + "troubleshoot"|"status") + show_troubleshooting + ;; + "destroy") + warning "Destroying infrastructure..." + tofu destroy -auto-approve + success "Infrastructure destroyed" + ;; + "help"|"-h"|"--help") + echo "Usage: $0 [command]" + echo "" + echo "Commands:" + echo " deploy Deploy the complete Kubernetes cluster (default)" + echo " troubleshoot Show troubleshooting information" + echo " status Same as troubleshoot" + echo " destroy Destroy the infrastructure" + echo " help Show this help" + ;; + *) + error "Unknown command: $1. Use '$0 help' for usage information." + ;; + esac + + local end_time=$(date +%s) + local duration=$((end_time - start_time)) + info "Total execution time: ${duration} seconds" +} + +# Trap to handle script interruption +trap 'error "Script interrupted by user"' INT TERM + +# Execute main function with all arguments +main "$@" diff --git a/openstack/k8s-cloud-init.yaml b/openstack/k8s-cloud-init.yaml new file mode 100644 index 0000000..d7141b6 --- /dev/null +++ b/openstack/k8s-cloud-init.yaml @@ -0,0 +1,321 @@ +#cloud-config +users: + - default + - name: ubuntu + sudo: ALL=(ALL) NOPASSWD:ALL + shell: /bin/bash + ssh_authorized_keys: + - ${public_key} + +# Corporate firewall detection and workaround setup +write_files: + - path: /usr/local/bin/detect-corporate-firewall.sh + permissions: '0755' + content: | + #!/bin/bash + # Corporate Firewall Detection Script + set -euo pipefail + + LOGFILE="/var/log/corporate-firewall-detection.log" + + log() { + echo "[$(date)] $*" | tee -a "$LOGFILE" + } + + log "Starting corporate firewall detection..." + + # Test registry access by checking certificates + test_registry_access() { + local registry="$1" + local expected_cn="$2" + + log "Testing access to $registry..." + + # Get the actual certificate subject + actual_cert=$(echo | timeout 10 openssl s_client -connect "$registry:443" -servername "$registry" 2>/dev/null | openssl x509 -noout -subject 2>/dev/null || echo "FAILED") + + if [[ "$actual_cert" == "FAILED" ]]; then + log "ERROR: Cannot connect to $registry" + return 1 + fi + + # Check if certificate matches expected + if [[ "$actual_cert" =~ "$expected_cn" ]]; then + log "SUCCESS: $registry certificate is valid" + return 0 + else + log "WARNING: $registry certificate mismatch: $actual_cert" + return 1 + fi + } + + # Detect corporate firewall by testing key registries + CORPORATE_FIREWALL_DETECTED=false + + if ! test_registry_access "registry-1.docker.io" "registry-1.docker.io"; then + CORPORATE_FIREWALL_DETECTED=true + fi + + if ! test_registry_access "ghcr.io" "ghcr.io"; then + CORPORATE_FIREWALL_DETECTED=true + fi + + # Test if registry.k8s.io is accessible (our fallback) + REGISTRY_K8S_ACCESSIBLE=false + if test_registry_access "registry.k8s.io" "registry.k8s.io"; then + REGISTRY_K8S_ACCESSIBLE=true + fi + + # Write detection results + cat > /etc/corporate-firewall-status << EOF + CORPORATE_FIREWALL_DETECTED=$CORPORATE_FIREWALL_DETECTED + REGISTRY_K8S_ACCESSIBLE=$REGISTRY_K8S_ACCESSIBLE + DETECTION_TIMESTAMP=$(date -Iseconds) + EOF + + log "Corporate firewall detection complete:" + log " CORPORATE_FIREWALL_DETECTED=$CORPORATE_FIREWALL_DETECTED" + log " REGISTRY_K8S_ACCESSIBLE=$REGISTRY_K8S_ACCESSIBLE" + + # Apply workarounds if corporate firewall detected + if [[ "$CORPORATE_FIREWALL_DETECTED" == "true" ]]; then + log "Applying corporate firewall workarounds..." + /usr/local/bin/setup-corporate-firewall-workarounds.sh + fi + + - path: /usr/local/bin/setup-corporate-firewall-workarounds.sh + permissions: '0755' + content: | + #!/bin/bash + # Corporate Firewall Workarounds Setup + set -euo pipefail + + LOGFILE="/var/log/corporate-firewall-workarounds.log" + + log() { + echo "[$(date)] $*" | tee -a "$LOGFILE" + } + + log "Setting up corporate firewall workarounds..." + + # 1. Configure containerd with TLS skip verification + setup_containerd_config() { + log "Configuring containerd for corporate firewall..." + + mkdir -p /etc/containerd + cat > /etc/containerd/config.toml << 'EOF' + version = 2 + + [plugins] + [plugins."io.containerd.grpc.v1.cri"] + sandbox_image = "registry.k8s.io/pause:3.9" + max_container_log_line_size = -1 + enable_unprivileged_ports = false + enable_unprivileged_icmp = false + [plugins."io.containerd.grpc.v1.cri".containerd] + default_runtime_name = "runc" + snapshotter = "overlayfs" + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes] + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] + runtime_type = "io.containerd.runc.v2" + runtime_engine = "" + base_runtime_spec = "/etc/containerd/cri-base.json" + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] + systemdCgroup = true + binaryName = "/usr/local/bin/runc" + [plugins."io.containerd.grpc.v1.cri".registry] + [plugins."io.containerd.grpc.v1.cri".registry.mirrors] + [plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io"] + endpoint = ["https://registry-1.docker.io"] + [plugins."io.containerd.grpc.v1.cri".registry.mirrors."quay.io"] + endpoint = ["https://quay.io"] + [plugins."io.containerd.grpc.v1.cri".registry.mirrors."ghcr.io"] + endpoint = ["https://ghcr.io"] + [plugins."io.containerd.grpc.v1.cri".registry.mirrors."registry.k8s.io"] + endpoint = ["https://registry.k8s.io"] + [plugins."io.containerd.grpc.v1.cri".registry.configs] + [plugins."io.containerd.grpc.v1.cri".registry.configs."registry-1.docker.io".tls] + insecure_skip_verify = true + [plugins."io.containerd.grpc.v1.cri".registry.configs."quay.io".tls] + insecure_skip_verify = true + [plugins."io.containerd.grpc.v1.cri".registry.configs."ghcr.io".tls] + insecure_skip_verify = true + [plugins."io.containerd.grpc.v1.cri".registry.configs."registry.k8s.io".tls] + insecure_skip_verify = false + EOF + + log "Containerd configuration updated for corporate firewall" + } + + # 2. Pre-create CNI bridge configuration + # setup_bridge_cni() { + # log "Setting up fallback bridge CNI configuration..." + # + # mkdir -p /etc/cni/net.d + # cat > /etc/cni/net.d/10-bridge.conf << 'EOF' + # { + # "cniVersion": "0.4.0", + # "name": "bridge-network", + # "type": "bridge", + # "bridge": "cni0", + # "isGateway": true, + # "ipMasq": true, + # "ipam": { + # "type": "host-local", + # "subnet": "10.244.0.0/16", + # "routes": [ + # { "dst": "0.0.0.0/0" } + # ] + # } + # } + # EOF + # + # log "Bridge CNI configuration created" + # } + + # 3. Create kubeadm configuration for external etcd with proper certificates + setup_kubeadm_config() { + log "Creating kubeadm configuration for external etcd..." + + mkdir -p /etc/kubernetes + cat > /etc/kubernetes/kubeadm-config.yaml << 'EOF' + apiVersion: kubeadm.k8s.io/v1beta3 + kind: ClusterConfiguration + kubernetesVersion: v1.28.11 + controlPlaneEndpoint: MASTER_IP:6443 + networking: + serviceSubnet: 10.96.0.0/12 + podSubnet: 10.244.0.0/16 + etcd: + external: + endpoints: + - https://MASTER_IP:2379 + caFile: /etc/kubernetes/pki/etcd-ca.crt + certFile: /etc/kubernetes/pki/apiserver-etcd-client.crt + keyFile: /etc/kubernetes/pki/apiserver-etcd-client.key + --- + apiVersion: kubeadm.k8s.io/v1beta3 + kind: InitConfiguration + localAPIEndpoint: + advertiseAddress: MASTER_IP + bindPort: 6443 + EOF + + log "Kubeadm configuration template created" + } + + # 4. Create etcd certificate setup script + setup_etcd_certificates() { + log "Creating etcd certificate setup script..." + + cat > /usr/local/bin/setup-etcd-certificates.sh << 'EOF' + #!/bin/bash + # Setup etcd client certificates for kube-apiserver + set -euo pipefail + + if [[ -d /etc/ssl/etcd/ssl ]]; then + echo "Setting up etcd client certificates..." + mkdir -p /etc/kubernetes/pki + + # Copy etcd CA and client certificates + cp /etc/ssl/etcd/ssl/ca.pem /etc/kubernetes/pki/etcd-ca.crt + cp /etc/ssl/etcd/ssl/node-$(hostname).pem /etc/kubernetes/pki/apiserver-etcd-client.crt + cp /etc/ssl/etcd/ssl/node-$(hostname)-key.pem /etc/kubernetes/pki/apiserver-etcd-client.key + + # Fix permissions + chmod 600 /etc/kubernetes/pki/apiserver-etcd-client.key + chmod 644 /etc/kubernetes/pki/apiserver-etcd-client.crt + chmod 644 /etc/kubernetes/pki/etcd-ca.crt + + echo "etcd certificates configured successfully" + else + echo "etcd certificates not found, skipping..." + fi + EOF + + chmod +x /usr/local/bin/setup-etcd-certificates.sh + log "etcd certificate setup script created" + } + + # 5. Create corporate firewall kubespray script + setup_kubespray_workarounds() { + log "Creating kubespray corporate firewall wrapper..." + + cat > /usr/local/bin/kubespray-corporate-firewall.sh << 'EOF' + #!/bin/bash + # Kubespray with Corporate Firewall Workarounds + set -euo pipefail + + echo "Running kubespray with corporate firewall workarounds..." + + # Source firewall detection results + if [[ -f /etc/corporate-firewall-status ]]; then + source /etc/corporate-firewall-status + echo "Corporate firewall detected: $CORPORATE_FIREWALL_DETECTED" + echo "registry.k8s.io accessible: $REGISTRY_K8S_ACCESSIBLE" + fi + + # Set environment variables for corporate firewall + export DOWNLOAD_VALIDATE_CERTS=false + export DOWNLOAD_RETRIES=10 + export DOWNLOAD_TIMEOUT=900 + + # Run the actual kubespray command with additional flags + exec "$@" + EOF + + chmod +x /usr/local/bin/kubespray-corporate-firewall.sh + log "Kubespray corporate firewall wrapper created" + } + + # Execute all setup functions + setup_containerd_config + setup_kubeadm_config + setup_etcd_certificates + setup_kubespray_workarounds + + log "Corporate firewall workarounds setup complete" + + - path: /etc/systemd/system/corporate-firewall-detection.service + content: | + [Unit] + Description=Corporate Firewall Detection and Workaround Setup + After=network-online.target + Wants=network-online.target + + [Service] + Type=oneshot + ExecStart=/usr/local/bin/detect-corporate-firewall.sh + RemainAfterExit=yes + StandardOutput=journal + StandardError=journal + + [Install] + WantedBy=multi-user.target + +# System updates and basic setup +package_update: true +package_upgrade: false + +# Install required packages +packages: + - curl + - wget + - git + - python3 + - python3-pip + - jq + - openssl + +# Enable the corporate firewall detection service +runcmd: + - systemctl daemon-reload + - systemctl enable corporate-firewall-detection.service + - systemctl start corporate-firewall-detection.service + +# Configure SSH +ssh_authorized_keys: + - ${public_key} + +# Ensure cloud-init completes +final_message: "Corporate firewall detection and workaround setup complete. System ready for Kubespray deployment." diff --git a/openstack/kubespray/inventory/sample/group_vars/all.yml b/openstack/kubespray/inventory/sample/group_vars/all.yml new file mode 100644 index 0000000..2009623 --- /dev/null +++ b/openstack/kubespray/inventory/sample/group_vars/all.yml @@ -0,0 +1,102 @@ +# Kubespray all.yml configuration for OpenStudio Server +# Optimized for corporate firewall environments + +# Basic cluster configuration +cluster_name: "{{ cluster_name | default('openstudio-server') }}" +kubernetes_version: v1.31.3 +kube_version: v1.31.3 + +# Network Configuration +kube_service_addresses: 10.233.0.0/18 +kube_pods_subnet: 10.233.64.0/18 +kube_network_node_prefix: 24 + +# Container runtime +container_manager: containerd + +# Corporate firewall detection and configuration +# Use wrapper script if corporate firewall is detected +use_corporate_firewall_wrapper: "{{ lookup('env', 'CORPORATE_FIREWALL_DETECTED') | default('false') | bool }}" + +# Download configuration - adjusted for corporate environments +download_run_once: true +download_localhost: false +download_cache_dir: /tmp/kubespray_cache +download_keep_remote_cache: false +download_force_cache: false + +# Download validation (set based on corporate firewall detection) +download_validate_certs: "{{ not (use_corporate_firewall_wrapper | bool) }}" +download_retries: "{{ 10 if use_corporate_firewall_wrapper else 3 }}" +download_delay: "{{ 5 if use_corporate_firewall_wrapper else 1 }}" +download_timeout: "{{ 900 if use_corporate_firewall_wrapper else 300 }}" + +# OpenStack cloud provider configuration +cloud_provider: openstack +openstack_cloud_provider: true +openstack_blockstorage_version: "v3" + +# External cloud provider (for LoadBalancer services) +external_openstack_cloud_provider: true + +# Enable addons required for OpenStudio Server +dns_mode: coredns +ingress_nginx_enabled: true +cert_manager_enabled: false +metallb_enabled: false + +# Storage configuration +csi_openstack_cinder_enabled: true +csi_openstack_cinder_blockstorage_version: "v3" +persistent_volumes_enabled: true +expand_persistent_volumes: true + +# Node labeling for workload separation +node_labels: + master: "node-role.kubernetes.io/control-plane=true" + web: "nodegroup=web-group,workload=web" + worker: "nodegroup=worker-group,workload=compute" + +# Resource reservations +kube_reserved: true +system_reserved: true + +# Kubernetes API server configuration +kube_apiserver_enable_admission_plugins: + - NodeRestriction + - PodSecurityPolicy + +# Kubelet configuration +kubelet_max_pods: 250 +kubelet_pods_per_core: 10 + +# Docker/containerd configuration +containerd_extra_args: {} +docker_containerd_extra_args: {} + +# Enable necessary features +podsecuritypolicy_enabled: false +kubernetes_audit: false + +# Etcd configuration for high availability +etcd_deployment_type: docker +etcd_memory_limit: 2G +etcd_cpu_limit: 1000m + +# CNI plugin configuration (Calico recommended for OpenStack) +kube_network_plugin: calico +calico_version: "{{ calico_version | default('v3.28.2') }}" +calico_cni_version: "{{ calico_cni_version | default('v3.28.2') }}" +calico_felix_default_endpoint_to_host_action: "ACCEPT" +calico_pool_cidr: "{{ kube_pods_subnet }}" + +# Registry mirrors (configured by corporate firewall detection) +containerd_registries: + "docker.io": + - "https://registry-1.docker.io" + "quay.io": + - "https://quay.io" + "registry.k8s.io": + - "https://registry.k8s.io" + "ghcr.io": + - "https://ghcr.io" diff --git a/openstack/kubespray/inventory/sample/group_vars/k8s_cluster.yml b/openstack/kubespray/inventory/sample/group_vars/k8s_cluster.yml new file mode 100644 index 0000000..88293db --- /dev/null +++ b/openstack/kubespray/inventory/sample/group_vars/k8s_cluster.yml @@ -0,0 +1,90 @@ +# Kubespray k8s-cluster.yml configuration for OpenStudio Server +# OpenStack cloud provider and storage configuration + +# OpenStack cloud provider configuration +openstack_cloud_provider: true +openstack_cloud_config: + global: + auth_url: "{{ openstack_auth_url }}" + username: "{{ openstack_username }}" + password: "{{ openstack_password }}" + tenant_name: "{{ openstack_tenant_name }}" + domain_name: "{{ openstack_user_domain_name | default('Default') }}" + region: "{{ openstack_region | default('RegionOne') }}" + block_storage: + bs_version: v3 + trust_device_path: false + ignore_volume_az: true + load_balancer: + use_octavia: true + subnet_id: "{{ openstack_subnet_id }}" + floating_network_id: "{{ openstack_external_network_id }}" + create_monitor: true + monitor_delay: 5 + monitor_timeout: 3 + monitor_max_retries: 3 + +# External cloud provider for newer Kubernetes versions +external_cloud_provider: openstack +external_openstack_cloud_provider: true + +# Cinder CSI driver configuration +csi_openstack_cinder_enabled: true +cinder_csi_controller_replicas: 1 +cinder_csi_plugin_image_tag: "latest" + +# Storage classes for OpenStudio Server requirements +openstack_cinder_storage_classes: + - name: ssd + is_default: true + volume_type: __DEFAULT__ + availability_zone: nova + reclaim_policy: Retain + allow_volume_expansion: true + parameters: + type: __DEFAULT__ + - name: nfs + provisioner: nfs.csi.k8s.io + reclaim_policy: Retain + allow_volume_expansion: true + volume_binding_mode: Immediate + +# Node taints for workload separation +node_taints: + - key: "nodegroup" + value: "web-group" + effect: "NoSchedule" + nodes_selector: "nodegroup=web-group" + - key: "nodegroup" + value: "worker-group" + effect: "NoSchedule" + nodes_selector: "nodegroup=worker-group" + +# Enable required Kubernetes features +kube_feature_gates: + - CSIMigration=true + - CSIMigrationOpenStack=true + - ExpandCSIVolumes=true + - ExpandInUsePersistentVolumes=true + +# API server admission controllers +kube_apiserver_enable_admission_plugins: + - NodeRestriction + - ResourceQuota + - LimitRanger + - ServiceAccount + - DefaultStorageClass + - MutatingAdmissionWebhook + - ValidatingAdmissionWebhook + +# Enable pod security standards +pod_security_policy_enabled: false +kube_pod_security_use_default: true + +# Resource management +kube_system_reserved: true +kube_system_reserved_namespace: kube-system + +# Networking +kube_proxy_mode: iptables +kube_proxy_strict_arp: true diff --git a/openstack/main.tf b/openstack/main.tf new file mode 100644 index 0000000..a55df87 --- /dev/null +++ b/openstack/main.tf @@ -0,0 +1,315 @@ +# Simplified OpenStack Infrastructure for Kubespray Kubernetes Deployment +# +# This creates basic Ubuntu instances with SSH access for Kubespray to configure + +provider "openstack" { + user_name = var.openstack_user_name + password = var.openstack_password + auth_url = var.openstack_auth_url + tenant_name = var.openstack_tenant_name + user_domain_name = var.openstack_user_domain_name + project_domain_id = var.openstack_project_domain_id + tenant_id = var.openstack_project_id + region = var.openstack_region +} + +# Create a network +resource "openstack_networking_network_v2" "k8s_network" { + name = "${var.cluster_name}-network" + admin_state_up = "true" +} + +# Create a subnet +resource "openstack_networking_subnet_v2" "k8s_subnet" { + name = "${var.cluster_name}-subnet" + network_id = openstack_networking_network_v2.k8s_network.id + cidr = "10.0.1.0/24" + ip_version = 4 + # Remove external DNS servers since port 53 is blocked by network policy + # Let OpenStack provide default DNS servers or use none + # dns_nameservers = [] +} + +# Get external network for router gateway +data "openstack_networking_network_v2" "external_network" { + name = "external" +} + +# Create a router for external connectivity +resource "openstack_networking_router_v2" "k8s_router" { + name = "${var.cluster_name}-router" + admin_state_up = true + external_network_id = data.openstack_networking_network_v2.external_network.id +} + +# Attach the subnet to the router +resource "openstack_networking_router_interface_v2" "k8s_router_interface" { + router_id = openstack_networking_router_v2.k8s_router.id + subnet_id = openstack_networking_subnet_v2.k8s_subnet.id +} + +# Create a security group with basic access +resource "openstack_networking_secgroup_v2" "k8s_secgroup" { + name = "${var.cluster_name}-secgroup" + description = "Security group for Kubespray Kubernetes cluster" +} + +# SSH access +resource "openstack_networking_secgroup_rule_v2" "ssh_access" { + direction = "ingress" + ethertype = "IPv4" + protocol = "tcp" + port_range_min = 22 + port_range_max = 22 + remote_ip_prefix = var.admin_access_cidr + security_group_id = openstack_networking_secgroup_v2.k8s_secgroup.id +} + +# ICMP for ping testing +resource "openstack_networking_secgroup_rule_v2" "icmp_access" { + direction = "ingress" + ethertype = "IPv4" + protocol = "icmp" + remote_ip_prefix = var.admin_access_cidr + security_group_id = openstack_networking_secgroup_v2.k8s_secgroup.id +} + +# Kubernetes API server access +resource "openstack_networking_secgroup_rule_v2" "k8s_api_access" { + direction = "ingress" + ethertype = "IPv4" + protocol = "tcp" + port_range_min = 6443 + port_range_max = 6443 + remote_ip_prefix = var.k8s_api_access_cidr + security_group_id = openstack_networking_secgroup_v2.k8s_secgroup.id +} + +# Internal communication (all ports between cluster nodes) +resource "openstack_networking_secgroup_rule_v2" "internal_all" { + direction = "ingress" + ethertype = "IPv4" + protocol = "tcp" + port_range_min = 1 + port_range_max = 65535 + remote_ip_prefix = "10.0.1.0/24" + security_group_id = openstack_networking_secgroup_v2.k8s_secgroup.id +} + +# Pod network communication (allow pod network to access hosts) +resource "openstack_networking_secgroup_rule_v2" "pod_network_internal" { + direction = "ingress" + ethertype = "IPv4" + protocol = "tcp" + port_range_min = 1 + port_range_max = 65535 + remote_ip_prefix = "10.244.0.0/16" + security_group_id = openstack_networking_secgroup_v2.k8s_secgroup.id +} + +# NodePort range for Kubernetes services (30000-32767) +resource "openstack_networking_secgroup_rule_v2" "nodeport_http" { + direction = "ingress" + ethertype = "IPv4" + protocol = "tcp" + port_range_min = 30749 + port_range_max = 30749 + remote_ip_prefix = var.nodeport_access_cidr + security_group_id = openstack_networking_secgroup_v2.k8s_secgroup.id +} + +# NodePort HTTPS for OpenStudio Server +resource "openstack_networking_secgroup_rule_v2" "nodeport_https" { + direction = "ingress" + ethertype = "IPv4" + protocol = "tcp" + port_range_min = 31385 + port_range_max = 31385 + remote_ip_prefix = var.nodeport_access_cidr + security_group_id = openstack_networking_secgroup_v2.k8s_secgroup.id +} + +# General NodePort range (optional - allows any NodePort services) +resource "openstack_networking_secgroup_rule_v2" "nodeport_range" { + direction = "ingress" + ethertype = "IPv4" + protocol = "tcp" + port_range_min = 30000 + port_range_max = 32767 + remote_ip_prefix = var.nodeport_access_cidr + security_group_id = openstack_networking_secgroup_v2.k8s_secgroup.id +} + +# Get the Ubuntu image +data "openstack_images_image_v2" "ubuntu_image" { + name = var.image_name + most_recent = true +} + +# Enhanced cloud-init with corporate firewall detection and workarounds +locals { + user_data = base64encode(templatefile("${path.module}/k8s-cloud-init.yaml", { + public_key = var.public_key + })) +} + +# Floating IPs for external access +resource "openstack_networking_floatingip_v2" "master_fip" { + pool = "external" +} + +resource "openstack_networking_floatingip_v2" "worker_fip" { + count = var.worker_count + pool = "external" +} + +resource "openstack_networking_floatingip_v2" "web_fip" { + count = var.web_count + pool = "external" +} + +# Create volumes for instances (required for CS.Tiny flavor with zero disk) +resource "openstack_blockstorage_volume_v3" "master_volume" { + name = "${var.cluster_name}-master-volume" + size = var.volume_size + image_id = data.openstack_images_image_v2.ubuntu_image.id +} + +resource "openstack_blockstorage_volume_v3" "worker_volume" { + count = var.worker_count + name = "${var.cluster_name}-worker-${count.index + 1}-volume" + size = var.volume_size + image_id = data.openstack_images_image_v2.ubuntu_image.id +} + +resource "openstack_blockstorage_volume_v3" "web_volume" { + count = var.web_count + name = "${var.cluster_name}-web-${count.index + 1}-volume" + size = var.volume_size + image_id = data.openstack_images_image_v2.ubuntu_image.id +} + +# Create network ports for proper floating IP association +resource "openstack_networking_port_v2" "master_port" { + name = "${var.cluster_name}-master-port" + network_id = openstack_networking_network_v2.k8s_network.id + admin_state_up = "true" + security_group_ids = [openstack_networking_secgroup_v2.k8s_secgroup.id] + + fixed_ip { + subnet_id = openstack_networking_subnet_v2.k8s_subnet.id + } +} + +resource "openstack_networking_port_v2" "worker_port" { + count = var.worker_count + name = "${var.cluster_name}-worker-${count.index + 1}-port" + network_id = openstack_networking_network_v2.k8s_network.id + admin_state_up = "true" + security_group_ids = [openstack_networking_secgroup_v2.k8s_secgroup.id] + + fixed_ip { + subnet_id = openstack_networking_subnet_v2.k8s_subnet.id + } +} + +resource "openstack_networking_port_v2" "web_port" { + count = var.web_count + name = "${var.cluster_name}-web-${count.index + 1}-port" + network_id = openstack_networking_network_v2.k8s_network.id + admin_state_up = "true" + security_group_ids = [openstack_networking_secgroup_v2.k8s_secgroup.id] + + fixed_ip { + subnet_id = openstack_networking_subnet_v2.k8s_subnet.id + } +} + +# Master node +resource "openstack_compute_instance_v2" "k8s_master" { + name = "${var.cluster_name}-master" + flavor_name = var.master_flavor + key_pair = var.key_pair + user_data = local.user_data + + block_device { + uuid = openstack_blockstorage_volume_v3.master_volume.id + source_type = "volume" + destination_type = "volume" + boot_index = 0 + delete_on_termination = true + } + + network { + port = openstack_networking_port_v2.master_port.id + } + + depends_on = [openstack_networking_router_interface_v2.k8s_router_interface] +} + +# Worker nodes +resource "openstack_compute_instance_v2" "k8s_worker" { + count = var.worker_count + name = "${var.cluster_name}-worker-${count.index + 1}" + flavor_name = var.worker_flavor + key_pair = var.key_pair + user_data = local.user_data + + block_device { + uuid = openstack_blockstorage_volume_v3.worker_volume[count.index].id + source_type = "volume" + destination_type = "volume" + boot_index = 0 + delete_on_termination = true + } + + network { + port = openstack_networking_port_v2.worker_port[count.index].id + } + + depends_on = [openstack_networking_router_interface_v2.k8s_router_interface] +} + +# Web nodes +resource "openstack_compute_instance_v2" "k8s_web" { + count = var.web_count + name = "${var.cluster_name}-web-${count.index + 1}" + flavor_name = var.web_flavor + key_pair = var.key_pair + user_data = local.user_data + + block_device { + uuid = openstack_blockstorage_volume_v3.web_volume[count.index].id + source_type = "volume" + destination_type = "volume" + boot_index = 0 + delete_on_termination = true + } + + network { + port = openstack_networking_port_v2.web_port[count.index].id + } + + depends_on = [openstack_networking_router_interface_v2.k8s_router_interface] +} + +# Associate floating IPs +resource "openstack_networking_floatingip_associate_v2" "master_fip_assoc" { + floating_ip = openstack_networking_floatingip_v2.master_fip.address + port_id = openstack_networking_port_v2.master_port.id + depends_on = [openstack_compute_instance_v2.k8s_master] +} + +resource "openstack_networking_floatingip_associate_v2" "worker_fip_assoc" { + count = var.worker_count + floating_ip = openstack_networking_floatingip_v2.worker_fip[count.index].address + port_id = openstack_networking_port_v2.worker_port[count.index].id + depends_on = [openstack_compute_instance_v2.k8s_worker] +} + +resource "openstack_networking_floatingip_associate_v2" "web_fip_assoc" { + count = var.web_count + floating_ip = openstack_networking_floatingip_v2.web_fip[count.index].address + port_id = openstack_networking_port_v2.web_port[count.index].id + depends_on = [openstack_compute_instance_v2.k8s_web] +} diff --git a/openstack/openstudio-large.tfvars b/openstack/openstudio-large.tfvars new file mode 100644 index 0000000..6930ff8 --- /dev/null +++ b/openstack/openstudio-large.tfvars @@ -0,0 +1,21 @@ +# OpenStudio Server Large Cluster Configuration +# Equivalent to AWS EKS large configuration + +cluster_name = "openstudio-server-large" + +# Master node configuration +master_flavor = "CS.2XMedium" # 32 vCPUs, 128GB RAM + +# Web node group - for web interface and API +web_count = 1 +web_flavor = "CE.2XMedium" # 32 vCPUs, 154GB RAM (enterprise flavor) + +# Worker node group - for computation +worker_count = 1 +worker_flavor = "CE.XLarge" # 32 vCPUs, 300GB RAM (compute enterprise) + +# Storage configuration +volume_size = 550 # GB for web nodes (matches EKS config) + +# SSH key configuration +key_pair = "achapin" diff --git a/openstack/openstudio-micro.tfvars b/openstack/openstudio-micro.tfvars new file mode 100644 index 0000000..82ca251 --- /dev/null +++ b/openstack/openstudio-micro.tfvars @@ -0,0 +1,22 @@ +# OpenStudio Server Micro Cluster Configuration +# Ultra-minimal configuration to fit within tight RAM quotas +# Total RAM usage: ~36GB (well within 245GB quota) + +cluster_name = "openstudio-server-micro" + +# Master node configuration - minimal but functional +master_flavor = "CS.Tiny" # 4 vCPUs, 16GB RAM + +# Web node group - single node, minimal +web_count = 1 +web_flavor = "CS.Tiny" # 4 vCPUs, 16GB RAM + +# Worker node group - single node, minimal +worker_count = 1 +worker_flavor = "CC.Tiny" # 4 vCPUs, 4GB RAM + +# Storage configuration - reduced for testing +volume_size = 50 # GB for each node + +# SSH key configuration +key_pair = "achapin" diff --git a/openstack/openstudio-small.tfvars b/openstack/openstudio-small.tfvars new file mode 100644 index 0000000..789c68c --- /dev/null +++ b/openstack/openstudio-small.tfvars @@ -0,0 +1,21 @@ +# OpenStudio Server Small Cluster Configuration +# Quota-conscious configuration: Total RAM ~84GB + +cluster_name = "openstudio-server-small" + +# Master node configuration +master_flavor = "CS.Wee" # 8 vCPUs, 32GB RAM + +# Web node group - for web interface and API +web_count = 2 +web_flavor = "CS.Tiny" # 4 vCPUs, 16GB RAM each (32GB total) + +# Worker node group - for computation +worker_count = 1 +worker_flavor = "shared_c8m16d50" # 8 vCPUs, 16GB RAM + +# Storage configuration +volume_size = 100 # GB for each node + +# SSH key configuration +key_pair = "achapin" diff --git a/openstack/outputs.tf b/openstack/outputs.tf new file mode 100644 index 0000000..1badab6 --- /dev/null +++ b/openstack/outputs.tf @@ -0,0 +1,86 @@ +# Output values for Kubespray deployment + +output "cluster_info" { + description = "Basic cluster information" + value = { + cluster_name = var.cluster_name + master_private_ip = openstack_networking_port_v2.master_port.all_fixed_ips[0] + master_public_ip = openstack_networking_floatingip_v2.master_fip.address + total_nodes = 1 + var.worker_count + var.web_count + worker_node_count = var.worker_count + web_node_count = var.web_count + } +} + +output "master_floating_ip" { + value = openstack_networking_floatingip_v2.master_fip.address +} + +output "master_ip" { + value = openstack_networking_port_v2.master_port.all_fixed_ips[0] +} + +output "worker_floating_ips" { + value = openstack_networking_floatingip_v2.worker_fip[*].address +} + +output "worker_ips" { + value = openstack_networking_port_v2.worker_port[*].all_fixed_ips[0] +} + +output "web_floating_ips" { + value = openstack_networking_floatingip_v2.web_fip[*].address +} + +output "web_node_ips" { + value = openstack_networking_port_v2.web_port[*].all_fixed_ips[0] +} + +output "ssh_connection_info" { + description = "SSH connection details" + value = { + master_ssh_command = "ssh ubuntu@${openstack_networking_floatingip_v2.master_fip.address}" + worker_ssh_commands = [for ip in openstack_networking_floatingip_v2.worker_fip[*].address : "ssh ubuntu@${ip}"] + web_ssh_commands = [for ip in openstack_networking_floatingip_v2.web_fip[*].address : "ssh ubuntu@${ip}"] + note = "Use your SSH private key (${var.key_pair}) for authentication" + } +} + +output "kubespray_inventory_template" { + description = "Template for Kubespray inventory file" + value = <<-EOT +[all] +master ansible_host=${openstack_networking_floatingip_v2.master_fip.address} ip=${openstack_networking_port_v2.master_port.all_fixed_ips[0]} +%{for i, ip in openstack_networking_floatingip_v2.worker_fip[*].address} +worker-${i + 1} ansible_host=${ip} ip=${openstack_networking_port_v2.worker_port[i].all_fixed_ips[0]} +%{endfor} +%{for i, ip in openstack_networking_floatingip_v2.web_fip[*].address} +web-${i + 1} ansible_host=${ip} ip=${openstack_networking_port_v2.web_port[i].all_fixed_ips[0]} +%{endfor} + +[kube-master] +master + +[etcd] +master + +[kube-node] +%{for i, ip in openstack_networking_floatingip_v2.worker_fip[*].address} +worker-${i + 1} +%{endfor} +%{for i, ip in openstack_networking_floatingip_v2.web_fip[*].address} +web-${i + 1} +%{endfor} + +[calico-rr] + +[k8s-cluster:children] +kube-master +kube-node +calico-rr + +[k8s-cluster:vars] +ansible_user=ubuntu +ansible_ssh_private_key_file=~/.ssh/id_rsa +EOT +} diff --git a/openstack/setup-kubectl.sh b/openstack/setup-kubectl.sh new file mode 100755 index 0000000..b158fbb --- /dev/null +++ b/openstack/setup-kubectl.sh @@ -0,0 +1,170 @@ +#!/bin/bash + +# setup-kubectl.sh +# Automatically configure kubectl for OpenStack Kubernetes cluster +# +# This script: +# 1. Gets the floating IP from Terraform output +# 2. Updates kubectl configuration to use the floating IP +# 3. Configures TLS verification for floating IP access +# 4. Tests the connection + +set -e + +KUBE_TLS_SERVER_NAME="${KUBE_TLS_SERVER_NAME:-kubernetes}" +OPENSTACK_ALLOW_INSECURE_KUBECTL="${OPENSTACK_ALLOW_INSECURE_KUBECTL:-false}" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Function to print colored output +print_status() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +print_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +print_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +configure_kubectl_tls() { + if [[ "${OPENSTACK_ALLOW_INSECURE_KUBECTL}" == "true" ]]; then + kubectl config set-cluster kubernetes --insecure-skip-tls-verify=true >/dev/null + print_warning "TLS verification disabled (OPENSTACK_ALLOW_INSECURE_KUBECTL=true)" + return + fi + + kubectl config set-cluster kubernetes --insecure-skip-tls-verify=false >/dev/null + kubectl config set-cluster kubernetes --tls-server-name="${KUBE_TLS_SERVER_NAME}" >/dev/null + print_success "TLS verification enabled (tls-server-name=${KUBE_TLS_SERVER_NAME})" +} + +# Check if tofu is available +if ! command -v tofu &> /dev/null; then + print_error "OpenTofu (tofu) is not installed or not in PATH" + print_error "Please install OpenTofu: https://opentofu.org/docs/intro/install/" + exit 1 +fi + +# Check if kubectl is available +if ! command -v kubectl &> /dev/null; then + print_error "kubectl is not installed or not in PATH" + print_error "Please install kubectl: https://kubernetes.io/docs/tasks/tools/" + exit 1 +fi + +print_status "Setting up kubectl for OpenStack Kubernetes cluster..." + +# Get the cluster information from Terraform +print_status "Retrieving cluster information from Terraform..." + +# Check if we're in the right directory +if [[ ! -f "main.tf" ]]; then + print_error "main.tf not found. Please run this script from the openstack/ directory." + exit 1 +fi + +# Use the tofu-with-env.sh script if available, otherwise use tofu directly +if [[ -f "./tofu-with-env.sh" ]]; then + TOFU_CMD="./tofu-with-env.sh" + print_status "Using tofu-with-env.sh for environment variables" +else + TOFU_CMD="tofu" + print_warning "tofu-with-env.sh not found, using tofu directly" + print_warning "Make sure your OpenStack environment variables are set!" +fi + +# Get the floating IP +print_status "Extracting floating IP from Terraform output..." +FLOATING_IP=$(${TOFU_CMD} output -raw master_floating_ip 2>/dev/null | tail -n 1) +if [[ -z "$FLOATING_IP" || "$FLOATING_IP" == *"Error"* ]]; then + print_error "Could not retrieve master_floating_ip from Terraform output" + print_error "Make sure your infrastructure is deployed and Terraform state is available" + print_error "Output was: $FLOATING_IP" + exit 1 +fi + +print_success "Found master floating IP: $FLOATING_IP" + +# Test connectivity to the API server +print_status "Testing connectivity to Kubernetes API server..." +if nc -zv "$FLOATING_IP" 6443 &>/dev/null; then + print_success "Kubernetes API server is reachable at $FLOATING_IP:6443" +else + print_error "Cannot reach Kubernetes API server at $FLOATING_IP:6443" + print_error "Please check:" + print_error " 1. The cluster is fully deployed and running" + print_error " 2. Security groups allow port 6443" + print_error " 3. The floating IP is correctly assigned" + exit 1 +fi + +# Backup existing kubeconfig if it exists +KUBECONFIG_PATH="${HOME}/.kube/config" +if [[ -f "$KUBECONFIG_PATH" ]]; then + BACKUP_PATH="${KUBECONFIG_PATH}.backup.$(date +%Y%m%d-%H%M%S)" + print_status "Backing up existing kubeconfig to $BACKUP_PATH" + cp "$KUBECONFIG_PATH" "$BACKUP_PATH" +fi + +# List available contexts before making changes +print_status "Current kubectl contexts:" +kubectl config get-contexts 2>/dev/null || print_warning "No existing kubectl contexts found" + +# Update kubectl configuration +print_status "Updating kubectl configuration..." + +# Set the cluster endpoint to use the floating IP +kubectl config set-cluster kubernetes --server="https://${FLOATING_IP}:6443" +print_success "Updated cluster endpoint to https://${FLOATING_IP}:6443" + +configure_kubectl_tls + +# Set the context to use (assuming it exists) +if kubectl config get-contexts kubernetes-admin@kubernetes &>/dev/null; then + kubectl config use-context kubernetes-admin@kubernetes + print_success "Switched to context: kubernetes-admin@kubernetes" +else + print_warning "Context 'kubernetes-admin@kubernetes' not found" + print_warning "Available contexts:" + kubectl config get-contexts + echo + print_warning "You may need to manually switch to the correct context with:" + print_warning "kubectl config use-context " +fi + +# Test the connection +print_status "Testing kubectl connection..." +if kubectl cluster-info &>/dev/null; then + print_success "kubectl is now configured and working!" + echo + print_status "Cluster information:" + kubectl cluster-info + echo + print_status "Node status:" + kubectl get nodes +else + print_error "kubectl configuration failed" + print_error "Please check the troubleshooting section in the README.md" + exit 1 +fi + +echo +print_success "kubectl setup complete!" +print_status "You can now use kubectl to manage your OpenStack Kubernetes cluster" +if [[ "${OPENSTACK_ALLOW_INSECURE_KUBECTL}" == "true" ]]; then + print_warning "Note: TLS verification is disabled due to OPENSTACK_ALLOW_INSECURE_KUBECTL=true" +else + print_status "TLS verification is enabled. Override server name with KUBE_TLS_SERVER_NAME if needed." +fi diff --git a/openstack/storage-classes.yaml b/openstack/storage-classes.yaml new file mode 100644 index 0000000..72e4139 --- /dev/null +++ b/openstack/storage-classes.yaml @@ -0,0 +1,57 @@ +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: cinder-csi + annotations: + storageclass.kubernetes.io/is-default-class: "true" +provisioner: cinder.csi.openstack.org +parameters: + type: __DEFAULT__ +reclaimPolicy: Delete +allowVolumeExpansion: true +volumeBindingMode: WaitForFirstConsumer +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: csi-cinder +provisioner: cinder.csi.openstack.org +parameters: + type: __DEFAULT__ +reclaimPolicy: Delete +allowVolumeExpansion: true +volumeBindingMode: WaitForFirstConsumer +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: cinder-ssd +provisioner: cinder.csi.openstack.org +parameters: + type: SSD +reclaimPolicy: Delete +allowVolumeExpansion: true +volumeBindingMode: WaitForFirstConsumer +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: cinder-hdd +provisioner: cinder.csi.openstack.org +parameters: + type: HDD +reclaimPolicy: Delete +allowVolumeExpansion: true +volumeBindingMode: WaitForFirstConsumer +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: cinder-retain +provisioner: cinder.csi.openstack.org +parameters: + type: __DEFAULT__ +reclaimPolicy: Retain +allowVolumeExpansion: true +volumeBindingMode: WaitForFirstConsumer diff --git a/openstack/tofu-with-env.sh b/openstack/tofu-with-env.sh new file mode 100755 index 0000000..947eab8 --- /dev/null +++ b/openstack/tofu-with-env.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# OpenTofu wrapper script that automatically loads environment variables +# Usage: ./tofu-with-env.sh plan +# ./tofu-with-env.sh apply +# ./tofu-with-env.sh destroy + +set -e + +# Check if .env file exists +if [ ! -f ".env" ]; then + echo "โŒ Error: .env file not found!" + echo "" + echo "Please create a .env file from the template:" + echo " cp .env.template .env" + echo " # Edit .env with your credentials" + echo "" + exit 1 +fi + +# Load environment variables +echo "๐Ÿ“„ Loading environment variables from .env..." +source .env + +# Verify required variables are set +if [ -z "$TF_VAR_openstack_user_name" ] || [ -z "$TF_VAR_openstack_password" ] || [ -z "$TF_VAR_openstack_tenant_name" ]; then + echo "โŒ Error: Missing required environment variables!" + echo "" + echo "Please check your .env file contains:" + echo " TF_VAR_openstack_user_name" + echo " TF_VAR_openstack_password" + echo " TF_VAR_openstack_tenant_name" + echo "" + exit 1 +fi + +# Add default flavor variables if not set +export TF_VAR_master_flavor_name="${TF_VAR_master_flavor_name:-CS.Tiny}" +export TF_VAR_web_flavor_name="${TF_VAR_web_flavor_name:-CS.Wee}" +export TF_VAR_worker_flavor_name="${TF_VAR_worker_flavor_name:-CM.XLarge}" + +echo "โœ… Environment variables loaded successfully" +echo "๐Ÿš€ Running: tofu $@" +echo "" + +# Run tofu with the provided arguments +tofu "$@" diff --git a/openstack/values-openstack-nfs-small.yaml b/openstack/values-openstack-nfs-small.yaml new file mode 100644 index 0000000..4e38da6 --- /dev/null +++ b/openstack/values-openstack-nfs-small.yaml @@ -0,0 +1,180 @@ +# OpenStack-specific values template for openstudio-server (external NFS class, small disks) +# Copy this file to a local, untracked values file before deployment. +# Do not commit populated credentials. + +# Set to openstack provider +global: + provider: + name: "openstack" + storageClasses: + block: "cinder-csi" + +storageclass: + createSsd: false + +cluster: + name: "openstudio-server" + +secrets: + # Recommended: pre-create this secret in the release namespace. + existingSecret: "openstudio-app-secrets" + validateExistingSecret: true + create: false + +nfs-server-provisioner: + enabled: false + persistence: + enabled: true + storageClass: "nfs-client" + size: 5Gi # Reduced from 110Gi + storageClass: + allowVolumeExpansion: false + mountOptions: + - vers=4 + - sync + +db: + name: "db" + label: "db" + username: "" + password: "" + container: + name: "mongo-db" + image: "mongo:6.0.7" + resources: + requests: + cpu: 1 + memory: "4Gi" + ports: + db_port: 27017 + persistence: + enabled: true + storageClass: "nfs-client" + size: 2Gi # Reduced from 100Gi + accessModes: + - "ReadWriteOnce" + +load_balancer: + name: "ingress-load-balancer" + externalTrafficPolicy: "Local" + label: "web" + internal: false + ports: + http_name: "http" + http_port: 80 + http_protocol: "TCP" + https_name: "https" + https_port: 443 + https_protocol: "TCP" + +nfs: + name: "nfs" + +nfs_pvc: + name: "nfs-pvc" + accessModes: + - "ReadWriteMany" + storage_class: "nfs-client" + storage: "5Gi" # Reduced from 100Gi + +redis: + name: "redis" + label: "redis" + password: "" + container: + name: "redis" + image: "redis:6.0.9" + resources: + requests: + cpu: 0.25 + memory: "1Gi" + port: 6379 + persistence: + enabled: true + storageClass: "nfs-client" + size: 1Gi # Reduced from 100Gi + accessModes: + - "ReadWriteOnce" + +redis_svc: + name: "queue" + label: "redis" + port: 6379 + url: "redis://:${REDIS_PASSWORD}@queue:6379" + +rserve: + name: "rserve" + label: "rserve" + number_of_workers: "1" + container: + name: "rserve" + image: "nrel/openstudio-rserve:3.8.0-1" + resources: + requests: + cpu: 1 + memory: "2Gi" + +rserve_svc: + name: "rserve" + label: "rserve" + port: 6311 + +web_background: + name: "web-background" + label: "web-background" + replicas: 1 + container: + name: "web-background" + image: "nrel/openstudio-server:3.8.0-1" + resources: + requests: + cpu: 0.25 + memory: "512Mi" + +web: + name: "web" + label: "web" + secret_key_value: "" + passenger_memory_per_process: 250 + container: + name: "web" + image: "nrel/openstudio-server:3.8.0-1" + resources: + requests: + cpu: 1 + memory: "2Gi" + port: + http: 80 + https: 443 + +web_hpa: + name: "web" + minReplicas: 1 + maxReplicas: 1 + targetCPUUtilizationPercentage: 50 + +web_svc: + name: "web" + label: "web" + ports: + http: 80 + https: 443 + +worker: + name: "worker" + label: "worker" + container: + name: "worker" + image: "nrel/openstudio-server:3.8.0-1" + resources: + requests: + cpu: 700m + memory: "900Mi" + terminationGracePeriodSeconds: 5200 + +worker_hpa: + name: "worker" + minReplicas: 2 + maxReplicas: 20 + targetCPUUtilizationPercentage: 50 + stabilizationWindowSeconds: 3600 diff --git a/openstack/values-openstack-nfs.yaml b/openstack/values-openstack-nfs.yaml new file mode 100644 index 0000000..31b37d7 --- /dev/null +++ b/openstack/values-openstack-nfs.yaml @@ -0,0 +1,180 @@ +# OpenStack-specific values template for openstudio-server (external NFS class) +# Copy this file to a local, untracked values file before deployment. +# Do not commit populated credentials. + +# Set to openstack provider +global: + provider: + name: "openstack" + storageClasses: + block: "cinder-csi" + +storageclass: + createSsd: false + +cluster: + name: "openstudio-server" + +secrets: + # Recommended: pre-create this secret in the release namespace. + existingSecret: "openstudio-app-secrets" + validateExistingSecret: true + create: false + +nfs-server-provisioner: + enabled: false + persistence: + enabled: true + storageClass: "nfs-client" + size: 110Gi + storageClass: + allowVolumeExpansion: false + mountOptions: + - vers=4 + - sync + +db: + name: "db" + label: "db" + username: "" + password: "" + container: + name: "mongo-db" + image: "mongo:6.0.7" + resources: + requests: + cpu: 1 + memory: "4Gi" + ports: + db_port: 27017 + persistence: + enabled: true + storageClass: "nfs-client" + size: 100Gi + accessModes: + - "ReadWriteOnce" + +load_balancer: + name: "ingress-load-balancer" + externalTrafficPolicy: "Local" + label: "web" + internal: false + ports: + http_name: "http" + http_port: 80 + http_protocol: "TCP" + https_name: "https" + https_port: 443 + https_protocol: "TCP" + +nfs: + name: "nfs" + +nfs_pvc: + name: "nfs-pvc" + accessModes: + - "ReadWriteMany" + storage_class: "nfs-client" + storage: "100Gi" + +redis: + name: "redis" + label: "redis" + password: "" + container: + name: "redis" + image: "redis:6.0.9" + resources: + requests: + cpu: 0.25 + memory: "1Gi" + port: 6379 + persistence: + enabled: true + storageClass: "nfs-client" + size: 100Gi + accessModes: + - "ReadWriteOnce" + +redis_svc: + name: "queue" + label: "redis" + port: 6379 + url: "redis://:${REDIS_PASSWORD}@queue:6379" + +rserve: + name: "rserve" + label: "rserve" + number_of_workers: "1" + container: + name: "rserve" + image: "nrel/openstudio-rserve:3.8.0-1" + resources: + requests: + cpu: 1 + memory: "2Gi" + +rserve_svc: + name: "rserve" + label: "rserve" + port: 6311 + +web_background: + name: "web-background" + label: "web-background" + replicas: 1 + container: + name: "web-background" + image: "nrel/openstudio-server:3.8.0-1" + resources: + requests: + cpu: 0.25 + memory: "512Mi" + +web: + name: "web" + label: "web" + secret_key_value: "" + passenger_memory_per_process: 250 + container: + name: "web" + image: "nrel/openstudio-server:3.8.0-1" + resources: + requests: + cpu: 1 + memory: "2Gi" + port: + http: 80 + https: 443 + +web_hpa: + name: "web" + minReplicas: 1 + maxReplicas: 1 + targetCPUUtilizationPercentage: 50 + +web_svc: + name: "web" + label: "web" + ports: + http: 80 + https: 443 + +worker: + name: "worker" + label: "worker" + container: + name: "worker" + image: "nrel/openstudio-server:3.8.0-1" + resources: + requests: + cpu: 700m + memory: "900Mi" + terminationGracePeriodSeconds: 5200 + +worker_hpa: + name: "worker" + minReplicas: 2 + maxReplicas: 20 + targetCPUUtilizationPercentage: 50 + stabilizationWindowSeconds: 3600 diff --git a/openstack/values-openstack.yaml b/openstack/values-openstack.yaml new file mode 100644 index 0000000..2932de2 --- /dev/null +++ b/openstack/values-openstack.yaml @@ -0,0 +1,182 @@ +# OpenStack-specific values template for openstudio-server +# Copy this file to a local, untracked values file before deployment. +# Do not commit populated credentials. + +# Set to openstack provider +global: + provider: + name: "openstack" + storageClasses: + block: "cinder-csi" + +cluster: + name: "openstudio-server" + +priorityClasses: + enabled: true + create: true + highName: "high-priority" + lowName: "low-priority" + +secrets: + # Recommended: pre-create this secret in the release namespace. + existingSecret: "openstudio-app-secrets" + validateExistingSecret: true + create: false + +nfs-server-provisioner: + persistence: + enabled: true + storageClass: "cinder-csi" + size: 110Gi + storageClass: + allowVolumeExpansion: false + mountOptions: + - vers=4 + - sync + +db: + name: "db" + label: "db" + username: "" + password: "" + container: + name: "mongo-db" + image: "mongo:6.0.7" + resources: + requests: + cpu: 1 + memory: "4Gi" + ports: + db_port: 27017 + persistence: + enabled: true + storageClass: "cinder-csi" + size: 100Gi + accessModes: + - "ReadWriteOnce" + +load_balancer: + name: "ingress-load-balancer" + externalTrafficPolicy: "Local" + label: "web" + internal: false + ports: + http_name: "http" + http_port: 80 + http_protocol: "TCP" + https_name: "https" + https_port: 443 + https_protocol: "TCP" + +nfs: + name: "nfs" + +nfs_pvc: + name: "nfs-pvc" + accessModes: + - "ReadWriteMany" + storage_class: "nfs" + storage: "100Gi" + +redis: + name: "redis" + label: "redis" + password: "" + container: + name: "redis" + image: "redis:6.0.9" + resources: + requests: + cpu: 0.25 + memory: "1Gi" + port: 6379 + persistence: + enabled: true + storageClass: "cinder-csi" + size: 100Gi + accessModes: + - "ReadWriteOnce" + +redis_svc: + name: "queue" + label: "redis" + port: 6379 + url: "redis://:${REDIS_PASSWORD}@queue:6379" + +rserve: + name: "rserve" + label: "rserve" + number_of_workers: "1" + container: + name: "rserve" + image: "nrel/openstudio-rserve:3.8.0-1" + resources: + requests: + cpu: 1 + memory: "2Gi" + +rserve_svc: + name: "rserve" + label: "rserve" + port: 6311 + +web_background: + name: "web-background" + label: "web-background" + replicas: 1 + container: + name: "web-background" + image: "nrel/openstudio-server:3.8.0-1" + resources: + requests: + cpu: 0.25 + memory: "512Mi" + +web: + name: "web" + label: "web" + secret_key_value: "" + passenger_memory_per_process: 250 + container: + name: "web" + image: "nrel/openstudio-server:3.8.0-1" + resources: + requests: + cpu: 1 + memory: "2Gi" + port: + http: 80 + https: 443 + +web_hpa: + name: "web" + minReplicas: 1 + maxReplicas: 1 + targetCPUUtilizationPercentage: 50 + +web_svc: + name: "web" + label: "web" + ports: + http: 80 + https: 443 + +worker: + name: "worker" + label: "worker" + container: + name: "worker" + image: "nrel/openstudio-server:3.8.0-1" + resources: + requests: + cpu: 700m + memory: "900Mi" + terminationGracePeriodSeconds: 5200 + +worker_hpa: + name: "worker" + minReplicas: 2 + maxReplicas: 20 + targetCPUUtilizationPercentage: 50 + stabilizationWindowSeconds: 3600 diff --git a/openstack/variables.tf b/openstack/variables.tf new file mode 100644 index 0000000..e813dd2 --- /dev/null +++ b/openstack/variables.tf @@ -0,0 +1,138 @@ +variable "openstack_user_name" { + description = "The username for OpenStack. Can be set via TF_VAR_openstack_user_name environment variable." + type = string + default = null +} + +variable "openstack_password" { + description = "The password for OpenStack. Can be set via TF_VAR_openstack_password environment variable." + type = string + sensitive = true + default = null +} + +variable "openstack_auth_url" { + description = "The authentication URL for OpenStack. Can be set via TF_VAR_openstack_auth_url environment variable." + type = string + default = "https://vs-api.hpc.nrel.gov:5000" +} + +variable "openstack_tenant_name" { + description = "The tenant name for OpenStack. Can be set via TF_VAR_openstack_tenant_name environment variable." + type = string + default = null +} + +variable "openstack_user_domain_name" { + description = "The user domain name for OpenStack. Can be set via TF_VAR_openstack_user_domain_name environment variable." + type = string + default = null +} + +variable "openstack_project_domain_id" { + description = "The project domain ID for OpenStack. Can be set via TF_VAR_openstack_project_domain_id environment variable." + type = string + default = null +} + +variable "openstack_project_id" { + description = "The project ID for OpenStack. Can be set via TF_VAR_openstack_project_id environment variable." + type = string + default = null +} + +variable "openstack_region" { + description = "The region for OpenStack. Can be set via TF_VAR_openstack_region environment variable." + type = string + default = "RegionOne" +} + +variable "cluster_name" { + description = "The name of the Kubernetes cluster." + type = string + default = "openstudio-server" +} + +variable "master_flavor" { + description = "The OpenStack flavor for the master node (should have sufficient resources for control plane)." + type = string + default = "CS.Wee" # 8 vCPUs, 32GB RAM +} + +variable "web_count" { + description = "The number of web nodes (for OpenStudio web services)." + type = number + default = 1 +} + +variable "web_flavor" { + description = "The OpenStack flavor for web nodes (equivalent to EKS m7i.8xlarge for web workloads)." + type = string + default = "CS.2XMedium" # 32 vCPUs, 128GB RAM +} + +variable "worker_count" { + description = "The number of worker nodes (for compute-intensive simulations)." + type = number + default = 1 +} + +variable "worker_flavor" { + description = "The OpenStack flavor for worker nodes (should be compute-optimized for simulations)." + type = string + default = "CM.XLarge" # 64 vCPUs, 128GB RAM, compute-optimized +} + +variable "image_name" { + description = "The name of the OpenStack image to use." + type = string + default = "ubuntu-jammy-kube-v1.33.2-250701-1108" +} + +variable "volume_size" { + description = "The size of the boot volume in GB." + type = number + default = 20 +} + +variable "key_pair" { + description = "The name of the SSH key pair to use. Set via tfvars or TF_VAR_key_pair." + type = string +} + +variable "os_username" { + description = "The username for console access." + type = string + default = "ubuntu" +} + +variable "os_password" { + description = "The password for console access." + type = string + sensitive = true + default = null +} + +variable "public_key" { + description = "The SSH public key content. Set via tfvars or TF_VAR_public_key." + type = string + sensitive = true +} + +variable "admin_access_cidr" { + description = "Ingress CIDR for admin access (SSH/ICMP). Default is permissive; narrow this in tfvars for production." + type = string + default = "0.0.0.0/0" +} + +variable "k8s_api_access_cidr" { + description = "Ingress CIDR for Kubernetes API port 6443. Default is permissive; narrow this in tfvars for production." + type = string + default = "0.0.0.0/0" +} + +variable "nodeport_access_cidr" { + description = "Ingress CIDR for NodePort services. Default is permissive; narrow this in tfvars for production." + type = string + default = "0.0.0.0/0" +} diff --git a/openstack/versions.tf b/openstack/versions.tf new file mode 100644 index 0000000..1000464 --- /dev/null +++ b/openstack/versions.tf @@ -0,0 +1,9 @@ +terraform { + required_providers { + openstack = { + source = "terraform-provider-openstack/openstack" + version = "~> 1.54" + } + } + required_version = ">= 1.0" +} From 69d18be631a05dac9654146038da8e981ae5929b Mon Sep 17 00:00:00 2001 From: achapin Date: Fri, 5 Jun 2026 18:06:13 -0400 Subject: [PATCH 2/2] fix(openstack): update OpenStack overlay values and documentation - Trim values-openstack.yaml, values-openstack-nfs.yaml, values-openstack-nfs-small.yaml to delta-only overrides; remove fields duplicated from base values.yaml - Add values-prod.local.yaml for production-specific overrides - Fix storageClass name: cinder-csi -> csi-cinder across all overlays and docs - Update README, QUICKSTART, TROUBLESHOOTING with corrected storageClass references and improved NFS sizing guidance - Remove allowVolumeExpansion: false from primary overlay (blocks Cinder expansion) - Add global.provider.name: openstack to all overlay files now that base values.yaml requires explicit provider selection - Update deploy-openstudio-cluster.sh with corrected storageClass default Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- openstack/QUICKSTART.md | 6 +- openstack/README.md | 40 ++++- openstack/TROUBLESHOOTING.md | 57 +++++++- openstack/deploy-openstudio-cluster.sh | 7 +- openstack/values-openstack-nfs-small.yaml | 124 ++-------------- openstack/values-openstack-nfs.yaml | 122 ++-------------- openstack/values-openstack.yaml | 170 +--------------------- openstack/values-prod.local.yaml | 41 ++++++ 8 files changed, 161 insertions(+), 406 deletions(-) create mode 100644 openstack/values-prod.local.yaml diff --git a/openstack/QUICKSTART.md b/openstack/QUICKSTART.md index 5a3e45c..54d354b 100644 --- a/openstack/QUICKSTART.md +++ b/openstack/QUICKSTART.md @@ -33,9 +33,9 @@ All tracked values files in this repository are templates. Keep real credentials The production template sets OpenStack storage hardening defaults: - `nfs-server-provisioner.persistence.size: 1Ti` -- `db.persistence.storageClass: cinder-csi` -- `redis.persistence.storageClass: cinder-csi` -- `global.storageClasses.block: cinder-csi` (override this if your Cinder class name differs) +- `db.persistence.storageClass: csi-cinder` +- `redis.persistence.storageClass: csi-cinder` +- `global.storageClasses.block: csi-cinder` (override this if your Cinder class name differs) ## Legacy Self-Managed Deployment (Optional, Untested) diff --git a/openstack/README.md b/openstack/README.md index a6c6915..8d55a5a 100644 --- a/openstack/README.md +++ b/openstack/README.md @@ -322,20 +322,52 @@ Additional OpenStack defaults are automatically applied when omitted in values: - `redis.persistence.storageClass`: `nfs` - `load_balancer.externalTrafficPolicy`: `Cluster` -For OpenStack block-backed PVCs, the chart now uses `global.storageClasses.block` (default `cinder-csi`) as the backing class for the NFS provisioner PVC. +> [!IMPORTANT] +> The OpenStack `nfs` defaults for `db` and `redis` are compatibility defaults, not production-safe defaults. +> In production, explicitly set both to block storage (`csi-cinder` or your `global.storageClasses.block` class). +> Keep `nfs` for shared artifacts (`nfs_pvc`) only. + +For OpenStack block-backed PVCs, the chart now uses `global.storageClasses.block` (default `csi-cinder`) as the backing class for the NFS provisioner PVC. `openstack/storage-classes.yaml` also includes a `csi-cinder` compatibility alias for older clusters/configs. For production hardening, the tracked `openstudio-server/values_production.templateyaml` explicitly sets: -- `db.persistence.storageClass: cinder-csi` -- `redis.persistence.storageClass: cinder-csi` +- `db.persistence.storageClass: csi-cinder` +- `redis.persistence.storageClass: csi-cinder` - `nfs-server-provisioner.persistence.size: 1Ti` +Incident retrospective (June 2026): + +- Observed failure mode: analyses failed before simulation start with MongoDB WiredTiger `Operation not permitted` errors. +- Root cause: DB PVC was configured to NFS. +- Operational policy now: DB/Redis must stay on block storage; NFS is for shared simulation artifacts only. + +Rollout warning triage (OpenStack): + +- `UpdateLoadBalancerFailed` / `SyncLoadBalancerFailed` events can occur transiently during node/pool membership updates. +- Treat these as **warning noise** if all are true: + - `kubectl get svc -n openstudio-server ingress-load-balancer` shows an external IP, + - `/` and `/status.json` return HTTP 200, + - web/worker deployments are fully available. +- Escalate when warnings persist and service health fails (missing external IP, non-200 health checks, or unavailable web deployment). + +Pod termination caveats: + +- `FailedKillPod` events during rollout usually indicate node/container-runtime cleanup delays for replaced pods. +- If replacement pods are healthy and workloads continue, this is typically infra-side and not a chart-level app failure. +- Track affected node(s) and coordinate runtime remediation (containerd/kubelet health, node pressure, host IO saturation). + +Mongo host tuning note: + +- Mongo startup may warn `vm.max_map_count is too low`. This is host-level kernel tuning and should be remediated on worker nodes hosting Mongo. + Preflight check before deploy/upgrade: ```bash kubectl get storageclass -kubectl get sc cinder-csi +kubectl get sc csi-cinder +kubectl get pvc -n openstudio-server +kubectl get pv | grep -E "openstudio-server/(db|redis|nfs-pvc|nfs-pvc-data)" ``` If your cluster uses a different Cinder class name, set it explicitly in your values file: diff --git a/openstack/TROUBLESHOOTING.md b/openstack/TROUBLESHOOTING.md index c55eb89..032e74d 100644 --- a/openstack/TROUBLESHOOTING.md +++ b/openstack/TROUBLESHOOTING.md @@ -7,6 +7,59 @@ This guide covers common issues encountered when deploying OpenStudio Server on ## ๐Ÿšจ Critical Issues and Solutions +### 0. Analyses fail before simulation start (MongoDB on NFS) + +**Symptoms:** +- Analyses are accepted but fail during initialization before any simulation jobs run. +- `web-background` logs include: + - `lhs.rb failed with [8]: 1: Operation not permitted (on db:27017...)` +- `db` (MongoDB) logs include WiredTiger errors: + - `__posix_open_file ... /data/db/collection-...wt ... Operation not permitted` + - New `collection-*.wt` files may appear as zero-byte files. + +**Root Cause:** +- MongoDB/WiredTiger is running on an NFS-backed PVC (`storageClass: nfs`), which is not a safe backend for this workload in this environment. + +**Diagnosis:** +```bash +# Check DB/Redis PVC backing classes +kubectl get pvc -n openstudio-server + +# Confirm PV storage classes +kubectl get pv | grep -E "openstudio-server/(db|redis)" + +# Confirm app-side failure signatures +kubectl logs -n openstudio-server deploy/web-background --tail=200 | \ + grep -E "Operation not permitted|lhs\\.rb failed" + +# Confirm MongoDB-side WiredTiger failures +kubectl logs -n openstudio-server deploy/db --tail=200 | \ + grep -E "WiredTiger|Operation not permitted|__posix_open_file" +``` + +**Fix:** +1. Set DB/Redis persistence to block storage in values: + - `db.persistence.storageClass: csi-cinder` + - `redis.persistence.storageClass: csi-cinder` +2. Migrate DB/Redis PVCs (requires downtime for those services): + - scale down `db` and `redis` + - delete old DB/Redis PVCs + - run `helm upgrade` with corrected values (and desired sizes) +3. Verify: + - new DB/Redis PVCs are `csi-cinder` + - no new WiredTiger `Operation not permitted` messages + - new analyses move past initialization into queued/running stages + +**Prevention (preflight before every upgrade):** +```bash +# Render-time values check +grep -nE "db:|redis:|storageClass" ./openstudio-server/values*.yaml + +# Runtime check (live cluster) +kubectl get pvc -n openstudio-server +kubectl get pv | grep -E "openstudio-server/(db|redis|nfs-pvc|nfs-pvc-data)" +``` + ### 1. Pod Network Isolation (Most Common Issue) **Symptoms:** @@ -97,7 +150,7 @@ kubectl exec -n kube-system -- curl -k If events show errors like: -- `storageclass.storage.k8s.io "cinder-csi" not found` +- `storageclass.storage.k8s.io "csi-cinder" not found` your chart values are likely using the wrong Cinder StorageClass name for this cluster. @@ -111,7 +164,7 @@ Set the OpenStack block class explicitly in values and redeploy: ```yaml global: storageClasses: - block: cinder-csi + block: csi-cinder ``` ### 3. Container Image Pull Failures diff --git a/openstack/deploy-openstudio-cluster.sh b/openstack/deploy-openstudio-cluster.sh index 7a3eec3..3b536d4 100755 --- a/openstack/deploy-openstudio-cluster.sh +++ b/openstack/deploy-openstudio-cluster.sh @@ -428,11 +428,10 @@ deploy_openstudio() { kubectl create namespace openstudio-server || true local secret_validator="$SCRIPT_DIR/../scripts/validate-app-secret.sh" - if [[ -x "$secret_validator" ]]; then - "$secret_validator" --namespace openstudio-server --secret-name "$APP_SECRET_NAME" - else - warning "Secret validator not found at $secret_validator; skipping preflight validation in this workflow" + if [[ ! -x "$secret_validator" ]]; then + error "Missing executable secret validator: $secret_validator" fi + "$secret_validator" --namespace openstudio-server --secret-name "$APP_SECRET_NAME" # Deploy OpenStudio Server log "Installing OpenStudio Server..." diff --git a/openstack/values-openstack-nfs-small.yaml b/openstack/values-openstack-nfs-small.yaml index 4e38da6..de0a635 100644 --- a/openstack/values-openstack-nfs-small.yaml +++ b/openstack/values-openstack-nfs-small.yaml @@ -1,32 +1,14 @@ -# OpenStack-specific values template for openstudio-server (external NFS class, small disks) -# Copy this file to a local, untracked values file before deployment. -# Do not commit populated credentials. +# Small OpenStack overlay for the external NFS-class deployment. -# Set to openstack provider global: provider: name: "openstack" - storageClasses: - block: "cinder-csi" - -storageclass: - createSsd: false - -cluster: - name: "openstudio-server" - -secrets: - # Recommended: pre-create this secret in the release namespace. - existingSecret: "openstudio-app-secrets" - validateExistingSecret: true - create: false nfs-server-provisioner: enabled: false persistence: - enabled: true + size: 5Gi storageClass: "nfs-client" - size: 5Gi # Reduced from 110Gi storageClass: allowVolumeExpansion: false mountOptions: @@ -34,146 +16,60 @@ nfs-server-provisioner: - sync db: - name: "db" - label: "db" - username: "" - password: "" + persistence: + size: 2Gi + storageClass: "nfs-client" container: - name: "mongo-db" - image: "mongo:6.0.7" resources: requests: cpu: 1 memory: "4Gi" - ports: - db_port: 27017 - persistence: - enabled: true - storageClass: "nfs-client" - size: 2Gi # Reduced from 100Gi - accessModes: - - "ReadWriteOnce" - -load_balancer: - name: "ingress-load-balancer" - externalTrafficPolicy: "Local" - label: "web" - internal: false - ports: - http_name: "http" - http_port: 80 - http_protocol: "TCP" - https_name: "https" - https_port: 443 - https_protocol: "TCP" - -nfs: - name: "nfs" nfs_pvc: - name: "nfs-pvc" - accessModes: - - "ReadWriteMany" - storage_class: "nfs-client" - storage: "5Gi" # Reduced from 100Gi + storage: "5Gi" + storageClass: "nfs-client" redis: - name: "redis" - label: "redis" - password: "" + persistence: + size: 1Gi + storageClass: "nfs-client" container: - name: "redis" - image: "redis:6.0.9" resources: requests: cpu: 0.25 memory: "1Gi" - port: 6379 - persistence: - enabled: true - storageClass: "nfs-client" - size: 1Gi # Reduced from 100Gi - accessModes: - - "ReadWriteOnce" - -redis_svc: - name: "queue" - label: "redis" - port: 6379 - url: "redis://:${REDIS_PASSWORD}@queue:6379" rserve: - name: "rserve" - label: "rserve" number_of_workers: "1" container: - name: "rserve" - image: "nrel/openstudio-rserve:3.8.0-1" resources: requests: cpu: 1 memory: "2Gi" -rserve_svc: - name: "rserve" - label: "rserve" - port: 6311 - web_background: - name: "web-background" - label: "web-background" replicas: 1 container: - name: "web-background" - image: "nrel/openstudio-server:3.8.0-1" resources: requests: cpu: 0.25 memory: "512Mi" web: - name: "web" - label: "web" - secret_key_value: "" - passenger_memory_per_process: 250 container: - name: "web" - image: "nrel/openstudio-server:3.8.0-1" resources: requests: cpu: 1 memory: "2Gi" - port: - http: 80 - https: 443 - -web_hpa: - name: "web" - minReplicas: 1 - maxReplicas: 1 - targetCPUUtilizationPercentage: 50 - -web_svc: - name: "web" - label: "web" - ports: - http: 80 - https: 443 worker: - name: "worker" - label: "worker" container: - name: "worker" - image: "nrel/openstudio-server:3.8.0-1" resources: requests: cpu: 700m memory: "900Mi" - terminationGracePeriodSeconds: 5200 worker_hpa: - name: "worker" minReplicas: 2 maxReplicas: 20 targetCPUUtilizationPercentage: 50 diff --git a/openstack/values-openstack-nfs.yaml b/openstack/values-openstack-nfs.yaml index 31b37d7..e84c6b6 100644 --- a/openstack/values-openstack-nfs.yaml +++ b/openstack/values-openstack-nfs.yaml @@ -1,32 +1,14 @@ -# OpenStack-specific values template for openstudio-server (external NFS class) -# Copy this file to a local, untracked values file before deployment. -# Do not commit populated credentials. +# OpenStack overlay for the external NFS-class deployment. -# Set to openstack provider global: provider: name: "openstack" - storageClasses: - block: "cinder-csi" - -storageclass: - createSsd: false - -cluster: - name: "openstudio-server" - -secrets: - # Recommended: pre-create this secret in the release namespace. - existingSecret: "openstudio-app-secrets" - validateExistingSecret: true - create: false nfs-server-provisioner: enabled: false persistence: - enabled: true - storageClass: "nfs-client" size: 110Gi + storageClass: "nfs-client" storageClass: allowVolumeExpansion: false mountOptions: @@ -34,146 +16,60 @@ nfs-server-provisioner: - sync db: - name: "db" - label: "db" - username: "" - password: "" + persistence: + size: 100Gi + storageClass: "nfs-client" container: - name: "mongo-db" - image: "mongo:6.0.7" resources: requests: cpu: 1 memory: "4Gi" - ports: - db_port: 27017 - persistence: - enabled: true - storageClass: "nfs-client" - size: 100Gi - accessModes: - - "ReadWriteOnce" - -load_balancer: - name: "ingress-load-balancer" - externalTrafficPolicy: "Local" - label: "web" - internal: false - ports: - http_name: "http" - http_port: 80 - http_protocol: "TCP" - https_name: "https" - https_port: 443 - https_protocol: "TCP" - -nfs: - name: "nfs" nfs_pvc: - name: "nfs-pvc" - accessModes: - - "ReadWriteMany" - storage_class: "nfs-client" storage: "100Gi" + storageClass: "nfs-client" redis: - name: "redis" - label: "redis" - password: "" + persistence: + size: 100Gi + storageClass: "nfs-client" container: - name: "redis" - image: "redis:6.0.9" resources: requests: cpu: 0.25 memory: "1Gi" - port: 6379 - persistence: - enabled: true - storageClass: "nfs-client" - size: 100Gi - accessModes: - - "ReadWriteOnce" - -redis_svc: - name: "queue" - label: "redis" - port: 6379 - url: "redis://:${REDIS_PASSWORD}@queue:6379" rserve: - name: "rserve" - label: "rserve" number_of_workers: "1" container: - name: "rserve" - image: "nrel/openstudio-rserve:3.8.0-1" resources: requests: cpu: 1 memory: "2Gi" -rserve_svc: - name: "rserve" - label: "rserve" - port: 6311 - web_background: - name: "web-background" - label: "web-background" replicas: 1 container: - name: "web-background" - image: "nrel/openstudio-server:3.8.0-1" resources: requests: cpu: 0.25 memory: "512Mi" web: - name: "web" - label: "web" - secret_key_value: "" - passenger_memory_per_process: 250 container: - name: "web" - image: "nrel/openstudio-server:3.8.0-1" resources: requests: cpu: 1 memory: "2Gi" - port: - http: 80 - https: 443 - -web_hpa: - name: "web" - minReplicas: 1 - maxReplicas: 1 - targetCPUUtilizationPercentage: 50 - -web_svc: - name: "web" - label: "web" - ports: - http: 80 - https: 443 worker: - name: "worker" - label: "worker" container: - name: "worker" - image: "nrel/openstudio-server:3.8.0-1" resources: requests: cpu: 700m memory: "900Mi" - terminationGracePeriodSeconds: 5200 worker_hpa: - name: "worker" minReplicas: 2 maxReplicas: 20 targetCPUUtilizationPercentage: 50 diff --git a/openstack/values-openstack.yaml b/openstack/values-openstack.yaml index 2932de2..69407be 100644 --- a/openstack/values-openstack.yaml +++ b/openstack/values-openstack.yaml @@ -1,182 +1,20 @@ -# OpenStack-specific values template for openstudio-server -# Copy this file to a local, untracked values file before deployment. -# Do not commit populated credentials. +# OpenStack-only overrides for openstudio-server. +# Keep this file limited to values that differ from openstudio-server/values.yaml. -# Set to openstack provider global: provider: name: "openstack" - storageClasses: - block: "cinder-csi" - -cluster: - name: "openstudio-server" - -priorityClasses: - enabled: true - create: true - highName: "high-priority" - lowName: "low-priority" - -secrets: - # Recommended: pre-create this secret in the release namespace. - existingSecret: "openstudio-app-secrets" - validateExistingSecret: true - create: false nfs-server-provisioner: - persistence: - enabled: true - storageClass: "cinder-csi" - size: 110Gi storageClass: - allowVolumeExpansion: false mountOptions: - vers=4 - sync db: - name: "db" - label: "db" - username: "" - password: "" - container: - name: "mongo-db" - image: "mongo:6.0.7" - resources: - requests: - cpu: 1 - memory: "4Gi" - ports: - db_port: 27017 persistence: - enabled: true - storageClass: "cinder-csi" - size: 100Gi - accessModes: - - "ReadWriteOnce" - -load_balancer: - name: "ingress-load-balancer" - externalTrafficPolicy: "Local" - label: "web" - internal: false - ports: - http_name: "http" - http_port: 80 - http_protocol: "TCP" - https_name: "https" - https_port: 443 - https_protocol: "TCP" - -nfs: - name: "nfs" - -nfs_pvc: - name: "nfs-pvc" - accessModes: - - "ReadWriteMany" - storage_class: "nfs" - storage: "100Gi" + storageClass: "csi-cinder" redis: - name: "redis" - label: "redis" - password: "" - container: - name: "redis" - image: "redis:6.0.9" - resources: - requests: - cpu: 0.25 - memory: "1Gi" - port: 6379 persistence: - enabled: true - storageClass: "cinder-csi" - size: 100Gi - accessModes: - - "ReadWriteOnce" - -redis_svc: - name: "queue" - label: "redis" - port: 6379 - url: "redis://:${REDIS_PASSWORD}@queue:6379" - -rserve: - name: "rserve" - label: "rserve" - number_of_workers: "1" - container: - name: "rserve" - image: "nrel/openstudio-rserve:3.8.0-1" - resources: - requests: - cpu: 1 - memory: "2Gi" - -rserve_svc: - name: "rserve" - label: "rserve" - port: 6311 - -web_background: - name: "web-background" - label: "web-background" - replicas: 1 - container: - name: "web-background" - image: "nrel/openstudio-server:3.8.0-1" - resources: - requests: - cpu: 0.25 - memory: "512Mi" - -web: - name: "web" - label: "web" - secret_key_value: "" - passenger_memory_per_process: 250 - container: - name: "web" - image: "nrel/openstudio-server:3.8.0-1" - resources: - requests: - cpu: 1 - memory: "2Gi" - port: - http: 80 - https: 443 - -web_hpa: - name: "web" - minReplicas: 1 - maxReplicas: 1 - targetCPUUtilizationPercentage: 50 - -web_svc: - name: "web" - label: "web" - ports: - http: 80 - https: 443 - -worker: - name: "worker" - label: "worker" - container: - name: "worker" - image: "nrel/openstudio-server:3.8.0-1" - resources: - requests: - cpu: 700m - memory: "900Mi" - terminationGracePeriodSeconds: 5200 - -worker_hpa: - name: "worker" - minReplicas: 2 - maxReplicas: 20 - targetCPUUtilizationPercentage: 50 - stabilizationWindowSeconds: 3600 + storageClass: "csi-cinder" diff --git a/openstack/values-prod.local.yaml b/openstack/values-prod.local.yaml new file mode 100644 index 0000000..6cf5570 --- /dev/null +++ b/openstack/values-prod.local.yaml @@ -0,0 +1,41 @@ +# Local OpenStack production overrides. + +global: + provider: + name: "openstack" + +autoscaler: + extraArgs: [] + openstackNodeGroups: [] + +cluster: + name: openstudio-server-prod + +nfs-server-provisioner: + persistence: + size: 1740Gi + storageClass: + allowVolumeExpansion: true + mountOptions: + - vers=4 + +nfs_pvc: + storage: 1.5Ti + +db: + persistence: + size: 200Gi + storageClass: csi-cinder + +redis: + persistence: + size: 200Gi + storageClass: csi-cinder + +web_background: + replicas: 2 + +worker_hpa: + minReplicas: 5 + maxReplicas: 50 + stabilizationWindowSeconds: 1800