From cb84e48f80449814f196b97b412c974d0568cc94 Mon Sep 17 00:00:00 2001 From: achapin Date: Fri, 5 Jun 2026 01:18:08 -0400 Subject: [PATCH 1/2] feat(chart): isolate runtime and provider-aware Helm template hardening Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../templates/deployment.yaml | 34 ++- .../nfs-server-provisioner/templates/pvc.yaml | 26 +- .../charts/nfs-server-provisioner/values.yaml | 12 +- openstudio-server/templates/_scheduling.tpl | 257 +++++++++++++++++ .../cluster-autoscaler-autodiscover.yaml | 112 ++++++-- openstudio-server/templates/db/db-deploy.yaml | 43 +-- openstudio-server/templates/db/db-pvc.yaml | 9 +- openstudio-server/templates/db/db-svc.yaml | 10 +- .../templates/hooks/pre-delete-hook.yaml | 71 ++++- .../templates/loadbalancer/loadbalancer.yaml | 31 +- openstudio-server/templates/nfs/nfs-pvc.yaml | 11 +- .../priority-class/priority_high.yaml | 10 +- .../priority-class/priority_low.yaml | 8 +- .../templates/redis/redis-deploy.yaml | 41 +-- .../templates/redis/redis-pvc.yaml | 9 +- .../templates/redis/redis-svc.yaml | 6 +- .../templates/rserve/rserve-deploy.yaml | 50 ++-- .../templates/rserve/rserve-svc.yaml | 6 +- .../templates/storageclass/storageclass.yaml | 24 +- .../web-background/web-background-deploy.yaml | 50 ++-- .../templates/web/web-deploy.yaml | 60 ++-- openstudio-server/templates/web/web-hpa.yaml | 4 +- openstudio-server/templates/web/web-svc.yaml | 11 +- .../templates/worker/worker-deploy.yaml | 52 ++-- .../templates/worker/worker-hpa.yaml | 4 +- openstudio-server/values.yaml | 265 ++++++++++-------- openstudio-server/values_large.templateyaml | 149 +++++----- .../values_production.templateyaml | 167 +++++++++++ openstudio-server/values_small.templateyaml | 149 +++++----- 29 files changed, 1186 insertions(+), 495 deletions(-) create mode 100644 openstudio-server/templates/_scheduling.tpl create mode 100644 openstudio-server/values_production.templateyaml diff --git a/openstudio-server/charts/nfs-server-provisioner/templates/deployment.yaml b/openstudio-server/charts/nfs-server-provisioner/templates/deployment.yaml index 3fb3753..56e058d 100755 --- a/openstudio-server/charts/nfs-server-provisioner/templates/deployment.yaml +++ b/openstudio-server/charts/nfs-server-provisioner/templates/deployment.yaml @@ -102,9 +102,39 @@ spec: nodeSelector: {{- toYaml . | nindent 8 }} {{- end }} - {{- with .Values.affinity }} + {{- if .Values.affinity }} affinity: - {{- toYaml . | nindent 8 }} + {{- toYaml .Values.affinity | nindent 8 }} + {{- else }} + {{- $global := default (dict) .Values.global }} + {{- $globalProvider := default (dict) (get $global "provider") }} + {{- $legacyProvider := default (dict) .Values.provider }} + {{- $legacyProviderName := lower (default "" (get $legacyProvider "name")) }} + {{- $allowLegacyProviderName := default true (get $globalProvider "allowLegacyName") }} + {{- $providerName := lower (default "" (get $globalProvider "name")) }} + {{- if ne $legacyProviderName "" }} + {{- if not $allowLegacyProviderName }} + {{- fail "provider.name is deprecated and disabled by default. Set global.provider.name, or set global.provider.allowLegacyName=true temporarily during migration." }} + {{- end }} + {{- if and (ne $providerName "") (ne $providerName $legacyProviderName) }} + {{- fail (printf "provider.name=%q conflicts with global.provider.name=%q. Remove provider.name and keep global.provider.name." $legacyProviderName $providerName) }} + {{- end }} + {{- if eq $providerName "" }} + {{- $providerName = $legacyProviderName }} + {{- end }} + {{- end }} + {{- $nodeGroups := default (dict) (get $global "nodeGroups") }} + {{- $labelKeyOverride := default "" (get $nodeGroups "labelKey") }} + {{- $webGroupOverride := default "" (get $nodeGroups "web") }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: {{ if ne $labelKeyOverride "" }}{{ $labelKeyOverride }}{{ else if eq $providerName "openstack" }}capi.stackhpc.com/node-group{{ else }}nodegroup{{ end }} + operator: In + values: + - {{ if ne $webGroupOverride "" }}{{ $webGroupOverride }}{{ else if eq $providerName "openstack" }}web{{ else }}web-group{{ end }} {{- end }} {{- with .Values.tolerations }} tolerations: diff --git a/openstudio-server/charts/nfs-server-provisioner/templates/pvc.yaml b/openstudio-server/charts/nfs-server-provisioner/templates/pvc.yaml index 120e588..bd259cc 100755 --- a/openstudio-server/charts/nfs-server-provisioner/templates/pvc.yaml +++ b/openstudio-server/charts/nfs-server-provisioner/templates/pvc.yaml @@ -1,9 +1,33 @@ +{{- $global := default (dict) .Values.global -}} +{{- $globalProvider := default (dict) (get $global "provider") -}} +{{- $legacyProvider := default (dict) .Values.provider -}} +{{- $legacyProviderName := lower (default "" (get $legacyProvider "name")) -}} +{{- $allowLegacyProviderName := default true (get $globalProvider "allowLegacyName") -}} +{{- $providerName := lower (default "" (get $globalProvider "name")) -}} +{{- if ne $legacyProviderName "" -}} +{{- if not $allowLegacyProviderName -}} +{{- fail "provider.name is deprecated and disabled by default. Set global.provider.name, or set global.provider.allowLegacyName=true temporarily during migration." -}} +{{- end -}} +{{- if and (ne $providerName "") (ne $providerName $legacyProviderName) -}} +{{- fail (printf "provider.name=%q conflicts with global.provider.name=%q. Remove provider.name and keep global.provider.name." $legacyProviderName $providerName) -}} +{{- end -}} +{{- if eq $providerName "" -}} +{{- if ne $legacyProviderName "" -}} +{{- $providerName = $legacyProviderName -}} +{{- else -}} +{{- $providerName = "aws" -}} +{{- end -}} +{{- end -}} +{{- end -}} +{{- $storageClasses := default (dict) (get $global "storageClasses") -}} +{{- $openstackBlockStorageClass := default "cinder-csi" (get $storageClasses "block") -}} +{{- $defaultStorageClass := ternary $openstackBlockStorageClass "ssd" (eq $providerName "openstack") }} kind: PersistentVolumeClaim apiVersion: v1 metadata: name: {{ .Values.persistence.name }} spec: - storageClassName: {{ .Values.persistence.storageClass }} + storageClassName: {{ default $defaultStorageClass .Values.persistence.storageClass | quote }} accessModes: {{ .Values.persistence.accessModes }} resources: diff --git a/openstudio-server/charts/nfs-server-provisioner/values.yaml b/openstudio-server/charts/nfs-server-provisioner/values.yaml index 2e790d1..dde46dc 100755 --- a/openstudio-server/charts/nfs-server-provisioner/values.yaml +++ b/openstudio-server/charts/nfs-server-provisioner/values.yaml @@ -93,12 +93,6 @@ nodeSelector: {} tolerations: [] -affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: nodegroup - operator: In - values: - - web-group \ No newline at end of file +# Affinity configuration +# Default: empty. Parent chart injects provider-aware defaults unless explicitly overridden. +affinity: {} \ No newline at end of file diff --git a/openstudio-server/templates/_scheduling.tpl b/openstudio-server/templates/_scheduling.tpl new file mode 100644 index 0000000..f1f367b --- /dev/null +++ b/openstudio-server/templates/_scheduling.tpl @@ -0,0 +1,257 @@ +{{- define "openstudio.providerName" -}} +{{- $global := default (dict) .Values.global -}} +{{- $globalProvider := default (dict) (get $global "provider") -}} +{{- $legacyProvider := default (dict) .Values.provider -}} +{{- $legacyProviderName := lower (default "" (get $legacyProvider "name")) -}} +{{- $allowLegacyProviderName := default true (get $globalProvider "allowLegacyName") -}} +{{- $provider := lower (default "" (get $globalProvider "name")) -}} +{{- if ne $legacyProviderName "" -}} +{{- if not $allowLegacyProviderName -}} +{{- fail "provider.name is deprecated and disabled by default. Set global.provider.name, or set global.provider.allowLegacyName=true temporarily during migration." -}} +{{- end -}} +{{- if and (ne $provider "") (ne $provider $legacyProviderName) -}} +{{- fail (printf "provider.name=%q conflicts with global.provider.name=%q. Remove provider.name and keep global.provider.name." $legacyProviderName $provider) -}} +{{- end -}} +{{- if eq $provider "" -}} +{{- $provider = $legacyProviderName -}} +{{- end -}} +{{- end -}} +{{- if eq $provider "" -}} +{{- $provider = "aws" -}} +{{- end -}} +{{- if not (has $provider (list "aws" "google" "azure" "openstack")) -}} +{{- fail (printf "global.provider.name=%q is unsupported. Supported values: aws, google, azure, openstack." $provider) -}} +{{- end -}} +{{- $provider -}} +{{- end -}} + +{{- define "openstudio.nodeGroupLabelKey" -}} +{{- $global := default (dict) .Values.global -}} +{{- $nodeGroups := default (dict) (get $global "nodeGroups") -}} +{{- $labelKey := default "" (get $nodeGroups "labelKey") -}} +{{- if ne $labelKey "" -}} +{{- $labelKey -}} +{{- else if eq (include "openstudio.providerName" .) "openstack" -}} +{{- "capi.stackhpc.com/node-group" -}} +{{- else -}} +{{- "nodegroup" -}} +{{- end -}} +{{- end -}} + +{{- define "openstudio.webNodeGroupValue" -}} +{{- $global := default (dict) .Values.global -}} +{{- $nodeGroups := default (dict) (get $global "nodeGroups") -}} +{{- $web := default "" (get $nodeGroups "web") -}} +{{- if ne $web "" -}} +{{- $web -}} +{{- else if eq (include "openstudio.providerName" .) "openstack" -}} +{{- "web" -}} +{{- else -}} +{{- "web-group" -}} +{{- end -}} +{{- end -}} + +{{- define "openstudio.workerNodeGroupValue" -}} +{{- $global := default (dict) .Values.global -}} +{{- $nodeGroups := default (dict) (get $global "nodeGroups") -}} +{{- $worker := default "" (get $nodeGroups "worker") -}} +{{- if ne $worker "" -}} +{{- $worker -}} +{{- else if eq (include "openstudio.providerName" .) "openstack" -}} +{{- "worker" -}} +{{- else -}} +{{- "worker-group" -}} +{{- end -}} +{{- end -}} + +{{- define "openstudio.nodeGroupValueForRole" -}} +{{- if eq .role "worker" -}} +{{- include "openstudio.workerNodeGroupValue" .root -}} +{{- else -}} +{{- include "openstudio.webNodeGroupValue" .root -}} +{{- end -}} +{{- end -}} + +{{- define "openstudio.nodeGroupAffinityMode" -}} +{{- $global := default (dict) .Values.global -}} +{{- $nodeGroups := default (dict) (get $global "nodeGroups") -}} +{{- $mode := lower (default "" (get $nodeGroups "affinityMode")) -}} +{{- if ne $mode "" -}} +{{- if not (has $mode (list "required" "preferred" "disabled")) -}} +{{- fail (printf "global.nodeGroups.affinityMode=%q is unsupported. Supported values: required, preferred, disabled." $mode) -}} +{{- end -}} +{{- $mode -}} +{{- else if eq (include "openstudio.providerName" .) "openstack" -}} +{{- "preferred" -}} +{{- else -}} +{{- "required" -}} +{{- end -}} +{{- end -}} + +{{- define "openstudio.affinityForRole" -}} +{{- $mode := include "openstudio.nodeGroupAffinityMode" .root -}} +{{- if ne $mode "disabled" -}} +affinity: + nodeAffinity: + {{- if eq $mode "preferred" }} + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: {{ include "openstudio.nodeGroupLabelKey" .root }} + operator: In + values: + - {{ include "openstudio.nodeGroupValueForRole" . }} + {{- else }} + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: {{ include "openstudio.nodeGroupLabelKey" .root }} + operator: In + values: + - {{ include "openstudio.nodeGroupValueForRole" . }} + {{- end }} +{{- end -}} +{{- end -}} + +{{- define "openstudio.defaultAppPersistenceStorageClass" -}} +{{- if eq (include "openstudio.providerName" .) "openstack" -}} +{{- "nfs" -}} +{{- else -}} +{{- "ssd" -}} +{{- end -}} +{{- end -}} + +{{- define "openstudio.openstackBlockStorageClass" -}} +{{- $global := default (dict) .Values.global -}} +{{- $storageClasses := default (dict) (get $global "storageClasses") -}} +{{- default "cinder-csi" (get $storageClasses "block") -}} +{{- end -}} + +{{- define "openstudio.defaultNfsProvisionerBackingStorageClass" -}} +{{- if eq (include "openstudio.providerName" .) "openstack" -}} +{{- include "openstudio.openstackBlockStorageClass" . -}} +{{- else -}} +{{- "ssd" -}} +{{- end -}} +{{- end -}} + +{{- define "openstudio.defaultLoadBalancerExternalTrafficPolicy" -}} +{{- if eq (include "openstudio.providerName" .) "openstack" -}} +{{- "Cluster" -}} +{{- else -}} +{{- "Local" -}} +{{- end -}} +{{- end -}} + +{{- define "openstudio.dbName" -}} +{{- default "db" .Values.db.name -}} +{{- end -}} + +{{- define "openstudio.redisName" -}} +{{- default "redis" .Values.redis.name -}} +{{- end -}} + +{{- define "openstudio.nfsPvcName" -}} +{{- $nfsPvc := default (dict) (get .Values "nfs_pvc") -}} +{{- default "nfs-pvc" (get $nfsPvc "name") -}} +{{- end -}} + +{{- define "openstudio.redisServiceName" -}} +{{- $redisSvc := default (dict) (get .Values "redis_svc") -}} +{{- default "queue" (get $redisSvc "name") -}} +{{- end -}} + +{{- define "openstudio.webName" -}} +{{- default "web" .Values.web.name -}} +{{- end -}} + +{{- define "openstudio.webServiceName" -}} +{{- $webSvc := default (dict) (get .Values "web_svc") -}} +{{- default (include "openstudio.webName" .) (get $webSvc "name") -}} +{{- end -}} + +{{- define "openstudio.webBackgroundName" -}} +{{- default "web-background" .Values.web_background.name -}} +{{- end -}} + +{{- define "openstudio.workerName" -}} +{{- default "worker" .Values.worker.name -}} +{{- end -}} + +{{- define "openstudio.rserveName" -}} +{{- default "rserve" .Values.rserve.name -}} +{{- end -}} + +{{- define "openstudio.rserveServiceName" -}} +{{- $rserveSvc := default (dict) (get .Values "rserve_svc") -}} +{{- default (include "openstudio.rserveName" .) (get $rserveSvc "name") -}} +{{- end -}} + +{{- define "openstudio.webHpaName" -}} +{{- $webHpa := default (dict) (get .Values "web_hpa") -}} +{{- default (include "openstudio.webName" .) (get $webHpa "name") -}} +{{- end -}} + +{{- define "openstudio.workerHpaName" -}} +{{- $workerHpa := default (dict) (get .Values "worker_hpa") -}} +{{- default (include "openstudio.workerName" .) (get $workerHpa "name") -}} +{{- end -}} + +{{- define "openstudio.secretName" -}} +{{- $secrets := default (dict) .Values.secrets -}} +{{- $existingSecret := default "" (get $secrets "existingSecret") -}} +{{- $create := true -}} +{{- if hasKey $secrets "create" -}} +{{- $create = (get $secrets "create") -}} +{{- end -}} +{{- if and (ne $existingSecret "") $create -}} +{{- fail "secrets.existingSecret and secrets.create=true cannot both be set; choose one secret source" -}} +{{- end -}} +{{- if ne $existingSecret "" -}} +{{- $existingSecret -}} +{{- else -}} +{{- if not $create -}} +{{- fail "Either secrets.existingSecret must be set or secrets.create must be true" -}} +{{- end -}} +{{- default (printf "%s-app-secrets" .Release.Name) (get $secrets "nameOverride") -}} +{{- end -}} +{{- end -}} + +{{- define "openstudio.secretKeyDbUsername" -}} +{{- $keys := default (dict) (get (default (dict) .Values.secrets) "keys") -}} +{{- default "db-username" (get $keys "dbUsername") -}} +{{- end -}} + +{{- define "openstudio.secretKeyDbPassword" -}} +{{- $keys := default (dict) (get (default (dict) .Values.secrets) "keys") -}} +{{- default "db-password" (get $keys "dbPassword") -}} +{{- end -}} + +{{- define "openstudio.secretKeyRedisPassword" -}} +{{- $keys := default (dict) (get (default (dict) .Values.secrets) "keys") -}} +{{- default "redis-password" (get $keys "redisPassword") -}} +{{- end -}} + +{{- define "openstudio.secretKeyWebSecret" -}} +{{- $keys := default (dict) (get (default (dict) .Values.secrets) "keys") -}} +{{- default "web-secret-key" (get $keys "webSecret") -}} +{{- end -}} + +{{- define "openstudio.serverImage" -}} +{{- $global := default (dict) .Values.global -}} +{{- $images := (get $global "images") | default (dict) -}} +{{- $org := default "nrel" (get $images "org") -}} +{{- $repo := default "openstudio-server" (get $images "serverRepository") -}} +{{- $tag := default "latest" (get $images "tag") -}} +{{- printf "%s/%s:%s" $org $repo $tag -}} +{{- end -}} + +{{- define "openstudio.rserveImage" -}} +{{- $global := default (dict) .Values.global -}} +{{- $images := (get $global "images") | default (dict) -}} +{{- $org := default "nrel" (get $images "org") -}} +{{- $repo := default "openstudio-rserve" (get $images "rserveRepository") -}} +{{- $tag := default "latest" (get $images "tag") -}} +{{- printf "%s/%s:%s" $org $repo $tag -}} +{{- end -}} \ No newline at end of file diff --git a/openstudio-server/templates/autoscaler/cluster-autoscaler-autodiscover.yaml b/openstudio-server/templates/autoscaler/cluster-autoscaler-autodiscover.yaml index 6adb2b8..26dd2ea 100644 --- a/openstudio-server/templates/autoscaler/cluster-autoscaler-autodiscover.yaml +++ b/openstudio-server/templates/autoscaler/cluster-autoscaler-autodiscover.yaml @@ -1,3 +1,51 @@ +{{- $providerName := include "openstudio.providerName" . -}} +{{- $autoscaler := default (dict) .Values.autoscaler -}} +{{- $autoscalerEnabled := default (eq $providerName "aws") (get $autoscaler "enabled") -}} +{{- $openstackNodeGroups := default (list) (get $autoscaler "openstackNodeGroups") -}} +{{- $autoscalerImage := default (dict) (get $autoscaler "image") -}} +{{- $kubeMinor := regexFind "^[0-9]+" .Capabilities.KubeVersion.Minor -}} +{{- $defaultAutoscalerTag := printf "v%s.%s.0" .Capabilities.KubeVersion.Major $kubeMinor -}} +{{- $autoscalerImageTag := default $defaultAutoscalerTag (get $autoscalerImage "tag") -}} +{{- $autoscalerImageRepo := default "registry.k8s.io/autoscaling/cluster-autoscaler" (get $autoscalerImage "repository") -}} +{{- $autoscalerImagePullPolicy := default "IfNotPresent" (get $autoscalerImage "pullPolicy") -}} +{{- $expectedTagPrefix := printf "v%s.%s." .Capabilities.KubeVersion.Major $kubeMinor -}} +{{- $openstackAutoscaler := default (dict) (get $autoscaler "openstack") -}} +{{- $cloudConfigSecretName := default "" (get $openstackAutoscaler "cloudConfigSecretName") -}} +{{- $cloudConfigSecretKey := default "cloud.conf" (get $openstackAutoscaler "cloudConfigSecretKey") -}} +{{- $cloudConfigMountPath := default "/etc/kubernetes/cloud.conf" (get $openstackAutoscaler "cloudConfigMountPath") -}} +{{- $caBundleSecretName := default "" (get $openstackAutoscaler "caBundleSecretName") -}} +{{- $caBundleSecretKey := default "ca.crt" (get $openstackAutoscaler "caBundleSecretKey") -}} +{{- $caBundleMountPath := default "/etc/ssl/certs/openstack-ca.crt" (get $openstackAutoscaler "caBundleMountPath") -}} +{{- $checkExistingOwnership := default true (get $openstackAutoscaler "checkExistingDeploymentOwnership") -}} +{{- $extraArgs := default (list) (get $autoscaler "extraArgs") -}} +{{- $hasCloudConfigArg := false -}} +{{- range $arg := $extraArgs -}} + {{- if regexMatch "^--cloud-config=.*" (printf "%v" $arg) -}} + {{- $hasCloudConfigArg = true -}} + {{- end -}} +{{- end -}} +{{- if and $autoscalerEnabled (eq $providerName "openstack") (eq (len $openstackNodeGroups) 0) -}} +{{- fail "autoscaler.enabled=true for OpenStack requires autoscaler.openstackNodeGroups with at least one {name,min,max} entry." -}} +{{- end -}} +{{- if and $autoscalerEnabled (not (hasPrefix $expectedTagPrefix $autoscalerImageTag)) -}} +{{- fail (printf "autoscaler.image.tag=%q should match cluster minor version prefix %q for compatibility with Kubernetes %s. Set autoscaler.image.tag explicitly." $autoscalerImageTag $expectedTagPrefix .Capabilities.KubeVersion.Version) -}} +{{- end -}} +{{- if and $autoscalerEnabled (eq $providerName "openstack") (eq $cloudConfigSecretName "") (not $hasCloudConfigArg) -}} +{{- fail "OpenStack autoscaler requires cloud config. Set autoscaler.openstack.cloudConfigSecretName or provide --cloud-config= in autoscaler.extraArgs." -}} +{{- end -}} +{{- if and $autoscalerEnabled (eq $providerName "openstack") $checkExistingOwnership -}} +{{- $existingAutoscaler := lookup "apps/v1" "Deployment" "kube-system" "cluster-autoscaler" -}} +{{- if $existingAutoscaler -}} +{{- $existingMetadata := default (dict) (get $existingAutoscaler "metadata") -}} +{{- $existingAnnotations := default (dict) (get $existingMetadata "annotations") -}} +{{- $existingReleaseName := default "" (get $existingAnnotations "meta.helm.sh/release-name") -}} +{{- $existingReleaseNamespace := default "" (get $existingAnnotations "meta.helm.sh/release-namespace") -}} +{{- if not (and (eq $existingReleaseName .Release.Name) (eq $existingReleaseNamespace .Release.Namespace)) -}} +{{- fail "autoscaler.enabled=true on OpenStack conflicts with an existing kube-system/cluster-autoscaler deployment not owned by this release. Use Azimuth/platform autoscaling only, or remove the existing deployment before enabling chart autoscaler." -}} +{{- end -}} +{{- end -}} +{{- end -}} +{{- if $autoscalerEnabled }} --- apiVersion: v1 kind: ServiceAccount @@ -137,18 +185,10 @@ spec: prometheus.io/scrape: 'true' prometheus.io/port: '8085' spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: nodegroup - operator: In - values: - - web-group + {{- include "openstudio.affinityForRole" (dict "root" . "role" "web") | nindent 6 }} serviceAccountName: cluster-autoscaler containers: - - image: registry.k8s.io/autoscaling/cluster-autoscaler:v1.26.6 + - image: {{ printf "%s:%s" $autoscalerImageRepo $autoscalerImageTag }} name: cluster-autoscaler resources: limits: @@ -161,17 +201,55 @@ spec: - ./cluster-autoscaler - --v=4 - --stderrthreshold=info - - --cloud-provider={{ .Values.provider.name }} + - --cloud-provider={{ $providerName }} - --skip-nodes-with-local-storage=false - --expander=least-waste + {{- if eq $providerName "aws" }} - --node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/{{ .Values.cluster.name }} + {{- else if eq $providerName "openstack" }} + {{- range $group := $openstackNodeGroups }} + - --nodes={{ required "autoscaler.openstackNodeGroups[].min is required when autoscaler is enabled for openstack" $group.min }}:{{ required "autoscaler.openstackNodeGroups[].max is required when autoscaler is enabled for openstack" $group.max }}:{{ required "autoscaler.openstackNodeGroups[].name is required when autoscaler is enabled for openstack" $group.name }} + {{- end }} + {{- if and (ne $cloudConfigSecretName "") (not $hasCloudConfigArg) }} + - --cloud-config={{ $cloudConfigMountPath }} + {{- end }} + {{- end }} + {{- range $arg := $extraArgs }} + - {{ $arg }} + {{- end }} - --scale-down-unneeded-time=5m + {{- if and (eq $providerName "openstack") (ne $caBundleSecretName "") }} + env: + - name: SSL_CERT_FILE + value: {{ $caBundleMountPath | quote }} + {{- end }} + imagePullPolicy: {{ $autoscalerImagePullPolicy | quote }} + {{- if and (eq $providerName "openstack") (or (ne $cloudConfigSecretName "") (ne $caBundleSecretName "")) }} volumeMounts: - - name: ssl-certs - mountPath: /etc/ssl/certs/ca-certificates.crt + {{- if ne $cloudConfigSecretName "" }} + - name: openstack-cloud-config + mountPath: {{ $cloudConfigMountPath | quote }} + subPath: {{ $cloudConfigSecretKey | quote }} readOnly: true - imagePullPolicy: "Always" + {{- end }} + {{- if ne $caBundleSecretName "" }} + - name: openstack-ca-bundle + mountPath: {{ $caBundleMountPath | quote }} + subPath: {{ $caBundleSecretKey | quote }} + readOnly: true + {{- end }} + {{- end }} + {{- if and (eq $providerName "openstack") (or (ne $cloudConfigSecretName "") (ne $caBundleSecretName "")) }} volumes: - - name: ssl-certs - hostPath: - path: "/etc/ssl/certs/ca-bundle.crt" + {{- if ne $cloudConfigSecretName "" }} + - name: openstack-cloud-config + secret: + secretName: {{ $cloudConfigSecretName | quote }} + {{- end }} + {{- if ne $caBundleSecretName "" }} + - name: openstack-ca-bundle + secret: + secretName: {{ $caBundleSecretName | quote }} + {{- end }} + {{- end }} +{{- end }} diff --git a/openstudio-server/templates/db/db-deploy.yaml b/openstudio-server/templates/db/db-deploy.yaml index 1c9f89f..942d1be 100644 --- a/openstudio-server/templates/db/db-deploy.yaml +++ b/openstudio-server/templates/db/db-deploy.yaml @@ -1,12 +1,12 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: {{ .Values.db.name }} + name: {{ include "openstudio.dbName" . }} spec: replicas: 1 selector: matchLabels: - app: {{ .Values.db.name }} + app: {{ default "db" .Values.db.label }} release: {{ .Release.Name }} strategy: type: RollingUpdate @@ -16,38 +16,41 @@ spec: template: metadata: labels: - app: {{ .Values.db.name }} + app: {{ default "db" .Values.db.label }} release: {{ .Release.Name }} + {{ $dbContainer := (get .Values.db "container") | default (dict) }} + {{ $dbContainerPorts := (get $dbContainer "ports") | default (dict) }} + {{ $priorityClasses := default (dict) .Values.priorityClasses }} spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: nodegroup - operator: In - values: - - web-group + {{- include "openstudio.affinityForRole" (dict "root" . "role" "web") | nindent 6 }} + {{- if (default true (get $priorityClasses "enabled")) }} + priorityClassName: {{ default "high-priority" (get $priorityClasses "highName") }} + {{- end }} containers: - - name: {{ .Values.db.container.name }} + - name: {{ default "mongo-db" .Values.db.container.name }} image: {{ .Values.db.container.image }} ports: - - containerPort: {{ .Values.db.container.ports.db_port }} + - containerPort: {{ default 27017 (get $dbContainerPorts "db_port") }} volumeMounts: - mountPath: /data/db - name: {{ .Values.db.name }} + name: db resources: requests: cpu: {{ .Values.db.container.resources.requests.cpu }} memory: {{ .Values.db.container.resources.requests.memory }} env: - name: MONGO_INITDB_ROOT_USERNAME - value: {{ .Values.db.username }} + valueFrom: + secretKeyRef: + name: {{ include "openstudio.secretName" . }} + key: {{ include "openstudio.secretKeyDbUsername" . }} - name: MONGO_INITDB_ROOT_PASSWORD - value: {{ .Values.db.password }} - priorityClassName: high-priority + valueFrom: + secretKeyRef: + name: {{ include "openstudio.secretName" . }} + key: {{ include "openstudio.secretKeyDbPassword" . }} volumes: - - name: {{ .Values.db.name }} + - name: db persistentVolumeClaim: - claimName: {{ .Values.db.name }} + claimName: {{ include "openstudio.dbName" . }} diff --git a/openstudio-server/templates/db/db-pvc.yaml b/openstudio-server/templates/db/db-pvc.yaml index 8ce57fb..4166663 100644 --- a/openstudio-server/templates/db/db-pvc.yaml +++ b/openstudio-server/templates/db/db-pvc.yaml @@ -1,11 +1,14 @@ kind: PersistentVolumeClaim apiVersion: v1 metadata: - name: {{ .Values.db.name }} + name: {{ include "openstudio.dbName" . }} +{{- $dbPersistence := default (dict) (get .Values.db "persistence") }} spec: - storageClassName: {{ .Values.db.persistence.storageClass }} + storageClassName: {{ default (include "openstudio.defaultAppPersistenceStorageClass" .) (get $dbPersistence "storageClass") | quote }} accessModes: - {{ .Values.db.persistence.accessModes }} +{{- range $mode := default (list "ReadWriteOnce") (get $dbPersistence "accessModes") }} + - {{ $mode | quote }} +{{- end }} resources: requests: storage: {{ .Values.db.persistence.size }} \ No newline at end of file diff --git a/openstudio-server/templates/db/db-svc.yaml b/openstudio-server/templates/db/db-svc.yaml index 26f4628..14c68ac 100644 --- a/openstudio-server/templates/db/db-svc.yaml +++ b/openstudio-server/templates/db/db-svc.yaml @@ -1,12 +1,14 @@ apiVersion: v1 kind: Service metadata: - name: {{ .Values.db.name }} + name: {{ include "openstudio.dbName" . }} +{{ $dbContainer := (get .Values.db "container") | default (dict) }} +{{ $dbContainerPorts := (get $dbContainer "ports") | default (dict) }} spec: selector: - app: {{ .Values.db.label }} + app: {{ default "db" .Values.db.label }} release: {{ .Release.Name }} ports: - - name: {{ .Values.db.name }} + - name: db protocol: TCP - port: {{ .Values.db.container.ports.db_port }} + port: {{ default 27017 (get $dbContainerPorts "db_port") }} diff --git a/openstudio-server/templates/hooks/pre-delete-hook.yaml b/openstudio-server/templates/hooks/pre-delete-hook.yaml index 7dc5bd3..4bf00d1 100644 --- a/openstudio-server/templates/hooks/pre-delete-hook.yaml +++ b/openstudio-server/templates/hooks/pre-delete-hook.yaml @@ -1,29 +1,84 @@ +{{- $hooks := default (dict) .Values.hooks -}} +{{- $preDeleteHook := default (dict) (get $hooks "preDeleteCleanup") -}} +{{- if default true (get $preDeleteHook "enabled") }} +{{- $cleanupName := printf "%s-nfs-client-cleanup" .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- $cleanupServiceAccount := printf "%s-sa" $cleanupName | trunc 63 | trimSuffix "-" -}} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ $cleanupServiceAccount }} + namespace: {{ .Release.Namespace }} + annotations: + "helm.sh/hook": pre-delete + "helm.sh/hook-weight": "1" + "helm.sh/hook-delete-policy": hook-succeeded +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ $cleanupName }} + namespace: {{ .Release.Namespace }} + annotations: + "helm.sh/hook": pre-delete + "helm.sh/hook-weight": "1" + "helm.sh/hook-delete-policy": hook-succeeded +rules: + - apiGroups: ["apps"] + resources: ["deployments"] + verbs: ["get", "list", "delete", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ $cleanupName }} + namespace: {{ .Release.Namespace }} + annotations: + "helm.sh/hook": pre-delete + "helm.sh/hook-weight": "2" + "helm.sh/hook-delete-policy": hook-succeeded +subjects: + - kind: ServiceAccount + name: {{ $cleanupServiceAccount }} + namespace: {{ .Release.Namespace }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ $cleanupName }} +--- apiVersion: batch/v1 kind: Job metadata: - name: nfs-client-cleanup - namespace: default + name: {{ $cleanupName }} + namespace: {{ .Release.Namespace }} annotations: "helm.sh/hook": pre-delete "helm.sh/hook-weight": "3" "helm.sh/hook-delete-policy": hook-succeeded spec: + backoffLimit: 1 template: metadata: name: nfs-client-cleanup spec: + serviceAccountName: {{ $cleanupServiceAccount }} containers: - name: kubectl - image: "k8s.gcr.io/hyperkube:v1.12.1" + image: "bitnami/kubectl:1.31" securityContext: - privileged: true + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + capabilities: + drop: + - ALL imagePullPolicy: "IfNotPresent" command: - /bin/sh - -c - > - kubectl delete deployment web; - kubectl delete deployment web-background; - kubectl delete deployment rserve; - sleep 60; + kubectl -n {{ .Release.Namespace }} delete deployment {{ include "openstudio.webName" . }} --ignore-not-found=true --wait=true --timeout=180s; + kubectl -n {{ .Release.Namespace }} delete deployment {{ include "openstudio.webBackgroundName" . }} --ignore-not-found=true --wait=true --timeout=180s; + kubectl -n {{ .Release.Namespace }} delete deployment {{ include "openstudio.rserveName" . }} --ignore-not-found=true --wait=true --timeout=180s; restartPolicy: Never +{{- end }} diff --git a/openstudio-server/templates/loadbalancer/loadbalancer.yaml b/openstudio-server/templates/loadbalancer/loadbalancer.yaml index 19febc3..a6743ad 100644 --- a/openstudio-server/templates/loadbalancer/loadbalancer.yaml +++ b/openstudio-server/templates/loadbalancer/loadbalancer.yaml @@ -1,25 +1,30 @@ apiVersion: v1 kind: Service +{{ $loadBalancer := (get .Values "load_balancer") | default (dict) }} +{{ $ports := (get $loadBalancer "ports") | default (dict) }} +{{ $providerName := include "openstudio.providerName" . }} metadata: - name: {{ .Values.load_balancer.name }} + name: {{ default "ingress-load-balancer" (get $loadBalancer "name") }} annotations: -{{- if and (eq .Values.provider.name "aws") .Values.load_balancer.internal }} +{{- if and (eq $providerName "aws") (get $loadBalancer "internal") }} service.beta.kubernetes.io/aws-load-balancer-internal: "true" -{{- else if and (eq .Values.provider.name "azure") .Values.load_balancer.internal }} +{{- else if and (eq $providerName "azure") (get $loadBalancer "internal") }} service.beta.kubernetes.io/azure-load-balancer-internal: "true" -{{- else if and (eq .Values.provider.name "google") .Values.load_balancer.internal }} +{{- else if and (eq $providerName "google") (get $loadBalancer "internal") }} cloud.google.com/load-balancer-type: "Internal" +{{- else if and (eq $providerName "openstack") (get $loadBalancer "internal") }} + service.beta.kubernetes.io/openstack-internal-load-balancer: "true" {{- end }} spec: - type: LoadBalancer - externalTrafficPolicy: {{ .Values.load_balancer.externalTrafficPolicy }} + type: {{ default "LoadBalancer" (get $loadBalancer "type") }} + externalTrafficPolicy: {{ default (include "openstudio.defaultLoadBalancerExternalTrafficPolicy" .) (get $loadBalancer "externalTrafficPolicy") | quote }} selector: - app: {{ .Values.load_balancer.label }} + app: {{ default "web" (get $loadBalancer "label") }} release: {{ .Release.Name }} ports: - - name: {{ .Values.load_balancer.ports.http_name }} - protocol: {{ .Values.load_balancer.ports.http_protocol }} - port: {{ .Values.load_balancer.ports.http_port }} - - name: {{ .Values.load_balancer.ports.https_name }} - protocol: {{ .Values.load_balancer.ports.https_protocol }} - port: {{ .Values.load_balancer.ports.https_port }} + - name: {{ default "http" (get $ports "http_name") }} + protocol: {{ default "TCP" (get $ports "http_protocol") }} + port: {{ default 80 (get $ports "http_port") }} + - name: {{ default "https" (get $ports "https_name") }} + protocol: {{ default "TCP" (get $ports "https_protocol") }} + port: {{ default 443 (get $ports "https_port") }} diff --git a/openstudio-server/templates/nfs/nfs-pvc.yaml b/openstudio-server/templates/nfs/nfs-pvc.yaml index 1e0a902..9977add 100644 --- a/openstudio-server/templates/nfs/nfs-pvc.yaml +++ b/openstudio-server/templates/nfs/nfs-pvc.yaml @@ -1,11 +1,14 @@ +{{- $nfsPvc := default (dict) (get .Values "nfs_pvc") -}} kind: PersistentVolumeClaim apiVersion: v1 metadata: - name: {{ .Values.nfs_pvc.name }} + name: {{ include "openstudio.nfsPvcName" . }} spec: accessModes: - {{ .Values.nfs_pvc.accessModes }} - storageClassName: "nfs" +{{- range $mode := default (list "ReadWriteMany") (get $nfsPvc "accessModes") }} + - {{ $mode }} +{{- end }} + storageClassName: {{ default "nfs" (get $nfsPvc "storage_class") | quote }} resources: requests: - storage: {{ .Values.nfs_pvc.storage }} \ No newline at end of file + storage: {{ default "2Gi" (get $nfsPvc "storage") }} \ No newline at end of file diff --git a/openstudio-server/templates/priority-class/priority_high.yaml b/openstudio-server/templates/priority-class/priority_high.yaml index 329a7fc..b8e395c 100644 --- a/openstudio-server/templates/priority-class/priority_high.yaml +++ b/openstudio-server/templates/priority-class/priority_high.yaml @@ -1,9 +1,13 @@ -{{- if not (lookup "scheduling.k8s.io/v1" "PriorityClass" "" "high-priority") }} +{{- $priorityClasses := default (dict) .Values.priorityClasses -}} +{{- if (default true (get $priorityClasses "create")) -}} +{{- $highName := default "high-priority" (get $priorityClasses "highName") -}} +{{- if not (lookup "scheduling.k8s.io/v1" "PriorityClass" "" $highName) }} apiVersion: scheduling.k8s.io/v1 kind: PriorityClass metadata: - name: high-priority + name: {{ $highName }} value: 1000000 globalDefault: false description: "Used for core rails web service pods only." -{{- end }} \ No newline at end of file +{{- end }} +{{- end }} diff --git a/openstudio-server/templates/priority-class/priority_low.yaml b/openstudio-server/templates/priority-class/priority_low.yaml index e86cbc5..085a3df 100644 --- a/openstudio-server/templates/priority-class/priority_low.yaml +++ b/openstudio-server/templates/priority-class/priority_low.yaml @@ -1,9 +1,13 @@ -{{- if not (lookup "scheduling.k8s.io/v1" "PriorityClass" "" "high-priority") }} +{{- $priorityClasses := default (dict) .Values.priorityClasses -}} +{{- if (default true (get $priorityClasses "create")) -}} +{{- $lowName := default "low-priority" (get $priorityClasses "lowName") -}} +{{- if not (lookup "scheduling.k8s.io/v1" "PriorityClass" "" $lowName) }} apiVersion: scheduling.k8s.io/v1 kind: PriorityClass metadata: - name: low-priority + name: {{ $lowName }} value: 10000 globalDefault: true description: "Used for non-core service pods only. This is default" {{- end }} +{{- end }} diff --git a/openstudio-server/templates/redis/redis-deploy.yaml b/openstudio-server/templates/redis/redis-deploy.yaml index fc60e78..6dda4d3 100644 --- a/openstudio-server/templates/redis/redis-deploy.yaml +++ b/openstudio-server/templates/redis/redis-deploy.yaml @@ -1,43 +1,46 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: {{ .Values.redis.name }} + name: {{ include "openstudio.redisName" . }} spec: replicas: 1 selector: matchLabels: - app: {{ .Values.redis.name }} + app: {{ default "redis" .Values.redis.label }} release: {{ .Release.Name }} template: metadata: labels: - app: {{ .Values.redis.name }} + app: {{ default "redis" .Values.redis.label }} release: {{ .Release.Name }} + {{ $priorityClasses := default (dict) .Values.priorityClasses }} spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: nodegroup - operator: In - values: - - web-group + {{- include "openstudio.affinityForRole" (dict "root" . "role" "web") | nindent 6 }} + {{- if (default true (get $priorityClasses "enabled")) }} + priorityClassName: {{ default "high-priority" (get $priorityClasses "highName") }} + {{- end }} containers: - - name: {{ .Values.redis.container.name }} + - name: {{ default "redis" .Values.redis.container.name }} image: {{ .Values.redis.container.image }} resources: requests: cpu: {{ .Values.redis.container.resources.requests.cpu }} memory: {{ .Values.redis.container.resources.requests.memory }} ports: - - containerPort: {{ .Values.redis.container.port }} + - containerPort: {{ default 6379 .Values.redis.container.port }} volumeMounts: - mountPath: /data - name: {{ .Values.redis.name }} - args: ["redis-server", "--appendonly yes", "--requirepass", "{{ .Values.redis.password }}"] - priorityClassName: high-priority + name: redis + env: + - name: REDIS_PASSWORD + valueFrom: + secretKeyRef: + name: {{ include "openstudio.secretName" . }} + key: {{ include "openstudio.secretKeyRedisPassword" . }} + command: ["/bin/sh", "-c"] + args: + - exec redis-server --appendonly yes --requirepass "$REDIS_PASSWORD" volumes: - - name: {{ .Values.redis.name }} + - name: redis persistentVolumeClaim: - claimName: {{ .Values.redis.name }} + claimName: {{ include "openstudio.redisName" . }} diff --git a/openstudio-server/templates/redis/redis-pvc.yaml b/openstudio-server/templates/redis/redis-pvc.yaml index a7b2c22..49d3037 100644 --- a/openstudio-server/templates/redis/redis-pvc.yaml +++ b/openstudio-server/templates/redis/redis-pvc.yaml @@ -1,11 +1,14 @@ kind: PersistentVolumeClaim apiVersion: v1 metadata: - name: {{ .Values.redis.name }} + name: {{ include "openstudio.redisName" . }} +{{- $redisPersistence := default (dict) (get .Values.redis "persistence") }} spec: - storageClassName: {{ .Values.redis.persistence.storageClass }} + storageClassName: {{ default (include "openstudio.defaultAppPersistenceStorageClass" .) (get $redisPersistence "storageClass") | quote }} accessModes: - {{ .Values.redis.persistence.accessModes }} +{{- range $mode := default (list "ReadWriteOnce") (get $redisPersistence "accessModes") }} + - {{ $mode | quote }} +{{- end }} resources: requests: storage: {{ .Values.redis.persistence.size }} \ No newline at end of file diff --git a/openstudio-server/templates/redis/redis-svc.yaml b/openstudio-server/templates/redis/redis-svc.yaml index 06260f9..b242b79 100644 --- a/openstudio-server/templates/redis/redis-svc.yaml +++ b/openstudio-server/templates/redis/redis-svc.yaml @@ -1,12 +1,12 @@ apiVersion: v1 kind: Service metadata: - name: {{ .Values.redis_svc.name }} + name: {{ include "openstudio.redisServiceName" . }} spec: selector: - app: {{ .Values.redis.name }} + app: {{ default "redis" .Values.redis.label }} release: {{ .Release.Name }} ports: - name: redis protocol: TCP - port: {{ .Values.redis_svc.port }} + port: {{ default 6379 (get ((get .Values "redis_svc") | default (dict)) "port") }} diff --git a/openstudio-server/templates/rserve/rserve-deploy.yaml b/openstudio-server/templates/rserve/rserve-deploy.yaml index 880713c..84f6cd1 100644 --- a/openstudio-server/templates/rserve/rserve-deploy.yaml +++ b/openstudio-server/templates/rserve/rserve-deploy.yaml @@ -1,12 +1,12 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: {{ .Values.rserve.name }} + name: {{ include "openstudio.rserveName" . }} spec: replicas: 1 selector: matchLabels: - app: {{ .Values.rserve.name }} + app: {{ default "rserve" .Values.rserve.label }} release: {{ .Release.Name }} strategy: type: RollingUpdate @@ -16,21 +16,18 @@ spec: template: metadata: labels: - app: {{ .Values.rserve.name }} + app: {{ default "rserve" .Values.rserve.label }} release: {{ .Release.Name }} + {{ $priorityClasses := default (dict) .Values.priorityClasses }} + {{ $redis := default (dict) .Values.redis }} spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: nodegroup - operator: In - values: - - web-group + {{- include "openstudio.affinityForRole" (dict "root" . "role" "web") | nindent 6 }} + {{- if (default true (get $priorityClasses "enabled")) }} + priorityClassName: {{ default "high-priority" (get $priorityClasses "highName") }} + {{- end }} containers: - - name: {{ .Values.rserve.container.name }} - image: {{ .Values.rserve.container.image }} + - name: {{ default "rserve" .Values.rserve.container.name }} + image: {{ default (include "openstudio.rserveImage" .) .Values.rserve.container.image }} resources: requests: cpu: {{ .Values.rserve.container.resources.requests.cpu }} @@ -42,13 +39,27 @@ spec: - name: OS_SERVER_NUMBER_OF_WORKERS value: {{ .Values.rserve.number_of_workers | quote }} - name: SECRET_KEY_BASE - value: {{ .Values.web.secret_key_value }} + valueFrom: + secretKeyRef: + name: {{ include "openstudio.secretName" . }} + key: {{ include "openstudio.secretKeyWebSecret" . }} + - name: REDIS_PASSWORD + valueFrom: + secretKeyRef: + name: {{ include "openstudio.secretName" . }} + key: {{ include "openstudio.secretKeyRedisPassword" . }} - name: REDIS_URL - value: {{ .Values.redis_svc.url }} + value: {{ default (printf "redis://:$(REDIS_PASSWORD)@%s:6379" (include "openstudio.redisServiceName" .)) (get $redis "url") | quote }} - name: MONGO_USER - value: {{ .Values.db.username }} + valueFrom: + secretKeyRef: + name: {{ include "openstudio.secretName" . }} + key: {{ include "openstudio.secretKeyDbUsername" . }} - name: MONGO_PASSWORD - value: {{ .Values.db.password }} + valueFrom: + secretKeyRef: + name: {{ include "openstudio.secretName" . }} + key: {{ include "openstudio.secretKeyDbPassword" . }} livenessProbe: exec: command: ["grep", "-qs", "/mnt/openstudio ", "/proc/mounts"] @@ -56,8 +67,7 @@ spec: periodSeconds: 30 timeoutSeconds: 30 failureThreshold: 3 - priorityClassName: high-priority volumes: - name: osdata persistentVolumeClaim: - claimName: nfs-pvc + claimName: {{ include "openstudio.nfsPvcName" . }} diff --git a/openstudio-server/templates/rserve/rserve-svc.yaml b/openstudio-server/templates/rserve/rserve-svc.yaml index d9e47d8..efc81dd 100644 --- a/openstudio-server/templates/rserve/rserve-svc.yaml +++ b/openstudio-server/templates/rserve/rserve-svc.yaml @@ -1,12 +1,12 @@ apiVersion: v1 kind: Service metadata: - name: {{ .Values.rserve_svc.name }} + name: {{ include "openstudio.rserveServiceName" . }} spec: selector: - app: {{ .Values.rserve.label }} + app: {{ default (default "rserve" .Values.rserve.label) .Values.rserve_svc.label }} release: {{ .Release.Name }} ports: - name: rserve protocol: TCP - port: {{ .Values.rserve_svc.port }} + port: {{ default 6311 .Values.rserve_svc.port }} diff --git a/openstudio-server/templates/storageclass/storageclass.yaml b/openstudio-server/templates/storageclass/storageclass.yaml index 7320550..9218ac0 100644 --- a/openstudio-server/templates/storageclass/storageclass.yaml +++ b/openstudio-server/templates/storageclass/storageclass.yaml @@ -1,24 +1,36 @@ -{{- if not (lookup "storage.k8s.io/v1" "StorageClass" "" "ssd") }} +{{- $storageclass := (get .Values "storageclass") | default (dict) -}} +{{- $providerName := include "openstudio.providerName" . -}} +{{- $defaultAppStorageClass := include "openstudio.defaultAppPersistenceStorageClass" . -}} +{{- $dbStorageClass := default $defaultAppStorageClass .Values.db.persistence.storageClass -}} +{{- $redisStorageClass := default $defaultAppStorageClass .Values.redis.persistence.storageClass -}} +{{- $nfsProvisioner := default (dict) (get .Values "nfs-server-provisioner") -}} +{{- $nfsPersistence := default (dict) (get $nfsProvisioner "persistence") -}} +{{- $defaultNfsStorageClass := include "openstudio.defaultNfsProvisionerBackingStorageClass" . -}} +{{- $nfsStorageClass := default $defaultNfsStorageClass (get $nfsPersistence "storageClass") -}} +{{- $shouldCreateSsd := or (default false (get $storageclass "createSsd")) (eq $dbStorageClass "ssd") (eq $redisStorageClass "ssd") (eq $nfsStorageClass "ssd") -}} +{{- if and $shouldCreateSsd (not (lookup "storage.k8s.io/v1" "StorageClass" "" "ssd")) }} apiVersion: storage.k8s.io/v1 kind: StorageClass metadata: name: ssd -{{ if eq .Values.provider.name "aws" }} +{{ if eq $providerName "aws" }} provisioner: kubernetes.io/aws-ebs parameters: type: gp3 -{{ end }} -{{ if eq .Values.provider.name "google" }} +{{ else if eq $providerName "google" }} provisioner: pd.csi.storage.gke.io parameters: type: pd-ssd replication-type: none -{{ end }} -{{ if eq .Values.provider.name "azure" }} +{{ else if eq $providerName "azure" }} provisioner: kubernetes.io/azure-disk parameters: kind: Managed storageaccounttype: StandardSSD_LRS +{{ else if eq $providerName "openstack" }} +provisioner: cinder.csi.openstack.org +{{ else }} +{{- fail (printf "storageclass: cannot create \"ssd\" for unsupported provider %q. Set storageclass.createSsd=false and provide an existing storageClass, or set a supported provider." $providerName) -}} {{ end }} reclaimPolicy: Delete allowVolumeExpansion: true diff --git a/openstudio-server/templates/web-background/web-background-deploy.yaml b/openstudio-server/templates/web-background/web-background-deploy.yaml index 45022d7..9f314a2 100644 --- a/openstudio-server/templates/web-background/web-background-deploy.yaml +++ b/openstudio-server/templates/web-background/web-background-deploy.yaml @@ -1,12 +1,12 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: {{ .Values.web_background.name }} + name: {{ include "openstudio.webBackgroundName" . }} spec: replicas: {{ .Values.web_background.replicas }} selector: matchLabels: - app: {{ .Values.web_background.name }} + app: {{ default "web-background" .Values.web_background.label }} release: {{ .Release.Name }} strategy: type: RollingUpdate @@ -16,21 +16,18 @@ spec: template: metadata: labels: - app: {{ .Values.web_background.name }} + app: {{ default "web-background" .Values.web_background.label }} release: {{ .Release.Name }} + {{ $priorityClasses := default (dict) .Values.priorityClasses }} + {{ $redis := default (dict) .Values.redis }} spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: nodegroup - operator: In - values: - - web-group + {{- include "openstudio.affinityForRole" (dict "root" . "role" "web") | nindent 6 }} + {{- if (default true (get $priorityClasses "enabled")) }} + priorityClassName: {{ default "high-priority" (get $priorityClasses "highName") }} + {{- end }} containers: - - name: {{ .Values.web_background.container.name }} - image: {{ .Values.web_background.container.image }} + - name: {{ default "web-background" .Values.web_background.container.name }} + image: {{ default (include "openstudio.serverImage" .) .Values.web_background.container.image }} imagePullPolicy: Always resources: requests: @@ -45,13 +42,27 @@ spec: - name: QUEUES value: background,analyses - name: SECRET_KEY_BASE - value: {{ .Values.web.secret_key_value }} + valueFrom: + secretKeyRef: + name: {{ include "openstudio.secretName" . }} + key: {{ include "openstudio.secretKeyWebSecret" . }} + - name: REDIS_PASSWORD + valueFrom: + secretKeyRef: + name: {{ include "openstudio.secretName" . }} + key: {{ include "openstudio.secretKeyRedisPassword" . }} - name: REDIS_URL - value: {{ .Values.redis_svc.url }} + value: {{ default (printf "redis://:$(REDIS_PASSWORD)@%s:6379" (include "openstudio.redisServiceName" .)) (get $redis "url") | quote }} - name: MONGO_USER - value: {{ .Values.db.username }} + valueFrom: + secretKeyRef: + name: {{ include "openstudio.secretName" . }} + key: {{ include "openstudio.secretKeyDbUsername" . }} - name: MONGO_PASSWORD - value: {{ .Values.db.password }} + valueFrom: + secretKeyRef: + name: {{ include "openstudio.secretName" . }} + key: {{ include "openstudio.secretKeyDbPassword" . }} command: ["/usr/local/bin/start-web-background"] livenessProbe: exec: @@ -60,8 +71,7 @@ spec: periodSeconds: 30 timeoutSeconds: 30 failureThreshold: 3 - priorityClassName: high-priority volumes: - name: nfs persistentVolumeClaim: - claimName: nfs-pvc + claimName: {{ include "openstudio.nfsPvcName" . }} diff --git a/openstudio-server/templates/web/web-deploy.yaml b/openstudio-server/templates/web/web-deploy.yaml index 6d2fcc9..9826027 100644 --- a/openstudio-server/templates/web/web-deploy.yaml +++ b/openstudio-server/templates/web/web-deploy.yaml @@ -1,12 +1,12 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: {{ .Values.web.name }} + name: {{ include "openstudio.webName" . }} spec: replicas: 1 selector: matchLabels: - app: {{ .Values.web.name }} + app: {{ default "web" .Values.web.label }} release: {{ .Release.Name }} strategy: type: RollingUpdate @@ -16,33 +16,34 @@ spec: template: metadata: labels: - app: {{ .Values.web.name }} + app: {{ default "web" .Values.web.label }} release: {{ .Release.Name }} + {{ $dbContainer := (get .Values.db "container") | default (dict) }} + {{ $dbContainerPorts := (get $dbContainer "ports") | default (dict) }} + {{ $webContainer := (get .Values.web "container") | default (dict) }} + {{ $webContainerPorts := (get $webContainer "port") | default (dict) }} + {{ $redis := default (dict) .Values.redis }} + {{ $priorityClasses := default (dict) .Values.priorityClasses }} spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: nodegroup - operator: In - values: - - web-group + {{- include "openstudio.affinityForRole" (dict "root" . "role" "web") | nindent 6 }} + {{- if (default true (get $priorityClasses "enabled")) }} + priorityClassName: {{ default "high-priority" (get $priorityClasses "highName") }} + {{- end }} initContainers: - name: init-wait-for-db image: alpine - command: ["/bin/sh", "-c", "for i in $(seq 1 300); do nc -zvw1 {{ .Values.db.name }} {{ .Values.db.container.ports.db_port }} && exit 0 || sleep 3; done; exit 1"] + command: ["/bin/sh", "-c", "for i in $(seq 1 300); do nc -zvw1 {{ include "openstudio.dbName" . }} {{ default 27017 (get $dbContainerPorts "db_port") }} && exit 0 || sleep 3; done; exit 1"] containers: - - name: {{ .Values.web.container.name }} - image: {{ .Values.web.container.image }} + - name: {{ default "web" .Values.web.container.name }} + image: {{ default (include "openstudio.serverImage" .) .Values.web.container.image }} imagePullPolicy: Always resources: requests: cpu: {{ .Values.web.container.resources.requests.cpu }} memory: {{ .Values.web.container.resources.requests.memory }} ports: - - containerPort: {{ .Values.web.container.port.http }} - - containerPort: {{ .Values.web.container.port.https }} + - containerPort: {{ default 80 (get $webContainerPorts "http") }} + - containerPort: {{ default 443 (get $webContainerPorts "https") }} volumeMounts: - name: nfs mountPath: "/mnt/openstudio" @@ -52,13 +53,27 @@ spec: - name: QUEUES value: analysis_wrappers - name: SECRET_KEY_BASE - value: {{ .Values.web.secret_key_value }} + valueFrom: + secretKeyRef: + name: {{ include "openstudio.secretName" . }} + key: {{ include "openstudio.secretKeyWebSecret" . }} + - name: REDIS_PASSWORD + valueFrom: + secretKeyRef: + name: {{ include "openstudio.secretName" . }} + key: {{ include "openstudio.secretKeyRedisPassword" . }} - name: REDIS_URL - value: {{ .Values.redis_svc.url }} + value: {{ default (printf "redis://:$(REDIS_PASSWORD)@%s:6379" (include "openstudio.redisServiceName" .)) (get $redis "url") | quote }} - name: MONGO_USER - value: {{ .Values.db.username }} + valueFrom: + secretKeyRef: + name: {{ include "openstudio.secretName" . }} + key: {{ include "openstudio.secretKeyDbUsername" . }} - name: MONGO_PASSWORD - value: {{ .Values.db.password }} + valueFrom: + secretKeyRef: + name: {{ include "openstudio.secretName" . }} + key: {{ include "openstudio.secretKeyDbPassword" . }} - name: MAX_REQUESTS value: {{ (ceil (mulf .Values.worker_hpa.maxReplicas 1.05)) | quote }} - name: MAX_POOL @@ -79,8 +94,7 @@ spec: periodSeconds: 200 timeoutSeconds: 120 failureThreshold: 3 - priorityClassName: high-priority volumes: - name: nfs persistentVolumeClaim: - claimName: nfs-pvc + claimName: {{ include "openstudio.nfsPvcName" . }} diff --git a/openstudio-server/templates/web/web-hpa.yaml b/openstudio-server/templates/web/web-hpa.yaml index fec0dab..d3888b2 100644 --- a/openstudio-server/templates/web/web-hpa.yaml +++ b/openstudio-server/templates/web/web-hpa.yaml @@ -1,12 +1,12 @@ apiVersion: autoscaling/v1 kind: HorizontalPodAutoscaler metadata: - name: {{ .Values.web.name }} + name: {{ include "openstudio.webHpaName" . }} spec: scaleTargetRef: apiVersion: apps/v1 kind: Deployment - name: {{ .Values.web.name }} + name: {{ include "openstudio.webName" . }} minReplicas: {{ .Values.web_hpa.minReplicas }} maxReplicas: {{ .Values.web_hpa.maxReplicas }} targetCPUUtilizationPercentage: {{ .Values.web_hpa.targetCPUUtilizationPercentage }} diff --git a/openstudio-server/templates/web/web-svc.yaml b/openstudio-server/templates/web/web-svc.yaml index 9208298..a1fbbe9 100644 --- a/openstudio-server/templates/web/web-svc.yaml +++ b/openstudio-server/templates/web/web-svc.yaml @@ -1,15 +1,18 @@ apiVersion: v1 kind: Service metadata: - name: {{ .Values.web_svc.name }} + name: {{ include "openstudio.webServiceName" . }} +{{ $webSvc := (get .Values "web_svc") | default (dict) }} +{{ $web := (get .Values "web") | default (dict) }} +{{ $webSvcPorts := (get $webSvc "ports") | default (dict) }} spec: selector: - app: {{ .Values.web.name }} + app: {{ default (default "web" (get $web "label")) (get $webSvc "label") }} release: {{ .Release.Name }} ports: - name: http protocol: TCP - port: {{ .Values.web_svc.ports.http }} + port: {{ default 80 (get $webSvcPorts "http") }} - name: https protocol: TCP - port: {{ .Values.web_svc.ports.https }} + port: {{ default 443 (get $webSvcPorts "https") }} diff --git a/openstudio-server/templates/worker/worker-deploy.yaml b/openstudio-server/templates/worker/worker-deploy.yaml index 8e55b0f..d4d8790 100644 --- a/openstudio-server/templates/worker/worker-deploy.yaml +++ b/openstudio-server/templates/worker/worker-deploy.yaml @@ -1,12 +1,12 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: {{ .Values.worker.name }} + name: {{ include "openstudio.workerName" . }} spec: replicas: 1 selector: matchLabels: - app: {{ .Values.worker.name }} + app: {{ default "worker" .Values.worker.label }} release: {{ .Release.Name }} strategy: type: RollingUpdate @@ -16,21 +16,18 @@ spec: template: metadata: labels: - app: {{ .Values.worker.name }} + app: {{ default "worker" .Values.worker.label }} release: {{ .Release.Name }} + {{ $priorityClasses := default (dict) .Values.priorityClasses }} + {{ $redis := default (dict) .Values.redis }} spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: nodegroup - operator: In - values: - - worker-group + {{- include "openstudio.affinityForRole" (dict "root" . "role" "worker") | nindent 6 }} + {{- if (default true (get $priorityClasses "enabled")) }} + priorityClassName: {{ default "low-priority" (get $priorityClasses "lowName") }} + {{- end }} containers: - - name: {{ .Values.worker.container.name }} - image: {{ .Values.worker.container.image }} + - name: {{ default "worker" .Values.worker.container.name }} + image: {{ default (include "openstudio.serverImage" .) .Values.worker.container.image }} imagePullPolicy: Always lifecycle: preStop: @@ -45,20 +42,35 @@ spec: mountPath: "/mnt/openstudio" env: - name: QUEUES - value: simulations + value: {{ .Values.worker.queues | quote }} + - name: QUEUE + value: {{ .Values.worker.queues | quote }} - name: COUNT value: "1" - name: SECRET_KEY_BASE - value: {{ .Values.web.secret_key_value }} + valueFrom: + secretKeyRef: + name: {{ include "openstudio.secretName" . }} + key: {{ include "openstudio.secretKeyWebSecret" . }} + - name: REDIS_PASSWORD + valueFrom: + secretKeyRef: + name: {{ include "openstudio.secretName" . }} + key: {{ include "openstudio.secretKeyRedisPassword" . }} - name: REDIS_URL - value: {{ .Values.redis_svc.url }} + value: {{ default (printf "redis://:$(REDIS_PASSWORD)@%s:6379" (include "openstudio.redisServiceName" .)) (get $redis "url") | quote }} - name: MONGO_USER - value: {{ .Values.db.username }} + valueFrom: + secretKeyRef: + name: {{ include "openstudio.secretName" . }} + key: {{ include "openstudio.secretKeyDbUsername" . }} - name: MONGO_PASSWORD - value: {{ .Values.db.password }} + valueFrom: + secretKeyRef: + name: {{ include "openstudio.secretName" . }} + key: {{ include "openstudio.secretKeyDbPassword" . }} command: ["/usr/local/bin/start-workers"] terminationGracePeriodSeconds: {{ .Values.worker.container.terminationGracePeriodSeconds }} # for long openstudio jobs. - priorityClassName: low-priority volumes: - name: osdata-worker emptyDir: {} diff --git a/openstudio-server/templates/worker/worker-hpa.yaml b/openstudio-server/templates/worker/worker-hpa.yaml index 2c8670f..0b83307 100644 --- a/openstudio-server/templates/worker/worker-hpa.yaml +++ b/openstudio-server/templates/worker/worker-hpa.yaml @@ -1,12 +1,12 @@ apiVersion: autoscaling/v2 kind: HorizontalPodAutoscaler metadata: - name: {{ .Values.worker_hpa.name }} + name: {{ include "openstudio.workerHpaName" . }} spec: scaleTargetRef: apiVersion: apps/v1 kind: Deployment - name: {{ .Values.worker.name }} + name: {{ include "openstudio.workerName" . }} minReplicas: {{ .Values.worker_hpa.minReplicas }} maxReplicas: {{ .Values.worker_hpa.maxReplicas }} behavior: diff --git a/openstudio-server/values.yaml b/openstudio-server/values.yaml index 07bb11b..da8bff6 100644 --- a/openstudio-server/values.yaml +++ b/openstudio-server/values.yaml @@ -1,178 +1,207 @@ -# Default values for openstudio-server. -# This is a YAML-formatted file. +# Tracked baseline values for openstudio-server. +# Copy and override for your target environment. # Declare variables to be passed into your templates. - -# Set to google, aws or azure -provider: - name: "" +global: + provider: + # Required. Set one of: aws, google, azure, openstack. + name: "" + # Migration compatibility toggle. When true, legacy provider.name can be used + # only if global.provider.name is unset. Keep false for strict mode. + allowLegacyName: false + # Optional storage class defaults used by provider-aware chart logic. + # For OpenStack, block is used as the backing class for the NFS provisioner PVC. + storageClasses: + block: "cinder-csi" + # Optional node-group scheduling controls. + # labelKey defaults by provider: openstack => capi.stackhpc.com/node-group, others => nodegroup. + # web/worker defaults by provider: openstack => web/worker, others => web-group/worker-group. + # affinityMode: required | preferred | disabled + # defaults by provider: openstack => preferred, others => required + nodeGroups: + # labelKey: "" + # web: "" + # worker: "" + # affinityMode: "" + images: + org: "nrel" + serverRepository: "openstudio-server" + rserveRepository: "openstudio-rserve" + tag: "3.10.0" cluster: name: "openstudio-server" +autoscaler: + # Defaults to enabled on AWS and disabled on other providers unless explicitly set. + # enabled: true + # Safety guard: when enabled on OpenStack, install/upgrade fails if a pre-existing + # kube-system/cluster-autoscaler deployment is detected and not owned by this Helm release. + # For OpenStack, enable autoscaler and provide one or more node groups in min:max:name format. + # Example: + # openstackNodeGroups: + # - name: web-group + # min: 1 + # max: 5 + # - name: worker-group + # min: 1 + # max: 50 + image: + repository: "registry.k8s.io/autoscaling/cluster-autoscaler" + # When empty, defaults to v..0. + tag: "" + pullPolicy: "IfNotPresent" + openstack: + # If set, chart mounts this secret and passes --cloud-config=. + # If unset, provide --cloud-config= in autoscaler.extraArgs. + cloudConfigSecretName: "" + cloudConfigSecretKey: "cloud.conf" + cloudConfigMountPath: "/etc/kubernetes/cloud.conf" + # Optional custom CA bundle for private OpenStack API endpoints. + # When set, chart mounts this secret key and sets SSL_CERT_FILE for autoscaler. + caBundleSecretName: "" + caBundleSecretKey: "ca.crt" + caBundleMountPath: "/etc/ssl/certs/openstack-ca.crt" + # Safety guard for OpenStack installs. When true, chart checks for a pre-existing + # kube-system/cluster-autoscaler deployment and fails if it's not owned by this release. + # Set false only when release-time RBAC cannot read kube-system deployments. + checkExistingDeploymentOwnership: true + openstackNodeGroups: [] + extraArgs: [] + +priorityClasses: + enabled: true + create: true + highName: "high-priority" + lowName: "low-priority" + +secrets: + # Primary path: use an externally managed secret by name. + # If set, chart will not create a secret from db/redis/web values. + existingSecret: "openstudio-app-secrets" + # Strict by default for connected clusters. + # Set false for offline/render-only workflows (for example CI helm template jobs). + validateExistingSecret: true + # Create a secret from db.username/db.password/redis.password/web.secret_key_value. + # Alternate path. Must be false when existingSecret is set. + create: false + # Optional override for chart-managed secret name. + nameOverride: "" + keys: + dbUsername: "db-username" + dbPassword: "db-password" + redisPassword: "redis-password" + webSecret: "web-secret-key" + +# NFS Server Provisioner - Increased storage for production workloads nfs-server-provisioner: persistence: enabled: true - storageClass: "ssd" - size: 110Gi + # Backend storage for the in-cluster NFS server (nfs-pvc-data). + # Keep nfs_pvc.storage below this value to avoid NFS provisioning failures. + size: 50Gi # Increased from 5Gi for production storageClass: - allowVolumeExpansion: false + allowVolumeExpansion: true # Enable expansion for future growth mountOptions: - vers=4 - - sync + +# Database - Enhanced for production db: - name: "db" - label: "db" - username: "openstudio" - password: "openstudio" + # Required when secrets.create=true. + username: "" + password: "" container: - name: "mongo-db" image: "mongo:6.0.7" resources: requests: - cpu: 1 - memory: "4Gi" - ports: - db_port: 27017 + cpu: 2 # Increased from 1 for better performance + memory: "8Gi" # Increased from 4Gi for production workloads persistence: - enabled: true - storageClass: "ssd" - size: 100Gi - accessModes: - - "ReadWriteOnce" - -load_balancer: - name: "ingress-load-balancer" - externalTrafficPolicy: "Local" - label: "web" - internal: false - ports: - http_name: "http" - http_port: 80 - http_protocol: "TCP" - https_name: "https" - https_port: 443 - https_protocol: "TCP" - -nfs: - name: "nfs" + size: 20Gi # Increased from 2Gi for production data +# NFS PVC - Increased storage for production nfs_pvc: - name: "nfs-pvc" - accessModes: - - "ReadWriteMany" storage_class: "nfs" - storage: "100Gi" - + # Frontend shared RWX claim consumed by web/rserve/background pods. + # Keep this below nfs-server-provisioner.persistence.size (recommended 85-95%). + storage: "10Gi" # Increased from 2Gi for production shared storage +# Redis - Enhanced for production redis: - name: "redis" - label: "redis" - password: "openstudio" + # Required when secrets.create=true. + password: "" + # Optional explicit Redis URI. When set, templates use this value for REDIS_URL. + # Use this if credentials include URI-reserved characters. + url: "" container: - name: "redis" image: "redis:6.0.9" resources: requests: - cpu: 0.25 - memory: "1Gi" - port: 6379 + cpu: 0.5 # Increased from 0.25 for better queue performance + memory: "2Gi" # Increased from 1Gi for larger queues persistence: - enabled: true - storageClass: "ssd" - size: 100Gi - accessModes: - - "ReadWriteOnce" - -redis_svc: - name: "queue" - label: "redis" - port: 6379 - url: "redis://:openstudio@queue:6379" + size: 5Gi # Increased from 1Gi for production queues +# RServe - Enhanced for production rserve: - name: "rserve" - label: "rserve" - number_of_workers: "1" + number_of_workers: "2" # Increased from 1 for better R processing container: - name: "rserve" - image: "nrel/openstudio-rserve:3.8.0-1" resources: requests: - cpu: 1 - memory: "2Gi" + cpu: 2 # Increased from 1 for better R performance + memory: "4Gi" # Increased from 2Gi for larger R workloads rserve_svc: - name: "rserve" - label: "rserve" - port: 6311 + {} +# Web Background - Enhanced for production web_background: - name: "web-background" - label: "web-background" - replicas: 1 + replicas: 2 # Increased from 1 for redundancy container: - name: "web-background" - image: "nrel/openstudio-server:3.8.0-1" resources: requests: - cpu: 0.25 - memory: "512Mi" + cpu: 0.5 # Increased from 0.25 for better background processing + memory: "1Gi" # Increased from 512Mi +# Web - Enhanced for production with proper passenger configuration web: - name: "web" - label: "web" - secret_key_value: "c4ab6d293e4bf52ee92e8dda6e16dc9b5448d0c5f7908ee40c66736d515f3c29142d905b283d73e5e9cef6b13cd8e38be6fd3b5e25d00f35b259923a86c7c473" - # passenger_max_request_queue_size: "1600" + # Required when secrets.create=true. + secret_key_value: "" # Use this formula (1024 * memory of web pod in Gi * 0.75) / memory per passenger process - # passenger_max_pool_size: "21" - # This value is typically between 150 and 400, find this by ssh into web pod and doing `passenger-status` - passenger_memory_per_process: 250 + passenger_memory_per_process: 250 # Keep existing optimized value container: - name: "web" - image: "nrel/openstudio-server:3.8.0-1" resources: requests: - cpu: 1 - memory: "2Gi" - port: - http: 80 - https: 443 - -# For this to work and have more than 1 web pod we'll need to implement -# a distributed file locking scheme such a https://github.com/antirez/redlock-rbs -# as even with using NFS via sync it cannot guarantee file locking using multiple clients + cpu: 2 # Increased from 1 for better web performance + memory: "8Gi" # Increased from 2Gi for production load + +# Web HPA - Configured for production scaling web_hpa: - name: "web" minReplicas: 1 maxReplicas: 1 - targetCPUUtilizationPercentage: 50 + targetCPUUtilizationPercentage: 70 # Slightly higher threshold for stability + # Note: Distributed file locking still needed for >1 replica web_svc: - name: "web" - label: "web" - ports: - http: 80 - https: 443 + {} +# Worker - Enhanced for production worker: - name: "worker" - label: "worker" + queues: "simulations,requeued" container: - name: "worker" - image: "nrel/openstudio-server:3.8.0-1" resources: requests: - cpu: 700m - memory: "900Mi" + cpu: 1 # Increased from 700m for better performance + memory: "1Gi" # Increased from 900Mi for larger simulations terminationGracePeriodSeconds: 5200 +# Worker HPA - Optimized for production workloads worker_hpa: - name: "worker" - minReplicas: 2 - maxReplicas: 20 - targetCPUUtilizationPercentage: 50 - stabilizationWindowSeconds: 3600 - - + minReplicas: 5 # Increased from 2 for baseline production capacity + maxReplicas: 50 # Increased from 20 for larger simulation batches + targetCPUUtilizationPercentage: 60 # Slightly higher for stability + stabilizationWindowSeconds: 1800 # Reduced from 3600 for faster scaling response +hooks: + preDeleteCleanup: + enabled: true diff --git a/openstudio-server/values_large.templateyaml b/openstudio-server/values_large.templateyaml index bc2f0a0..84279aa 100644 --- a/openstudio-server/values_large.templateyaml +++ b/openstudio-server/values_large.templateyaml @@ -3,164 +3,147 @@ # Declare variables to be passed into your templates. -# Set to google, aws or azure -provider: - name: "" +global: + provider: + name: "" + # Temporary migration toggle. Keep false unless upgrading legacy values + # that still set provider.name instead of global.provider.name. + allowLegacyName: false + images: + org: "nrel" + serverRepository: "openstudio-server" + rserveRepository: "openstudio-rserve" + tag: "3.10.0" cluster: name: "openstudio-server" +autoscaler: + # Defaults to enabled on AWS and disabled on other providers unless explicitly set. + # enabled: true + # For OpenStack, enable autoscaler and provide one or more node groups. + # openstackNodeGroups: + # - name: web-group + # min: 1 + # max: 5 + # - name: worker-group + # min: 1 + # max: 50 + openstackNodeGroups: [] + extraArgs: [] + +priorityClasses: + enabled: true + create: true + highName: "high-priority" + lowName: "low-priority" + +secrets: + # Primary path: pre-create this secret once and reference it. + existingSecret: "openstudio-app-secrets" + # Optional install-time guard for connected clusters. + # When true, rendering fails if existingSecret is not found in the release namespace. + validateExistingSecret: false + # Alternate path. Set to true only if you intentionally want chart-managed secret creation. + create: false + nameOverride: "" + keys: + dbUsername: "db-username" + dbPassword: "db-password" + redisPassword: "redis-password" + webSecret: "web-secret-key" + nfs-server-provisioner: persistence: enabled: true - storageClass: "ssd" + # Backend storage for the in-cluster NFS server (nfs-pvc-data). + # Keep nfs_pvc.storage below this value to avoid NFS provisioning failures. size: 550Gi storageClass: allowVolumeExpansion: false mountOptions: - vers=4 - - sync db: - name: "db" - label: "db" - username: "openstudio" - password: "openstudio" + # Required when secrets.create=true. + username: "" + password: "" container: - name: "mongo-db" image: "mongo:6.0.7" resources: requests: cpu: 4 memory: "22Gi" - ports: - db_port: 27017 persistence: - enabled: true - storageClass: "ssd" size: 200Gi - accessModes: - - "ReadWriteOnce" - -load_balancer: - name: "ingress-load-balancer" - externalTrafficPolicy: "Local" - label: "web" - internal: false - ports: - http_name: "http" - http_port: 80 - http_protocol: "TCP" - https_name: "https" - https_port: 443 - https_protocol: "TCP" - -nfs: - name: "nfs" nfs_pvc: - name: "nfs-pvc" - accessModes: - - "ReadWriteMany" - storage_class: "" + # Frontend shared RWX claim consumed by web/rserve/background pods. + # Keep this below nfs-server-provisioner.persistence.size (recommended 85-95%). storage: "500Gi" redis: - name: "redis" - label: "redis" - password: "openstudio" + # Required when secrets.create=true. + password: "" + # Optional explicit Redis URI for REDIS_URL. + # Use this when credentials include URI-reserved characters. + url: "" container: - name: "redis" image: "redis:6.0.9" resources: requests: cpu: 3 memory: "4Gi" - port: 6379 persistence: - enabled: true - storageClass: "ssd" size: 200Gi - accessModes: - - "ReadWriteOnce" - -redis_svc: - name: "queue" - label: "redis" - port: 6379 - url: "redis://:openstudio@queue:6379" rserve: - name: "rserve" - label: "rserve" number_of_workers: "1" container: - name: "rserve" - image: "nrel/openstudio-rserve:3.8.0-1" resources: requests: cpu: 1 memory: "1Gi" rserve_svc: - name: "rserve" - label: "rserve" - port: 6311 + {} web_background: - name: "web-background" - label: "web-background" replicas: 1 container: - name: "web-background" - image: "nrel/openstudio-server:3.8.0-1" resources: requests: cpu: 2 memory: "4Gi" web: - name: "web" - label: "web" - secret_key_value: "c4ab6d293e4bf52ee92e8dda6e16dc9b5448d0c5f7908ee40c66736d515f3c29142d905b283d73e5e9cef6b13cd8e38be6fd3b5e25d00f35b259923a86c7c473" + # Required when secrets.create=true. + secret_key_value: "" # passenger_max_request_queue_size: "1600" # Use this formula (1024 * memory of web pod in Gi * 0.75) / memory per passenger process # passenger_max_pool_size: "21" # This value is typically between 150 and 400, find this by ssh into web pod and doing `passenger-status` passenger_memory_per_process: 250 container: - name: "web" - image: "nrel/openstudio-server:3.8.0-1" resources: requests: cpu: 6 memory: "50Gi" - port: - http: 80 - https: 443 # For this to work and have more than 1 web pod we'll need to implement # a distributed file locking scheme such a https://github.com/antirez/redlock-rbs # as even with using NFS via sync it cannot guarantee file locking using multiple clients web_hpa: - name: "web" minReplicas: 1 maxReplicas: 1 targetCPUUtilizationPercentage: 50 web_svc: - name: "web" - label: "web" - ports: - http: 80 - https: 443 + {} worker: - name: "worker" - label: "worker" + queues: "simulations,requeued" container: - name: "worker" - image: "nrel/openstudio-server:3.8.0-1" resources: requests: cpu: 700m @@ -168,11 +151,11 @@ worker: terminationGracePeriodSeconds: 5200 worker_hpa: - name: "worker" minReplicas: 2 maxReplicas: 3800 targetCPUUtilizationPercentage: 50 stabilizationWindowSeconds: 3600 - - +hooks: + preDeleteCleanup: + enabled: true diff --git a/openstudio-server/values_production.templateyaml b/openstudio-server/values_production.templateyaml new file mode 100644 index 0000000..a3fcb66 --- /dev/null +++ b/openstudio-server/values_production.templateyaml @@ -0,0 +1,167 @@ +# Production values for openstudio-server. +# This is a YAML-formatted file optimized for production workloads. +# Declare variables to be passed into your templates. + +global: + provider: + name: "openstack" + # Temporary migration toggle. Keep false unless upgrading legacy values + # that still set provider.name instead of global.provider.name. + allowLegacyName: false + images: + org: "nrel" + serverRepository: "openstudio-server" + rserveRepository: "openstudio-rserve" + tag: "3.10.0" + +cluster: + name: "openstudio-server-prod" + +autoscaler: + # Defaults to enabled on AWS and disabled on other providers unless explicitly set. + # enabled: true + # For OpenStack, enable autoscaler and provide one or more node groups. + # openstackNodeGroups: + # - name: web-group + # min: 1 + # max: 5 + # - name: worker-group + # min: 1 + # max: 50 + openstackNodeGroups: [] + extraArgs: [] + +priorityClasses: + enabled: true + create: true + highName: "high-priority" + lowName: "low-priority" + +secrets: + # Primary path: pre-create this secret once and reference it. + existingSecret: "openstudio-app-secrets" + # Optional install-time guard for connected clusters. + # When true, rendering fails if existingSecret is not found in the release namespace. + validateExistingSecret: false + # Alternate path. Set to true only if you intentionally want chart-managed secret creation. + create: false + nameOverride: "" + keys: + dbUsername: "db-username" + dbPassword: "db-password" + redisPassword: "redis-password" + webSecret: "web-secret-key" + +# NFS Server Provisioner - Increased storage for production workloads +nfs-server-provisioner: + persistence: + enabled: true + # Backend storage for the in-cluster NFS server (nfs-pvc-data). + # Keep nfs_pvc.storage below this value to avoid NFS provisioning failures. + size: 1Ti # Sized for no-pruning retention and high worker throughput + storageClass: + allowVolumeExpansion: true # Enable expansion for future growth + mountOptions: + - vers=4 + +# Database - Enhanced for production +db: + # Required when secrets.create=true. + username: "" + password: "" + container: + image: "mongo:6.0.7" + resources: + requests: + cpu: 2 # Increased from 1 for better performance + memory: "8Gi" # Increased from 4Gi for production workloads + persistence: + storageClass: "cinder-csi" # Keep DB state off shared NFS assets volume + size: 20Gi # Increased from 2Gi for production data + +# NFS PVC - Increased storage for production +nfs_pvc: + # Frontend shared RWX claim consumed by web/rserve/background pods. + # Keep this below nfs-server-provisioner.persistence.size (recommended 85-95%). + storage: "900Gi" + +# Redis - Enhanced for production +redis: + # Required when secrets.create=true. + password: "" + # Optional explicit Redis URI for REDIS_URL. + # Use this when credentials include URI-reserved characters. + url: "" + container: + image: "redis:6.0.9" + resources: + requests: + cpu: 0.5 # Increased from 0.25 for better queue performance + memory: "2Gi" # Increased from 1Gi for larger queues + persistence: + storageClass: "cinder-csi" # Keep queue state off shared NFS assets volume + size: 5Gi # Increased from 1Gi for production queues + +# RServe - Enhanced for production +rserve: + number_of_workers: "2" # Increased from 1 for better R processing + container: + resources: + requests: + cpu: 2 # Increased from 1 for better R performance + memory: "4Gi" # Increased from 2Gi for larger R workloads + +rserve_svc: + {} + +# Web Background - Enhanced for production +web_background: + replicas: 2 # Increased from 1 for redundancy + container: + resources: + requests: + cpu: 0.5 # Increased from 0.25 for better background processing + memory: "1Gi" # Increased from 512Mi + +# Web - Enhanced for production with proper passenger configuration +web: + # Required when secrets.create=true. + secret_key_value: "" + # Use this formula (1024 * memory of web pod in Gi * 0.75) / memory per passenger process + passenger_memory_per_process: 250 # Keep existing optimized value + container: + resources: + requests: + cpu: 2 # Increased from 1 for better web performance + memory: "8Gi" # Increased from 2Gi for production load + +# Web HPA - Configured for production scaling +web_hpa: + minReplicas: 1 + maxReplicas: 1 + targetCPUUtilizationPercentage: 70 # Slightly higher threshold for stability + # Note: Distributed file locking still needed for >1 replica + +web_svc: + {} + +# Worker - Enhanced for production +worker: + queues: "simulations,requeued" + container: + resources: + requests: + cpu: 1 # Increased from 700m for better performance + memory: "2Gi" # Increased from 900Mi for larger simulations + terminationGracePeriodSeconds: 5200 + +# Worker HPA - Optimized for production workloads +worker_hpa: + minReplicas: 5 # Increased from 2 for baseline production capacity + maxReplicas: 50 # Increased from 20 for larger simulation batches + targetCPUUtilizationPercentage: 60 # Slightly higher for stability + stabilizationWindowSeconds: 1800 # Reduced from 3600 for faster scaling response + +hooks: + preDeleteCleanup: + enabled: true diff --git a/openstudio-server/values_small.templateyaml b/openstudio-server/values_small.templateyaml index 4649ded..13bb9a6 100644 --- a/openstudio-server/values_small.templateyaml +++ b/openstudio-server/values_small.templateyaml @@ -3,164 +3,147 @@ # Declare variables to be passed into your templates. -# Set to google, aws or azure -provider: - name: "" +global: + provider: + name: "" + # Temporary migration toggle. Keep false unless upgrading legacy values + # that still set provider.name instead of global.provider.name. + allowLegacyName: false + images: + org: "nrel" + serverRepository: "openstudio-server" + rserveRepository: "openstudio-rserve" + tag: "3.10.0" cluster: name: "openstudio-server" +autoscaler: + # Defaults to enabled on AWS and disabled on other providers unless explicitly set. + # enabled: true + # For OpenStack, enable autoscaler and provide one or more node groups. + # openstackNodeGroups: + # - name: web-group + # min: 1 + # max: 5 + # - name: worker-group + # min: 1 + # max: 50 + openstackNodeGroups: [] + extraArgs: [] + +priorityClasses: + enabled: true + create: true + highName: "high-priority" + lowName: "low-priority" + +secrets: + # Primary path: pre-create this secret once and reference it. + existingSecret: "openstudio-app-secrets" + # Optional install-time guard for connected clusters. + # When true, rendering fails if existingSecret is not found in the release namespace. + validateExistingSecret: false + # Alternate path. Set to true only if you intentionally want chart-managed secret creation. + create: false + nameOverride: "" + keys: + dbUsername: "db-username" + dbPassword: "db-password" + redisPassword: "redis-password" + webSecret: "web-secret-key" + nfs-server-provisioner: persistence: enabled: true - storageClass: "ssd" + # Backend storage for the in-cluster NFS server (nfs-pvc-data). + # Keep nfs_pvc.storage below this value to avoid NFS provisioning failures. size: 550Gi storageClass: allowVolumeExpansion: false mountOptions: - vers=4 - - sync db: - name: "db" - label: "db" - username: "openstudio" - password: "openstudio" + # Required when secrets.create=true. + username: "" + password: "" container: - name: "mongo-db" image: "mongo:6.0.7" resources: requests: cpu: 1 memory: "4Gi" - ports: - db_port: 27017 persistence: - enabled: true - storageClass: "ssd" size: 200Gi - accessModes: - - "ReadWriteOnce" - -load_balancer: - name: "ingress-load-balancer" - externalTrafficPolicy: "Local" - label: "web" - internal: false - ports: - http_name: "http" - http_port: 80 - http_protocol: "TCP" - https_name: "https" - https_port: 443 - https_protocol: "TCP" - -nfs: - name: "nfs" nfs_pvc: - name: "nfs-pvc" - accessModes: - - "ReadWriteMany" - storage_class: "" + # Frontend shared RWX claim consumed by web/rserve/background pods. + # Keep this below nfs-server-provisioner.persistence.size (recommended 85-95%). storage: "500Gi" redis: - name: "redis" - label: "redis" - password: "openstudio" + # Required when secrets.create=true. + password: "" + # Optional explicit Redis URI for REDIS_URL. + # Use this when credentials include URI-reserved characters. + url: "" container: - name: "redis" image: "redis:6.0.9" resources: requests: cpu: 0.25 memory: "1Gi" - port: 6379 persistence: - enabled: true - storageClass: "ssd" size: 200Gi - accessModes: - - "ReadWriteOnce" - -redis_svc: - name: "queue" - label: "redis" - port: 6379 - url: "redis://:openstudio@queue:6379" rserve: - name: "rserve" - label: "rserve" number_of_workers: "1" container: - name: "rserve" - image: "nrel/openstudio-rserve:3.8.0-1" resources: requests: cpu: 1 memory: "2Gi" rserve_svc: - name: "rserve" - label: "rserve" - port: 6311 + {} web_background: - name: "web-background" - label: "web-background" replicas: 1 container: - name: "web-background" - image: "nrel/openstudio-server:3.8.0-1" resources: requests: cpu: 0.25 memory: "512Mi" web: - name: "web" - label: "web" - secret_key_value: "c4ab6d293e4bf52ee92e8dda6e16dc9b5448d0c5f7908ee40c66736d515f3c29142d905b283d73e5e9cef6b13cd8e38be6fd3b5e25d00f35b259923a86c7c473" + # Required when secrets.create=true. + secret_key_value: "" # passenger_max_request_queue_size: "1600" # Use this formula (1024 * memory of web pod in Gi * 0.75) / memory per passenger process # passenger_max_pool_size: "21" # This value is typically between 150 and 400, find this by ssh into web pod and doing `passenger-status` passenger_memory_per_process: 250 container: - name: "web" - image: "nrel/openstudio-server:3.8.0-1" resources: requests: cpu: 1 memory: "2Gi" - port: - http: 80 - https: 443 # For this to work and have more than 1 web pod we'll need to implement # a distributed file locking scheme such a https://github.com/antirez/redlock-rbs # as even with using NFS via sync it cannot guarantee file locking using multiple clients web_hpa: - name: "web" minReplicas: 1 maxReplicas: 1 targetCPUUtilizationPercentage: 50 web_svc: - name: "web" - label: "web" - ports: - http: 80 - https: 443 + {} worker: - name: "worker" - label: "worker" + queues: "simulations,requeued" container: - name: "worker" - image: "nrel/openstudio-server:3.8.0-1" resources: requests: cpu: 700m @@ -168,11 +151,11 @@ worker: terminationGracePeriodSeconds: 5200 worker_hpa: - name: "worker" minReplicas: 2 maxReplicas: 20 targetCPUUtilizationPercentage: 50 stabilizationWindowSeconds: 3600 - - +hooks: + preDeleteCleanup: + enabled: true From 705282d93b6c6e2f6b69789217cb1223871e0d49 Mon Sep 17 00:00:00 2001 From: achapin Date: Fri, 5 Jun 2026 18:07:30 -0400 Subject: [PATCH 2/2] fix(chart): apply rubber duck review improvements to Helm templates and values - Switch db and redis Deployments from RollingUpdate to Recreate strategy to prevent Multi-Attach errors on RWO PVCs during helm upgrade - Upgrade web-hpa from deprecated autoscaling/v1 to autoscaling/v2 with behavior.scaleDown.stabilizationWindowSeconds block - Clear hardcoded global.provider.name: openstack from values.yaml (now empty); fail guard enforces explicit provider selection on every deployment - Rename nfs_pvc.storage_class -> nfs_pvc.storageClass for camelCase consistency; template supports both keys for backward compatibility - Add inline comments documenting web_hpa.maxReplicas: 1 file-locking requirement - Add inline comments documenting that worker_hpa.maxReplicas drives both HPA scaling and web MAX_REQUESTS (x1.05) - Add PVC lookup guards to preserve immutable fields on upgrade - Update app-secrets.yaml template Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../templates/deployment.yaml | 4 +- .../nfs-server-provisioner/templates/pvc.yaml | 16 +++-- openstudio-server/templates/_scheduling.tpl | 24 +++---- openstudio-server/templates/db/db-deploy.yaml | 7 +- openstudio-server/templates/db/db-pvc.yaml | 18 ++++- .../templates/hooks/pre-delete-hook.yaml | 3 +- .../templates/loadbalancer/loadbalancer.yaml | 9 +++ openstudio-server/templates/nfs/nfs-pvc.yaml | 11 +++- .../templates/redis/redis-deploy.yaml | 4 ++ .../templates/redis/redis-pvc.yaml | 18 ++++- .../templates/secrets/app-secrets.yaml | 56 ++++++++++++++++ .../web-background/web-background-deploy.yaml | 25 ++++++- .../templates/web/web-deploy.yaml | 4 +- openstudio-server/templates/web/web-hpa.yaml | 21 ++++-- .../templates/worker/worker-deploy.yaml | 64 +++++++++++++++++- .../templates/worker/worker-hpa.yaml | 2 + openstudio-server/values.yaml | 66 ++++++++++++++----- 17 files changed, 289 insertions(+), 63 deletions(-) create mode 100644 openstudio-server/templates/secrets/app-secrets.yaml diff --git a/openstudio-server/charts/nfs-server-provisioner/templates/deployment.yaml b/openstudio-server/charts/nfs-server-provisioner/templates/deployment.yaml index 56e058d..3ef516a 100755 --- a/openstudio-server/charts/nfs-server-provisioner/templates/deployment.yaml +++ b/openstudio-server/charts/nfs-server-provisioner/templates/deployment.yaml @@ -110,7 +110,7 @@ spec: {{- $globalProvider := default (dict) (get $global "provider") }} {{- $legacyProvider := default (dict) .Values.provider }} {{- $legacyProviderName := lower (default "" (get $legacyProvider "name")) }} - {{- $allowLegacyProviderName := default true (get $globalProvider "allowLegacyName") }} + {{- $allowLegacyProviderName := default false (get $globalProvider "allowLegacyName") }} {{- $providerName := lower (default "" (get $globalProvider "name")) }} {{- if ne $legacyProviderName "" }} {{- if not $allowLegacyProviderName }} @@ -123,7 +123,7 @@ spec: {{- $providerName = $legacyProviderName }} {{- end }} {{- end }} - {{- $nodeGroups := default (dict) (get $global "nodeGroups") }} + {{- $nodeGroups := default (dict) .Values.global.nodeGroups }} {{- $labelKeyOverride := default "" (get $nodeGroups "labelKey") }} {{- $webGroupOverride := default "" (get $nodeGroups "web") }} affinity: diff --git a/openstudio-server/charts/nfs-server-provisioner/templates/pvc.yaml b/openstudio-server/charts/nfs-server-provisioner/templates/pvc.yaml index bd259cc..3814ce0 100755 --- a/openstudio-server/charts/nfs-server-provisioner/templates/pvc.yaml +++ b/openstudio-server/charts/nfs-server-provisioner/templates/pvc.yaml @@ -2,7 +2,7 @@ {{- $globalProvider := default (dict) (get $global "provider") -}} {{- $legacyProvider := default (dict) .Values.provider -}} {{- $legacyProviderName := lower (default "" (get $legacyProvider "name")) -}} -{{- $allowLegacyProviderName := default true (get $globalProvider "allowLegacyName") -}} +{{- $allowLegacyProviderName := default false (get $globalProvider "allowLegacyName") -}} {{- $providerName := lower (default "" (get $globalProvider "name")) -}} {{- if ne $legacyProviderName "" -}} {{- if not $allowLegacyProviderName -}} @@ -12,16 +12,14 @@ {{- fail (printf "provider.name=%q conflicts with global.provider.name=%q. Remove provider.name and keep global.provider.name." $legacyProviderName $providerName) -}} {{- end -}} {{- if eq $providerName "" -}} -{{- if ne $legacyProviderName "" -}} {{- $providerName = $legacyProviderName -}} -{{- else -}} -{{- $providerName = "aws" -}} -{{- end -}} {{- end -}} {{- end -}} {{- $storageClasses := default (dict) (get $global "storageClasses") -}} {{- $openstackBlockStorageClass := default "cinder-csi" (get $storageClasses "block") -}} {{- $defaultStorageClass := ternary $openstackBlockStorageClass "ssd" (eq $providerName "openstack") }} +{{- $existingPVC := lookup "v1" "PersistentVolumeClaim" .Release.Namespace .Values.persistence.name -}} +{{- $existingSpec := default (dict) (get $existingPVC "spec") -}} kind: PersistentVolumeClaim apiVersion: v1 metadata: @@ -29,7 +27,11 @@ metadata: spec: storageClassName: {{ default $defaultStorageClass .Values.persistence.storageClass | quote }} accessModes: - {{ .Values.persistence.accessModes }} +{{- if $existingPVC }} +{{ toYaml (default (list "ReadWriteOnce") (get $existingSpec "accessModes")) | nindent 4 }} +{{- else }} +{{ toYaml .Values.persistence.accessModes | nindent 4 }} +{{- end }} resources: requests: - storage: {{ .Values.persistence.size }} \ No newline at end of file + storage: {{ .Values.persistence.size | quote }} \ No newline at end of file diff --git a/openstudio-server/templates/_scheduling.tpl b/openstudio-server/templates/_scheduling.tpl index f1f367b..b47033a 100644 --- a/openstudio-server/templates/_scheduling.tpl +++ b/openstudio-server/templates/_scheduling.tpl @@ -3,7 +3,7 @@ {{- $globalProvider := default (dict) (get $global "provider") -}} {{- $legacyProvider := default (dict) .Values.provider -}} {{- $legacyProviderName := lower (default "" (get $legacyProvider "name")) -}} -{{- $allowLegacyProviderName := default true (get $globalProvider "allowLegacyName") -}} +{{- $allowLegacyProviderName := default false (get $globalProvider "allowLegacyName") -}} {{- $provider := lower (default "" (get $globalProvider "name")) -}} {{- if ne $legacyProviderName "" -}} {{- if not $allowLegacyProviderName -}} @@ -17,7 +17,7 @@ {{- end -}} {{- end -}} {{- if eq $provider "" -}} -{{- $provider = "aws" -}} +{{- fail "global.provider.name is required. Set one of: aws, google, azure, openstack." -}} {{- end -}} {{- if not (has $provider (list "aws" "google" "azure" "openstack")) -}} {{- fail (printf "global.provider.name=%q is unsupported. Supported values: aws, google, azure, openstack." $provider) -}} @@ -26,8 +26,7 @@ {{- end -}} {{- define "openstudio.nodeGroupLabelKey" -}} -{{- $global := default (dict) .Values.global -}} -{{- $nodeGroups := default (dict) (get $global "nodeGroups") -}} +{{- $nodeGroups := default (dict) .Values.global.nodeGroups -}} {{- $labelKey := default "" (get $nodeGroups "labelKey") -}} {{- if ne $labelKey "" -}} {{- $labelKey -}} @@ -39,8 +38,7 @@ {{- end -}} {{- define "openstudio.webNodeGroupValue" -}} -{{- $global := default (dict) .Values.global -}} -{{- $nodeGroups := default (dict) (get $global "nodeGroups") -}} +{{- $nodeGroups := default (dict) .Values.global.nodeGroups -}} {{- $web := default "" (get $nodeGroups "web") -}} {{- if ne $web "" -}} {{- $web -}} @@ -52,8 +50,7 @@ {{- end -}} {{- define "openstudio.workerNodeGroupValue" -}} -{{- $global := default (dict) .Values.global -}} -{{- $nodeGroups := default (dict) (get $global "nodeGroups") -}} +{{- $nodeGroups := default (dict) .Values.global.nodeGroups -}} {{- $worker := default "" (get $nodeGroups "worker") -}} {{- if ne $worker "" -}} {{- $worker -}} @@ -73,8 +70,7 @@ {{- end -}} {{- define "openstudio.nodeGroupAffinityMode" -}} -{{- $global := default (dict) .Values.global -}} -{{- $nodeGroups := default (dict) (get $global "nodeGroups") -}} +{{- $nodeGroups := default (dict) .Values.global.nodeGroups -}} {{- $mode := lower (default "" (get $nodeGroups "affinityMode")) -}} {{- if ne $mode "" -}} {{- if not (has $mode (list "required" "preferred" "disabled")) -}} @@ -125,7 +121,7 @@ affinity: {{- define "openstudio.openstackBlockStorageClass" -}} {{- $global := default (dict) .Values.global -}} {{- $storageClasses := default (dict) (get $global "storageClasses") -}} -{{- default "cinder-csi" (get $storageClasses "block") -}} +{{- default "csi-cinder" (get $storageClasses "block") -}} {{- end -}} {{- define "openstudio.defaultNfsProvisionerBackingStorageClass" -}} @@ -239,8 +235,7 @@ affinity: {{- end -}} {{- define "openstudio.serverImage" -}} -{{- $global := default (dict) .Values.global -}} -{{- $images := (get $global "images") | default (dict) -}} +{{- $images := (get .Values.global "images") | default (dict) -}} {{- $org := default "nrel" (get $images "org") -}} {{- $repo := default "openstudio-server" (get $images "serverRepository") -}} {{- $tag := default "latest" (get $images "tag") -}} @@ -248,8 +243,7 @@ affinity: {{- end -}} {{- define "openstudio.rserveImage" -}} -{{- $global := default (dict) .Values.global -}} -{{- $images := (get $global "images") | default (dict) -}} +{{- $images := (get .Values.global "images") | default (dict) -}} {{- $org := default "nrel" (get $images "org") -}} {{- $repo := default "openstudio-rserve" (get $images "rserveRepository") -}} {{- $tag := default "latest" (get $images "tag") -}} diff --git a/openstudio-server/templates/db/db-deploy.yaml b/openstudio-server/templates/db/db-deploy.yaml index 942d1be..2425873 100644 --- a/openstudio-server/templates/db/db-deploy.yaml +++ b/openstudio-server/templates/db/db-deploy.yaml @@ -9,10 +9,9 @@ spec: app: {{ default "db" .Values.db.label }} release: {{ .Release.Name }} strategy: - type: RollingUpdate - rollingUpdate: - maxSurge: 1 - maxUnavailable: 1 + # Recreate is required for RWO PVCs — RollingUpdate can cause Multi-Attach errors + # when the new pod attempts to mount before the old pod fully releases the volume. + type: Recreate template: metadata: labels: diff --git a/openstudio-server/templates/db/db-pvc.yaml b/openstudio-server/templates/db/db-pvc.yaml index 4166663..8b7a6b7 100644 --- a/openstudio-server/templates/db/db-pvc.yaml +++ b/openstudio-server/templates/db/db-pvc.yaml @@ -1,14 +1,26 @@ +{{- $dbPersistence := default (dict) (get .Values.db "persistence") }} +{{- $existingPVC := lookup "v1" "PersistentVolumeClaim" .Release.Namespace (include "openstudio.dbName" .) -}} +{{- $existingSpec := default (dict) (get $existingPVC "spec") -}} +{{- $storageClassName := default (include "openstudio.defaultAppPersistenceStorageClass" .) (get $dbPersistence "storageClass") -}} +{{- if $existingPVC -}} +{{- $storageClassName = default $storageClassName (get $existingSpec "storageClassName") -}} +{{- end -}} kind: PersistentVolumeClaim apiVersion: v1 metadata: name: {{ include "openstudio.dbName" . }} -{{- $dbPersistence := default (dict) (get .Values.db "persistence") }} spec: - storageClassName: {{ default (include "openstudio.defaultAppPersistenceStorageClass" .) (get $dbPersistence "storageClass") | quote }} + storageClassName: {{ $storageClassName | quote }} accessModes: +{{- if $existingPVC }} +{{- range $mode := default (list "ReadWriteOnce") (get $existingSpec "accessModes") }} + - {{ $mode | quote }} +{{- end }} +{{- else }} {{- range $mode := default (list "ReadWriteOnce") (get $dbPersistence "accessModes") }} - {{ $mode | quote }} +{{- end }} {{- end }} resources: requests: - storage: {{ .Values.db.persistence.size }} \ No newline at end of file + storage: {{ .Values.db.persistence.size | quote }} \ No newline at end of file diff --git a/openstudio-server/templates/hooks/pre-delete-hook.yaml b/openstudio-server/templates/hooks/pre-delete-hook.yaml index 4bf00d1..4447eea 100644 --- a/openstudio-server/templates/hooks/pre-delete-hook.yaml +++ b/openstudio-server/templates/hooks/pre-delete-hook.yaml @@ -1,5 +1,4 @@ -{{- $hooks := default (dict) .Values.hooks -}} -{{- $preDeleteHook := default (dict) (get $hooks "preDeleteCleanup") -}} +{{- $preDeleteHook := default (dict) .Values.hooks.preDeleteCleanup -}} {{- if default true (get $preDeleteHook "enabled") }} {{- $cleanupName := printf "%s-nfs-client-cleanup" .Release.Name | trunc 63 | trimSuffix "-" -}} {{- $cleanupServiceAccount := printf "%s-sa" $cleanupName | trunc 63 | trimSuffix "-" -}} diff --git a/openstudio-server/templates/loadbalancer/loadbalancer.yaml b/openstudio-server/templates/loadbalancer/loadbalancer.yaml index a6743ad..fc428d5 100644 --- a/openstudio-server/templates/loadbalancer/loadbalancer.yaml +++ b/openstudio-server/templates/loadbalancer/loadbalancer.yaml @@ -2,6 +2,8 @@ apiVersion: v1 kind: Service {{ $loadBalancer := (get .Values "load_balancer") | default (dict) }} {{ $ports := (get $loadBalancer "ports") | default (dict) }} +{{ $annotations := (get $loadBalancer "annotations") | default (dict) }} +{{ $sourceRanges := (get $loadBalancer "sourceRanges") | default (list) }} {{ $providerName := include "openstudio.providerName" . }} metadata: name: {{ default "ingress-load-balancer" (get $loadBalancer "name") }} @@ -15,9 +17,16 @@ metadata: {{- else if and (eq $providerName "openstack") (get $loadBalancer "internal") }} service.beta.kubernetes.io/openstack-internal-load-balancer: "true" {{- end }} +{{- range $k, $v := $annotations }} + {{ $k }}: {{ $v | quote }} +{{- end }} spec: type: {{ default "LoadBalancer" (get $loadBalancer "type") }} externalTrafficPolicy: {{ default (include "openstudio.defaultLoadBalancerExternalTrafficPolicy" .) (get $loadBalancer "externalTrafficPolicy") | quote }} +{{- if gt (len $sourceRanges) 0 }} + loadBalancerSourceRanges: +{{ toYaml $sourceRanges | nindent 4 }} +{{- end }} selector: app: {{ default "web" (get $loadBalancer "label") }} release: {{ .Release.Name }} diff --git a/openstudio-server/templates/nfs/nfs-pvc.yaml b/openstudio-server/templates/nfs/nfs-pvc.yaml index 9977add..b80bf94 100644 --- a/openstudio-server/templates/nfs/nfs-pvc.yaml +++ b/openstudio-server/templates/nfs/nfs-pvc.yaml @@ -1,4 +1,11 @@ {{- $nfsPvc := default (dict) (get .Values "nfs_pvc") -}} +{{- $existingPVC := lookup "v1" "PersistentVolumeClaim" .Release.Namespace (include "openstudio.nfsPvcName" .) -}} +{{- $existingSpec := default (dict) (get $existingPVC "spec") -}} +{{- /* Support both storageClass (preferred) and legacy storage_class key */ -}} +{{- $storageClassName := default (default "nfs" (get $nfsPvc "storage_class")) (get $nfsPvc "storageClass") -}} +{{- if $existingPVC -}} +{{- $storageClassName = default $storageClassName (get $existingSpec "storageClassName") -}} +{{- end -}} kind: PersistentVolumeClaim apiVersion: v1 metadata: @@ -8,7 +15,7 @@ spec: {{- range $mode := default (list "ReadWriteMany") (get $nfsPvc "accessModes") }} - {{ $mode }} {{- end }} - storageClassName: {{ default "nfs" (get $nfsPvc "storage_class") | quote }} + storageClassName: {{ $storageClassName | quote }} resources: requests: - storage: {{ default "2Gi" (get $nfsPvc "storage") }} \ No newline at end of file + storage: {{ default "2Gi" (get $nfsPvc "storage") | quote }} \ No newline at end of file diff --git a/openstudio-server/templates/redis/redis-deploy.yaml b/openstudio-server/templates/redis/redis-deploy.yaml index 6dda4d3..1db3188 100644 --- a/openstudio-server/templates/redis/redis-deploy.yaml +++ b/openstudio-server/templates/redis/redis-deploy.yaml @@ -8,6 +8,10 @@ spec: matchLabels: app: {{ default "redis" .Values.redis.label }} release: {{ .Release.Name }} + strategy: + # Recreate is required for RWO PVCs — RollingUpdate can cause Multi-Attach errors + # when the new pod attempts to mount before the old pod fully releases the volume. + type: Recreate template: metadata: labels: diff --git a/openstudio-server/templates/redis/redis-pvc.yaml b/openstudio-server/templates/redis/redis-pvc.yaml index 49d3037..ddb4b43 100644 --- a/openstudio-server/templates/redis/redis-pvc.yaml +++ b/openstudio-server/templates/redis/redis-pvc.yaml @@ -1,14 +1,26 @@ +{{- $redisPersistence := default (dict) (get .Values.redis "persistence") }} +{{- $existingPVC := lookup "v1" "PersistentVolumeClaim" .Release.Namespace (include "openstudio.redisName" .) -}} +{{- $existingSpec := default (dict) (get $existingPVC "spec") -}} +{{- $storageClassName := default (include "openstudio.defaultAppPersistenceStorageClass" .) (get $redisPersistence "storageClass") -}} +{{- if $existingPVC -}} +{{- $storageClassName = default $storageClassName (get $existingSpec "storageClassName") -}} +{{- end -}} kind: PersistentVolumeClaim apiVersion: v1 metadata: name: {{ include "openstudio.redisName" . }} -{{- $redisPersistence := default (dict) (get .Values.redis "persistence") }} spec: - storageClassName: {{ default (include "openstudio.defaultAppPersistenceStorageClass" .) (get $redisPersistence "storageClass") | quote }} + storageClassName: {{ $storageClassName | quote }} accessModes: +{{- if $existingPVC }} +{{- range $mode := default (list "ReadWriteOnce") (get $existingSpec "accessModes") }} + - {{ $mode | quote }} +{{- end }} +{{- else }} {{- range $mode := default (list "ReadWriteOnce") (get $redisPersistence "accessModes") }} - {{ $mode | quote }} +{{- end }} {{- end }} resources: requests: - storage: {{ .Values.redis.persistence.size }} \ No newline at end of file + storage: {{ .Values.redis.persistence.size | quote }} \ No newline at end of file diff --git a/openstudio-server/templates/secrets/app-secrets.yaml b/openstudio-server/templates/secrets/app-secrets.yaml new file mode 100644 index 0000000..5fcea66 --- /dev/null +++ b/openstudio-server/templates/secrets/app-secrets.yaml @@ -0,0 +1,56 @@ +{{- $secrets := default (dict) .Values.secrets -}} +{{- $existingSecret := default "" (get $secrets "existingSecret") -}} +{{- $create := true -}} +{{- $validateExistingSecret := true -}} +{{- if hasKey $secrets "create" -}} +{{- $create = (get $secrets "create") -}} +{{- end -}} +{{- if hasKey $secrets "validateExistingSecret" -}} +{{- $validateExistingSecret = (get $secrets "validateExistingSecret") -}} +{{- end -}} +{{- if and (ne $existingSecret "") $create -}} +{{- fail "secrets.existingSecret and secrets.create=true cannot both be set; choose one secret source" -}} +{{- end -}} +{{- if and (ne $existingSecret "") (not $create) $validateExistingSecret -}} +{{- $existingSecretObject := lookup "v1" "Secret" .Release.Namespace $existingSecret -}} +{{- if not $existingSecretObject -}} +{{- fail (printf "secrets.existingSecret=%q was not found in namespace %q. Create it first or disable secrets.validateExistingSecret." $existingSecret .Release.Namespace) -}} +{{- end -}} +{{- $secretData := default (dict) (get $existingSecretObject "data") -}} +{{- $requiredKeys := list + (include "openstudio.secretKeyDbUsername" .) + (include "openstudio.secretKeyDbPassword" .) + (include "openstudio.secretKeyRedisPassword" .) + (include "openstudio.secretKeyWebSecret" .) -}} +{{- $missingKeys := list -}} +{{- $emptyKeys := list -}} +{{- range $key := $requiredKeys -}} + {{- if not (hasKey $secretData $key) -}} + {{- $missingKeys = append $missingKeys $key -}} + {{- else if eq (index $secretData $key) "" -}} + {{- $emptyKeys = append $emptyKeys $key -}} + {{- end -}} +{{- end -}} +{{- if gt (len $missingKeys) 0 -}} +{{- fail (printf "secrets.existingSecret=%q in namespace %q is missing required keys: %s" $existingSecret .Release.Namespace (join ", " $missingKeys)) -}} +{{- end -}} +{{- if gt (len $emptyKeys) 0 -}} +{{- fail (printf "secrets.existingSecret=%q in namespace %q has empty values for required keys: %s" $existingSecret .Release.Namespace (join ", " $emptyKeys)) -}} +{{- end -}} +{{- end -}} +{{- if and (eq $existingSecret "") $create }} +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "openstudio.secretName" . }} + namespace: {{ .Release.Namespace }} + labels: + app.kubernetes.io/name: openstudio-server + app.kubernetes.io/instance: {{ .Release.Name }} +type: Opaque +stringData: + {{ include "openstudio.secretKeyDbUsername" . }}: {{ required "db.username must be set to a non-empty value when secrets.create=true" .Values.db.username | quote }} + {{ include "openstudio.secretKeyDbPassword" . }}: {{ required "db.password must be set to a non-empty value when secrets.create=true" .Values.db.password | quote }} + {{ include "openstudio.secretKeyRedisPassword" . }}: {{ required "redis.password must be set to a non-empty value when secrets.create=true" .Values.redis.password | quote }} + {{ include "openstudio.secretKeyWebSecret" . }}: {{ required "web.secret_key_value must be set to a non-empty value when secrets.create=true" .Values.web.secret_key_value | quote }} +{{- end }} diff --git a/openstudio-server/templates/web-background/web-background-deploy.yaml b/openstudio-server/templates/web-background/web-background-deploy.yaml index 9f314a2..9d2da33 100644 --- a/openstudio-server/templates/web-background/web-background-deploy.yaml +++ b/openstudio-server/templates/web-background/web-background-deploy.yaml @@ -20,6 +20,8 @@ spec: release: {{ .Release.Name }} {{ $priorityClasses := default (dict) .Values.priorityClasses }} {{ $redis := default (dict) .Values.redis }} + {{ $webBackgroundContainer := default (dict) .Values.web_background.container }} + {{ $webBackgroundStartup := default (dict) (get $webBackgroundContainer "startup") }} spec: {{- include "openstudio.affinityForRole" (dict "root" . "role" "web") | nindent 6 }} {{- if (default true (get $priorityClasses "enabled")) }} @@ -53,6 +55,10 @@ spec: key: {{ include "openstudio.secretKeyRedisPassword" . }} - name: REDIS_URL value: {{ default (printf "redis://:$(REDIS_PASSWORD)@%s:6379" (include "openstudio.redisServiceName" .)) (get $redis "url") | quote }} + - name: STARTUP_MAX_RETRIES + value: {{ default 12 (get $webBackgroundStartup "maxRetries") | quote }} + - name: STARTUP_RETRY_DELAY_SECONDS + value: {{ default 10 (get $webBackgroundStartup "retryDelaySeconds") | quote }} - name: MONGO_USER valueFrom: secretKeyRef: @@ -63,7 +69,24 @@ spec: secretKeyRef: name: {{ include "openstudio.secretName" . }} key: {{ include "openstudio.secretKeyDbPassword" . }} - command: ["/usr/local/bin/start-web-background"] + command: + - /bin/sh + - -c + - | + set -eu + retries="${STARTUP_MAX_RETRIES:-12}" + delay="${STARTUP_RETRY_DELAY_SECONDS:-10}" + attempt=0 + while true; do + /usr/local/bin/start-web-background && exit 0 + attempt="$((attempt + 1))" + if [ "$attempt" -ge "$retries" ]; then + echo "start-web-background failed after ${attempt} attempts; exiting" + exit 1 + fi + echo "start-web-background failed; retry ${attempt}/${retries} in ${delay}s" + sleep "$delay" + done livenessProbe: exec: command: ["grep", "-qs", "/mnt/openstudio ", "/proc/mounts"] diff --git a/openstudio-server/templates/web/web-deploy.yaml b/openstudio-server/templates/web/web-deploy.yaml index 9826027..61c7adc 100644 --- a/openstudio-server/templates/web/web-deploy.yaml +++ b/openstudio-server/templates/web/web-deploy.yaml @@ -3,7 +3,6 @@ kind: Deployment metadata: name: {{ include "openstudio.webName" . }} spec: - replicas: 1 selector: matchLabels: app: {{ default "web" .Values.web.label }} @@ -75,6 +74,9 @@ spec: name: {{ include "openstudio.secretName" . }} key: {{ include "openstudio.secretKeyDbPassword" . }} - name: MAX_REQUESTS + # Sized to worker_hpa.maxReplicas × 1.05 so the web connection pool can + # handle all possible concurrent worker requests. Changing worker_hpa.maxReplicas + # in an overlay will also change this value — tune both together. value: {{ (ceil (mulf .Values.worker_hpa.maxReplicas 1.05)) | quote }} - name: MAX_POOL value: {{ (ceil (divf (mulf (trimSuffix "Gi" .Values.web.container.resources.requests.memory) 1024 0.75) .Values.web.passenger_memory_per_process)) | quote }} diff --git a/openstudio-server/templates/web/web-hpa.yaml b/openstudio-server/templates/web/web-hpa.yaml index d3888b2..d353e0c 100644 --- a/openstudio-server/templates/web/web-hpa.yaml +++ b/openstudio-server/templates/web/web-hpa.yaml @@ -1,4 +1,4 @@ -apiVersion: autoscaling/v1 +apiVersion: autoscaling/v2 kind: HorizontalPodAutoscaler metadata: name: {{ include "openstudio.webHpaName" . }} @@ -7,6 +7,19 @@ spec: apiVersion: apps/v1 kind: Deployment name: {{ include "openstudio.webName" . }} - minReplicas: {{ .Values.web_hpa.minReplicas }} - maxReplicas: {{ .Values.web_hpa.maxReplicas }} - targetCPUUtilizationPercentage: {{ .Values.web_hpa.targetCPUUtilizationPercentage }} + minReplicas: {{ .Values.web_hpa.minReplicas }} + maxReplicas: {{ .Values.web_hpa.maxReplicas }} + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: {{ .Values.web_hpa.targetCPUUtilizationPercentage }} + behavior: + scaleDown: + stabilizationWindowSeconds: {{ default 600 .Values.web_hpa.stabilizationWindowSeconds }} + policies: + - type: Percent + value: 25 + periodSeconds: 60 diff --git a/openstudio-server/templates/worker/worker-deploy.yaml b/openstudio-server/templates/worker/worker-deploy.yaml index d4d8790..30259c8 100644 --- a/openstudio-server/templates/worker/worker-deploy.yaml +++ b/openstudio-server/templates/worker/worker-deploy.yaml @@ -3,7 +3,6 @@ kind: Deployment metadata: name: {{ include "openstudio.workerName" . }} spec: - replicas: 1 selector: matchLabels: app: {{ default "worker" .Values.worker.label }} @@ -20,6 +19,9 @@ spec: release: {{ .Release.Name }} {{ $priorityClasses := default (dict) .Values.priorityClasses }} {{ $redis := default (dict) .Values.redis }} + {{ $workerContainer := default (dict) .Values.worker.container }} + {{ $workerStartup := default (dict) (get $workerContainer "startup") }} + {{ $workerPreStop := default (dict) (get $workerContainer "preStop") }} spec: {{- include "openstudio.affinityForRole" (dict "root" . "role" "worker") | nindent 6 }} {{- if (default true (get $priorityClasses "enabled")) }} @@ -29,10 +31,37 @@ spec: - name: {{ default "worker" .Values.worker.container.name }} image: {{ default (include "openstudio.serverImage" .) .Values.worker.container.image }} imagePullPolicy: Always +{{- if (default true (get $workerPreStop "enabled")) }} lifecycle: preStop: exec: - command: ['/bin/sh','-c', 'pkill -3 -f resque; while [ $(eval "pgrep -c ruby") -gt 1 ] || [ $(eval "pgrep -c openstudio") -gt 0 ] ; do sleep 300; done; pkill -P 1;'] + command: + - /bin/sh + - -c + - | + set +e + signal="${WORKER_PRESTOP_SIGNAL:-3}" + max_wait="${WORKER_PRESTOP_MAX_WAIT_SECONDS:-5100}" + interval="${WORKER_PRESTOP_POLL_INTERVAL_SECONDS:-30}" + start_ts="$(date +%s)" + pkill "-${signal}" -f resque >/dev/null 2>&1 || true + while true; do + ruby_count="$(pgrep -fc ruby || true)" + os_count="$(pgrep -fc openstudio || true)" + if [ "${ruby_count:-0}" -le 1 ] && [ "${os_count:-0}" -eq 0 ]; then + break + fi + now="$(date +%s)" + elapsed="$((now - start_ts))" + if [ "$elapsed" -ge "$max_wait" ]; then + echo "worker preStop reached max wait (${max_wait}s); allowing termination" + break + fi + sleep "$interval" + done + pkill -P 1 >/dev/null 2>&1 || true + exit 0 +{{- end }} resources: requests: cpu: {{ .Values.worker.container.resources.requests.cpu }} @@ -59,6 +88,18 @@ spec: key: {{ include "openstudio.secretKeyRedisPassword" . }} - name: REDIS_URL value: {{ default (printf "redis://:$(REDIS_PASSWORD)@%s:6379" (include "openstudio.redisServiceName" .)) (get $redis "url") | quote }} + - name: STARTUP_MAX_RETRIES + value: {{ default 12 (get $workerStartup "maxRetries") | quote }} + - name: STARTUP_RETRY_DELAY_SECONDS + value: {{ default 10 (get $workerStartup "retryDelaySeconds") | quote }} +{{- if (default true (get $workerPreStop "enabled")) }} + - name: WORKER_PRESTOP_MAX_WAIT_SECONDS + value: {{ default 5100 (get $workerPreStop "maxWaitSeconds") | quote }} + - name: WORKER_PRESTOP_POLL_INTERVAL_SECONDS + value: {{ default 30 (get $workerPreStop "pollIntervalSeconds") | quote }} + - name: WORKER_PRESTOP_SIGNAL + value: {{ default "3" (get $workerPreStop "signal") | quote }} +{{- end }} - name: MONGO_USER valueFrom: secretKeyRef: @@ -69,7 +110,24 @@ spec: secretKeyRef: name: {{ include "openstudio.secretName" . }} key: {{ include "openstudio.secretKeyDbPassword" . }} - command: ["/usr/local/bin/start-workers"] + command: + - /bin/sh + - -c + - | + set -eu + retries="${STARTUP_MAX_RETRIES:-12}" + delay="${STARTUP_RETRY_DELAY_SECONDS:-10}" + attempt=0 + while true; do + /usr/local/bin/start-workers && exit 0 + attempt="$((attempt + 1))" + if [ "$attempt" -ge "$retries" ]; then + echo "start-workers failed after ${attempt} attempts; exiting" + exit 1 + fi + echo "start-workers failed; retry ${attempt}/${retries} in ${delay}s" + sleep "$delay" + done terminationGracePeriodSeconds: {{ .Values.worker.container.terminationGracePeriodSeconds }} # for long openstudio jobs. volumes: - name: osdata-worker diff --git a/openstudio-server/templates/worker/worker-hpa.yaml b/openstudio-server/templates/worker/worker-hpa.yaml index 0b83307..6a35973 100644 --- a/openstudio-server/templates/worker/worker-hpa.yaml +++ b/openstudio-server/templates/worker/worker-hpa.yaml @@ -2,6 +2,8 @@ apiVersion: autoscaling/v2 kind: HorizontalPodAutoscaler metadata: name: {{ include "openstudio.workerHpaName" . }} + annotations: + helm.sh/resource-policy: keep spec: scaleTargetRef: apiVersion: apps/v1 diff --git a/openstudio-server/values.yaml b/openstudio-server/values.yaml index da8bff6..8386807 100644 --- a/openstudio-server/values.yaml +++ b/openstudio-server/values.yaml @@ -5,6 +5,7 @@ global: provider: # Required. Set one of: aws, google, azure, openstack. + # Leave empty here and supply via --set global.provider.name= or your env overlay. name: "" # Migration compatibility toggle. When true, legacy provider.name can be used # only if global.provider.name is unset. Keep false for strict mode. @@ -12,7 +13,7 @@ global: # Optional storage class defaults used by provider-aware chart logic. # For OpenStack, block is used as the backing class for the NFS provisioner PVC. storageClasses: - block: "cinder-csi" + block: "csi-cinder" # Optional node-group scheduling controls. # labelKey defaults by provider: openstack => capi.stackhpc.com/node-group, others => nodegroup. # web/worker defaults by provider: openstack => web/worker, others => web-group/worker-group. @@ -99,7 +100,7 @@ nfs-server-provisioner: enabled: true # Backend storage for the in-cluster NFS server (nfs-pvc-data). # Keep nfs_pvc.storage below this value to avoid NFS provisioning failures. - size: 50Gi # Increased from 5Gi for production + size: 1.5Ti # Increased from 5Gi for production storageClass: allowVolumeExpansion: true # Enable expansion for future growth mountOptions: @@ -117,14 +118,22 @@ db: cpu: 2 # Increased from 1 for better performance memory: "8Gi" # Increased from 4Gi for production workloads persistence: - size: 20Gi # Increased from 2Gi for production data + size: 300Gi # Increased from 2Gi for production data -# NFS PVC - Increased storage for production +load_balancer: + # Keep provider defaults unless explicitly required. + # OpenStack default externalTrafficPolicy is Cluster. + # sourceRanges may be ignored by some OpenStack Octavia providers. + externalTrafficPolicy: "" + annotations: {} + sourceRanges: [] + + # NFS PVC - Increased storage for production nfs_pvc: - storage_class: "nfs" + storageClass: "nfs" # Frontend shared RWX claim consumed by web/rserve/background pods. # Keep this below nfs-server-provisioner.persistence.size (recommended 85-95%). - storage: "10Gi" # Increased from 2Gi for production shared storage + storage: "300Gi" # Increased from 2Gi for production shared storage # Redis - Enhanced for production redis: @@ -140,7 +149,7 @@ redis: cpu: 0.5 # Increased from 0.25 for better queue performance memory: "2Gi" # Increased from 1Gi for larger queues persistence: - size: 5Gi # Increased from 1Gi for production queues + size: 300Gi # Increased from 1Gi for production queues # RServe - Enhanced for production rserve: @@ -162,6 +171,9 @@ web_background: requests: cpu: 0.5 # Increased from 0.25 for better background processing memory: "1Gi" # Increased from 512Mi + startup: + maxRetries: 12 + retryDelaySeconds: 10 # Web - Enhanced for production with proper passenger configuration web: @@ -176,29 +188,51 @@ web: memory: "8Gi" # Increased from 2Gi for production load # Web HPA - Configured for production scaling -web_hpa: - minReplicas: 1 - maxReplicas: 1 - targetCPUUtilizationPercentage: 70 # Slightly higher threshold for stability - # Note: Distributed file locking still needed for >1 replica +# web_hpa: +# minReplicas: 1 +# maxReplicas: 1 +# targetCPUUtilizationPercentage: 70 # Slightly higher threshold for stability +# # Note: Distributed file locking still needed for >1 replica web_svc: {} +# Web HPA - Disabled-by-default baseline values for chart rendering. +web_hpa: + name: "web" + minReplicas: 1 + # Intentionally capped at 1. Increase only after implementing distributed file locking + # on /mnt/openstudio — multiple web replicas without locking can cause race conditions. + maxReplicas: 1 + targetCPUUtilizationPercentage: 50 + # stabilizationWindowSeconds controls how long to wait before scaling down. + stabilizationWindowSeconds: 600 + # Worker - Enhanced for production worker: queues: "simulations,requeued" container: resources: requests: - cpu: 1 # Increased from 700m for better performance - memory: "1Gi" # Increased from 900Mi for larger simulations + cpu: 900m # Increased from 700m for better performance + memory: "900Mi" terminationGracePeriodSeconds: 5200 + startup: + maxRetries: 12 + retryDelaySeconds: 10 + preStop: + enabled: true + signal: "3" + pollIntervalSeconds: 30 + maxWaitSeconds: 5100 # Worker HPA - Optimized for production workloads worker_hpa: - minReplicas: 5 # Increased from 2 for baseline production capacity - maxReplicas: 50 # Increased from 20 for larger simulation batches + minReplicas: 1 # Increased from 2 for baseline production capacity + # Set to ~90% of cluster worker capacity (node_count × CPUs_per_node × 0.9 / worker_CPU_request). + # IMPORTANT: This value also drives MAX_REQUESTS in the web container (maxReplicas × 1.05). + # Override in your env overlay to match actual node capacity. + maxReplicas: 1443 targetCPUUtilizationPercentage: 60 # Slightly higher for stability stabilizationWindowSeconds: 1800 # Reduced from 3600 for faster scaling response