Merge branch 'main' into refactor/ingress-config

sd109 · web-flow · commit 8786c3c13ebf · 2025-02-18T13:50:17.000Z
diff --git a/.github/workflows/build-push-vllm-cpu.yml b/.github/workflows/build-push-vllm-cpu.yml
@@ -0,0 +1,71 @@
+name: Publish vLLM CPU images
+
+on:
+  # NOTE(sd109): Since this is checking out an external
+  # it's probably safer to leave this as workflow dispatch
+  # only so that we can manually build images from specific
+  # refs rather than automatically pulling in the latest
+  # content from the remote repo.
+  workflow_dispatch:
+    inputs:
+      vllm_ref:
+        type: string
+        description: The vLLM GitHub ref (tag, branch or commit) to build.
+        required: true
+
+jobs:
+  build_push_x86_image:
+    name: Build and push image
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      id-token: write         # needed for signing the images with GitHub OIDC Token
+      packages: write         # required for pushing container images
+      security-events: write  # required for pushing SARIF files
+    steps:
+      - name: Check out the vLLM repository
+        uses: actions/checkout@v4
+        with:
+          repository: vllm-project/vllm
+          ref: ${{ inputs.vllm_ref }}
+
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and push image
+        run: |
+          IMAGE=ghcr.io/stackhpc/vllm-cpu:${{ inputs.vllm_ref }}
+          docker build -f Dockerfile.cpu -t $IMAGE --shm-size=4g .
+          docker push $IMAGE
+
+  build_push_arm64_image:
+    name: Build and push image
+    runs-on: ubuntu-24.04-arm
+    permissions:
+      contents: read
+      id-token: write         # needed for signing the images with GitHub OIDC Token
+      packages: write         # required for pushing container images
+      security-events: write  # required for pushing SARIF files
+    steps:
+      - name: Check out the vLLM repository
+        uses: actions/checkout@v4
+        with:
+          repository: vllm-project/vllm
+          ref: ${{ inputs.vllm_ref }}
+
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and push image
+        run: |
+          IMAGE=ghcr.io/stackhpc/vllm-cpu:${{ inputs.vllm_ref }}-arm64
+          docker build -f Dockerfile.arm -t $IMAGE --shm-size=4g .
+          docker push $IMAGE
diff --git a/.github/workflows/test-pr.yml b/.github/workflows/test-pr.yml
@@ -66,7 +66,7 @@ jobs:
         with:
           cluster_name: ${{ env.CLUSTER_NAME }}
 
-      # NOTE(scott): Since the local Chart.yaml uses "appVersion: latest" and this
+      # NOTE(scott): Since the local Chart.yaml uses "appVersion: latest" and this
       # only gets overwritten to the correct commit SHA during Helm chart build,
       # we need to pull these published images and load them into the kind cluster
       # with the tag correct tag.
diff --git a/charts/azimuth-chat/ci/test-values.yaml b/charts/azimuth-chat/ci/test-values.yaml
@@ -1,16 +1,24 @@
 azimuth-llm:
+  huggingface:
+    # Use the smallest LLM we can find
+    model: &model HuggingFaceTB/SmolLM2-135M-Instruct
   api:
-    enabled: false
+    # CI Kind cluster doesn't have kube-prometheus-stack
+    monitoring:
+      enabled: false
+    # No GPUs in CI runners
+    gpus: 0
   ui:
     service:
       zenith:
         enabled: false
     appSettings:
+      model_name: *model
       # Verify that we can set non-standard LLM params
       llm_params:
         max_tokens: 101
         temperature: 0.1
+        top_k: 2
         top_p: 0.15
-        top_k: 1
         presence_penalty: 0.9
         frequency_penalty: 1
diff --git a/charts/azimuth-image-analysis/ci/test-values.yaml b/charts/azimuth-image-analysis/ci/test-values.yaml
@@ -0,0 +1,23 @@
+azimuth-llm:
+  huggingface:
+    # Use the smallest vision model we can find
+    model: &model HuggingFaceTB/SmolVLM-256M-Instruct
+  api:
+    # CI Kind cluster doesn't have kube-prometheus-stack
+    monitoring:
+      enabled: false
+    # No GPUs in CI runners
+    gpus: 0
+  ui:
+    service:
+      zenith:
+        enabled: false
+    appSettings:
+      model_name: *model
+      # Verify that we can set non-standard LLM params
+      llm_params:
+        max_tokens: 10  # Constrain response tokens to speed up CI test
+        temperature: 0.1
+        top_p: 0.15
+        presence_penalty: 0.9
+        frequency_penalty: 1
diff --git a/charts/azimuth-image-analysis/ci/ui-only-values.yaml b/charts/azimuth-image-analysis/ci/ui-only-values.yaml
diff --git a/charts/azimuth-llm/ci/default-values.yaml b/charts/azimuth-llm/ci/default-values.yaml
@@ -0,0 +1,18 @@
+# This is intended to test the default chart values
+# as close as possible given the constraints of running
+# inside a Kind cluster within a CI runner
+huggingface:
+  # Use the smallest LLM we can find
+  model: &model HuggingFaceTB/SmolLM2-135M-Instruct
+api:
+  # CI Kind cluster doesn't have kube-prometheus-stack
+  monitoring:
+    enabled: false
+  # No GPUs in CI runners
+  gpus: 0
+ui:
+  service:
+    zenith:
+      enabled: false
+  appSettings:
+    model_name: *model
diff --git a/charts/azimuth-llm/ci/no-api-values.yaml b/charts/azimuth-llm/ci/no-api-values.yaml
diff --git a/charts/azimuth-llm/templates/api/deployment.yml b/charts/azimuth-llm/templates/api/deployment.yml
@@ -19,7 +19,8 @@ spec:
     spec:
       containers:
       - name: {{ .Release.Name }}-api
-        image: {{ printf "%s:%s" .Values.api.image.repository .Values.api.image.version }}
+        {{ $imageRepo := .Values.api.image.repository | default (ternary "ghcr.io/stackhpc/vllm-cpu" "vllm-project/vllm" (eq (.Values.api.gpus | int) 0)) -}}
+        image: {{ printf "%s:%s" $imageRepo .Values.api.image.version }}
         ports:
         - name: api
           containerPort: 8000
@@ -29,7 +30,7 @@ spec:
         args:
           - --model
           - {{ .Values.huggingface.model }}
-          {{- include "azimuth-llm.chatTemplate" . | nindent 10 }}
+          {{- include "azimuth-llm.chatTemplate" . | nindent 10 -}}
           {{- if .Values.api.modelMaxContextLength -}}
           - --max-model-len
           - {{ .Values.api.modelMaxContextLength | quote }}
@@ -41,7 +42,7 @@ spec:
           {{- if .Values.api.extraArgs -}}
           {{- .Values.api.extraArgs | toYaml | nindent 10 }}
           {{- end -}}
-        {{- if .Values.huggingface.secretName }}
+        {{- if .Values.huggingface.secretName -}}
         envFrom:
         - secretRef:
             name: {{ .Values.huggingface.secretName }}
diff --git a/charts/azimuth-llm/templates/api/service.yml b/charts/azimuth-llm/templates/api/service.yml
@@ -2,7 +2,7 @@
 apiVersion: v1
 kind: Service
 metadata:
-  name: {{ .Values.api.service.name }}
+  name: {{ .Release.Name }}-api
   labels:
     {{- include "azimuth-llm.api-selectorLabels" . | nindent 4 }}
 spec:
diff --git a/charts/azimuth-llm/templates/api/zenith-client.yml b/charts/azimuth-llm/templates/api/zenith-client.yml
@@ -8,7 +8,7 @@ metadata:
 spec:
   reservationName: {{ .Release.Name }}-api
   upstream:
-    serviceName: {{ .Values.api.service.name }}
+    serviceName: {{ .Release.Name }}-api
   auth:
     skip: {{ .Values.api.service.zenith.skipAuth }}
 {{- end -}}
diff --git a/charts/azimuth-llm/templates/test/end-to-end.yml b/charts/azimuth-llm/templates/test/end-to-end.yml
@@ -10,21 +10,23 @@ spec:
     spec:
       containers:
       - name: gradio-client-test
-        {{- /*
-          Use the chat image since we know this contains the gradio_client package
-        */}}
-        image: {{ printf "ghcr.io/stackhpc/azimuth-llm-chat-ui:%s" (default .Chart.AppVersion .Values.ui.image.tag) }}
+        # Assumes that one of the in-repo Gradio apps is used and that
+        # the app includes a `gradio-test-client.py` script.
+        image: {{ printf "%s:%s" .Values.ui.image.repository (default .Chart.AppVersion .Values.ui.image.tag) }}
         imagePullPolicy: IfNotPresent
         command:
         - python
         - gradio-client-test.py
         {{- if .Values.ui.ingress.enabled }}
         - {{ .Values.ingress.host }}{{ .Values.ui.ingress.path }}
         {{- else }}
-        - http://{{ .Values.ui.service.name }}.{{ .Release.Namespace }}.svc
+        - http://{{ .Release.Name }}-ui.{{ .Release.Namespace }}.svc
         {{- end }}
+        env:
+        - name: PYTHONUNBUFFERED
+          value: "1"
+        tty: true # Make stdout from python visible in k8s logs
       restartPolicy: Never
-  # Allow plenty of retries since downloading
-  # model weights can take a long time.
-  backoffLimit: 10
+  # Handle retries within gradio-test-client script
+  backoffLimit: 1
 {{- end -}}
diff --git a/charts/azimuth-llm/templates/test/web-app.yml b/charts/azimuth-llm/templates/test/web-app.yml
@@ -18,7 +18,7 @@ spec:
         {{- if .Values.ui.ingress.enabled }}
         - {{ .Values.ingress.host | trimPrefix "http://" | trimPrefix "https://" }}{{ .Values.ui.ingress.path }}
         {{- else }}
-        - {{ .Values.ui.service.name }}.{{ .Release.Namespace }}.svc
+        - {{ .Release.Name }}-ui.{{ .Release.Namespace }}.svc
         {{- end }}
         - "80"
       restartPolicy: Never
diff --git a/charts/azimuth-llm/templates/ui/app-config-map.yml b/charts/azimuth-llm/templates/ui/app-config-map.yml
@@ -7,5 +7,6 @@ metadata:
     {{- include "azimuth-llm.labels" . | nindent 4 }}
 data:
   overrides.yml: |
+    {{- $_ := set .Values.ui.appSettings "backend_url" (printf "http://%s-api.%s.svc" .Release.Name .Release.Namespace) }}
     {{- .Values.ui.appSettings | toYaml | nindent 4 }}
 {{- end -}}
diff --git a/charts/azimuth-llm/templates/ui/service.yml b/charts/azimuth-llm/templates/ui/service.yml
@@ -2,7 +2,7 @@
 apiVersion: v1
 kind: Service
 metadata:
-  name: {{ .Values.ui.service.name }}
+  name: {{ .Release.Name }}-ui
   labels:
     {{- include "azimuth-llm.labels" . | nindent 4 }}
 spec:
diff --git a/charts/azimuth-llm/templates/ui/ui-zenith-client.yml b/charts/azimuth-llm/templates/ui/ui-zenith-client.yml
@@ -9,7 +9,7 @@ metadata:
 spec:
   reservationName: {{ .Release.Name }}-ui
   upstream:
-    serviceName: {{ .Values.ui.service.name }}
+    serviceName: {{ .Release.Name }}-ui
   auth:
     skip: {{ .Values.ui.service.zenith.skipAuth }}
 {{- end -}}
diff --git a/charts/azimuth-llm/values.schema.json b/charts/azimuth-llm/values.schema.json
@@ -37,7 +37,7 @@
                             "type": "string",
                             "title": "Backend vLLM version",
                             "description": "The vLLM version to use as a backend. Must be a version tag from [this list](https://github.com/vllm-project/vllm/tags)",
-                            "default": "v0.6.3"
+                            "default": "v0.7.2"
                         }
                     }
                 }
@@ -118,7 +118,8 @@
                         }
                     },
                     "required": [
-                        "model_name"                    ]
+                        "model_name"
+                    ]
                 }
             }
         }
diff --git a/charts/azimuth-llm/values.yaml b/charts/azimuth-llm/values.yaml
@@ -33,15 +33,16 @@ api:
   enabled: true
   # Container image config
   image:
-    repository: vllm/vllm-openai
-    version: v0.6.3
+    # Defaults to vllm/vllm-openai when api.gpus > 0
+    # or ghrc.io/stackhpc/vllm-cpu when api.gpus == 0
+    repository:
+    version: v0.7.2
   monitoring:
     enabled: true
   # The number of replicas for the backend deployment
   replicas: 1
   # Service config
   service:
-    name: llm-backend
     type: ClusterIP
     zenith:
       enabled: false
@@ -73,8 +74,7 @@ api:
     hostPath:
       path: /tmp/llm/huggingface-cache
   # Number of gpus to requests for each api pod instance
-  # NOTE: This must be in the range 1 <= value <= N, where
-  # 'N' is the number of GPUs available in a single
+  # NOTE: This must be less than the number of GPUs available in a single
   # worker node on the target Kubernetes cluster.
   # NOTE: According to the vLLM docs found here
   # https://docs.vllm.ai/en/latest/serving/distributed_serving.html
@@ -83,8 +83,12 @@ api:
   gpus: 1
   # The update strategy to use for the deployment
   # See https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#updating-a-deployment
-  # NOTE: Changing this has implications for the number of additional GPU worker nodes required
-  # to preform a rolling zero-downtime update
+  # NOTE: The following RollingUpdate strategy offers a zero-downtime update but requires additional GPU worker nodes.
+  # updateStrategy:
+  #   type: RollingUpdate
+  #   rollingUpdate:
+  #     maxSurge: 1
+  #     maxUnavailable: 0
   updateStrategy:
     type: Recreate
   # The value of the vLLM backend's max_model_len argument (if the model's default is not suitable)
@@ -122,7 +126,6 @@ ui:
         - Arial
   # Service config
   service:
-    name: web-app
     type: ClusterIP
     zenith:
       enabled: true
diff --git a/ct.yaml b/ct.yaml
@@ -6,3 +6,6 @@ validate-maintainers: false
 all: true
 # Split output to make it look nice in GitHub Actions tab
 github-groups: true
+# Allow for long running install and test processes
+# (e.g. downloading containers images and model weights)
+helm-extra-args: --timeout 1200s
diff --git a/web-apps/chat/app.py b/web-apps/chat/app.py
@@ -61,7 +61,6 @@ class PossibleSystemPromptException(Exception):
     streaming=True,
 )
 
-
 def inference(latest_message, history):
     # Allow mutating global variable
     global BACKEND_INITIALISED
diff --git a/web-apps/chat/defaults.yml b/web-apps/chat/defaults.yml
@@ -30,6 +30,6 @@ theme_params: {}
 theme_params_extended: {}
 
 # Additional CSS and JS overrides
-# See https://www.gradio.app/guides/custom-CSS-and-JS
+# See https://www.gradio.app/guides/custom-CSS-and-JS
 css_overrides:
 custom_javascript:
diff --git a/web-apps/chat/gradio-client-test.py b/web-apps/chat/gradio-client-test.py
diff --git a/web-apps/image-analysis/gradio-client-test.py b/web-apps/image-analysis/gradio-client-test.py
diff --git a/web-apps/utils/utils.py b/web-apps/utils/utils.py