From f08d7c7a3f02ca3a1d83c61e7ec472a200128e09 Mon Sep 17 00:00:00 2001 From: Arthur Cheng Date: Mon, 22 Jun 2026 19:37:54 -0700 Subject: [PATCH] add model and multi node runtime for GLM 5.2 --- config/models/zai-org/GLM-5.2-FP8.yaml | 15 + config/runtimes/kustomization.yaml | 1 + .../vllm/zai-org/glm-5-2-fp8-multi-rt.yaml | 313 ++++++++++++++++++ .../isvc/zai-org/glm-5-2-fp8-multi.yaml | 16 + 4 files changed, 345 insertions(+) create mode 100644 config/models/zai-org/GLM-5.2-FP8.yaml create mode 100644 config/runtimes/vllm/zai-org/glm-5-2-fp8-multi-rt.yaml create mode 100644 config/samples/isvc/zai-org/glm-5-2-fp8-multi.yaml diff --git a/config/models/zai-org/GLM-5.2-FP8.yaml b/config/models/zai-org/GLM-5.2-FP8.yaml new file mode 100644 index 000000000..a0bb70e41 --- /dev/null +++ b/config/models/zai-org/GLM-5.2-FP8.yaml @@ -0,0 +1,15 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterBaseModel +metadata: + name: glm-5-2-fp8 +spec: + modelCapabilities: + - TEXT_TO_TEXT + vendor: zai-org + displayName: zai-org.glm-5.2-fp8 + disabled: false + version: "1.0.0" + storage: + storageUri: hf://zai-org/GLM-5.2-FP8 + path: /raid/models/zai-org/GLM-5.2-FP8 + key: hf-token diff --git a/config/runtimes/kustomization.yaml b/config/runtimes/kustomization.yaml index 0b78b5905..cb860be5f 100644 --- a/config/runtimes/kustomization.yaml +++ b/config/runtimes/kustomization.yaml @@ -52,3 +52,4 @@ resources: - vllm/mixtral-8x7b-instruct-rt.yaml - vllm/deepseek-ai/deepseek-v4-flash-rt.yaml - vllm/deepseek-ai/deepseek-v4-pro-rt.yaml +- vllm/zai-org/glm-5-2-fp8-multi-rt.yaml diff --git a/config/runtimes/vllm/zai-org/glm-5-2-fp8-multi-rt.yaml b/config/runtimes/vllm/zai-org/glm-5-2-fp8-multi-rt.yaml new file mode 100644 index 000000000..ec2dab023 --- /dev/null +++ b/config/runtimes/vllm/zai-org/glm-5-2-fp8-multi-rt.yaml @@ -0,0 +1,313 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: vllm-glm-5-2-fp8-multi +spec: + disabled: false + routerConfig: + annotations: + prometheus.io/path: /metrics + prometheus.io/port: '29000' + prometheus.io/scrape: 'true' + labels: + logging-forward: enabled + runner: + name: router + image: fra.ocir.io/idqj093njucb/smg:1.5.0 + ports: + - containerPort: 8080 + name: http + resources: + requests: + cpu: "1" + memory: 2Gi + limits: + cpu: "8" + memory: 16Gi + args: + - --host + - 0.0.0.0 + - --port + - "8080" + - --service-discovery + - --service-discovery-namespace + - $(NAMESPACE) + - --service-discovery-port + - "8080" + - --selector + - component=engine ome.io/inferenceservice=$(INFERENCESERVICE_NAME) + - --enable-igw + - --request-id-headers + - opc-request-id + - --log-json + - --disable-retries + - --disable-circuit-breaker + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] + readinessProbe: + httpGet: + path: /readiness + port: 8080 + failureThreshold: 5 + periodSeconds: 30 + timeoutSeconds: 10 + livenessProbe: + httpGet: + path: /liveness + port: 8080 + failureThreshold: 5 + periodSeconds: 30 + timeoutSeconds: 10 + startupProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + periodSeconds: 20 + timeoutSeconds: 10 + initialDelaySeconds: 30 + supportedModelFormats: + - modelFramework: + name: transformers + version: "5.12.0" + modelFormat: + name: safetensors + version: "1.0.0" + modelArchitecture: GlmMoeDsaForCausalLM + quantization: fp8 + autoSelect: true + priority: 1 + modelSizeRange: + min: 700B + max: 800B + protocolVersions: + - openAI + engineConfig: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + rdma.ome.io/auto-inject: "true" + rdma.ome.io/profile: "oci-roce" + rdma.ome.io/container-name: "ome-container" + labels: + logging-forward: enabled + leader: + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - BM.GPU.H100.8 + - BM.GPU.H100-NC.8 + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + volumes: + - name: dshm + emptyDir: + medium: Memory + - name: devinf + hostPath: + path: /dev/infiniband + runner: + name: ome-container + image: fra.ocir.io/idqj093njucb/vllm-openai:v0.23.0-cu129 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - vllm + - serve + args: + - $(MODEL_PATH) + - --trust-remote-code + - --kv-cache-dtype=fp8 + - --tensor-parallel-size=8 + - -cc.cudagraph_num_of_warmups=10 + - --pipeline-parallel-size=2 + - --nnodes=2 + - --node-rank=0 + - --master-addr=$(LWS_LEADER_ADDRESS) + - --enable-expert-parallel + - --gpu-memory-utilization=0.9 + - --max-num-seqs=32 + - --max-num-batched-tokens=512 + - --enable-chunked-prefill + - --enable-prefix-caching + # around 500K context + - --max-model-len=-1 + - --tool-call-parser=glm47 + - --enable-auto-tool-choice + - --reasoning-parser=glm45 + - --safetensors-load-strategy=prefetch + - --served-model-name=vllm-model + - --port=8080 + env: + - name: VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS + value: '1' + - name: VLLM_ENGINE_READY_TIMEOUT_S + value: '3600' + - name: VLLM_LOGGING_LEVEL + value: 'INFO' + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: NCCL_SOCKET_IFNAME + value: eth0 + - name: NODE_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /dev/infiniband + name: devinf + securityContext: + capabilities: + add: + - IPC_LOCK + - CAP_SYS_ADMIN + privileged: true + resources: + requests: + cpu: 64 + memory: 512Gi + nvidia.com/gpu: 8 + limits: + cpu: 64 + memory: 512Gi + nvidia.com/gpu: 8 + readinessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 90 + timeoutSeconds: 60 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 190 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + worker: + size: 1 + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - BM.GPU.H100.8 + - BM.GPU.H100-NC.8 + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + enableServiceLinks: false + hostIPC: true + volumes: + - name: dshm + emptyDir: + medium: Memory + - name: devinf + hostPath: + path: /dev/infiniband + runner: + name: ome-container + image: fra.ocir.io/idqj093njucb/vllm-openai:v0.23.0-cu129 + command: + - vllm + - serve + args: + - $(MODEL_PATH) + - --trust-remote-code + - --kv-cache-dtype=fp8 + - --tensor-parallel-size=8 + - -cc.cudagraph_num_of_warmups=10 + - --pipeline-parallel-size=2 + - --nnodes=2 + - --node-rank=1 + - --master-addr=$(LWS_LEADER_ADDRESS) + - --headless + - --enable-expert-parallel + - --gpu-memory-utilization=0.9 + - --max-num-seqs=32 + - --max-num-batched-tokens=512 + - --enable-chunked-prefill + - --enable-prefix-caching + # around 500K context + - --max-model-len=-1 + - --tool-call-parser=glm47 + - --enable-auto-tool-choice + - --reasoning-parser=glm45 + - --safetensors-load-strategy=prefetch + - --served-model-name=vllm-model + - --port=8080 + env: + - name: VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS + value: '1' + - name: VLLM_ENGINE_READY_TIMEOUT_S + value: '3600' + - name: VLLM_LOGGING_LEVEL + value: 'INFO' + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: NCCL_SOCKET_IFNAME + value: eth0 + - name: NODE_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /dev/infiniband + name: devinf + securityContext: + capabilities: + add: + - IPC_LOCK + - CAP_SYS_ADMIN + privileged: true + resources: + requests: + cpu: 64 + memory: 512Gi + nvidia.com/gpu: 8 + limits: + cpu: 64 + memory: 512Gi + nvidia.com/gpu: 8 + # omit HTTP probes since worker does not expose HTTP endpoints diff --git a/config/samples/isvc/zai-org/glm-5-2-fp8-multi.yaml b/config/samples/isvc/zai-org/glm-5-2-fp8-multi.yaml new file mode 100644 index 000000000..3b8d582d1 --- /dev/null +++ b/config/samples/isvc/zai-org/glm-5-2-fp8-multi.yaml @@ -0,0 +1,16 @@ +apiVersion: ome.io/v1beta1 +kind: InferenceService +metadata: + name: glm-5-2-fp8-multi + namespace: glm +spec: + model: + name: glm-5-2-fp8 + engine: + minReplicas: 1 + maxReplicas: 1 + router: + minReplicas: 1 + maxReplicas: 1 + runtime: + name: vllm-glm-5-2-fp8-multi