From f08d7c7a3f02ca3a1d83c61e7ec472a200128e09 Mon Sep 17 00:00:00 2001
From: Arthur Cheng <arthur.cheng@oracle.com>
Date: Mon, 22 Jun 2026 19:37:54 -0700
Subject: [PATCH] add model and multi node runtime for GLM 5.2

---
 config/models/zai-org/GLM-5.2-FP8.yaml        |  15 +
 config/runtimes/kustomization.yaml            |   1 +
 .../vllm/zai-org/glm-5-2-fp8-multi-rt.yaml    | 313 ++++++++++++++++++
 .../isvc/zai-org/glm-5-2-fp8-multi.yaml       |  16 +
 4 files changed, 345 insertions(+)
 create mode 100644 config/models/zai-org/GLM-5.2-FP8.yaml
 create mode 100644 config/runtimes/vllm/zai-org/glm-5-2-fp8-multi-rt.yaml
 create mode 100644 config/samples/isvc/zai-org/glm-5-2-fp8-multi.yaml

diff --git a/config/models/zai-org/GLM-5.2-FP8.yaml b/config/models/zai-org/GLM-5.2-FP8.yaml
new file mode 100644
index 000000000..a0bb70e41
--- /dev/null
+++ b/config/models/zai-org/GLM-5.2-FP8.yaml
@@ -0,0 +1,15 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterBaseModel
+metadata:
+  name: glm-5-2-fp8
+spec:
+  modelCapabilities:
+    - TEXT_TO_TEXT
+  vendor: zai-org
+  displayName: zai-org.glm-5.2-fp8
+  disabled: false
+  version: "1.0.0"
+  storage:
+    storageUri: hf://zai-org/GLM-5.2-FP8
+    path: /raid/models/zai-org/GLM-5.2-FP8
+    key: hf-token
diff --git a/config/runtimes/kustomization.yaml b/config/runtimes/kustomization.yaml
index 0b78b5905..cb860be5f 100644
--- a/config/runtimes/kustomization.yaml
+++ b/config/runtimes/kustomization.yaml
@@ -52,3 +52,4 @@ resources:
 - vllm/mixtral-8x7b-instruct-rt.yaml
 - vllm/deepseek-ai/deepseek-v4-flash-rt.yaml
 - vllm/deepseek-ai/deepseek-v4-pro-rt.yaml
+- vllm/zai-org/glm-5-2-fp8-multi-rt.yaml
diff --git a/config/runtimes/vllm/zai-org/glm-5-2-fp8-multi-rt.yaml b/config/runtimes/vllm/zai-org/glm-5-2-fp8-multi-rt.yaml
new file mode 100644
index 000000000..ec2dab023
--- /dev/null
+++ b/config/runtimes/vllm/zai-org/glm-5-2-fp8-multi-rt.yaml
@@ -0,0 +1,313 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: vllm-glm-5-2-fp8-multi
+spec:
+  disabled: false
+  routerConfig:
+    annotations:
+      prometheus.io/path: /metrics
+      prometheus.io/port: '29000'
+      prometheus.io/scrape: 'true'
+    labels:
+      logging-forward: enabled
+    runner:
+      name: router
+      image: fra.ocir.io/idqj093njucb/smg:1.5.0
+      ports:
+        - containerPort: 8080
+          name: http
+      resources:
+        requests:
+          cpu: "1"
+          memory: 2Gi
+        limits:
+          cpu: "8"
+          memory: 16Gi
+      args:
+        - --host
+        - 0.0.0.0
+        - --port
+        - "8080"
+        - --service-discovery
+        - --service-discovery-namespace
+        - $(NAMESPACE)
+        - --service-discovery-port
+        - "8080"
+        - --selector
+        - component=engine ome.io/inferenceservice=$(INFERENCESERVICE_NAME)
+        - --enable-igw
+        - --request-id-headers
+        - opc-request-id
+        - --log-json
+        - --disable-retries
+        - --disable-circuit-breaker
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
+      readinessProbe:
+        httpGet:
+          path: /readiness
+          port: 8080
+        failureThreshold: 5
+        periodSeconds: 30
+        timeoutSeconds: 10
+      livenessProbe:
+        httpGet:
+          path: /liveness
+          port: 8080
+        failureThreshold: 5
+        periodSeconds: 30
+        timeoutSeconds: 10
+      startupProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        periodSeconds: 20
+        timeoutSeconds: 10
+        initialDelaySeconds: 30
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "5.12.0"
+      modelFormat:
+        name: safetensors
+        version: "1.0.0"
+      modelArchitecture: GlmMoeDsaForCausalLM
+      quantization: fp8
+      autoSelect: true
+      priority: 1
+  modelSizeRange:
+    min: 700B
+    max: 800B
+  protocolVersions:
+    - openAI
+  engineConfig:
+    annotations:
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+    labels:
+      logging-forward: enabled
+    leader:
+      tolerations:
+        - key: "nvidia.com/gpu"
+          operator: "Exists"
+          effect: "NoSchedule"
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: node.kubernetes.io/instance-type
+                    operator: In
+                    values:
+                      - BM.GPU.H100.8
+                      - BM.GPU.H100-NC.8
+      dnsPolicy: ClusterFirstWithHostNet
+      hostNetwork: true
+      volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+        - name: devinf
+          hostPath:
+            path: /dev/infiniband
+      runner:
+        name: ome-container
+        image: fra.ocir.io/idqj093njucb/vllm-openai:v0.23.0-cu129
+        ports:
+          - containerPort: 8080
+            name: http1
+            protocol: TCP
+        command:
+          - vllm
+          - serve
+        args:
+          - $(MODEL_PATH)
+          - --trust-remote-code
+          - --kv-cache-dtype=fp8
+          - --tensor-parallel-size=8
+          - -cc.cudagraph_num_of_warmups=10
+          - --pipeline-parallel-size=2
+          - --nnodes=2
+          - --node-rank=0
+          - --master-addr=$(LWS_LEADER_ADDRESS)
+          - --enable-expert-parallel
+          - --gpu-memory-utilization=0.9
+          - --max-num-seqs=32
+          - --max-num-batched-tokens=512
+          - --enable-chunked-prefill
+          - --enable-prefix-caching
+          # around 500K context
+          - --max-model-len=-1
+          - --tool-call-parser=glm47
+          - --enable-auto-tool-choice
+          - --reasoning-parser=glm45
+          - --safetensors-load-strategy=prefetch
+          - --served-model-name=vllm-model
+          - --port=8080
+        env:
+          - name: VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS
+            value: '1'
+          - name: VLLM_ENGINE_READY_TIMEOUT_S
+            value: '3600'
+          - name: VLLM_LOGGING_LEVEL
+            value: 'INFO'
+          - name: GLOO_SOCKET_IFNAME
+            value: eth0
+          - name: NCCL_SOCKET_IFNAME
+            value: eth0
+          - name: NODE_IP
+            valueFrom:
+              fieldRef:
+                fieldPath: status.hostIP
+        volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /dev/infiniband
+            name: devinf
+        securityContext:
+          capabilities:
+            add:
+              - IPC_LOCK
+              - CAP_SYS_ADMIN
+          privileged: true
+        resources:
+          requests:
+            cpu: 64
+            memory: 512Gi
+            nvidia.com/gpu: 8
+          limits:
+            cpu: 64
+            memory: 512Gi
+            nvidia.com/gpu: 8
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 8080
+          failureThreshold: 3
+          successThreshold: 1
+          periodSeconds: 90
+          timeoutSeconds: 60
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 8080
+          failureThreshold: 5
+          successThreshold: 1
+          periodSeconds: 60
+          timeoutSeconds: 60
+        startupProbe:
+          httpGet:
+            path: /health
+            port: 8080
+          failureThreshold: 190
+          successThreshold: 1
+          periodSeconds: 6
+          initialDelaySeconds: 60
+          timeoutSeconds: 30
+    worker:
+      size: 1
+      tolerations:
+        - key: "nvidia.com/gpu"
+          operator: "Exists"
+          effect: "NoSchedule"
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: node.kubernetes.io/instance-type
+                    operator: In
+                    values:
+                      - BM.GPU.H100.8
+                      - BM.GPU.H100-NC.8
+      dnsPolicy: ClusterFirstWithHostNet
+      hostNetwork: true
+      enableServiceLinks: false
+      hostIPC: true
+      volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+        - name: devinf
+          hostPath:
+            path: /dev/infiniband
+      runner:
+        name: ome-container
+        image: fra.ocir.io/idqj093njucb/vllm-openai:v0.23.0-cu129
+        command:
+          - vllm
+          - serve
+        args:
+          - $(MODEL_PATH)
+          - --trust-remote-code
+          - --kv-cache-dtype=fp8
+          - --tensor-parallel-size=8
+          - -cc.cudagraph_num_of_warmups=10
+          - --pipeline-parallel-size=2
+          - --nnodes=2
+          - --node-rank=1
+          - --master-addr=$(LWS_LEADER_ADDRESS)
+          - --headless
+          - --enable-expert-parallel
+          - --gpu-memory-utilization=0.9
+          - --max-num-seqs=32
+          - --max-num-batched-tokens=512
+          - --enable-chunked-prefill
+          - --enable-prefix-caching
+          # around 500K context
+          - --max-model-len=-1
+          - --tool-call-parser=glm47
+          - --enable-auto-tool-choice
+          - --reasoning-parser=glm45
+          - --safetensors-load-strategy=prefetch
+          - --served-model-name=vllm-model
+          - --port=8080
+        env:
+          - name: VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS
+            value: '1'
+          - name: VLLM_ENGINE_READY_TIMEOUT_S
+            value: '3600'
+          - name: VLLM_LOGGING_LEVEL
+            value: 'INFO'
+          - name: GLOO_SOCKET_IFNAME
+            value: eth0
+          - name: NCCL_SOCKET_IFNAME
+            value: eth0
+          - name: NODE_IP
+            valueFrom:
+              fieldRef:
+                fieldPath: status.hostIP
+        volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /dev/infiniband
+            name: devinf
+        securityContext:
+          capabilities:
+            add:
+              - IPC_LOCK
+              - CAP_SYS_ADMIN
+          privileged: true
+        resources:
+          requests:
+            cpu: 64
+            memory: 512Gi
+            nvidia.com/gpu: 8
+          limits:
+            cpu: 64
+            memory: 512Gi
+            nvidia.com/gpu: 8
+        # omit HTTP probes since worker does not expose HTTP endpoints
diff --git a/config/samples/isvc/zai-org/glm-5-2-fp8-multi.yaml b/config/samples/isvc/zai-org/glm-5-2-fp8-multi.yaml
new file mode 100644
index 000000000..3b8d582d1
--- /dev/null
+++ b/config/samples/isvc/zai-org/glm-5-2-fp8-multi.yaml
@@ -0,0 +1,16 @@
+apiVersion: ome.io/v1beta1
+kind: InferenceService
+metadata:
+  name: glm-5-2-fp8-multi
+  namespace: glm
+spec:
+  model:
+    name: glm-5-2-fp8
+  engine:
+    minReplicas: 1
+    maxReplicas: 1
+  router:
+    minReplicas: 1
+    maxReplicas: 1
+  runtime:
+    name: vllm-glm-5-2-fp8-multi