Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions config/models/zai-org/GLM-5.2-FP8.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
apiVersion: ome.io/v1beta1
kind: ClusterBaseModel
metadata:
name: glm-5-2-fp8
spec:
modelCapabilities:
- TEXT_TO_TEXT
vendor: zai-org
displayName: zai-org.glm-5.2-fp8
disabled: false
version: "1.0.0"
storage:
storageUri: hf://zai-org/GLM-5.2-FP8
path: /raid/models/zai-org/GLM-5.2-FP8
key: hf-token
1 change: 1 addition & 0 deletions config/runtimes/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,4 @@ resources:
- vllm/mixtral-8x7b-instruct-rt.yaml
- vllm/deepseek-ai/deepseek-v4-flash-rt.yaml
- vllm/deepseek-ai/deepseek-v4-pro-rt.yaml
- vllm/zai-org/glm-5-2-fp8-multi-rt.yaml
313 changes: 313 additions & 0 deletions config/runtimes/vllm/zai-org/glm-5-2-fp8-multi-rt.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,313 @@
apiVersion: ome.io/v1beta1
kind: ClusterServingRuntime
metadata:
name: vllm-glm-5-2-fp8-multi
spec:
disabled: false
routerConfig:
annotations:
prometheus.io/path: /metrics
prometheus.io/port: '29000'
prometheus.io/scrape: 'true'
labels:
logging-forward: enabled
runner:
name: router
image: fra.ocir.io/idqj093njucb/smg:1.5.0
ports:
- containerPort: 8080
name: http
resources:
requests:
cpu: "1"
memory: 2Gi
limits:
cpu: "8"
memory: 16Gi
args:
- --host
- 0.0.0.0
- --port
- "8080"
- --service-discovery
- --service-discovery-namespace
- $(NAMESPACE)
- --service-discovery-port
- "8080"
- --selector
- component=engine ome.io/inferenceservice=$(INFERENCESERVICE_NAME)
- --enable-igw
- --request-id-headers
- opc-request-id
- --log-json
- --disable-retries
- --disable-circuit-breaker
env:
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: INFERENCESERVICE_NAME
valueFrom:
fieldRef:
fieldPath: metadata.labels['ome.io/inferenceservice']
readinessProbe:
httpGet:
path: /readiness
port: 8080
failureThreshold: 5
periodSeconds: 30
timeoutSeconds: 10
livenessProbe:
httpGet:
path: /liveness
port: 8080
failureThreshold: 5
periodSeconds: 30
timeoutSeconds: 10
startupProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 5
periodSeconds: 20
timeoutSeconds: 10
initialDelaySeconds: 30
supportedModelFormats:
- modelFramework:
name: transformers
version: "5.12.0"
modelFormat:
name: safetensors
version: "1.0.0"
modelArchitecture: GlmMoeDsaForCausalLM
quantization: fp8
autoSelect: true
priority: 1
modelSizeRange:
min: 700B
max: 800B
protocolVersions:
- openAI
engineConfig:
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
rdma.ome.io/auto-inject: "true"
rdma.ome.io/profile: "oci-roce"
rdma.ome.io/container-name: "ome-container"
labels:
logging-forward: enabled
leader:
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node.kubernetes.io/instance-type
operator: In
values:
- BM.GPU.H100.8
- BM.GPU.H100-NC.8
dnsPolicy: ClusterFirstWithHostNet
hostNetwork: true
volumes:
- name: dshm
emptyDir:
medium: Memory
- name: devinf
hostPath:
path: /dev/infiniband
runner:
name: ome-container
image: fra.ocir.io/idqj093njucb/vllm-openai:v0.23.0-cu129
ports:
- containerPort: 8080
name: http1
protocol: TCP
command:
- vllm
- serve
args:
- $(MODEL_PATH)
- --trust-remote-code
- --kv-cache-dtype=fp8
- --tensor-parallel-size=8
- -cc.cudagraph_num_of_warmups=10
- --pipeline-parallel-size=2
- --nnodes=2
- --node-rank=0
- --master-addr=$(LWS_LEADER_ADDRESS)
- --enable-expert-parallel
- --gpu-memory-utilization=0.9
- --max-num-seqs=32
- --max-num-batched-tokens=512
- --enable-chunked-prefill
- --enable-prefix-caching
# around 500K context
- --max-model-len=-1
- --tool-call-parser=glm47
- --enable-auto-tool-choice
- --reasoning-parser=glm45
- --safetensors-load-strategy=prefetch
- --served-model-name=vllm-model
- --port=8080
env:
- name: VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS
value: '1'
- name: VLLM_ENGINE_READY_TIMEOUT_S
value: '3600'
- name: VLLM_LOGGING_LEVEL
value: 'INFO'
- name: GLOO_SOCKET_IFNAME
value: eth0
- name: NCCL_SOCKET_IFNAME
value: eth0
- name: NODE_IP
valueFrom:
fieldRef:
fieldPath: status.hostIP
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /dev/infiniband
name: devinf
securityContext:
capabilities:
add:
- IPC_LOCK
- CAP_SYS_ADMIN
privileged: true
resources:
requests:
cpu: 64
memory: 512Gi
nvidia.com/gpu: 8
limits:
cpu: 64
memory: 512Gi
nvidia.com/gpu: 8
readinessProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 3
successThreshold: 1
periodSeconds: 90
timeoutSeconds: 60
livenessProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 5
successThreshold: 1
periodSeconds: 60
timeoutSeconds: 60
startupProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 190
successThreshold: 1
periodSeconds: 6
initialDelaySeconds: 60
timeoutSeconds: 30
worker:
size: 1
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node.kubernetes.io/instance-type
operator: In
values:
- BM.GPU.H100.8
- BM.GPU.H100-NC.8
dnsPolicy: ClusterFirstWithHostNet
hostNetwork: true
enableServiceLinks: false
hostIPC: true
volumes:
- name: dshm
emptyDir:
medium: Memory
- name: devinf
hostPath:
path: /dev/infiniband
runner:
name: ome-container
image: fra.ocir.io/idqj093njucb/vllm-openai:v0.23.0-cu129
command:
- vllm
- serve
args:
- $(MODEL_PATH)
- --trust-remote-code
- --kv-cache-dtype=fp8
- --tensor-parallel-size=8
- -cc.cudagraph_num_of_warmups=10
- --pipeline-parallel-size=2
- --nnodes=2
- --node-rank=1
- --master-addr=$(LWS_LEADER_ADDRESS)
- --headless
- --enable-expert-parallel
- --gpu-memory-utilization=0.9
- --max-num-seqs=32
- --max-num-batched-tokens=512
- --enable-chunked-prefill
- --enable-prefix-caching
# around 500K context
- --max-model-len=-1
- --tool-call-parser=glm47
- --enable-auto-tool-choice
- --reasoning-parser=glm45
- --safetensors-load-strategy=prefetch
- --served-model-name=vllm-model
- --port=8080
env:
- name: VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS
value: '1'
- name: VLLM_ENGINE_READY_TIMEOUT_S
value: '3600'
- name: VLLM_LOGGING_LEVEL
value: 'INFO'
- name: GLOO_SOCKET_IFNAME
value: eth0
- name: NCCL_SOCKET_IFNAME
value: eth0
- name: NODE_IP
valueFrom:
fieldRef:
fieldPath: status.hostIP
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /dev/infiniband
name: devinf
securityContext:
capabilities:
add:
- IPC_LOCK
- CAP_SYS_ADMIN
privileged: true
resources:
requests:
cpu: 64
memory: 512Gi
nvidia.com/gpu: 8
limits:
cpu: 64
memory: 512Gi
nvidia.com/gpu: 8
# omit HTTP probes since worker does not expose HTTP endpoints
16 changes: 16 additions & 0 deletions config/samples/isvc/zai-org/glm-5-2-fp8-multi.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: ome.io/v1beta1
kind: InferenceService
metadata:
name: glm-5-2-fp8-multi
namespace: glm
spec:
model:
name: glm-5-2-fp8
engine:
minReplicas: 1
maxReplicas: 1
router:
minReplicas: 1
maxReplicas: 1
runtime:
name: vllm-glm-5-2-fp8-multi
Loading