Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions config/runtimes/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,4 @@ resources:
- vllm/mixtral-8x7b-instruct-rt.yaml
- vllm/deepseek-ai/deepseek-v4-flash-rt.yaml
- vllm/deepseek-ai/deepseek-v4-pro-rt.yaml
- vllm/deepseek-ai/deepseek-v4-pro-multi-rt.yaml
313 changes: 313 additions & 0 deletions config/runtimes/vllm/deepseek-ai/deepseek-v4-pro-multi-rt.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,313 @@
apiVersion: ome.io/v1beta1
kind: ClusterServingRuntime
metadata:
name: vllm-deepseek-v4-pro-multi
spec:
disabled: false
routerConfig:
annotations:
prometheus.io/path: /metrics
prometheus.io/port: '29000'
prometheus.io/scrape: 'true'
labels:
logging-forward: enabled
runner:
name: router
image: docker.io/lightseekorg/smg:1.5.0
ports:
- containerPort: 8080
name: http
resources:
requests:
cpu: "1"
memory: 2Gi
limits:
cpu: "8"
memory: 16Gi
args:
- --host
- 0.0.0.0
- --port
- "8080"
- --service-discovery
- --service-discovery-namespace
- $(NAMESPACE)
- --service-discovery-port
- "8080"
- --selector
- component=engine ome.io/inferenceservice=$(INFERENCESERVICE_NAME)
- --enable-igw
- --request-id-headers
- opc-request-id
- --log-json
- --disable-retries
- --disable-circuit-breaker
env:
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: INFERENCESERVICE_NAME
valueFrom:
fieldRef:
fieldPath: metadata.labels['ome.io/inferenceservice']
readinessProbe:
httpGet:
path: /readiness
port: 8080
failureThreshold: 5
periodSeconds: 30
timeoutSeconds: 10
livenessProbe:
httpGet:
path: /liveness
port: 8080
failureThreshold: 5
periodSeconds: 30
timeoutSeconds: 10
startupProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 5
periodSeconds: 20
timeoutSeconds: 10
initialDelaySeconds: 30
supportedModelFormats:
- modelFramework:
name: transformers
version: "4.57.1"
modelFormat:
name: safetensors
version: "1.0.0"
modelArchitecture: DeepseekV4ForCausalLM
quantization: fp8
autoSelect: true
priority: 1
modelSizeRange:
min: 800B
max: 900B
protocolVersions:
- openAI
engineConfig:
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
rdma.ome.io/auto-inject: "true"
rdma.ome.io/profile: "oci-roce"
rdma.ome.io/container-name: "ome-container"
labels:
logging-forward: enabled
leader:
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node.kubernetes.io/instance-type
operator: In
values:
- BM.GPU.H100.8
- BM.GPU.H100-NC.8
dnsPolicy: ClusterFirstWithHostNet
hostNetwork: true
volumes:
- name: dshm
emptyDir:
medium: Memory
- name: devinf
hostPath:
path: /dev/infiniband
runner:
name: ome-container
image: docker.io/vllm/vllm-openai:v0.23.0-cu129
ports:
- containerPort: 8080
name: http1
protocol: TCP
command:
- vllm
- serve
args:
- $(MODEL_PATH)
- --trust-remote-code
- --kv-cache-dtype=fp8
- --block-size=256
- --enable-expert-parallel
- --tensor-parallel-size=8
- --pipeline-parallel-size=2
- --nnodes=2
- --node-rank=0
- -cc.pass_config.fuse_allreduce_rms=False
- --master-addr=$(LWS_LEADER_ADDRESS)
- --gpu-memory-utilization=0.95
- --max-num-seqs=256

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does decreasing this increase this improve the batched_tokens?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

decreasing this help with the memory pressure.

- --max-num-batched-tokens=512

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

😢

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, sadly.

- --max-model-len=-1
- --no-enable-flashinfer-autotune
- '--compilation-config={"mode": 0, "cudagraph_mode": "FULL_DECODE_ONLY"}'
- --tokenizer-mode=deepseek_v4
- --tool-call-parser=deepseek_v4
- --enable-auto-tool-choice
- --reasoning-parser=deepseek_v4
- --served-model-name=vllm-model
- --port=8080
env:
- name: VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS
value: '1'
- name: VLLM_ENGINE_READY_TIMEOUT_S
value: '3600'
- name: VLLM_LOGGING_LEVEL
value: 'INFO'
- name: GLOO_SOCKET_IFNAME
value: eth0
- name: NCCL_SOCKET_IFNAME
value: eth0
- name: NODE_IP
valueFrom:
fieldRef:
fieldPath: status.hostIP
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /dev/infiniband
name: devinf
securityContext:
capabilities:
add:
- IPC_LOCK
- CAP_SYS_ADMIN
privileged: true
resources:
requests:
cpu: 64
memory: 512Gi
nvidia.com/gpu: 8
limits:
cpu: 64
memory: 512Gi
nvidia.com/gpu: 8
readinessProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 3
successThreshold: 1
periodSeconds: 90
timeoutSeconds: 60
livenessProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 5
successThreshold: 1
periodSeconds: 60
timeoutSeconds: 60
startupProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 190
successThreshold: 1
periodSeconds: 6
initialDelaySeconds: 60
timeoutSeconds: 30
worker:
size: 1
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node.kubernetes.io/instance-type
operator: In
values:
- BM.GPU.H100.8
- BM.GPU.H100-NC.8
dnsPolicy: ClusterFirstWithHostNet
hostNetwork: true
enableServiceLinks: false
hostIPC: true
volumes:
- name: dshm
emptyDir:
medium: Memory
- name: devinf
hostPath:
path: /dev/infiniband
runner:
name: ome-container
image: docker.io/vllm/vllm-openai:v0.23.0-cu129
command:
- vllm
- serve
args:
- $(MODEL_PATH)
- --trust-remote-code
- --kv-cache-dtype=fp8
- --block-size=256
- --enable-expert-parallel
- --tensor-parallel-size=8
- --pipeline-parallel-size=2
- --nnodes=2
- --node-rank=1
- -cc.pass_config.fuse_allreduce_rms=False
- --master-addr=$(LWS_LEADER_ADDRESS)
- --headless
- --gpu-memory-utilization=0.95
- --max-num-seqs=256
- --max-num-batched-tokens=512
- --max-model-len=-1
- --no-enable-flashinfer-autotune
- '--compilation-config={"mode": 0, "cudagraph_mode": "FULL_DECODE_ONLY"}'
- --tokenizer-mode=deepseek_v4
- --tool-call-parser=deepseek_v4
- --enable-auto-tool-choice
- --reasoning-parser=deepseek_v4
- --served-model-name=vllm-model
- --port=8080
env:
- name: VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS
value: '1'
- name: VLLM_ENGINE_READY_TIMEOUT_S
value: '3600'
- name: VLLM_LOGGING_LEVEL
value: 'INFO'
- name: GLOO_SOCKET_IFNAME
value: eth0
- name: NCCL_SOCKET_IFNAME
value: eth0
- name: NODE_IP
valueFrom:
fieldRef:
fieldPath: status.hostIP
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /dev/infiniband
name: devinf
securityContext:
capabilities:
add:
- IPC_LOCK
- CAP_SYS_ADMIN
privileged: true
resources:
requests:
cpu: 64
memory: 512Gi
nvidia.com/gpu: 8
limits:
cpu: 64
memory: 512Gi
nvidia.com/gpu: 8
# omit HTTP probes since worker does not expose HTTP endpoints
21 changes: 21 additions & 0 deletions config/samples/isvc/deepseek-ai/deepseek-v4-pro-multi.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# apiVersion: v1
# kind: Namespace
# metadata:
# name: deepseek-v4
---
apiVersion: ome.io/v1beta1
kind: InferenceService
metadata:
name: deepseek-v4-pro-multi
namespace: deepseek-v4
spec:
model:
name: deepseek-v4-pro
engine:
minReplicas: 1
maxReplicas: 1
router:
minReplicas: 1
maxReplicas: 1
runtime:
name: vllm-deepseek-v4-pro-multi
Loading