ome-projects · YouNeedCryDear · Jun 17, 2026 · shenoyvvarun · Jun 18, 2026 · YouNeedCryDear
@@ -52,3 +52,4 @@ resources:
 - vllm/mixtral-8x7b-instruct-rt.yaml
 - vllm/deepseek-ai/deepseek-v4-flash-rt.yaml
 - vllm/deepseek-ai/deepseek-v4-pro-rt.yaml
+- vllm/deepseek-ai/deepseek-v4-pro-multi-rt.yaml
@@ -0,0 +1,313 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: vllm-deepseek-v4-pro-multi
+spec:
+  disabled: false
+  routerConfig:
+    annotations:
+      prometheus.io/path: /metrics
+      prometheus.io/port: '29000'
+      prometheus.io/scrape: 'true'
+    labels:
+      logging-forward: enabled
+    runner:
+      name: router
+      image: docker.io/lightseekorg/smg:1.5.0
+      ports:
+        - containerPort: 8080
+          name: http
+      resources:
+        requests:
+          cpu: "1"
+          memory: 2Gi
+        limits:
+          cpu: "8"
+          memory: 16Gi
+      args:
+        - --host
+        - 0.0.0.0
+        - --port
+        - "8080"
+        - --service-discovery
+        - --service-discovery-namespace
+        - $(NAMESPACE)
+        - --service-discovery-port
+        - "8080"
+        - --selector
+        - component=engine ome.io/inferenceservice=$(INFERENCESERVICE_NAME)
+        - --enable-igw
+        - --request-id-headers
+        - opc-request-id
+        - --log-json
+        - --disable-retries
+        - --disable-circuit-breaker
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
+      readinessProbe:
+        httpGet:
+          path: /readiness
+          port: 8080
+        failureThreshold: 5
+        periodSeconds: 30
+        timeoutSeconds: 10
+      livenessProbe:
+        httpGet:
+          path: /liveness
+          port: 8080
+        failureThreshold: 5
+        periodSeconds: 30
+        timeoutSeconds: 10
+      startupProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        periodSeconds: 20
+        timeoutSeconds: 10
+        initialDelaySeconds: 30
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.57.1"
+      modelFormat:
+        name: safetensors
+        version: "1.0.0"
+      modelArchitecture: DeepseekV4ForCausalLM
+      quantization: fp8
+      autoSelect: true
+      priority: 1
+  modelSizeRange:
+    min: 800B
+    max: 900B
+  protocolVersions:
+    - openAI
+  engineConfig:
+    annotations:
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+      rdma.ome.io/auto-inject: "true"
+      rdma.ome.io/profile: "oci-roce"
+      rdma.ome.io/container-name: "ome-container"
+    labels:
+      logging-forward: enabled
+    leader:
+      tolerations:
+        - key: "nvidia.com/gpu"
+          operator: "Exists"
+          effect: "NoSchedule"
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: node.kubernetes.io/instance-type
+                    operator: In
+                    values:
+                      - BM.GPU.H100.8
+                      - BM.GPU.H100-NC.8
+      dnsPolicy: ClusterFirstWithHostNet
+      hostNetwork: true
+      volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+        - name: devinf
+          hostPath:
+            path: /dev/infiniband
+      runner:
+        name: ome-container
+        image: docker.io/vllm/vllm-openai:v0.23.0-cu129
+        ports:
+          - containerPort: 8080
+            name: http1
+            protocol: TCP
+        command:
+          - vllm
+          - serve
+        args:
+          - $(MODEL_PATH)
+          - --trust-remote-code
+          - --kv-cache-dtype=fp8
+          - --block-size=256
+          - --enable-expert-parallel
+          - --tensor-parallel-size=8
+          - --pipeline-parallel-size=2
+          - --nnodes=2
+          - --node-rank=0
+          - -cc.pass_config.fuse_allreduce_rms=False
+          - --master-addr=$(LWS_LEADER_ADDRESS)
+          - --gpu-memory-utilization=0.95
+          - --max-num-seqs=256
+          - --max-num-batched-tokens=512
+          - --max-model-len=-1
+          - --no-enable-flashinfer-autotune
+          - '--compilation-config={"mode": 0, "cudagraph_mode": "FULL_DECODE_ONLY"}'
+          - --tokenizer-mode=deepseek_v4
+          - --tool-call-parser=deepseek_v4
+          - --enable-auto-tool-choice
+          - --reasoning-parser=deepseek_v4
+          - --served-model-name=vllm-model
+          - --port=8080
+        env:
+          - name: VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS
+            value: '1'
+          - name: VLLM_ENGINE_READY_TIMEOUT_S
+            value: '3600'
+          - name: VLLM_LOGGING_LEVEL
+            value: 'INFO'
+          - name: GLOO_SOCKET_IFNAME
+            value: eth0
+          - name: NCCL_SOCKET_IFNAME
+            value: eth0
+          - name: NODE_IP
+            valueFrom:
+              fieldRef:
+                fieldPath: status.hostIP
+        volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /dev/infiniband
+            name: devinf
+        securityContext:
+          capabilities:
+            add:
+              - IPC_LOCK
+              - CAP_SYS_ADMIN
+          privileged: true
+        resources:
+          requests:
+            cpu: 64
+            memory: 512Gi
+            nvidia.com/gpu: 8
+          limits:
+            cpu: 64
+            memory: 512Gi
+            nvidia.com/gpu: 8
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 8080
+          failureThreshold: 3
+          successThreshold: 1
+          periodSeconds: 90
+          timeoutSeconds: 60
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 8080
+          failureThreshold: 5
+          successThreshold: 1
+          periodSeconds: 60
+          timeoutSeconds: 60
+        startupProbe:
+          httpGet:
+            path: /health
+            port: 8080
+          failureThreshold: 190
+          successThreshold: 1
+          periodSeconds: 6
+          initialDelaySeconds: 60
+          timeoutSeconds: 30
+    worker:
+      size: 1
+      tolerations:
+        - key: "nvidia.com/gpu"
+          operator: "Exists"
+          effect: "NoSchedule"
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: node.kubernetes.io/instance-type
+                    operator: In
+                    values:
+                      - BM.GPU.H100.8
+                      - BM.GPU.H100-NC.8
+      dnsPolicy: ClusterFirstWithHostNet
+      hostNetwork: true
+      enableServiceLinks: false
+      hostIPC: true
+      volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+        - name: devinf
+          hostPath:
+            path: /dev/infiniband
+      runner:
+        name: ome-container
+        image: docker.io/vllm/vllm-openai:v0.23.0-cu129
+        command:
+          - vllm
+          - serve
+        args:
+          - $(MODEL_PATH)
+          - --trust-remote-code
+          - --kv-cache-dtype=fp8
+          - --block-size=256
+          - --enable-expert-parallel
+          - --tensor-parallel-size=8
+          - --pipeline-parallel-size=2
+          - --nnodes=2
+          - --node-rank=1
+          - -cc.pass_config.fuse_allreduce_rms=False
+          - --master-addr=$(LWS_LEADER_ADDRESS)
+          - --headless
+          - --gpu-memory-utilization=0.95
+          - --max-num-seqs=256
+          - --max-num-batched-tokens=512
+          - --max-model-len=-1
+          - --no-enable-flashinfer-autotune
+          - '--compilation-config={"mode": 0, "cudagraph_mode": "FULL_DECODE_ONLY"}'
+          - --tokenizer-mode=deepseek_v4
+          - --tool-call-parser=deepseek_v4
+          - --enable-auto-tool-choice
+          - --reasoning-parser=deepseek_v4
+          - --served-model-name=vllm-model
+          - --port=8080
+        env:
+          - name: VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS
+            value: '1'
+          - name: VLLM_ENGINE_READY_TIMEOUT_S
+            value: '3600'
+          - name: VLLM_LOGGING_LEVEL
+            value: 'INFO'
+          - name: GLOO_SOCKET_IFNAME
+            value: eth0
+          - name: NCCL_SOCKET_IFNAME
+            value: eth0
+          - name: NODE_IP
+            valueFrom:
+              fieldRef:
+                fieldPath: status.hostIP
+        volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /dev/infiniband
+            name: devinf
+        securityContext:
+          capabilities:
+            add:
+              - IPC_LOCK
+              - CAP_SYS_ADMIN
+          privileged: true
+        resources:
+          requests:
+            cpu: 64
+            memory: 512Gi
+            nvidia.com/gpu: 8
+          limits:
+            cpu: 64
+            memory: 512Gi
+            nvidia.com/gpu: 8
+        # omit HTTP probes since worker does not expose HTTP endpoints
@@ -0,0 +1,21 @@
+# apiVersion: v1
+# kind: Namespace
+# metadata:
+#   name: deepseek-v4
+---
+apiVersion: ome.io/v1beta1
+kind: InferenceService
+metadata:
+  name: deepseek-v4-pro-multi
+  namespace: deepseek-v4
+spec:
+  model:
+    name: deepseek-v4-pro
+  engine:
+    minReplicas: 1
+    maxReplicas: 1
+  router:
+    minReplicas: 1
+    maxReplicas: 1
+  runtime:
+    name: vllm-deepseek-v4-pro-multi