Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions config/models/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ resources:

# moonshotai
- moonshotai/Kimi-K2-Instruct.yaml
- moonshotai/Kimi-K2.6.yaml
- moonshotai/Kimi-K2.7-Code.yaml
- moonshotai/Kimi-VL-A3B-Instruct.yaml

# nvidia
Expand Down
15 changes: 15 additions & 0 deletions config/models/moonshotai/Kimi-K2.6.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
apiVersion: ome.io/v1beta1
kind: ClusterBaseModel
metadata:
name: kimi-k2-6
spec:
modelCapabilities:
- TEXT_TO_TEXT
- IMAGE_TEXT_TO_TEXT
Comment thread
Juno13340 marked this conversation as resolved.
vendor: moonshotai
displayName: moonshotai.kimi-k2-6
disabled: false
version: "1.0.0"
storage:
storageUri: hf://moonshotai/Kimi-K2.6
path: /raid/models/moonshotai/Kimi-K2.6
15 changes: 15 additions & 0 deletions config/models/moonshotai/Kimi-K2.7-Code.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
apiVersion: ome.io/v1beta1
kind: ClusterBaseModel
metadata:
name: kimi-k2-7-code
spec:
modelCapabilities:
- TEXT_TO_TEXT
- IMAGE_TEXT_TO_TEXT
vendor: moonshotai
displayName: moonshotai.kimi-k2-7-code
disabled: false
version: "1.0.0"
storage:
storageUri: hf://moonshotai/Kimi-K2.7-Code
path: /raid/models/moonshotai/Kimi-K2.7-Code
1 change: 1 addition & 0 deletions config/runtimes/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,4 @@ resources:
- vllm/mixtral-8x7b-instruct-rt.yaml
- vllm/deepseek-ai/deepseek-v4-flash-rt.yaml
- vllm/deepseek-ai/deepseek-v4-pro-rt.yaml
- vllm/moonshotai/kimi-k25-tp8-rt.yaml
178 changes: 178 additions & 0 deletions config/runtimes/vllm/moonshotai/kimi-k25-tp8-rt.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
apiVersion: ome.io/v1beta1
kind: ClusterServingRuntime
metadata:
name: vllm-kimi-k25-tp8
annotations:
ome.io/engine-ready-timeout-sec: "3600"
spec:
acceleratorRequirements:
acceleratorClasses:
- nvidia-b200-8
- nvidia-h200-8
disabled: false
routerConfig:
annotations:
prometheus.io/path: /metrics
prometheus.io/port: '29000'
prometheus.io/scrape: 'true'
labels:
logging-forward: enabled
runner:
name: router
image: docker.io/lightseekorg/smg:1.5.0
ports:
- containerPort: 8080
name: http
resources:
Comment thread
Juno13340 marked this conversation as resolved.
requests:
cpu: "1"
memory: 2Gi
limits:
cpu: "1"
memory: 2Gi
args:
- --host
- 0.0.0.0
- --port
- "8080"
- --service-discovery
- --service-discovery-namespace
- $(NAMESPACE)
- --service-discovery-port
- "8080"
- --selector
- component=engine ome.io/inferenceservice=$(INFERENCESERVICE_NAME)
- --enable-igw
- --request-id-headers
- opc-request-id
- --log-json
- --disable-retries
- --disable-circuit-breaker
- --disable-tokenizer-autoload
env:
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: INFERENCESERVICE_NAME
valueFrom:
fieldRef:
fieldPath: metadata.labels['ome.io/inferenceservice']
readinessProbe:
httpGet:
path: /readiness
port: 8080
failureThreshold: 5
periodSeconds: 30
timeoutSeconds: 10
livenessProbe:
httpGet:
path: /liveness
port: 8080
failureThreshold: 5
periodSeconds: 30
timeoutSeconds: 10
startupProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 177
periodSeconds: 20
timeoutSeconds: 10
initialDelaySeconds: 60
supportedModelFormats:
- modelFramework:
name: transformers
version: "4.57.1"
modelFormat:
name: safetensors
version: "1.0.0"
modelArchitecture: KimiK25ForConditionalGeneration
autoSelect: true
priority: 1
acceleratorConfig:
nvidia-b200-8:
tensorParallelismOverride:
tensorParallelSize: 8
nvidia-h200-8:
tensorParallelismOverride:
tensorParallelSize: 8
modelSizeRange:
min: 150B

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How is this size calculated? Isn't it a 1T parameter model?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

1T logically, yes, but OME matches on the safetensors element count, which ignores dtype. These are int4-packed, so it comes out ~150–300B, not 1T. Tried 900–1100B first and autoSelect failed for this exact reason.

max: 300B
protocolVersions:
- openAI
engineConfig:
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
labels:
logging-forward: enabled
volumes:
- name: dshm
emptyDir:
medium: Memory
runner:
name: ome-container
image: docker.io/vllm/vllm-openai:v0.23.0-cu129
ports:
- containerPort: 8080
name: http1
protocol: TCP
args:
- $(MODEL_PATH)
- --port=8080
- --max-log-len=0
- --served-model-name=vllm-model
- --trust-remote-code
- --enable-expert-parallel
- --tensor-parallel-size=8
- --max-model-len=262144
- '--limit-mm-per-prompt={"image":5,"video":1}'
- --tool-call-parser=kimi_k2
- --enable-auto-tool-choice
- --reasoning-parser=kimi_k2
- --mm-encoder-tp-mode=data
env:
- name: VLLM_ENGINE_READY_TIMEOUT_S
value: '3600'
- name: VLLM_RPC_TIMEOUT
value: '600000'
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 64
memory: 512Gi
nvidia.com/gpu: 8
limits:
cpu: 64
memory: 512Gi
nvidia.com/gpu: 8
readinessProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 3
successThreshold: 1
periodSeconds: 90
timeoutSeconds: 60
livenessProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 5
successThreshold: 1
periodSeconds: 60
timeoutSeconds: 60
startupProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 348
successThreshold: 1
periodSeconds: 10
initialDelaySeconds: 120
timeoutSeconds: 30