ome-projects · Juno13340 · Jun 22, 2026 · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026
@@ -116,6 +116,8 @@ resources:
 
   # moonshotai
   - moonshotai/Kimi-K2-Instruct.yaml
+  - moonshotai/Kimi-K2.6.yaml
+  - moonshotai/Kimi-K2.7-Code.yaml
   - moonshotai/Kimi-VL-A3B-Instruct.yaml
 
   # nvidia

@@ -0,0 +1,15 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterBaseModel
+metadata:
+  name: kimi-k2-6
+spec:
+  modelCapabilities:
+    - TEXT_TO_TEXT
+    - IMAGE_TEXT_TO_TEXT
+  vendor: moonshotai
+  displayName: moonshotai.kimi-k2-6
+  disabled: false
+  version: "1.0.0"
+  storage:
+    storageUri: hf://moonshotai/Kimi-K2.6
+    path: /raid/models/moonshotai/Kimi-K2.6
@@ -0,0 +1,15 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterBaseModel
+metadata:
+  name: kimi-k2-7-code
+spec:
+  modelCapabilities:
+    - TEXT_TO_TEXT
+    - IMAGE_TEXT_TO_TEXT
+  vendor: moonshotai
+  displayName: moonshotai.kimi-k2-7-code
+  disabled: false
+  version: "1.0.0"
+  storage:
+    storageUri: hf://moonshotai/Kimi-K2.7-Code
+    path: /raid/models/moonshotai/Kimi-K2.7-Code
@@ -52,3 +52,4 @@ resources:
 - vllm/mixtral-8x7b-instruct-rt.yaml
 - vllm/deepseek-ai/deepseek-v4-flash-rt.yaml
 - vllm/deepseek-ai/deepseek-v4-pro-rt.yaml
+- vllm/moonshotai/kimi-k25-tp8-rt.yaml
@@ -0,0 +1,178 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: vllm-kimi-k25-tp8
+  annotations:
+    ome.io/engine-ready-timeout-sec: "3600"
+spec:
+  acceleratorRequirements:
+    acceleratorClasses:
+      - nvidia-b200-8
+      - nvidia-h200-8
+  disabled: false
+  routerConfig:
+    annotations:
+      prometheus.io/path: /metrics
+      prometheus.io/port: '29000'
+      prometheus.io/scrape: 'true'
+    labels:
+      logging-forward: enabled
+    runner:
+      name: router
+      image: docker.io/lightseekorg/smg:1.5.0
+      ports:
+        - containerPort: 8080
+          name: http
+      resources:
+        requests:
+          cpu: "1"
+          memory: 2Gi
+        limits:
+          cpu: "1"
+          memory: 2Gi
+      args:
+        - --host
+        - 0.0.0.0
+        - --port
+        - "8080"
+        - --service-discovery
+        - --service-discovery-namespace
+        - $(NAMESPACE)
+        - --service-discovery-port
+        - "8080"
+        - --selector
+        - component=engine ome.io/inferenceservice=$(INFERENCESERVICE_NAME)
+        - --enable-igw
+        - --request-id-headers
+        - opc-request-id
+        - --log-json
+        - --disable-retries
+        - --disable-circuit-breaker
+        - --disable-tokenizer-autoload
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
+      readinessProbe:
+        httpGet:
+          path: /readiness
+          port: 8080
+        failureThreshold: 5
+        periodSeconds: 30
+        timeoutSeconds: 10
+      livenessProbe:
+        httpGet:
+          path: /liveness
+          port: 8080
+        failureThreshold: 5
+        periodSeconds: 30
+        timeoutSeconds: 10
+      startupProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 177
+        periodSeconds: 20
+        timeoutSeconds: 10
+        initialDelaySeconds: 60
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.57.1"
+      modelFormat:
+        name: safetensors
+        version: "1.0.0"
+      modelArchitecture: KimiK25ForConditionalGeneration
+      autoSelect: true
+      priority: 1
+      acceleratorConfig:
+        nvidia-b200-8:
+          tensorParallelismOverride:
+            tensorParallelSize: 8
+        nvidia-h200-8:
+          tensorParallelismOverride:
+            tensorParallelSize: 8
+  modelSizeRange:
+    min: 150B
+    max: 300B
+  protocolVersions:
+    - openAI
+  engineConfig:
+    annotations:
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    runner:
+      name: ome-container
+      image: docker.io/vllm/vllm-openai:v0.23.0-cu129
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      args:
+        - $(MODEL_PATH)
+        - --port=8080
+        - --max-log-len=0
+        - --served-model-name=vllm-model
+        - --trust-remote-code
+        - --enable-expert-parallel
+        - --tensor-parallel-size=8
+        - --max-model-len=262144
+        - '--limit-mm-per-prompt={"image":5,"video":1}'
+        - --tool-call-parser=kimi_k2
+        - --enable-auto-tool-choice
+        - --reasoning-parser=kimi_k2
+        - --mm-encoder-tp-mode=data
+      env:
+        - name: VLLM_ENGINE_READY_TIMEOUT_S
+          value: '3600'
+        - name: VLLM_RPC_TIMEOUT
+          value: '600000'
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 64
+          memory: 512Gi
+          nvidia.com/gpu: 8
+        limits:
+          cpu: 64
+          memory: 512Gi
+          nvidia.com/gpu: 8
+      readinessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 90
+        timeoutSeconds: 60
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 348
+        successThreshold: 1
+        periodSeconds: 10
+        initialDelaySeconds: 120
+        timeoutSeconds: 30