diff --git a/config/runtimes/kustomization.yaml b/config/runtimes/kustomization.yaml index 0b78b5905..a76c578a6 100644 --- a/config/runtimes/kustomization.yaml +++ b/config/runtimes/kustomization.yaml @@ -52,3 +52,5 @@ resources: - vllm/mixtral-8x7b-instruct-rt.yaml - vllm/deepseek-ai/deepseek-v4-flash-rt.yaml - vllm/deepseek-ai/deepseek-v4-pro-rt.yaml +- vllm/openai/gpt-oss-20b-imported-rt.yaml +- vllm/openai/gpt-oss-120b-imported-rt.yaml diff --git a/config/runtimes/vllm/openai/gpt-oss-120b-imported-rt.yaml b/config/runtimes/vllm/openai/gpt-oss-120b-imported-rt.yaml new file mode 100644 index 000000000..487504799 --- /dev/null +++ b/config/runtimes/vllm/openai/gpt-oss-120b-imported-rt.yaml @@ -0,0 +1,172 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: vllm-gpt-oss-120b-imported +spec: + disabled: false + routerConfig: + annotations: + prometheus.io/path: /metrics + prometheus.io/port: "29000" + prometheus.io/scrape: "true" + labels: + logging-forward: enabled + runner: + name: router + image: fra.ocir.io/idqj093njucb/smg:v1.4.1.post2-hotfix + ports: + - containerPort: 8080 + name: http + resources: + requests: + cpu: "1" + memory: 2Gi + limits: + cpu: "1" + memory: 2Gi + args: + - --host + - 0.0.0.0 + - --port + - "8080" + - --service-discovery + - --service-discovery-namespace + - $(NAMESPACE) + - --service-discovery-port + - "8080" + - --selector + - component=engine ome.io/inferenceservice=$(INFERENCESERVICE_NAME) + - --enable-igw + - --request-id-headers + - opc-request-id + - --log-json + - --disable-retries + - --disable-circuit-breaker + - --disable-tokenizer-autoload + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] + readinessProbe: + httpGet: + path: /readiness + port: 8080 + failureThreshold: 5 + periodSeconds: 30 + timeoutSeconds: 10 + livenessProbe: + httpGet: + path: /liveness + port: 8080 + failureThreshold: 5 + periodSeconds: 30 + timeoutSeconds: 10 + startupProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + periodSeconds: 20 + timeoutSeconds: 10 + initialDelaySeconds: 30 + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.55.0" + modelFormat: + name: safetensors + version: "1.0.0" + modelArchitecture: GptOssForCausalLM + autoSelect: true + priority: 2 + modelSizeRange: + min: 60B + max: 125B + protocolVersions: + - openAI + engineConfig: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + runner: + name: ome-container + image: fra.ocir.io/idqj093njucb/official-vllm-openai:v0.22.2-hotfix-cuda12.9 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - vllm + - serve + args: + - $(MODEL_PATH) + - --host=0.0.0.0 + - --port=8080 + - --max-log-len=0 + - --served-model-name=vllm-model + - --max-model-len=131072 + - --tensor-parallel-size=2 + - --kv-cache-dtype=auto + - --async-scheduling + - --max-cudagraph-capture-size=256 + - --enable-auto-tool-choice + - --tool-call-parser=openai + - --reasoning-parser=openai_gptoss + env: + - name: VLLM_ENGINE_ITERATION_TIMEOUT_S + value: "120" + - name: VLLM_WORKER_MULTIPROC_METHOD + value: "spawn" + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 25 + memory: 100Gi + nvidia.com/gpu: 2 + limits: + cpu: 25 + memory: 100Gi + nvidia.com/gpu: 2 + readinessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 diff --git a/config/runtimes/vllm/openai/gpt-oss-20b-imported-rt.yaml b/config/runtimes/vllm/openai/gpt-oss-20b-imported-rt.yaml new file mode 100644 index 000000000..86256730c --- /dev/null +++ b/config/runtimes/vllm/openai/gpt-oss-20b-imported-rt.yaml @@ -0,0 +1,172 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: vllm-gpt-oss-20b-imported +spec: + disabled: false + routerConfig: + annotations: + prometheus.io/path: /metrics + prometheus.io/port: "29000" + prometheus.io/scrape: "true" + labels: + logging-forward: enabled + runner: + name: router + image: fra.ocir.io/idqj093njucb/smg:v1.4.1.post2-hotfix + ports: + - containerPort: 8080 + name: http + resources: + requests: + cpu: "1" + memory: 2Gi + limits: + cpu: "1" + memory: 2Gi + args: + - --host + - 0.0.0.0 + - --port + - "8080" + - --service-discovery + - --service-discovery-namespace + - $(NAMESPACE) + - --service-discovery-port + - "8080" + - --selector + - component=engine ome.io/inferenceservice=$(INFERENCESERVICE_NAME) + - --enable-igw + - --request-id-headers + - opc-request-id + - --log-json + - --disable-retries + - --disable-circuit-breaker + - --disable-tokenizer-autoload + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] + readinessProbe: + httpGet: + path: /readiness + port: 8080 + failureThreshold: 5 + periodSeconds: 30 + timeoutSeconds: 10 + livenessProbe: + httpGet: + path: /liveness + port: 8080 + failureThreshold: 5 + periodSeconds: 30 + timeoutSeconds: 10 + startupProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + periodSeconds: 20 + timeoutSeconds: 10 + initialDelaySeconds: 30 + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.55.0" + modelFormat: + name: safetensors + version: "1.0.0" + modelArchitecture: GptOssForCausalLM + autoSelect: true + priority: 2 + modelSizeRange: + min: 10B + max: 25B + protocolVersions: + - openAI + engineConfig: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + runner: + name: ome-container + image: fra.ocir.io/idqj093njucb/official-vllm-openai:v0.22.2-hotfix-cuda12.9 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - vllm + - serve + args: + - $(MODEL_PATH) + - --host=0.0.0.0 + - --port=8080 + - --max-log-len=0 + - --served-model-name=vllm-model + - --max-model-len=131072 + - --tensor-parallel-size=1 + - --kv-cache-dtype=auto + - --async-scheduling + - --max-cudagraph-capture-size=256 + - --enable-auto-tool-choice + - --tool-call-parser=openai + - --reasoning-parser=openai_gptoss + env: + - name: VLLM_ENGINE_ITERATION_TIMEOUT_S + value: "120" + - name: VLLM_WORKER_MULTIPROC_METHOD + value: "spawn" + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 60Gi + nvidia.com/gpu: 1 + limits: + cpu: 10 + memory: 60Gi + nvidia.com/gpu: 1 + readinessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 diff --git a/config/samples/isvc/openai/gpt-oss-120b.yaml b/config/samples/isvc/openai/gpt-oss-120b.yaml new file mode 100644 index 000000000..c1a3dbe24 --- /dev/null +++ b/config/samples/isvc/openai/gpt-oss-120b.yaml @@ -0,0 +1,18 @@ +apiVersion: ome.io/v1beta1 +kind: InferenceService +metadata: + name: gpt-oss-120b + namespace: openai-test + annotations: + ome.io/deploymentMode: RawDeployment +spec: + model: + name: gpt-oss-120b + engine: + minReplicas: 1 + maxReplicas: 1 + runtime: + name: vllm-gpt-oss-120b-imported + router: + minReplicas: 1 + maxReplicas: 1 diff --git a/config/samples/isvc/openai/gpt-oss-20b.yaml b/config/samples/isvc/openai/gpt-oss-20b.yaml index cf9074de8..2acea33e8 100644 --- a/config/samples/isvc/openai/gpt-oss-20b.yaml +++ b/config/samples/isvc/openai/gpt-oss-20b.yaml @@ -11,3 +11,8 @@ spec: engine: minReplicas: 1 maxReplicas: 1 + runtime: + name: vllm-gpt-oss-20b-imported + router: + minReplicas: 1 + maxReplicas: 1