Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions config/runtimes/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,5 @@ resources:
- vllm/mixtral-8x7b-instruct-rt.yaml
- vllm/deepseek-ai/deepseek-v4-flash-rt.yaml
- vllm/deepseek-ai/deepseek-v4-pro-rt.yaml
- vllm/openai/gpt-oss-20b-imported-rt.yaml
- vllm/openai/gpt-oss-120b-imported-rt.yaml
172 changes: 172 additions & 0 deletions config/runtimes/vllm/openai/gpt-oss-120b-imported-rt.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
apiVersion: ome.io/v1beta1
kind: ClusterServingRuntime
metadata:
name: vllm-gpt-oss-120b-imported
spec:
disabled: false
routerConfig:
annotations:
prometheus.io/path: /metrics
prometheus.io/port: "29000"
prometheus.io/scrape: "true"
labels:
logging-forward: enabled
runner:
name: router
image: fra.ocir.io/idqj093njucb/smg:v1.4.1.post2-hotfix
ports:
- containerPort: 8080
name: http
resources:
requests:
cpu: "1"
memory: 2Gi
limits:
cpu: "1"
memory: 2Gi
args:
- --host
- 0.0.0.0
- --port
- "8080"
- --service-discovery
- --service-discovery-namespace
- $(NAMESPACE)
- --service-discovery-port
- "8080"
- --selector
- component=engine ome.io/inferenceservice=$(INFERENCESERVICE_NAME)
- --enable-igw
- --request-id-headers
- opc-request-id
- --log-json
- --disable-retries
- --disable-circuit-breaker
- --disable-tokenizer-autoload
env:
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: INFERENCESERVICE_NAME
valueFrom:
fieldRef:
fieldPath: metadata.labels['ome.io/inferenceservice']
readinessProbe:
httpGet:
path: /readiness
port: 8080
failureThreshold: 5
periodSeconds: 30
timeoutSeconds: 10
livenessProbe:
httpGet:
path: /liveness
port: 8080
failureThreshold: 5
periodSeconds: 30
timeoutSeconds: 10
startupProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 5
periodSeconds: 20
timeoutSeconds: 10
initialDelaySeconds: 30
supportedModelFormats:
- modelFramework:
name: transformers
version: "4.55.0"
modelFormat:
name: safetensors
version: "1.0.0"
modelArchitecture: GptOssForCausalLM
autoSelect: true
priority: 2
modelSizeRange:
min: 60B
max: 125B
protocolVersions:
- openAI
engineConfig:
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
labels:
logging-forward: enabled
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
volumes:
- name: dshm
emptyDir:
medium: Memory
runner:
name: ome-container
image: fra.ocir.io/idqj093njucb/official-vllm-openai:v0.22.2-hotfix-cuda12.9
ports:
- containerPort: 8080
name: http1
protocol: TCP
command:
- vllm
- serve
args:
- $(MODEL_PATH)
- --host=0.0.0.0
- --port=8080
- --max-log-len=0
- --served-model-name=vllm-model
- --max-model-len=131072
- --tensor-parallel-size=2
- --kv-cache-dtype=auto
- --async-scheduling
- --max-cudagraph-capture-size=256
- --enable-auto-tool-choice
- --tool-call-parser=openai
- --reasoning-parser=openai_gptoss
env:
- name: VLLM_ENGINE_ITERATION_TIMEOUT_S
value: "120"
- name: VLLM_WORKER_MULTIPROC_METHOD
value: "spawn"
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 25
memory: 100Gi
nvidia.com/gpu: 2
limits:
cpu: 25
memory: 100Gi
nvidia.com/gpu: 2
readinessProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 3
successThreshold: 1
periodSeconds: 60
timeoutSeconds: 200
livenessProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 5
successThreshold: 1
periodSeconds: 60
timeoutSeconds: 60
startupProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 150
successThreshold: 1
periodSeconds: 6
initialDelaySeconds: 60
timeoutSeconds: 30
172 changes: 172 additions & 0 deletions config/runtimes/vllm/openai/gpt-oss-20b-imported-rt.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
apiVersion: ome.io/v1beta1
kind: ClusterServingRuntime
metadata:
name: vllm-gpt-oss-20b-imported
spec:
disabled: false
routerConfig:
annotations:
prometheus.io/path: /metrics
prometheus.io/port: "29000"
prometheus.io/scrape: "true"
labels:
logging-forward: enabled
runner:
name: router
image: fra.ocir.io/idqj093njucb/smg:v1.4.1.post2-hotfix
ports:
- containerPort: 8080
name: http
resources:
requests:
cpu: "1"
memory: 2Gi
limits:
cpu: "1"
memory: 2Gi
args:
- --host
- 0.0.0.0
- --port
- "8080"
- --service-discovery
- --service-discovery-namespace
- $(NAMESPACE)
- --service-discovery-port
- "8080"
- --selector
- component=engine ome.io/inferenceservice=$(INFERENCESERVICE_NAME)
- --enable-igw
- --request-id-headers
- opc-request-id
- --log-json
- --disable-retries
- --disable-circuit-breaker
- --disable-tokenizer-autoload
env:
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: INFERENCESERVICE_NAME
valueFrom:
fieldRef:
fieldPath: metadata.labels['ome.io/inferenceservice']
readinessProbe:
httpGet:
path: /readiness
port: 8080
failureThreshold: 5
periodSeconds: 30
timeoutSeconds: 10
livenessProbe:
httpGet:
path: /liveness
port: 8080
failureThreshold: 5
periodSeconds: 30
timeoutSeconds: 10
startupProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 5
periodSeconds: 20
timeoutSeconds: 10
initialDelaySeconds: 30
supportedModelFormats:
- modelFramework:
name: transformers
version: "4.55.0"
modelFormat:
name: safetensors
version: "1.0.0"
modelArchitecture: GptOssForCausalLM
autoSelect: true
priority: 2
modelSizeRange:
min: 10B
max: 25B
protocolVersions:
- openAI
engineConfig:
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
labels:
logging-forward: enabled
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
volumes:
- name: dshm
emptyDir:
medium: Memory
runner:
name: ome-container
image: fra.ocir.io/idqj093njucb/official-vllm-openai:v0.22.2-hotfix-cuda12.9
ports:
- containerPort: 8080
name: http1
protocol: TCP
command:
- vllm
- serve
args:
- $(MODEL_PATH)
- --host=0.0.0.0
- --port=8080
- --max-log-len=0
- --served-model-name=vllm-model
- --max-model-len=131072
- --tensor-parallel-size=1
- --kv-cache-dtype=auto
- --async-scheduling
- --max-cudagraph-capture-size=256
- --enable-auto-tool-choice
- --tool-call-parser=openai
- --reasoning-parser=openai_gptoss
env:
- name: VLLM_ENGINE_ITERATION_TIMEOUT_S
value: "120"
- name: VLLM_WORKER_MULTIPROC_METHOD
value: "spawn"
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 10
memory: 60Gi
nvidia.com/gpu: 1
limits:
cpu: 10
memory: 60Gi
nvidia.com/gpu: 1
readinessProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 3
successThreshold: 1
periodSeconds: 60
timeoutSeconds: 200
livenessProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 5
successThreshold: 1
periodSeconds: 60
timeoutSeconds: 60
startupProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 150
successThreshold: 1
periodSeconds: 6
initialDelaySeconds: 60
timeoutSeconds: 30
18 changes: 18 additions & 0 deletions config/samples/isvc/openai/gpt-oss-120b.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
apiVersion: ome.io/v1beta1
kind: InferenceService
metadata:
name: gpt-oss-120b
namespace: openai-test
annotations:
ome.io/deploymentMode: RawDeployment
spec:
model:
name: gpt-oss-120b
engine:
minReplicas: 1
maxReplicas: 1
runtime:
name: vllm-gpt-oss-120b-imported
router:
minReplicas: 1
maxReplicas: 1
5 changes: 5 additions & 0 deletions config/samples/isvc/openai/gpt-oss-20b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,8 @@ spec:
engine:
minReplicas: 1
maxReplicas: 1
runtime:
name: vllm-gpt-oss-20b-imported
router:
minReplicas: 1
maxReplicas: 1