ome-projects · ankrovv · Jun 24, 2026
@@ -52,3 +52,5 @@ resources:
 - vllm/mixtral-8x7b-instruct-rt.yaml
 - vllm/deepseek-ai/deepseek-v4-flash-rt.yaml
 - vllm/deepseek-ai/deepseek-v4-pro-rt.yaml
+- vllm/openai/gpt-oss-20b-imported-rt.yaml
+- vllm/openai/gpt-oss-120b-imported-rt.yaml
@@ -0,0 +1,172 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: vllm-gpt-oss-120b-imported
+spec:
+  disabled: false
+  routerConfig:
+    annotations:
+      prometheus.io/path: /metrics
+      prometheus.io/port: "29000"
+      prometheus.io/scrape: "true"
+    labels:
+      logging-forward: enabled
+    runner:
+      name: router
+      image: fra.ocir.io/idqj093njucb/smg:v1.4.1.post2-hotfix
+      ports:
+        - containerPort: 8080
+          name: http
+      resources:
+        requests:
+          cpu: "1"
+          memory: 2Gi
+        limits:
+          cpu: "1"
+          memory: 2Gi
+      args:
+        - --host
+        - 0.0.0.0
+        - --port
+        - "8080"
+        - --service-discovery
+        - --service-discovery-namespace
+        - $(NAMESPACE)
+        - --service-discovery-port
+        - "8080"
+        - --selector
+        - component=engine ome.io/inferenceservice=$(INFERENCESERVICE_NAME)
+        - --enable-igw
+        - --request-id-headers
+        - opc-request-id
+        - --log-json
+        - --disable-retries
+        - --disable-circuit-breaker
+        - --disable-tokenizer-autoload
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
+      readinessProbe:
+        httpGet:
+          path: /readiness
+          port: 8080
+        failureThreshold: 5
+        periodSeconds: 30
+        timeoutSeconds: 10
+      livenessProbe:
+        httpGet:
+          path: /liveness
+          port: 8080
+        failureThreshold: 5
+        periodSeconds: 30
+        timeoutSeconds: 10
+      startupProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        periodSeconds: 20
+        timeoutSeconds: 10
+        initialDelaySeconds: 30
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.55.0"
+      modelFormat:
+        name: safetensors
+        version: "1.0.0"
+      modelArchitecture: GptOssForCausalLM
+      autoSelect: true
+      priority: 2
+  modelSizeRange:
+    min: 60B
+    max: 125B
+  protocolVersions:
+    - openAI
+  engineConfig:
+    annotations:
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    runner:
+      name: ome-container
+      image: fra.ocir.io/idqj093njucb/official-vllm-openai:v0.22.2-hotfix-cuda12.9
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - vllm
+        - serve
+      args:
+        - $(MODEL_PATH)
+        - --host=0.0.0.0
+        - --port=8080
+        - --max-log-len=0
+        - --served-model-name=vllm-model
+        - --max-model-len=131072
+        - --tensor-parallel-size=2
+        - --kv-cache-dtype=auto
+        - --async-scheduling
+        - --max-cudagraph-capture-size=256
+        - --enable-auto-tool-choice
+        - --tool-call-parser=openai
+        - --reasoning-parser=openai_gptoss
+      env:
+        - name: VLLM_ENGINE_ITERATION_TIMEOUT_S
+          value: "120"
+        - name: VLLM_WORKER_MULTIPROC_METHOD
+          value: "spawn"
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 25
+          memory: 100Gi
+          nvidia.com/gpu: 2
+        limits:
+          cpu: 25
+          memory: 100Gi
+          nvidia.com/gpu: 2
+      readinessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
@@ -0,0 +1,172 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: vllm-gpt-oss-20b-imported
+spec:
+  disabled: false
+  routerConfig:
+    annotations:
+      prometheus.io/path: /metrics
+      prometheus.io/port: "29000"
+      prometheus.io/scrape: "true"
+    labels:
+      logging-forward: enabled
+    runner:
+      name: router
+      image: fra.ocir.io/idqj093njucb/smg:v1.4.1.post2-hotfix
+      ports:
+        - containerPort: 8080
+          name: http
+      resources:
+        requests:
+          cpu: "1"
+          memory: 2Gi
+        limits:
+          cpu: "1"
+          memory: 2Gi
+      args:
+        - --host
+        - 0.0.0.0
+        - --port
+        - "8080"
+        - --service-discovery
+        - --service-discovery-namespace
+        - $(NAMESPACE)
+        - --service-discovery-port
+        - "8080"
+        - --selector
+        - component=engine ome.io/inferenceservice=$(INFERENCESERVICE_NAME)
+        - --enable-igw
+        - --request-id-headers
+        - opc-request-id
+        - --log-json
+        - --disable-retries
+        - --disable-circuit-breaker
+        - --disable-tokenizer-autoload
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
+      readinessProbe:
+        httpGet:
+          path: /readiness
+          port: 8080
+        failureThreshold: 5
+        periodSeconds: 30
+        timeoutSeconds: 10
+      livenessProbe:
+        httpGet:
+          path: /liveness
+          port: 8080
+        failureThreshold: 5
+        periodSeconds: 30
+        timeoutSeconds: 10
+      startupProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        periodSeconds: 20
+        timeoutSeconds: 10
+        initialDelaySeconds: 30
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.55.0"
+      modelFormat:
+        name: safetensors
+        version: "1.0.0"
+      modelArchitecture: GptOssForCausalLM
+      autoSelect: true
+      priority: 2
+  modelSizeRange:
+    min: 10B
+    max: 25B
+  protocolVersions:
+    - openAI
+  engineConfig:
+    annotations:
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    runner:
+      name: ome-container
+      image: fra.ocir.io/idqj093njucb/official-vllm-openai:v0.22.2-hotfix-cuda12.9
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - vllm
+        - serve
+      args:
+        - $(MODEL_PATH)
+        - --host=0.0.0.0
+        - --port=8080
+        - --max-log-len=0
+        - --served-model-name=vllm-model
+        - --max-model-len=131072
+        - --tensor-parallel-size=1
+        - --kv-cache-dtype=auto
+        - --async-scheduling
+        - --max-cudagraph-capture-size=256
+        - --enable-auto-tool-choice
+        - --tool-call-parser=openai
+        - --reasoning-parser=openai_gptoss
+      env:
+        - name: VLLM_ENGINE_ITERATION_TIMEOUT_S
+          value: "120"
+        - name: VLLM_WORKER_MULTIPROC_METHOD
+          value: "spawn"
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 60Gi
+          nvidia.com/gpu: 1
+        limits:
+          cpu: 10
+          memory: 60Gi
+          nvidia.com/gpu: 1
+      readinessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
@@ -0,0 +1,18 @@
+apiVersion: ome.io/v1beta1
+kind: InferenceService
+metadata:
+  name: gpt-oss-120b
+  namespace: openai-test
+  annotations:
+    ome.io/deploymentMode: RawDeployment
+spec:
+  model:
+    name: gpt-oss-120b
+  engine:
+    minReplicas: 1
+    maxReplicas: 1
+  runtime:
+    name: vllm-gpt-oss-120b-imported
+  router:
+    minReplicas: 1
+    maxReplicas: 1
@@ -11,3 +11,8 @@ spec:
   engine:
     minReplicas: 1
     maxReplicas: 1
+  runtime:
+    name: vllm-gpt-oss-20b-imported
+  router:
+    minReplicas: 1
+    maxReplicas: 1