ome-projects · TJ5 · May 12, 2026 · May 12, 2026 · May 14, 2026 · May 14, 2026
@@ -122,6 +122,7 @@ resources:
   - nvidia/Llama-3_1-Nemotron-Ultra-253B-v1.yaml
   - nvidia/Llama-3_3-Nemotron-Super-49B-v1.yaml
   - nvidia/Llama-3.1-Nemotron-Nano-8B-v1.yaml
+  - nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8.yaml
   - nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16.yaml
   - nvidia/NVIDIA-Nemotron-Nano-9B-v2.yaml
 

@@ -0,0 +1,22 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterBaseModel
+metadata:
+  name: nvidia-nemotron-3-super-120b-a12b-fp8
+spec:
+  modelCapabilities:
+    - TEXT_TO_TEXT
+  vendor: nvidia
+  disabled: false
+  version: "1.0.0"
+  displayName: nvidia.NVIDIA-Nemotron-3-Super-120B-A12B-FP8
+  modelFormat:
+    name: safetensors
+    version: "1.0.0"
+  modelFramework:
+    name: transformers
+    version: "4.57.6"
+  modelArchitecture: NemotronHForCausalLM
+  storage:
+    storageUri: hf://nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8
+    path: /raid/models/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8
+    key: "hf-token"
@@ -52,3 +52,4 @@ resources:
 - vllm/mixtral-8x7b-instruct-rt.yaml
 - vllm/deepseek-ai/deepseek-v4-flash-rt.yaml
 - vllm/deepseek-ai/deepseek-v4-pro-rt.yaml
+- vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml
@@ -0,0 +1,212 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: vllm-nvidia-nemotron-3-super-120b-a12b-fp8
+spec:
+  disabled: false
+  routerConfig:
+    annotations:
+      prometheus.io/path: /metrics
+      prometheus.io/port: '29000'
+      prometheus.io/scrape: 'true'
+    labels:
+      logging-forward: enabled
+    runner:
+      name: router
+      image: docker.io/lightseekorg/smg:1.4.1
+      ports:
+        - containerPort: 8080
+          name: http
+      resources:
+        limits:
+          cpu: "8"
+          memory: 16Gi
+      args:
+        - launch
+        - --host
+        - 0.0.0.0
+        - --port
+        - "8080"
+        - --service-discovery
+        - --service-discovery-namespace
+        - $(NAMESPACE)
+        - --service-discovery-port
+        - "8080"
+        - --selector
+        - component=engine ome.io/inferenceservice=$(INFERENCESERVICE_NAME)
+        - --request-id-headers
+        - opc-request-id
+        - --log-json
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
+        - name: VLLM_LOGGING_LEVEL
+          value: 'INFO'
+      readinessProbe:
+        httpGet:
+          path: /readiness
+          port: 8080
+        failureThreshold: 10
+        periodSeconds: 30
+        timeoutSeconds: 10
+      livenessProbe:
+        httpGet:
+          path: /liveness
+          port: 8080
+        failureThreshold: 5
+        periodSeconds: 30
+        timeoutSeconds: 10
+      startupProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        periodSeconds: 20
+        timeoutSeconds: 10
+        initialDelaySeconds: 30
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.57.6"
+      modelFormat:
+        name: safetensors
+        version: "1.0.0"
+      modelArchitecture: NemotronHForCausalLM
+      quantization: fp8
+      autoSelect: true
+      priority: 1
+    - modelFramework:
+        name: transformers
+        version: "4.57.6"
+      modelFormat:
+        name: safetensors
+        version: "1.0.0"
+      modelArchitecture: NemotronHForCausalLM
+      autoSelect: true
+      priority: 1
+  modelSizeRange:
+    min: 115B
+    max: 125B
+  protocolVersions:
+    - openAI
+  engineConfig:
+    annotations:
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    affinity:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+            - matchExpressions:
+                - key: node.kubernetes.io/instance-type
+                  operator: In
+                  values:
+                    - BM.GPU.H100.8
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    runner:
+      name: ome-container
+      image: docker.io/vllm/vllm-openai:v0.20.0
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - vllm
+        - serve
+      args:
+        - $(MODEL_PATH)
+        - --host
+        - 0.0.0.0
+        - --port
+        - "8080"
+        - --async-scheduling
+        - --dtype
+        - auto
+        - --kv-cache-dtype
+        - fp8
+        - --tensor-parallel-size
+        - "4"
+        - --pipeline-parallel-size
+        - "1"
+        - --data-parallel-size
+        - "1"
+        - --max-model-len
+        - "1048576"
+        - --enable-expert-parallel
+        - --trust-remote-code
+        - --gpu-memory-utilization
+        - "0.9"
+        - --max-cudagraph-capture-size
+        - "128"
+        - --enable-chunked-prefill
+        - --mamba-ssm-cache-dtype
+        - float32
+        - --served-model-name
+        - nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8
+        - --reasoning-parser
+        - nemotron_v3
+        - --enable-auto-tool-choice
+        - --tool-call-parser
+        - qwen3_coder
+        - --chat-template
+        - $(MODEL_PATH)/chat_template.jinja
+      env:
+        - name: VLLM_ENGINE_READY_TIMEOUT_S
+          value: '3600'
+        - name: VLLM_LOGGING_LEVEL
+          value: 'INFO'
+        - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN
+          value: '1'
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 64
+          memory: 256Gi
+          nvidia.com/gpu: 4
+        limits:
+          cpu: 64
+          memory: 256Gi
+          nvidia.com/gpu: 4
+      readinessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 10
+        successThreshold: 1
+        periodSeconds: 90
+        timeoutSeconds: 60
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 190
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
@@ -0,0 +1,16 @@
+apiVersion: ome.io/v1beta1
+kind: InferenceService
+metadata:
+  name: nvidia-nemotron-3-super-120b-a12b-fp8
+  namespace: nvidia-nemotron-3-super-120b-a12b-fp8
+spec:
+  model:
+    name: nvidia-nemotron-3-super-120b-a12b-fp8
+  runtime:
+    name: vllm-nvidia-nemotron-3-super-120b-a12b-fp8
+  engine:
+    minReplicas: 1
+    maxReplicas: 1
+  router:
+    minReplicas: 1
+    maxReplicas: 1