From 98c4c88a2410248ddf4a0540841e43d19c2d0598 Mon Sep 17 00:00:00 2001
From: Tejesh Anand <tejesh.anand@oracle.com>
Date: Tue, 12 May 2026 12:53:35 -0700
Subject: [PATCH 1/4] init

---
 config/models/kustomization.yaml              |   1 +
 ...NVIDIA-Nemotron-3-Super-120B-A12B-FP8.yaml |  22 ++
 config/runtimes/kustomization.yaml            |   1 +
 ...dia-nemotron-3-super-120b-a12b-fp8-rt.yaml | 209 ++++++++++++++++++
 ...nvidia-nemotron-3-super-120b-a12b-fp8.yaml |  16 ++
 5 files changed, 249 insertions(+)
 create mode 100644 config/models/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8.yaml
 create mode 100644 config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml
 create mode 100644 config/samples/isvc/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8.yaml

diff --git a/config/models/kustomization.yaml b/config/models/kustomization.yaml
index e884fc7c0..30e5ae54a 100644
--- a/config/models/kustomization.yaml
+++ b/config/models/kustomization.yaml
@@ -122,6 +122,7 @@ resources:
   - nvidia/Llama-3_1-Nemotron-Ultra-253B-v1.yaml
   - nvidia/Llama-3_3-Nemotron-Super-49B-v1.yaml
   - nvidia/Llama-3.1-Nemotron-Nano-8B-v1.yaml
+  - nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8.yaml
   - nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16.yaml
   - nvidia/NVIDIA-Nemotron-Nano-9B-v2.yaml
 
diff --git a/config/models/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8.yaml b/config/models/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8.yaml
new file mode 100644
index 000000000..75e70608e
--- /dev/null
+++ b/config/models/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8.yaml
@@ -0,0 +1,22 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterBaseModel
+metadata:
+  name: nvidia-nemotron-3-super-120b-a12b-fp8
+spec:
+  modelCapabilities:
+    - TEXT_TO_TEXT
+  vendor: nvidia
+  disabled: false
+  version: "1.0.0"
+  displayName: nvidia.NVIDIA-Nemotron-3-Super-120B-A12B-FP8
+  modelFormat:
+    name: safetensors
+    version: "1.0.0"
+  modelFramework:
+    name: transformers
+    version: "4.57.6"
+  modelArchitecture: NemotronHForCausalLM
+  storage:
+    storageUri: hf://nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8
+    path: /raid/models/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8
+    key: "hf-token"
diff --git a/config/runtimes/kustomization.yaml b/config/runtimes/kustomization.yaml
index 0b78b5905..519e10337 100644
--- a/config/runtimes/kustomization.yaml
+++ b/config/runtimes/kustomization.yaml
@@ -52,3 +52,4 @@ resources:
 - vllm/mixtral-8x7b-instruct-rt.yaml
 - vllm/deepseek-ai/deepseek-v4-flash-rt.yaml
 - vllm/deepseek-ai/deepseek-v4-pro-rt.yaml
+- vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml
diff --git a/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml b/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml
new file mode 100644
index 000000000..fcf65f068
--- /dev/null
+++ b/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml
@@ -0,0 +1,209 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: vllm-nvidia-nemotron-3-super-120b-a12b-fp8
+spec:
+  disabled: false
+  routerConfig:
+    annotations:
+      prometheus.io/path: /metrics
+      prometheus.io/port: '29000'
+      prometheus.io/scrape: 'true'
+    labels:
+      logging-forward: enabled
+    runner:
+      name: router
+      image: docker.io/lightseekorg/smg:1.4.1
+      ports:
+        - containerPort: 8080
+          name: http
+      resources:
+        limits:
+          cpu: "8"
+          memory: 16Gi
+      args:
+        - launch
+        - --host
+        - 0.0.0.0
+        - --port
+        - "8080"
+        - --service-discovery
+        - --service-discovery-namespace
+        - $(NAMESPACE)
+        - --service-discovery-port
+        - "8080"
+        - --selector
+        - component=engine ome.io/inferenceservice=$(INFERENCESERVICE_NAME)
+        - --model-path
+        - $(MODEL_PATH)
+        - --chat-template
+        - $(MODEL_PATH)/chat_template.jinja
+        - --request-id-headers
+        - opc-request-id
+        - --log-json
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
+        - name: VLLM_LOGGING_LEVEL
+          value: 'INFO'
+      readinessProbe:
+        httpGet:
+          path: /readiness
+          port: 8080
+        failureThreshold: 5
+        periodSeconds: 30
+        timeoutSeconds: 10
+      livenessProbe:
+        httpGet:
+          path: /liveness
+          port: 8080
+        failureThreshold: 5
+        periodSeconds: 30
+        timeoutSeconds: 10
+      startupProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        periodSeconds: 20
+        timeoutSeconds: 10
+        initialDelaySeconds: 30
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.57.6"
+      modelFormat:
+        name: safetensors
+        version: "1.0.0"
+      modelArchitecture: NemotronHForCausalLM
+      quantization: fp8
+      autoSelect: true
+      priority: 1
+  modelSizeRange:
+    min: 115B
+    max: 125B
+  protocolVersions:
+    - openAI
+  engineConfig:
+    annotations:
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    affinity:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+            - matchExpressions:
+                - key: node.kubernetes.io/instance-type
+                  operator: In
+                  values:
+                    - BM.GPU.H100.8
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    runner:
+      name: ome-container
+      image: docker.io/vllm/vllm-openai:v0.20.0
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - python3
+      args:
+        - -m
+        - vllm.entrypoints.openai.api_server
+        - --model
+        - $(MODEL_PATH)
+        - --host
+        - 0.0.0.0
+        - --port
+        - "8080"
+        - --async-scheduling
+        - --dtype
+        - auto
+        - --kv-cache-dtype
+        - fp8
+        - --tensor-parallel-size
+        - "4"
+        - --pipeline-parallel-size
+        - "1"
+        - --data-parallel-size
+        - "1"
+        - --max-model-len
+        - "1048576"
+        - --enable-expert-parallel
+        - --trust-remote-code
+        - --gpu-memory-utilization
+        - "0.9"
+        - --max-cudagraph-capture-size
+        - "128"
+        - --enable-chunked-prefill
+        - --mamba-ssm-cache-dtype
+        - float32
+        - --served-model-name
+        - nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8
+        - --reasoning-parser
+        - nemotron_v3
+        - --enable-auto-tool-choice
+        - --tool-call-parser
+        - qwen3_coder
+        - --chat-template
+        - $(MODEL_PATH)/chat_template.jinja
+      env:
+        - name: VLLM_ENGINE_READY_TIMEOUT_S
+          value: '3600'
+        - name: VLLM_LOGGING_LEVEL
+          value: 'INFO'
+        - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN
+          value: '1'
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 64
+          memory: 256Gi
+          nvidia.com/gpu: 4
+        limits:
+          cpu: 64
+          memory: 256Gi
+          nvidia.com/gpu: 4
+      readinessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 90
+        timeoutSeconds: 60
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+      startupProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 190
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
diff --git a/config/samples/isvc/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8.yaml b/config/samples/isvc/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8.yaml
new file mode 100644
index 000000000..56a607e88
--- /dev/null
+++ b/config/samples/isvc/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8.yaml
@@ -0,0 +1,16 @@
+apiVersion: ome.io/v1beta1
+kind: InferenceService
+metadata:
+  name: nvidia-nemotron-3-super-120b-a12b-fp8
+  namespace: nvidia-nemotron-3-super-120b-a12b-fp8
+spec:
+  model:
+    name: nvidia-nemotron-3-super-120b-a12b-fp8
+  runtime:
+    name: vllm-nvidia-nemotron-3-super-120b-a12b-fp8
+  engine:
+    minReplicas: 1
+    maxReplicas: 1
+  router:
+    minReplicas: 1
+    maxReplicas: 1

From baa1df5fef6530cb013787ba2107eb1e94f9d0bf Mon Sep 17 00:00:00 2001
From: Tejesh Anand <tejesh.anand@oracle.com>
Date: Tue, 12 May 2026 14:58:17 -0700
Subject: [PATCH 2/4] feedback

---
 ...dia-nemotron-3-super-120b-a12b-fp8-rt.yaml | 23 +++++++++++--------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml b/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml
index fcf65f068..e3834fdcc 100644
--- a/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml
+++ b/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml
@@ -34,10 +34,6 @@ spec:
         - "8080"
         - --selector
         - component=engine ome.io/inferenceservice=$(INFERENCESERVICE_NAME)
-        - --model-path
-        - $(MODEL_PATH)
-        - --chat-template
-        - $(MODEL_PATH)/chat_template.jinja
         - --request-id-headers
         - opc-request-id
         - --log-json
@@ -56,7 +52,7 @@ spec:
         httpGet:
           path: /readiness
           port: 8080
-        failureThreshold: 5
+        failureThreshold: 10
         periodSeconds: 30
         timeoutSeconds: 10
       livenessProbe:
@@ -85,6 +81,15 @@ spec:
       quantization: fp8
       autoSelect: true
       priority: 1
+    - modelFramework:
+        name: transformers
+        version: "4.57.6"
+      modelFormat:
+        name: safetensors
+        version: "1.0.0"
+      modelArchitecture: NemotronHForCausalLM
+      autoSelect: true
+      priority: 1
   modelSizeRange:
     min: 115B
     max: 125B
@@ -122,11 +127,9 @@ spec:
           name: http1
           protocol: TCP
       command:
-        - python3
+        - vllm
+        - serve
       args:
-        - -m
-        - vllm.entrypoints.openai.api_server
-        - --model
         - $(MODEL_PATH)
         - --host
         - 0.0.0.0
@@ -186,7 +189,7 @@ spec:
         httpGet:
           path: /health
           port: 8080
-        failureThreshold: 3
+        failureThreshold: 10
         successThreshold: 1
         periodSeconds: 90
         timeoutSeconds: 60

From 0f5d64e33bd87cdf6b17f6563bc724ffc0185914 Mon Sep 17 00:00:00 2001
From: Tejesh Anand <tejesh.anand@oracle.com>
Date: Thu, 14 May 2026 11:27:01 -0700
Subject: [PATCH 3/4] use tp2 for fp8

---
 .../vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml b/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml
index e3834fdcc..3bcda9c48 100644
--- a/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml
+++ b/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml
@@ -141,13 +141,15 @@ spec:
         - --kv-cache-dtype
         - fp8
         - --tensor-parallel-size
-        - "4"
+        - "2"
         - --pipeline-parallel-size
         - "1"
         - --data-parallel-size
         - "1"
         - --max-model-len
         - "1048576"
+        - --max-num-seqs
+        - "256"
         - --enable-expert-parallel
         - --trust-remote-code
         - --gpu-memory-utilization

From f9ad9e65d9c66a1c03125bd1b315fa6a472b016a Mon Sep 17 00:00:00 2001
From: Tejesh Anand <tejesh.anand@oracle.com>
Date: Thu, 14 May 2026 13:23:44 -0700
Subject: [PATCH 4/4] Revert "use tp2 for fp8"

This reverts commit 0f5d64e33bd87cdf6b17f6563bc724ffc0185914.
---
 .../vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml b/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml
index 3bcda9c48..e3834fdcc 100644
--- a/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml
+++ b/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml
@@ -141,15 +141,13 @@ spec:
         - --kv-cache-dtype
         - fp8
         - --tensor-parallel-size
-        - "2"
+        - "4"
         - --pipeline-parallel-size
         - "1"
         - --data-parallel-size
         - "1"
         - --max-model-len
         - "1048576"
-        - --max-num-seqs
-        - "256"
         - --enable-expert-parallel
         - --trust-remote-code
         - --gpu-memory-utilization