From 98c4c88a2410248ddf4a0540841e43d19c2d0598 Mon Sep 17 00:00:00 2001 From: Tejesh Anand Date: Tue, 12 May 2026 12:53:35 -0700 Subject: [PATCH 1/4] init --- config/models/kustomization.yaml | 1 + ...NVIDIA-Nemotron-3-Super-120B-A12B-FP8.yaml | 22 ++ config/runtimes/kustomization.yaml | 1 + ...dia-nemotron-3-super-120b-a12b-fp8-rt.yaml | 209 ++++++++++++++++++ ...nvidia-nemotron-3-super-120b-a12b-fp8.yaml | 16 ++ 5 files changed, 249 insertions(+) create mode 100644 config/models/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8.yaml create mode 100644 config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml create mode 100644 config/samples/isvc/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8.yaml diff --git a/config/models/kustomization.yaml b/config/models/kustomization.yaml index e884fc7c0..30e5ae54a 100644 --- a/config/models/kustomization.yaml +++ b/config/models/kustomization.yaml @@ -122,6 +122,7 @@ resources: - nvidia/Llama-3_1-Nemotron-Ultra-253B-v1.yaml - nvidia/Llama-3_3-Nemotron-Super-49B-v1.yaml - nvidia/Llama-3.1-Nemotron-Nano-8B-v1.yaml + - nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8.yaml - nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16.yaml - nvidia/NVIDIA-Nemotron-Nano-9B-v2.yaml diff --git a/config/models/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8.yaml b/config/models/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8.yaml new file mode 100644 index 000000000..75e70608e --- /dev/null +++ b/config/models/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8.yaml @@ -0,0 +1,22 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterBaseModel +metadata: + name: nvidia-nemotron-3-super-120b-a12b-fp8 +spec: + modelCapabilities: + - TEXT_TO_TEXT + vendor: nvidia + disabled: false + version: "1.0.0" + displayName: nvidia.NVIDIA-Nemotron-3-Super-120B-A12B-FP8 + modelFormat: + name: safetensors + version: "1.0.0" + modelFramework: + name: transformers + version: "4.57.6" + modelArchitecture: NemotronHForCausalLM + storage: + storageUri: hf://nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8 + path: /raid/models/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8 + key: "hf-token" diff --git a/config/runtimes/kustomization.yaml b/config/runtimes/kustomization.yaml index 0b78b5905..519e10337 100644 --- a/config/runtimes/kustomization.yaml +++ b/config/runtimes/kustomization.yaml @@ -52,3 +52,4 @@ resources: - vllm/mixtral-8x7b-instruct-rt.yaml - vllm/deepseek-ai/deepseek-v4-flash-rt.yaml - vllm/deepseek-ai/deepseek-v4-pro-rt.yaml +- vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml diff --git a/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml b/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml new file mode 100644 index 000000000..fcf65f068 --- /dev/null +++ b/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml @@ -0,0 +1,209 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: vllm-nvidia-nemotron-3-super-120b-a12b-fp8 +spec: + disabled: false + routerConfig: + annotations: + prometheus.io/path: /metrics + prometheus.io/port: '29000' + prometheus.io/scrape: 'true' + labels: + logging-forward: enabled + runner: + name: router + image: docker.io/lightseekorg/smg:1.4.1 + ports: + - containerPort: 8080 + name: http + resources: + limits: + cpu: "8" + memory: 16Gi + args: + - launch + - --host + - 0.0.0.0 + - --port + - "8080" + - --service-discovery + - --service-discovery-namespace + - $(NAMESPACE) + - --service-discovery-port + - "8080" + - --selector + - component=engine ome.io/inferenceservice=$(INFERENCESERVICE_NAME) + - --model-path + - $(MODEL_PATH) + - --chat-template + - $(MODEL_PATH)/chat_template.jinja + - --request-id-headers + - opc-request-id + - --log-json + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] + - name: VLLM_LOGGING_LEVEL + value: 'INFO' + readinessProbe: + httpGet: + path: /readiness + port: 8080 + failureThreshold: 5 + periodSeconds: 30 + timeoutSeconds: 10 + livenessProbe: + httpGet: + path: /liveness + port: 8080 + failureThreshold: 5 + periodSeconds: 30 + timeoutSeconds: 10 + startupProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + periodSeconds: 20 + timeoutSeconds: 10 + initialDelaySeconds: 30 + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.57.6" + modelFormat: + name: safetensors + version: "1.0.0" + modelArchitecture: NemotronHForCausalLM + quantization: fp8 + autoSelect: true + priority: 1 + modelSizeRange: + min: 115B + max: 125B + protocolVersions: + - openAI + engineConfig: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - BM.GPU.H100.8 + volumes: + - name: dshm + emptyDir: + medium: Memory + runner: + name: ome-container + image: docker.io/vllm/vllm-openai:v0.20.0 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - python3 + args: + - -m + - vllm.entrypoints.openai.api_server + - --model + - $(MODEL_PATH) + - --host + - 0.0.0.0 + - --port + - "8080" + - --async-scheduling + - --dtype + - auto + - --kv-cache-dtype + - fp8 + - --tensor-parallel-size + - "4" + - --pipeline-parallel-size + - "1" + - --data-parallel-size + - "1" + - --max-model-len + - "1048576" + - --enable-expert-parallel + - --trust-remote-code + - --gpu-memory-utilization + - "0.9" + - --max-cudagraph-capture-size + - "128" + - --enable-chunked-prefill + - --mamba-ssm-cache-dtype + - float32 + - --served-model-name + - nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8 + - --reasoning-parser + - nemotron_v3 + - --enable-auto-tool-choice + - --tool-call-parser + - qwen3_coder + - --chat-template + - $(MODEL_PATH)/chat_template.jinja + env: + - name: VLLM_ENGINE_READY_TIMEOUT_S + value: '3600' + - name: VLLM_LOGGING_LEVEL + value: 'INFO' + - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN + value: '1' + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 64 + memory: 256Gi + nvidia.com/gpu: 4 + limits: + cpu: 64 + memory: 256Gi + nvidia.com/gpu: 4 + readinessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 90 + timeoutSeconds: 60 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 190 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 diff --git a/config/samples/isvc/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8.yaml b/config/samples/isvc/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8.yaml new file mode 100644 index 000000000..56a607e88 --- /dev/null +++ b/config/samples/isvc/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8.yaml @@ -0,0 +1,16 @@ +apiVersion: ome.io/v1beta1 +kind: InferenceService +metadata: + name: nvidia-nemotron-3-super-120b-a12b-fp8 + namespace: nvidia-nemotron-3-super-120b-a12b-fp8 +spec: + model: + name: nvidia-nemotron-3-super-120b-a12b-fp8 + runtime: + name: vllm-nvidia-nemotron-3-super-120b-a12b-fp8 + engine: + minReplicas: 1 + maxReplicas: 1 + router: + minReplicas: 1 + maxReplicas: 1 From baa1df5fef6530cb013787ba2107eb1e94f9d0bf Mon Sep 17 00:00:00 2001 From: Tejesh Anand Date: Tue, 12 May 2026 14:58:17 -0700 Subject: [PATCH 2/4] feedback --- ...dia-nemotron-3-super-120b-a12b-fp8-rt.yaml | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml b/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml index fcf65f068..e3834fdcc 100644 --- a/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml +++ b/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml @@ -34,10 +34,6 @@ spec: - "8080" - --selector - component=engine ome.io/inferenceservice=$(INFERENCESERVICE_NAME) - - --model-path - - $(MODEL_PATH) - - --chat-template - - $(MODEL_PATH)/chat_template.jinja - --request-id-headers - opc-request-id - --log-json @@ -56,7 +52,7 @@ spec: httpGet: path: /readiness port: 8080 - failureThreshold: 5 + failureThreshold: 10 periodSeconds: 30 timeoutSeconds: 10 livenessProbe: @@ -85,6 +81,15 @@ spec: quantization: fp8 autoSelect: true priority: 1 + - modelFramework: + name: transformers + version: "4.57.6" + modelFormat: + name: safetensors + version: "1.0.0" + modelArchitecture: NemotronHForCausalLM + autoSelect: true + priority: 1 modelSizeRange: min: 115B max: 125B @@ -122,11 +127,9 @@ spec: name: http1 protocol: TCP command: - - python3 + - vllm + - serve args: - - -m - - vllm.entrypoints.openai.api_server - - --model - $(MODEL_PATH) - --host - 0.0.0.0 @@ -186,7 +189,7 @@ spec: httpGet: path: /health port: 8080 - failureThreshold: 3 + failureThreshold: 10 successThreshold: 1 periodSeconds: 90 timeoutSeconds: 60 From 0f5d64e33bd87cdf6b17f6563bc724ffc0185914 Mon Sep 17 00:00:00 2001 From: Tejesh Anand Date: Thu, 14 May 2026 11:27:01 -0700 Subject: [PATCH 3/4] use tp2 for fp8 --- .../vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml b/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml index e3834fdcc..3bcda9c48 100644 --- a/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml +++ b/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml @@ -141,13 +141,15 @@ spec: - --kv-cache-dtype - fp8 - --tensor-parallel-size - - "4" + - "2" - --pipeline-parallel-size - "1" - --data-parallel-size - "1" - --max-model-len - "1048576" + - --max-num-seqs + - "256" - --enable-expert-parallel - --trust-remote-code - --gpu-memory-utilization From f9ad9e65d9c66a1c03125bd1b315fa6a472b016a Mon Sep 17 00:00:00 2001 From: Tejesh Anand Date: Thu, 14 May 2026 13:23:44 -0700 Subject: [PATCH 4/4] Revert "use tp2 for fp8" This reverts commit 0f5d64e33bd87cdf6b17f6563bc724ffc0185914. --- .../vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml b/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml index 3bcda9c48..e3834fdcc 100644 --- a/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml +++ b/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml @@ -141,15 +141,13 @@ spec: - --kv-cache-dtype - fp8 - --tensor-parallel-size - - "2" + - "4" - --pipeline-parallel-size - "1" - --data-parallel-size - "1" - --max-model-len - "1048576" - - --max-num-seqs - - "256" - --enable-expert-parallel - --trust-remote-code - --gpu-memory-utilization