diff --git a/config/models/kustomization.yaml b/config/models/kustomization.yaml index e884fc7c0..30e5ae54a 100644 --- a/config/models/kustomization.yaml +++ b/config/models/kustomization.yaml @@ -122,6 +122,7 @@ resources: - nvidia/Llama-3_1-Nemotron-Ultra-253B-v1.yaml - nvidia/Llama-3_3-Nemotron-Super-49B-v1.yaml - nvidia/Llama-3.1-Nemotron-Nano-8B-v1.yaml + - nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8.yaml - nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16.yaml - nvidia/NVIDIA-Nemotron-Nano-9B-v2.yaml diff --git a/config/models/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8.yaml b/config/models/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8.yaml new file mode 100644 index 000000000..75e70608e --- /dev/null +++ b/config/models/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8.yaml @@ -0,0 +1,22 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterBaseModel +metadata: + name: nvidia-nemotron-3-super-120b-a12b-fp8 +spec: + modelCapabilities: + - TEXT_TO_TEXT + vendor: nvidia + disabled: false + version: "1.0.0" + displayName: nvidia.NVIDIA-Nemotron-3-Super-120B-A12B-FP8 + modelFormat: + name: safetensors + version: "1.0.0" + modelFramework: + name: transformers + version: "4.57.6" + modelArchitecture: NemotronHForCausalLM + storage: + storageUri: hf://nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8 + path: /raid/models/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8 + key: "hf-token" diff --git a/config/runtimes/kustomization.yaml b/config/runtimes/kustomization.yaml index 0b78b5905..519e10337 100644 --- a/config/runtimes/kustomization.yaml +++ b/config/runtimes/kustomization.yaml @@ -52,3 +52,4 @@ resources: - vllm/mixtral-8x7b-instruct-rt.yaml - vllm/deepseek-ai/deepseek-v4-flash-rt.yaml - vllm/deepseek-ai/deepseek-v4-pro-rt.yaml +- vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml diff --git a/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml b/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml new file mode 100644 index 000000000..e3834fdcc --- /dev/null +++ b/config/runtimes/vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml @@ -0,0 +1,212 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: vllm-nvidia-nemotron-3-super-120b-a12b-fp8 +spec: + disabled: false + routerConfig: + annotations: + prometheus.io/path: /metrics + prometheus.io/port: '29000' + prometheus.io/scrape: 'true' + labels: + logging-forward: enabled + runner: + name: router + image: docker.io/lightseekorg/smg:1.4.1 + ports: + - containerPort: 8080 + name: http + resources: + limits: + cpu: "8" + memory: 16Gi + args: + - launch + - --host + - 0.0.0.0 + - --port + - "8080" + - --service-discovery + - --service-discovery-namespace + - $(NAMESPACE) + - --service-discovery-port + - "8080" + - --selector + - component=engine ome.io/inferenceservice=$(INFERENCESERVICE_NAME) + - --request-id-headers + - opc-request-id + - --log-json + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] + - name: VLLM_LOGGING_LEVEL + value: 'INFO' + readinessProbe: + httpGet: + path: /readiness + port: 8080 + failureThreshold: 10 + periodSeconds: 30 + timeoutSeconds: 10 + livenessProbe: + httpGet: + path: /liveness + port: 8080 + failureThreshold: 5 + periodSeconds: 30 + timeoutSeconds: 10 + startupProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + periodSeconds: 20 + timeoutSeconds: 10 + initialDelaySeconds: 30 + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.57.6" + modelFormat: + name: safetensors + version: "1.0.0" + modelArchitecture: NemotronHForCausalLM + quantization: fp8 + autoSelect: true + priority: 1 + - modelFramework: + name: transformers + version: "4.57.6" + modelFormat: + name: safetensors + version: "1.0.0" + modelArchitecture: NemotronHForCausalLM + autoSelect: true + priority: 1 + modelSizeRange: + min: 115B + max: 125B + protocolVersions: + - openAI + engineConfig: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node.kubernetes.io/instance-type + operator: In + values: + - BM.GPU.H100.8 + volumes: + - name: dshm + emptyDir: + medium: Memory + runner: + name: ome-container + image: docker.io/vllm/vllm-openai:v0.20.0 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - vllm + - serve + args: + - $(MODEL_PATH) + - --host + - 0.0.0.0 + - --port + - "8080" + - --async-scheduling + - --dtype + - auto + - --kv-cache-dtype + - fp8 + - --tensor-parallel-size + - "4" + - --pipeline-parallel-size + - "1" + - --data-parallel-size + - "1" + - --max-model-len + - "1048576" + - --enable-expert-parallel + - --trust-remote-code + - --gpu-memory-utilization + - "0.9" + - --max-cudagraph-capture-size + - "128" + - --enable-chunked-prefill + - --mamba-ssm-cache-dtype + - float32 + - --served-model-name + - nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8 + - --reasoning-parser + - nemotron_v3 + - --enable-auto-tool-choice + - --tool-call-parser + - qwen3_coder + - --chat-template + - $(MODEL_PATH)/chat_template.jinja + env: + - name: VLLM_ENGINE_READY_TIMEOUT_S + value: '3600' + - name: VLLM_LOGGING_LEVEL + value: 'INFO' + - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN + value: '1' + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 64 + memory: 256Gi + nvidia.com/gpu: 4 + limits: + cpu: 64 + memory: 256Gi + nvidia.com/gpu: 4 + readinessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 10 + successThreshold: 1 + periodSeconds: 90 + timeoutSeconds: 60 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + startupProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 190 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 diff --git a/config/samples/isvc/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8.yaml b/config/samples/isvc/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8.yaml new file mode 100644 index 000000000..56a607e88 --- /dev/null +++ b/config/samples/isvc/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8.yaml @@ -0,0 +1,16 @@ +apiVersion: ome.io/v1beta1 +kind: InferenceService +metadata: + name: nvidia-nemotron-3-super-120b-a12b-fp8 + namespace: nvidia-nemotron-3-super-120b-a12b-fp8 +spec: + model: + name: nvidia-nemotron-3-super-120b-a12b-fp8 + runtime: + name: vllm-nvidia-nemotron-3-super-120b-a12b-fp8 + engine: + minReplicas: 1 + maxReplicas: 1 + router: + minReplicas: 1 + maxReplicas: 1