Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions config/models/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ resources:
- nvidia/Llama-3_1-Nemotron-Ultra-253B-v1.yaml
- nvidia/Llama-3_3-Nemotron-Super-49B-v1.yaml
- nvidia/Llama-3.1-Nemotron-Nano-8B-v1.yaml
- nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8.yaml
- nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16.yaml
- nvidia/NVIDIA-Nemotron-Nano-9B-v2.yaml

Expand Down
22 changes: 22 additions & 0 deletions config/models/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
apiVersion: ome.io/v1beta1
kind: ClusterBaseModel
metadata:
name: nvidia-nemotron-3-super-120b-a12b-fp8
spec:
modelCapabilities:
- TEXT_TO_TEXT
vendor: nvidia
disabled: false
version: "1.0.0"
displayName: nvidia.NVIDIA-Nemotron-3-Super-120B-A12B-FP8
modelFormat:
name: safetensors
version: "1.0.0"
modelFramework:
name: transformers
version: "4.57.6"
modelArchitecture: NemotronHForCausalLM
storage:
storageUri: hf://nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8
path: /raid/models/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8
key: "hf-token"
1 change: 1 addition & 0 deletions config/runtimes/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,4 @@ resources:
- vllm/mixtral-8x7b-instruct-rt.yaml
- vllm/deepseek-ai/deepseek-v4-flash-rt.yaml
- vllm/deepseek-ai/deepseek-v4-pro-rt.yaml
- vllm/nvidia/nvidia-nemotron-3-super-120b-a12b-fp8-rt.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
apiVersion: ome.io/v1beta1
kind: ClusterServingRuntime
metadata:
name: vllm-nvidia-nemotron-3-super-120b-a12b-fp8
spec:
disabled: false
routerConfig:
annotations:
prometheus.io/path: /metrics
prometheus.io/port: '29000'
prometheus.io/scrape: 'true'
labels:
logging-forward: enabled
runner:
name: router
image: docker.io/lightseekorg/smg:1.4.1
ports:
- containerPort: 8080
name: http
resources:
limits:
cpu: "8"
memory: 16Gi
args:
- launch
- --host
- 0.0.0.0
- --port
- "8080"
- --service-discovery
- --service-discovery-namespace
- $(NAMESPACE)
- --service-discovery-port
- "8080"
- --selector
- component=engine ome.io/inferenceservice=$(INFERENCESERVICE_NAME)
- --request-id-headers
- opc-request-id
- --log-json
env:
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: INFERENCESERVICE_NAME
valueFrom:
fieldRef:
fieldPath: metadata.labels['ome.io/inferenceservice']
- name: VLLM_LOGGING_LEVEL
value: 'INFO'
readinessProbe:
httpGet:
path: /readiness
port: 8080
failureThreshold: 10
periodSeconds: 30
timeoutSeconds: 10
livenessProbe:
httpGet:
path: /liveness
port: 8080
failureThreshold: 5
periodSeconds: 30
timeoutSeconds: 10
startupProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 5
periodSeconds: 20
timeoutSeconds: 10
initialDelaySeconds: 30
supportedModelFormats:
Comment thread
TJ5 marked this conversation as resolved.
- modelFramework:
name: transformers
version: "4.57.6"
modelFormat:
name: safetensors
version: "1.0.0"
modelArchitecture: NemotronHForCausalLM
quantization: fp8
autoSelect: true
priority: 1
- modelFramework:
name: transformers
version: "4.57.6"
modelFormat:
name: safetensors
version: "1.0.0"
modelArchitecture: NemotronHForCausalLM
autoSelect: true
priority: 1
modelSizeRange:
min: 115B
max: 125B
protocolVersions:
- openAI
engineConfig:
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
labels:
logging-forward: enabled
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node.kubernetes.io/instance-type
operator: In
values:
- BM.GPU.H100.8
volumes:
- name: dshm
emptyDir:
medium: Memory
runner:
name: ome-container
image: docker.io/vllm/vllm-openai:v0.20.0
ports:
- containerPort: 8080
name: http1
protocol: TCP
command:
- vllm
- serve
args:
- $(MODEL_PATH)
- --host
- 0.0.0.0
- --port
- "8080"
- --async-scheduling
- --dtype
- auto
- --kv-cache-dtype
- fp8
- --tensor-parallel-size
- "4"
- --pipeline-parallel-size
- "1"
- --data-parallel-size
- "1"
- --max-model-len
- "1048576"
Comment thread
TJ5 marked this conversation as resolved.
- --enable-expert-parallel
- --trust-remote-code
- --gpu-memory-utilization
- "0.9"
- --max-cudagraph-capture-size
- "128"
- --enable-chunked-prefill
- --mamba-ssm-cache-dtype
- float32
- --served-model-name
- nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8
- --reasoning-parser
- nemotron_v3
- --enable-auto-tool-choice
- --tool-call-parser
- qwen3_coder
- --chat-template
- $(MODEL_PATH)/chat_template.jinja
env:
- name: VLLM_ENGINE_READY_TIMEOUT_S
value: '3600'
- name: VLLM_LOGGING_LEVEL
value: 'INFO'
- name: VLLM_ALLOW_LONG_MAX_MODEL_LEN
value: '1'
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 64
memory: 256Gi
nvidia.com/gpu: 4
limits:
cpu: 64
memory: 256Gi
nvidia.com/gpu: 4
readinessProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 10
successThreshold: 1
periodSeconds: 90
timeoutSeconds: 60
livenessProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 5
successThreshold: 1
periodSeconds: 60
timeoutSeconds: 60
startupProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 190
successThreshold: 1
periodSeconds: 6
initialDelaySeconds: 60
timeoutSeconds: 30
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: ome.io/v1beta1
kind: InferenceService
metadata:
name: nvidia-nemotron-3-super-120b-a12b-fp8
namespace: nvidia-nemotron-3-super-120b-a12b-fp8
spec:
model:
name: nvidia-nemotron-3-super-120b-a12b-fp8
runtime:
name: vllm-nvidia-nemotron-3-super-120b-a12b-fp8
engine:
minReplicas: 1
maxReplicas: 1
router:
minReplicas: 1
maxReplicas: 1