Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions config/models/google/gemma-4-26B-A4B-it.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: ome.io/v1beta1
kind: ClusterBaseModel
metadata:
name: gemma-4-26b-a4b-it
spec:
modelCapabilities:
- TEXT_TO_TEXT
- IMAGE_TEXT_TO_TEXT
- VIDEO_TEXT_TO_TEXT
vendor: google
disabled: false
version: "1.0.0"
displayName: google.gemma-4-26b-a4b-it
storage:
storageUri: hf://google/gemma-4-26B-A4B-it
path: /raid/models/google/gemma-4-26B-A4B-it
16 changes: 16 additions & 0 deletions config/models/google/gemma-4-31B-it.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: ome.io/v1beta1
kind: ClusterBaseModel
metadata:
name: gemma-4-31b-it
spec:
modelCapabilities:
- TEXT_TO_TEXT
- IMAGE_TEXT_TO_TEXT
- VIDEO_TEXT_TO_TEXT
vendor: google
disabled: false
version: "1.0.0"
displayName: google.gemma-4-31b-it
storage:
storageUri: hf://google/gemma-4-31B-it
path: /raid/models/google/gemma-4-31B-it
17 changes: 17 additions & 0 deletions config/models/google/gemma-4-E2B-it.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
apiVersion: ome.io/v1beta1
kind: ClusterBaseModel
metadata:
name: gemma-4-e2b-it
spec:
modelCapabilities:
- TEXT_TO_TEXT
- IMAGE_TEXT_TO_TEXT
- VIDEO_TEXT_TO_TEXT
- AUDIO_TEXT_TO_TEXT
vendor: google
disabled: false
version: "1.0.0"
displayName: google.gemma-4-e2b-it
storage:
storageUri: hf://google/gemma-4-E2B-it
path: /raid/models/google/gemma-4-E2B-it
17 changes: 17 additions & 0 deletions config/models/google/gemma-4-E4B-it.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
apiVersion: ome.io/v1beta1
kind: ClusterBaseModel
metadata:
name: gemma-4-e4b-it
spec:
modelCapabilities:
- TEXT_TO_TEXT
- IMAGE_TEXT_TO_TEXT
- VIDEO_TEXT_TO_TEXT
- AUDIO_TEXT_TO_TEXT
vendor: google
disabled: false
version: "1.0.0"
displayName: google.gemma-4-e4b-it
storage:
storageUri: hf://google/gemma-4-E4B-it
path: /raid/models/google/gemma-4-E4B-it
4 changes: 4 additions & 0 deletions config/models/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ resources:
# google
- google/gemma-3-1b-it.yaml
- google/gemma-3-4b-it.yaml
- google/gemma-4-E2B-it.yaml
- google/gemma-4-E4B-it.yaml
- google/gemma-4-26B-A4B-it.yaml
- google/gemma-4-31B-it.yaml

# HuggingFaceTB
- HuggingFaceTB/SmolLM-1.7B.yaml
Expand Down
2 changes: 2 additions & 0 deletions config/runtimes/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,5 @@ resources:
- vllm/mixtral-8x7b-instruct-rt.yaml
- vllm/deepseek-ai/deepseek-v4-flash-rt.yaml
- vllm/deepseek-ai/deepseek-v4-pro-rt.yaml
- vllm/gemma-4-tp1-rt.yaml
- vllm/gemma-4-tp2-rt.yaml
256 changes: 256 additions & 0 deletions config/runtimes/vllm/gemma-4-tp1-rt.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
apiVersion: ome.io/v1beta1
kind: ClusterServingRuntime
metadata:
name: vllm-gemma-4-tp1
spec:
disabled: false
acceleratorRequirements:
acceleratorClasses:
- nvidia-h100-1
- nvidia-h100-2
- nvidia-h100-4
- nvidia-h100-8
- nvidia-a100-80gb-1
- nvidia-a100-80gb-2
- nvidia-a100-80gb-4
- nvidia-a100-80gb-8
- nvidia-h200-1
- nvidia-h200-2
- nvidia-h200-4
- nvidia-h200-8
- nvidia-b200-1
- nvidia-b200-2
- nvidia-b200-4
- nvidia-b200-8
supportedModelFormats:
- modelFramework:
name: transformers
version: "5.5.0.dev0"
modelFormat:
name: safetensors
version: "1.0.0"
modelArchitecture: Gemma4ForConditionalGeneration
autoSelect: true
priority: 1
version: "1.0.0"
acceleratorConfig:
nvidia-h100-1:
tensorParallelismOverride:
tensorParallelSize: 1
nvidia-h100-2:
tensorParallelismOverride:
tensorParallelSize: 2
nvidia-h100-4:
tensorParallelismOverride:
tensorParallelSize: 4
nvidia-h100-8:
tensorParallelismOverride:
tensorParallelSize: 8
nvidia-a100-80gb-1:
tensorParallelismOverride:
tensorParallelSize: 1
nvidia-a100-80gb-2:
tensorParallelismOverride:
tensorParallelSize: 2
nvidia-a100-80gb-4:
tensorParallelismOverride:
tensorParallelSize: 4
nvidia-a100-80gb-8:
tensorParallelismOverride:
tensorParallelSize: 8
nvidia-h200-1:
tensorParallelismOverride:
tensorParallelSize: 1
nvidia-h200-2:
tensorParallelismOverride:
tensorParallelSize: 2
nvidia-h200-4:
tensorParallelismOverride:
tensorParallelSize: 4
nvidia-h200-8:
tensorParallelismOverride:
tensorParallelSize: 8
nvidia-b200-1:
tensorParallelismOverride:
tensorParallelSize: 1
nvidia-b200-2:
tensorParallelismOverride:
tensorParallelSize: 2
nvidia-b200-4:
tensorParallelismOverride:
tensorParallelSize: 4
nvidia-b200-8:
tensorParallelismOverride:
tensorParallelSize: 8
modelSizeRange:
min: 4.6B
max: 27.7B

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The change didn't follow old pattern to create one runtime for each model, why we switch to use one runtime for multiple models?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All the Gemma4 models use the same underlying model architecture so creating one runtime per model would create redundancies and consolidating by tp-size also allows for future expansion and support if google were to release new gemma4 models using the same architecture. cc @YouNeedCryDear for more context

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct. We are not going direction of one runtime per model as it scaling. If multiple models in the same family are sharing the same architecture, same format and essentially same engine config. Then we are combining those into a single runtime. Parallelism and engine args overwrite will be controlled on Accelerator Class level. Please let me know if there is any concerns for it @XinyueZhang369

protocolVersions:
- openAI
routerConfig:
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "29000"
prometheus.io/path: "/metrics"
labels:
logging-forward: enabled
runner:
name: router
image: docker.io/lightseekorg/smg:1.4.1
ports:
- containerPort: 8080
name: http
resources:
limits:
cpu: "1"
Comment thread
ankrovv marked this conversation as resolved.
memory: 2Gi
args:
- --host
- 0.0.0.0
- --port
- "8080"
- --service-discovery
- --service-discovery-namespace
- $(NAMESPACE)
- --service-discovery-port
- "8080"
- --selector
- component=engine ome.io/inferenceservice=$(INFERENCESERVICE_NAME)
- --enable-igw
- --request-id-headers
- opc-request-id
- --log-json
- --disable-retries
- --disable-circuit-breaker
- --disable-tokenizer-autoload
env:
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: INFERENCESERVICE_NAME
valueFrom:
fieldRef:
fieldPath: metadata.labels['ome.io/inferenceservice']
readinessProbe:
httpGet:
path: /readiness
port: 8080
failureThreshold: 5
periodSeconds: 30
timeoutSeconds: 10
livenessProbe:
httpGet:
path: /liveness
port: 8080
failureThreshold: 5
periodSeconds: 30
timeoutSeconds: 10
startupProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 10
periodSeconds: 20
timeoutSeconds: 10
engineConfig:
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
labels:
logging-forward: enabled
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
volumes:
- name: dshm
emptyDir:
medium: Memory
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node.kubernetes.io/instance-type
operator: In
values:
- BM.GPU.A100-v2.8
- BM.GPU.H100.8
- BM.GPU.H200-NC.8
- BM.GPU.H200.8
runner:
name: ome-container
image: fra.ocir.io/idqj093njucb/official-vllm-openai:v0.19.1-nightly-gemma4
ports:
- containerPort: 8080
name: http1
protocol: TCP
command:
- /bin/bash
- '-lc'
- --
args:
- |
vllm serve \
--port=8080 \
--model="$MODEL_PATH" \
--max-log-len=0 \
--served-model-name=vllm-model \
--tensor-parallel-size=1 \
--max-model-len=-1 \
--gpu-memory-utilization=0.9 \
--enable-auto-tool-choice \
--tool-call-parser=gemma4 \
--reasoning-parser=gemma4 \
--async-scheduling \
--no-scheduler-reserve-full-isl \
--limit-mm-per-prompt '{"image": 10, "audio": 1, "video": 1}'
env:
- name: VLLM_LOGGING_LEVEL
value: "INFO"
- name: VLLM_RPC_TIMEOUT
value: '30000'
- name: VLLM_ENGINE_ITERATION_TIMEOUT_S
value: '120'
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 10
memory: 80Gi
nvidia.com/gpu: 1
limits:
cpu: 10
memory: 80Gi
nvidia.com/gpu: 1

readinessProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 3
successThreshold: 1
periodSeconds: 60
timeoutSeconds: 200

livenessProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 5
successThreshold: 1
periodSeconds: 60
timeoutSeconds: 60

startupProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 150
successThreshold: 1
periodSeconds: 6
initialDelaySeconds: 60
timeoutSeconds: 30
Loading