Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions examples/online_serving/qwen3_omni/qwen3_omni_moe_thinking.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Stage config for running Qwen3-Omni-MoE-Thinking (text-only output)
# This config is for models like Qwen3-Omni-30B-A3B-Thinking that only have the
# thinker component and do not support audio output.
#
# Single stage: Thinker (multimodal understanding + text generation)

# The following config has been verified on 2x H100-80G GPUs.
stage_args:
- stage_id: 0
runtime:
devices: "0,1"
max_batch_size: 1
engine_args:
model_stage: thinker
model_arch: Qwen3OmniMoeForConditionalGeneration
worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
scheduler_cls: vllm_omni.core.sched.scheduler.OmniScheduler
gpu_memory_utilization: 0.9
enforce_eager: true
trust_remote_code: true
engine_output_type: text
distributed_executor_backend: "mp"
enable_prefix_caching: false
hf_config_name: thinker_config
tensor_parallel_size: 2
final_output: true
final_output_type: text
is_comprehension: true
default_sampling_params:
temperature: 0.4
top_p: 0.9
top_k: 1
max_tokens: 2048
seed: 42
detokenize: True
repetition_penalty: 1.05