vllm-project · ywang96 · Dec 4, 2025 · Dec 3, 2025 · Dec 3, 2025 · Dec 3, 2025
@@ -0,0 +1,36 @@
+# Stage config for running Qwen3-Omni-MoE-Thinking (text-only output)
+# This config is for models like Qwen3-Omni-30B-A3B-Thinking that only have the
+# thinker component and do not support audio output.
+#
+# Single stage: Thinker (multimodal understanding + text generation)
+
+# The following config has been verified on 2x H100-80G GPUs.
+stage_args:
+  - stage_id: 0
+    runtime:
+      devices: "0,1"
+      max_batch_size: 1
+    engine_args:
+      model_stage: thinker
+      model_arch: Qwen3OmniMoeForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
+      scheduler_cls: vllm_omni.core.sched.scheduler.OmniScheduler
+      gpu_memory_utilization: 0.9
+      enforce_eager: true
+      trust_remote_code: true
+      engine_output_type: text
+      distributed_executor_backend: "mp"
+      enable_prefix_caching: false
+      hf_config_name: thinker_config
+      tensor_parallel_size: 2
+    final_output: true
+    final_output_type: text
+    is_comprehension: true
+    default_sampling_params:
+      temperature: 0.4
+      top_p: 0.9
+      top_k: 1
+      max_tokens: 2048
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.05