From 6003d666ddaf62d74e7c813bc0d14cc3a31bc03f Mon Sep 17 00:00:00 2001 From: Huamin Li <3ericli@gmail.com> Date: Fri, 17 Oct 2025 10:15:51 -0700 Subject: [PATCH] add eval config for Qwen3-235B-A22B-Thinking-2507-FP8 Signed-off-by: Huamin Li <3ericli@gmail.com> --- .../configs/Qwen3-235B-A22B-Thinking-2507-FP8.yaml | 10 ++++++++++ .buildkite/lm-eval-harness/configs/Qwen3-8B.yaml | 10 ++++++++++ .../lm-eval-harness/configs/models-large-h100.txt | 1 + .buildkite/lm-eval-harness/configs/models-small.txt | 2 +- .../lm-eval-harness/test_lm_eval_correctness.py | 1 + .buildkite/test-pipeline.yaml | 13 ++++++++++++- 6 files changed, 35 insertions(+), 2 deletions(-) create mode 100644 .buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Thinking-2507-FP8.yaml create mode 100644 .buildkite/lm-eval-harness/configs/Qwen3-8B.yaml diff --git a/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Thinking-2507-FP8.yaml b/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Thinking-2507-FP8.yaml new file mode 100644 index 000000000000..42a20536d811 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Thinking-2507-FP8.yaml @@ -0,0 +1,10 @@ +model_name: "Qwen/Qwen3-235B-A22B-Thinking-2507-FP8" +tasks: + - name: "mmlu_pro" + metrics: + - name: "exact_match,custom-extract" + value: 0.77 +num_fewshot: 5 +limit: 250 # will run on 250 * 14 subjects = 3500 samples +max_model_len: 8096 +gen_kwargs: "top_p=1,top_k=0,max_gen_toks=1536" diff --git a/.buildkite/lm-eval-harness/configs/Qwen3-8B.yaml b/.buildkite/lm-eval-harness/configs/Qwen3-8B.yaml new file mode 100644 index 000000000000..7a7b128e0b75 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Qwen3-8B.yaml @@ -0,0 +1,10 @@ +model_name: "Qwen/Qwen3-8B" +tasks: + - name: "mmlu_pro" + metrics: + - name: "exact_match,custom-extract" + value: 0.60 +num_fewshot: 5 +limit: 250 # will run on 250 * 14 subjects = 3500 samples +max_model_len: 8096 +gen_kwargs: "top_p=1,top_k=0,max_gen_toks=1536" diff --git a/.buildkite/lm-eval-harness/configs/models-large-h100.txt b/.buildkite/lm-eval-harness/configs/models-large-h100.txt index 4fb0b84bc4d8..81381e47134e 100644 --- a/.buildkite/lm-eval-harness/configs/models-large-h100.txt +++ b/.buildkite/lm-eval-harness/configs/models-large-h100.txt @@ -1 +1,2 @@ Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml +Qwen3-235B-A22B-Thinking-2507-FP8.yaml diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt index 36e0543879b3..5b87af9fe18f 100644 --- a/.buildkite/lm-eval-harness/configs/models-small.txt +++ b/.buildkite/lm-eval-harness/configs/models-small.txt @@ -1,6 +1,6 @@ -Qwen2.5-1.5B-Instruct.yaml Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml Qwen1.5-MoE-W4A16-compressed-tensors.yaml +Qwen3-8B.yaml diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index f10de82b1d8e..d2ffcb5a2039 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -40,6 +40,7 @@ def launch_lm_eval(eval_config, tp_size): # existing text models in CI, so only apply it for mm. apply_chat_template=backend == "vllm-vlm", batch_size=batch_size, + gen_kwargs=eval_config.get("gen_kwargs", None), ) return results diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index a28e333eac69..330a78f7cd9e 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1089,7 +1089,7 @@ steps: - tests/weight_loading commands: - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt - + - label: NixlConnector PD accuracy tests (Distributed) # 30min timeout_in_minutes: 30 working_dir: "/vllm-workspace/tests" @@ -1145,6 +1145,17 @@ steps: - pytest -v -s tests/distributed/test_context_parallel.py - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 +- label: LM Eval Large Models (H200) # optional + gpu: h200 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-h100.txt --tp-size=4 + ##### B200 test ##### - label: Distributed Tests (B200) # optional gpu: b200