diff --git a/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Thinking-2507-FP8.yaml b/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Thinking-2507-FP8.yaml new file mode 100644 index 000000000000..42a20536d811 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Thinking-2507-FP8.yaml @@ -0,0 +1,10 @@ +model_name: "Qwen/Qwen3-235B-A22B-Thinking-2507-FP8" +tasks: + - name: "mmlu_pro" + metrics: + - name: "exact_match,custom-extract" + value: 0.77 +num_fewshot: 5 +limit: 250 # will run on 250 * 14 subjects = 3500 samples +max_model_len: 8096 +gen_kwargs: "top_p=1,top_k=0,max_gen_toks=1536" diff --git a/.buildkite/lm-eval-harness/configs/Qwen3-8B.yaml b/.buildkite/lm-eval-harness/configs/Qwen3-8B.yaml new file mode 100644 index 000000000000..7a7b128e0b75 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Qwen3-8B.yaml @@ -0,0 +1,10 @@ +model_name: "Qwen/Qwen3-8B" +tasks: + - name: "mmlu_pro" + metrics: + - name: "exact_match,custom-extract" + value: 0.60 +num_fewshot: 5 +limit: 250 # will run on 250 * 14 subjects = 3500 samples +max_model_len: 8096 +gen_kwargs: "top_p=1,top_k=0,max_gen_toks=1536" diff --git a/.buildkite/lm-eval-harness/configs/models-large-h100.txt b/.buildkite/lm-eval-harness/configs/models-large-h100.txt index 4fb0b84bc4d8..81381e47134e 100644 --- a/.buildkite/lm-eval-harness/configs/models-large-h100.txt +++ b/.buildkite/lm-eval-harness/configs/models-large-h100.txt @@ -1 +1,2 @@ Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml +Qwen3-235B-A22B-Thinking-2507-FP8.yaml diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt index 36e0543879b3..5b87af9fe18f 100644 --- a/.buildkite/lm-eval-harness/configs/models-small.txt +++ b/.buildkite/lm-eval-harness/configs/models-small.txt @@ -1,6 +1,6 @@ -Qwen2.5-1.5B-Instruct.yaml Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml Qwen1.5-MoE-W4A16-compressed-tensors.yaml +Qwen3-8B.yaml diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index f10de82b1d8e..d2ffcb5a2039 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -40,6 +40,7 @@ def launch_lm_eval(eval_config, tp_size): # existing text models in CI, so only apply it for mm. apply_chat_template=backend == "vllm-vlm", batch_size=batch_size, + gen_kwargs=eval_config.get("gen_kwargs", None), ) return results diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 984e2108f88e..9b9a276eff03 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1096,7 +1096,7 @@ steps: - tests/weight_loading commands: - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt - + - label: NixlConnector PD accuracy tests (Distributed) # 30min timeout_in_minutes: 30 working_dir: "/vllm-workspace/tests" @@ -1152,6 +1152,17 @@ steps: - pytest -v -s tests/distributed/test_context_parallel.py - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 +- label: LM Eval Large Models (H200) # optional + gpu: h200 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-h100.txt --tp-size=4 + ##### B200 test ##### - label: Distributed Tests (B200) # optional gpu: b200