Skip to content

Commit b6afec7

Browse files
[Test] Add accuracy nightly test for new models (#4262)
### What this PR does / why we need it? Add accuracy nightly test for new models: PaddlePaddle/ERNIE-4.5-21B-A3B-PT LLM-Research/Molmo-7B-D-0924 LLM-Research/gemma-2-9b-it LLM-Research/gemma-3-4b-it Shanghai_AI_Laboratory/internlm-7b llava-hf/llava-1.5-7b-hf - vLLM version: v0.11.2 Signed-off-by: hfadzxy <[email protected]>
1 parent 8e7f5cf commit b6afec7

11 files changed

+97
-4
lines changed

.github/workflows/_e2e_nightly_single_node_models.yaml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ jobs:
5959
name: ${{inputs.model_list}} accuracy test
6060
runs-on: ${{ inputs.runner }}
6161
container:
62-
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
62+
image: "${{ inputs.image }}"
6363
env:
6464
VLLM_USE_MODELSCOPE: True
6565
GHA_VLLM_ASCEND_VERSION: ${{ inputs.vllm-ascend }}
@@ -111,6 +111,12 @@ jobs:
111111
. /usr/local/Ascend/ascend-toolkit/8.3.RC2/bisheng_toolkit/set_env.sh
112112
python3 -m pip install "https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/triton_ascend-3.2.0.dev2025110717-cp311-cp311-manylinux_2_27_aarch64.whl"
113113
114+
- name: Install tensorflow (for Molmo-7B-D-0924)
115+
if: ${{ inputs.runner == 'linux-aarch64-a2-1' && contains(inputs.model_list, 'Molmo-7B-D-0924') }}
116+
shell: bash -l {0}
117+
run: |
118+
pip install tensorflow --no-cache-dir
119+
114120
- name: Resolve vllm-ascend version
115121
run: |
116122
VERSION_INPUT="${{ inputs.vllm-ascend }}"
@@ -172,6 +178,7 @@ jobs:
172178
id: report
173179
env:
174180
VLLM_WORKER_MULTIPROC_METHOD: spawn
181+
HF_DATASETS_OFFLINE: True
175182
VLLM_USE_MODELSCOPE: True
176183
VLLM_CI_RUNNER: ${{ inputs.runner }}
177184
VLLM_VERSION: ${{ env.GHA_VLLM_VERSION }}

.github/workflows/vllm_ascend_test_nightly_a2.yaml

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,15 @@ jobs:
114114
- Qwen3-VL-8B-Instruct
115115
- Qwen2.5-Omni-7B
116116
- Meta-Llama-3.1-8B-Instruct
117+
- os: linux-aarch64-a2-1
118+
model_list:
119+
- ERNIE-4.5-21B-A3B-PT
120+
- gemma-2-9b-it
121+
- gemma-3-4b-it
122+
- internlm-7b
123+
- InternVL3_5-8B-hf
124+
- llava-1.5-7b-hf
125+
- Molmo-7B-D-0924
117126
- os: linux-aarch64-a2-2
118127
model_list:
119128
- Qwen3-30B-A3B
@@ -128,5 +137,5 @@ jobs:
128137
vllm: v0.11.2
129138
runner: ${{ matrix.test_config.os }}
130139
model_list: ${{ toJson(matrix.test_config.model_list) }}
131-
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
140+
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11'
132141
upload: false
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
model_name: "PaddlePaddle/ERNIE-4.5-21B-A3B-PT"
2+
hardware: "Atlas A2 Series"
3+
tasks:
4+
- name: "gsm8k"
5+
metrics:
6+
- name: "exact_match,flexible-extract"
7+
value: 0.71
8+
num_fewshot: 5
9+
trust_remote_code: True
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
model_name: "LLM-Research/Molmo-7B-D-0924"
2+
hardware: "Atlas A2 Series"
3+
model: "vllm-vlm"
4+
tasks:
5+
- name: "ceval-valid"
6+
metrics:
7+
- name: "acc,none"
8+
value: 0.71
9+
max_model_len: 4096
10+
trust_remote_code: True
11+
apply_chat_template: False
12+
fewshot_as_multiturn: False
13+
gpu_memory_utilization: 0.8

tests/e2e/models/configs/accuracy.txt

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,10 @@ Qwen3-VL-30B-A3B-Instruct.yaml
99
Qwen3-VL-8B-Instruct.yaml
1010
Qwen2.5-Omni-7B.yaml
1111
Meta-Llama-3.1-8B-Instruct.yaml
12-
InternVL3_5-8B.yaml
12+
InternVL3_5-8B.yaml
13+
ERNIE-4.5-21B-A3B-PT.yaml
14+
gemma-2-9b-it.yaml
15+
gemma-3-4b-it.yaml
16+
internlm-7b.yaml
17+
Molmo-7B-D-0924.yaml
18+
llava-1.5-7b-hf.yaml
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
model_name: "LLM-Research/gemma-2-9b-it"
2+
hardware: "Atlas A2 Series"
3+
tasks:
4+
- name: "gsm8k"
5+
metrics:
6+
- name: "exact_match,strict-match"
7+
value: 0.46
8+
- name: "exact_match,flexible-extract"
9+
value: 0.79
10+
num_fewshot: 5
11+
gpu_memory_utilization: 0.8
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
model_name: "LLM-Research/gemma-3-4b-it"
2+
hardware: "Atlas A2 Series"
3+
tasks:
4+
- name: "gsm8k"
5+
metrics:
6+
- name: "exact_match,strict-match"
7+
value: 0.59
8+
- name: "exact_match,flexible-extract"
9+
value: 0.59
10+
num_fewshot: 5
11+
apply_chat_template: False
12+
fewshot_as_multiturn: False
13+
gpu_memory_utilization: 0.7
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
model_name: "Shanghai_AI_Laboratory/internlm-7b"
2+
hardware: "Atlas A2 Series"
3+
tasks:
4+
- name: "ceval-valid"
5+
metrics:
6+
- name: "acc,none"
7+
value: 0.42
8+
num_fewshot: 5
9+
max_model_len: 2048
10+
trust_remote_code: True
11+
dtype: "bfloat16"
12+
apply_chat_template: False
13+
fewshot_as_multiturn: False
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
model_name: "llava-hf/llava-1.5-7b-hf"
2+
hardware: "Atlas A2 Series"
3+
model: "vllm-vlm"
4+
tasks:
5+
- name: "ceval-valid"
6+
metrics:
7+
- name: "acc,none"
8+
value: 0.30
9+
trust_remote_code: True
10+
gpu_memory_utilization: 0.8
11+
dtype: "bfloat16"

0 commit comments

Comments
 (0)