[Test] Add accuracy nightly test for new models (#4262)

zhangxinyuehfad · web-flow · commit b6afec73e1e0 · 2025-12-01T22:28:46.000+08:00
### What this PR does / why we need it?
Add accuracy nightly test for new models：

PaddlePaddle/ERNIE-4.5-21B-A3B-PT
LLM-Research/Molmo-7B-D-0924
LLM-Research/gemma-2-9b-it
LLM-Research/gemma-3-4b-it
Shanghai_AI_Laboratory/internlm-7b
llava-hf/llava-1.5-7b-hf

- vLLM version: v0.11.2

Signed-off-by: hfadzxy &lt;starmoon_zhang@163.com&gt;
diff --git a/.github/workflows/_e2e_nightly_single_node_models.yaml b/.github/workflows/_e2e_nightly_single_node_models.yaml
@@ -59,7 +59,7 @@ jobs:
     name: ${{inputs.model_list}} accuracy test
     runs-on: ${{ inputs.runner }}
     container:
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11
+      image: "${{ inputs.image }}"
       env:
         VLLM_USE_MODELSCOPE: True
         GHA_VLLM_ASCEND_VERSION: ${{ inputs.vllm-ascend }}
@@ -111,6 +111,12 @@ jobs:
           . /usr/local/Ascend/ascend-toolkit/8.3.RC2/bisheng_toolkit/set_env.sh
           python3 -m pip install "https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/triton_ascend-3.2.0.dev2025110717-cp311-cp311-manylinux_2_27_aarch64.whl"
 
+      - name: Install tensorflow (for Molmo-7B-D-0924)
+        if: ${{ inputs.runner == 'linux-aarch64-a2-1' && contains(inputs.model_list, 'Molmo-7B-D-0924') }}
+        shell: bash -l {0}
+        run: |
+          pip install tensorflow --no-cache-dir
+
       - name: Resolve vllm-ascend version
         run: |
           VERSION_INPUT="${{ inputs.vllm-ascend }}"
@@ -172,6 +178,7 @@ jobs:
         id: report
         env:
           VLLM_WORKER_MULTIPROC_METHOD: spawn
+          HF_DATASETS_OFFLINE: True
           VLLM_USE_MODELSCOPE: True
           VLLM_CI_RUNNER: ${{ inputs.runner }}
           VLLM_VERSION: ${{ env.GHA_VLLM_VERSION }}
diff --git a/.github/workflows/vllm_ascend_test_nightly_a2.yaml b/.github/workflows/vllm_ascend_test_nightly_a2.yaml
@@ -114,6 +114,15 @@ jobs:
               - Qwen3-VL-8B-Instruct
               - Qwen2.5-Omni-7B
               - Meta-Llama-3.1-8B-Instruct
+          - os: linux-aarch64-a2-1
+            model_list:
+              - ERNIE-4.5-21B-A3B-PT
+              - gemma-2-9b-it
+              - gemma-3-4b-it
+              - internlm-7b
+              - InternVL3_5-8B-hf
+              - llava-1.5-7b-hf
+              - Molmo-7B-D-0924
           - os: linux-aarch64-a2-2
             model_list:
               - Qwen3-30B-A3B
@@ -128,5 +137,5 @@ jobs:
       vllm: v0.11.2
       runner: ${{ matrix.test_config.os }}
       model_list: ${{ toJson(matrix.test_config.model_list) }}
-      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
+      image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-910b-ubuntu22.04-py3.11'
       upload: false
diff --git a/tests/e2e/models/configs/ERNIE-4.5-21B-A3B-PT.yaml b/tests/e2e/models/configs/ERNIE-4.5-21B-A3B-PT.yaml
@@ -0,0 +1,9 @@
+model_name: "PaddlePaddle/ERNIE-4.5-21B-A3B-PT"
+hardware: "Atlas A2 Series"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,flexible-extract"
+    value: 0.71
+num_fewshot: 5
+trust_remote_code: True
diff --git a/tests/e2e/models/configs/InternVL3_5-8B-hf.yaml b/tests/e2e/models/configs/InternVL3_5-8B-hf.yaml
diff --git a/tests/e2e/models/configs/Molmo-7B-D-0924.yaml b/tests/e2e/models/configs/Molmo-7B-D-0924.yaml
@@ -0,0 +1,13 @@
+model_name: "LLM-Research/Molmo-7B-D-0924"
+hardware: "Atlas A2 Series"
+model: "vllm-vlm"
+tasks:
+- name: "ceval-valid"
+  metrics:
+  - name: "acc,none"
+    value: 0.71
+max_model_len: 4096
+trust_remote_code: True
+apply_chat_template: False
+fewshot_as_multiturn: False
+gpu_memory_utilization: 0.8
diff --git a/tests/e2e/models/configs/accuracy.txt b/tests/e2e/models/configs/accuracy.txt
@@ -9,4 +9,10 @@ Qwen3-VL-30B-A3B-Instruct.yaml
 Qwen3-VL-8B-Instruct.yaml
 Qwen2.5-Omni-7B.yaml
 Meta-Llama-3.1-8B-Instruct.yaml
-InternVL3_5-8B.yaml
+InternVL3_5-8B.yaml
+ERNIE-4.5-21B-A3B-PT.yaml
+gemma-2-9b-it.yaml
+gemma-3-4b-it.yaml
+internlm-7b.yaml
+Molmo-7B-D-0924.yaml
+llava-1.5-7b-hf.yaml
diff --git a/tests/e2e/models/configs/gemma-2-9b-it.yaml b/tests/e2e/models/configs/gemma-2-9b-it.yaml
@@ -0,0 +1,11 @@
+model_name: "LLM-Research/gemma-2-9b-it"
+hardware: "Atlas A2 Series"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.46
+  - name: "exact_match,flexible-extract"
+    value: 0.79
+num_fewshot: 5
+gpu_memory_utilization: 0.8
diff --git a/tests/e2e/models/configs/gemma-3-4b-it.yaml b/tests/e2e/models/configs/gemma-3-4b-it.yaml
@@ -0,0 +1,13 @@
+model_name: "LLM-Research/gemma-3-4b-it"
+hardware: "Atlas A2 Series"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.59
+  - name: "exact_match,flexible-extract"
+    value: 0.59
+num_fewshot: 5
+apply_chat_template: False
+fewshot_as_multiturn: False
+gpu_memory_utilization: 0.7
diff --git a/tests/e2e/models/configs/internlm-7b.yaml b/tests/e2e/models/configs/internlm-7b.yaml
@@ -0,0 +1,13 @@
+model_name: "Shanghai_AI_Laboratory/internlm-7b"
+hardware: "Atlas A2 Series"
+tasks:
+- name: "ceval-valid"
+  metrics:
+  - name: "acc,none"
+    value: 0.42
+num_fewshot: 5
+max_model_len: 2048
+trust_remote_code: True
+dtype: "bfloat16"
+apply_chat_template: False
+fewshot_as_multiturn: False
diff --git a/tests/e2e/models/configs/llava-1.5-7b-hf.yaml b/tests/e2e/models/configs/llava-1.5-7b-hf.yaml
@@ -0,0 +1,11 @@
+model_name: "llava-hf/llava-1.5-7b-hf"
+hardware: "Atlas A2 Series"
+model: "vllm-vlm"
+tasks:
+- name: "ceval-valid"
+  metrics:
+  - name: "acc,none"
+    value: 0.30
+trust_remote_code: True
+gpu_memory_utilization: 0.8
+dtype: "bfloat16"
diff --git a/tests/e2e/models/test_lm_eval_correctness.py b/tests/e2e/models/test_lm_eval_correctness.py
@@ -39,10 +39,11 @@ def env_config() -> EnvConfig:
 def build_model_args(eval_config, tp_size):
     trust_remote_code = eval_config.get("trust_remote_code", False)
     max_model_len = eval_config.get("max_model_len", 4096)
+    dtype = eval_config.get("dtype", "auto")
     model_args = {
         "pretrained": eval_config["model_name"],
         "tensor_parallel_size": tp_size,
-        "dtype": "auto",
+        "dtype": dtype,
         "trust_remote_code": trust_remote_code,
         "max_model_len": max_model_len,
     }