From 57e50b1885a476102ac2e3177c8fa1767e4b9ab9 Mon Sep 17 00:00:00 2001
From: Brian Dellabetta <bdellabe@redhat.com>
Date: Mon, 20 Oct 2025 18:22:49 +0000
Subject: [PATCH 1/5] qwen 3 vl with apply_chat _template true

Signed-off-by: Brian Dellabetta <bdellabe@redhat.com>
---
 tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml    | 11 +++++------
 .../configs/vl_int8_w8a8_dynamic_per_token.yaml       | 11 +++++------
 tests/lmeval/configs/vl_w4a16_actorder_weight.yaml    | 11 +++++------
 tests/lmeval/test_lmeval.py                           |  3 +++
 4 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml
index 3fde9d4c65..45c4d2e404 100644
--- a/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml
+++ b/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml
@@ -1,18 +1,17 @@
 cadence: weekly
-model: Qwen/Qwen2.5-VL-7B-Instruct
-model_class: Qwen2_5_VLForConditionalGeneration
+model: Qwen/Qwen3-VL-8B-Instruct
+model_class: Qwen3VLForConditionalGeneration
 scheme: FP8_DYNAMIC
 recipe: tests/e2e/vLLM/recipes/FP8/recipe_fp8_dynamic.yaml
 lmeval:
   model: "hf-multimodal"
   model_args:
     dtype: bfloat16
-    add_bos_token: True
     convert_img_format: True
   task: mmmu_val_literature
+  apply_chat_template: True
   num_fewshot: 0
   batch_size: 8
-  # dense model achieves accuracy of 0.9 +/ 0.0557
+  # dense model achieves accuracy of 0.833
   metrics:
-    acc,none: 0.8333
-    acc_stderr,none: 0.0557
+    acc,none: 0.833
diff --git a/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml
index 86c2f0e12f..df4d54e036 100644
--- a/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml
+++ b/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml
@@ -1,6 +1,6 @@
 cadence: "weekly"
-model: Qwen/Qwen2.5-VL-7B-Instruct
-model_class: Qwen2_5_VLForConditionalGeneration
+model: Qwen/Qwen3-VL-8B-Instruct
+model_class: Qwen3VLForConditionalGeneration
 scheme: INT8_dyn_per_token
 recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml
 dataset_id: lmms-lab/flickr30k
@@ -9,12 +9,11 @@ lmeval:
   model: "hf-multimodal"
   model_args:
     dtype: bfloat16
-    add_bos_token: True
     convert_img_format: True
   task: mmmu_val_literature
+  apply_chat_template: True
   num_fewshot: 0
   batch_size: 8
-  # dense model achieves accuracy of 0.9 +/ 0.0557
+  # dense model achieves accuracy of 0.833
   metrics:
-    acc,none: 0.833
-    acc_stderr,none: 0.0557
\ No newline at end of file
+    acc,none: 0.833
\ No newline at end of file
diff --git a/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml b/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml
index 37b162b37d..be79fca2f8 100644
--- a/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml
+++ b/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml
@@ -1,6 +1,6 @@
 cadence: "weekly"
-model: Qwen/Qwen2.5-VL-7B-Instruct
-model_class: Qwen2_5_VLForConditionalGeneration
+model: Qwen/Qwen3-VL-8B-Instruct
+model_class: Qwen3VLForConditionalGeneration
 scheme: W4A16_actorder_weight
 recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
 dataset_id: lmms-lab/flickr30k
@@ -9,12 +9,11 @@ lmeval:
   model: "hf-multimodal"
   model_args:
     dtype: bfloat16
-    add_bos_token: True
     convert_img_format: True
   task: mmmu_val_literature
+  apply_chat_template: True
   num_fewshot: 0
   batch_size: 8
-  # dense model achieves accuracy of 0.9 +/ 0.0557
+  # dense model achieves accuracy of 0.8333
   metrics:
-    acc,none: 0.8333
-    acc_stderr,none: 0.0557
\ No newline at end of file
+    acc,none: 0.800
\ No newline at end of file
diff --git a/tests/lmeval/test_lmeval.py b/tests/lmeval/test_lmeval.py
index a44cd042ff..da99782c68 100644
--- a/tests/lmeval/test_lmeval.py
+++ b/tests/lmeval/test_lmeval.py
@@ -25,6 +25,7 @@ class LmEvalConfig(BaseModel):
     num_fewshot: int = 5
     limit: int = 1000
     batch_size: int = 100
+    apply_chat_template: bool = False
     # Recovery testing (default): compare against base model performance
     # Default threshold is 0.95 (retain ≥95% of base), can be overridden
     recovery_threshold: Union[float, dict] = 0.95
@@ -160,6 +161,7 @@ def _eval_base_model(self):
             num_fewshot=self.lmeval.num_fewshot,
             limit=self.lmeval.limit,
             device="cuda:0",
+            apply_chat_template=self.lmeval.apply_chat_template,
             batch_size=self.lmeval.batch_size,
         )
 
@@ -190,6 +192,7 @@ def _run_lm_eval(self):
             num_fewshot=self.lmeval.num_fewshot,
             limit=self.lmeval.limit,
             device="cuda:0",
+            apply_chat_template=self.lmeval.apply_chat_template,
             batch_size=self.lmeval.batch_size,
         )
 

From 1e013537fde1f4d6ce94c77873227b7621244879 Mon Sep 17 00:00:00 2001
From: Brian Dellabetta <bdellabe@redhat.com>
Date: Tue, 21 Oct 2025 22:46:56 +0000
Subject: [PATCH 2/5] chartqa p1

Signed-off-by: Brian Dellabetta <bdellabe@redhat.com>
---
 tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml
index 45c4d2e404..cc47857bab 100644
--- a/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml
+++ b/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml
@@ -8,10 +8,16 @@ lmeval:
   model_args:
     dtype: bfloat16
     convert_img_format: True
-  task: mmmu_val_literature
+  task: chartqa
   apply_chat_template: True
   num_fewshot: 0
-  batch_size: 8
-  # dense model achieves accuracy of 0.833
+  batch_size: 100
+  limit: 100
+  # test runs in 26m
+  # dense model achieves exact_match accuracy of 0.530
+  # dense model achieves relaxed_accuracy of 0.780
+  # dense model achieves anywhere_accuracy of 0.800
   metrics:
-    acc,none: 0.833
+    exact_match,none: 0.530
+    relaxed_accuracy,none: 0.780
+    anywhere_accuracy,none: 0.810

From 2fc4001de5a978d32bc16004553efb444097c03e Mon Sep 17 00:00:00 2001
From: Brian Dellabetta <bdellabe@redhat.com>
Date: Wed, 22 Oct 2025 15:24:05 +0000
Subject: [PATCH 3/5] broken test

Signed-off-by: Brian Dellabetta <bdellabe@redhat.com>
---
 .../configs/vl_int8_w8a8_dynamic_per_token.yaml    | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml
index df4d54e036..07bd0e33b8 100644
--- a/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml
+++ b/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml
@@ -10,10 +10,16 @@ lmeval:
   model_args:
     dtype: bfloat16
     convert_img_format: True
-  task: mmmu_val_literature
+  task: chartqa
   apply_chat_template: True
   num_fewshot: 0
-  batch_size: 8
-  # dense model achieves accuracy of 0.833
+  batch_size: 100
+  limit: 100
+  # test runs in m
+  # dense model achieves exact_match accuracy of 0.
+  # dense model achieves relaxed_accuracy of 0.
+  # dense model achieves anywhere_accuracy of 0.
   metrics:
-    acc,none: 0.833
\ No newline at end of file
+    exact_match,none: 0.
+    relaxed_accuracy,none: 0.
+    anywhere_accuracy,none: 0.
\ No newline at end of file

From 73f11ed2ca6f2628e9583c7b03c6095cac2f9e60 Mon Sep 17 00:00:00 2001
From: Brian Dellabetta <bdellabe@redhat.com>
Date: Wed, 22 Oct 2025 20:19:22 +0000
Subject: [PATCH 4/5] neuralmagic/calibration dataset

Signed-off-by: Brian Dellabetta <bdellabe@redhat.com>
---
 tests/e2e/e2e_utils.py                        | 15 +++++++++++
 .../configs/vl_fp8_dynamic_per_token.yaml     |  1 -
 .../vl_int8_w8a8_dynamic_per_token.yaml       | 18 ++++++-------
 .../configs/vl_w4a16_actorder_weight.yaml     | 18 ++++++++-----
 tests/testing_utils.py                        | 25 +++++++++++++++++++
 5 files changed, 61 insertions(+), 16 deletions(-)

diff --git a/tests/e2e/e2e_utils.py b/tests/e2e/e2e_utils.py
index a800c3770b..f052a7ca73 100644
--- a/tests/e2e/e2e_utils.py
+++ b/tests/e2e/e2e_utils.py
@@ -62,6 +62,21 @@ def data_collator(batch):
 
             oneshot_kwargs["data_collator"] = data_collator
 
+        elif "calibration" in dataset_id:
+
+            def data_collator(batch):
+                assert len(batch) == 1
+                return {
+                    key: (
+                        torch.tensor(value)
+                        if key != "pixel_values"
+                        else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
+                    )
+                    for key, value in batch[0].items()
+                }
+
+            oneshot_kwargs["data_collator"] = data_collator
+
     oneshot_kwargs["model"] = loaded_model
     if recipe:
         oneshot_kwargs["recipe"] = recipe
diff --git a/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml
index cc47857bab..9199dd978f 100644
--- a/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml
+++ b/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml
@@ -13,7 +13,6 @@ lmeval:
   num_fewshot: 0
   batch_size: 100
   limit: 100
-  # test runs in 26m
   # dense model achieves exact_match accuracy of 0.530
   # dense model achieves relaxed_accuracy of 0.780
   # dense model achieves anywhere_accuracy of 0.800
diff --git a/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml
index 07bd0e33b8..89145844e6 100644
--- a/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml
+++ b/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml
@@ -3,8 +3,9 @@ model: Qwen/Qwen3-VL-8B-Instruct
 model_class: Qwen3VLForConditionalGeneration
 scheme: INT8_dyn_per_token
 recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml
-dataset_id: lmms-lab/flickr30k
-dataset_split: "test[:512]"
+dataset_id: neuralmagic/calibration
+dataset_config: LLM
+dataset_split: "train[:512]"
 lmeval:
   model: "hf-multimodal"
   model_args:
@@ -15,11 +16,10 @@ lmeval:
   num_fewshot: 0
   batch_size: 100
   limit: 100
-  # test runs in m
-  # dense model achieves exact_match accuracy of 0.
-  # dense model achieves relaxed_accuracy of 0.
-  # dense model achieves anywhere_accuracy of 0.
+  # dense model achieves exact_match accuracy of 0.520
+  # dense model achieves relaxed_accuracy of 0.780
+  # dense model achieves anywhere_accuracy of 0.800
   metrics:
-    exact_match,none: 0.
-    relaxed_accuracy,none: 0.
-    anywhere_accuracy,none: 0.
\ No newline at end of file
+    exact_match,none: 0.550
+    relaxed_accuracy,none: 0.770
+    anywhere_accuracy,none: 0.770
\ No newline at end of file
diff --git a/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml b/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml
index be79fca2f8..121cc14bc8 100644
--- a/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml
+++ b/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml
@@ -3,17 +3,23 @@ model: Qwen/Qwen3-VL-8B-Instruct
 model_class: Qwen3VLForConditionalGeneration
 scheme: W4A16_actorder_weight
 recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
-dataset_id: lmms-lab/flickr30k
-dataset_split: "test[:512]"
+dataset_id: neuralmagic/calibration
+dataset_config: LLM
+dataset_split: "train[:512]"
 lmeval:
   model: "hf-multimodal"
   model_args:
     dtype: bfloat16
     convert_img_format: True
-  task: mmmu_val_literature
+  task: chartqa
   apply_chat_template: True
   num_fewshot: 0
-  batch_size: 8
-  # dense model achieves accuracy of 0.8333
+  batch_size: 100
+  limit: 100
+  # dense model achieves exact_match accuracy of 0.520
+  # dense model achieves relaxed_accuracy of 0.780
+  # dense model achieves anywhere_accuracy of 0.800
   metrics:
-    acc,none: 0.800
\ No newline at end of file
+    exact_match,none: 0.540
+    relaxed_accuracy,none: 0.780
+    anywhere_accuracy,none: 0.800
\ No newline at end of file
diff --git a/tests/testing_utils.py b/tests/testing_utils.py
index 4ce6a5de69..2cf69720be 100644
--- a/tests/testing_utils.py
+++ b/tests/testing_utils.py
@@ -285,6 +285,31 @@ def process(sample):
                 "images": sample["image"],
             }
 
+    # "neuralmagic/calibration"
+    elif ds_name == "calibration":
+
+        def process(example):
+            messages = []
+            for message in example["messages"]:
+                messages.append(
+                    {
+                        "role": message["role"],
+                        "content": [{"type": "text", "text": message["content"]}],
+                    }
+                )
+
+            return processor.apply_chat_template(
+                messages,
+                return_tensors="pt",
+                padding=False,
+                truncation=True,
+                max_length=max_seq_length,
+                tokenize=True,
+                add_special_tokens=False,
+                return_dict=True,
+                add_generation_prompt=False,
+            )
+
     else:
         raise NotImplementedError(f"Cannot preprocess dataset {ds.info.dataset_name}")
 

From 6c51e0333180a0015c313a6a77b6cc256bc24f8a Mon Sep 17 00:00:00 2001
From: Brian Dellabetta <bdellabe@redhat.com>
Date: Tue, 28 Oct 2025 17:56:52 +0000
Subject: [PATCH 5/5] prune unused datasets, up to 500 samples

Signed-off-by: Brian Dellabetta <bdellabe@redhat.com>
---
 .../configs/vl_fp8_dynamic_per_token.yaml     | 10 +++---
 .../vl_int8_w8a8_dynamic_per_token.yaml       | 12 +++----
 .../configs/vl_w4a16_actorder_weight.yaml     | 12 +++----
 tests/testing_utils.py                        | 33 -------------------
 4 files changed, 17 insertions(+), 50 deletions(-)

diff --git a/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml
index 9199dd978f..da2b2d9869 100644
--- a/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml
+++ b/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml
@@ -12,11 +12,11 @@ lmeval:
   apply_chat_template: True
   num_fewshot: 0
   batch_size: 100
-  limit: 100
-  # dense model achieves exact_match accuracy of 0.530
+  limit: 500
+  # dense model achieves exact_match accuracy of 0.576
   # dense model achieves relaxed_accuracy of 0.780
-  # dense model achieves anywhere_accuracy of 0.800
+  # dense model achieves anywhere_accuracy of 0.806
   metrics:
-    exact_match,none: 0.530
-    relaxed_accuracy,none: 0.780
+    exact_match,none: 0.596
+    relaxed_accuracy,none: 0.784
     anywhere_accuracy,none: 0.810
diff --git a/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml
index 89145844e6..7271ccc4fc 100644
--- a/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml
+++ b/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml
@@ -15,11 +15,11 @@ lmeval:
   apply_chat_template: True
   num_fewshot: 0
   batch_size: 100
-  limit: 100
-  # dense model achieves exact_match accuracy of 0.520
+  limit: 500
+  # dense model achieves exact_match accuracy of 0.576
   # dense model achieves relaxed_accuracy of 0.780
-  # dense model achieves anywhere_accuracy of 0.800
+  # dense model achieves anywhere_accuracy of 0.806
   metrics:
-    exact_match,none: 0.550
-    relaxed_accuracy,none: 0.770
-    anywhere_accuracy,none: 0.770
\ No newline at end of file
+    exact_match,none: 0.608
+    relaxed_accuracy,none: 0.806
+    anywhere_accuracy,none: 0.824
\ No newline at end of file
diff --git a/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml b/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml
index 121cc14bc8..8b07f5c2cb 100644
--- a/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml
+++ b/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml
@@ -15,11 +15,11 @@ lmeval:
   apply_chat_template: True
   num_fewshot: 0
   batch_size: 100
-  limit: 100
-  # dense model achieves exact_match accuracy of 0.520
+  limit: 500
+  # dense model achieves exact_match accuracy of 0.576
   # dense model achieves relaxed_accuracy of 0.780
-  # dense model achieves anywhere_accuracy of 0.800
+  # dense model achieves anywhere_accuracy of 0.806
   metrics:
-    exact_match,none: 0.540
-    relaxed_accuracy,none: 0.780
-    anywhere_accuracy,none: 0.800
\ No newline at end of file
+    exact_match,none: 0.588
+    relaxed_accuracy,none: 0.782
+    anywhere_accuracy,none: 0.808
\ No newline at end of file
diff --git a/tests/testing_utils.py b/tests/testing_utils.py
index 2cf69720be..0cd1963036 100644
--- a/tests/testing_utils.py
+++ b/tests/testing_utils.py
@@ -218,20 +218,6 @@ def process(sample):
                 add_special_tokens=False,
             )
 
-    elif ds_name == "llm_compression_calibration":
-
-        def process(sample):
-            return processor(
-                processor.apply_chat_template(
-                    sample["text"],
-                    tokenize=False,
-                ),
-                padding=False,
-                max_length=max_seq_length,
-                truncation=True,
-                add_special_tokens=False,
-            )
-
     elif ds_name == "open-platypus":
         # use the output rather than the instruction
         def process(sample):
@@ -246,25 +232,6 @@ def process(sample):
                 add_special_tokens=False,
             )
 
-    elif ds_name == "slimorca-deduped-cleaned-corrected":
-        # find the first element corresponding to a message from a human
-        def process(sample):
-            conversation_idx = 0
-            for idx, conversation in enumerate(sample["conversations"]):
-                if conversation["from"] == "human":
-                    conversation_idx = idx
-                    break
-            return processor(
-                processor.apply_chat_template(
-                    sample["conversations"][conversation_idx]["value"],
-                    tokenize=False,
-                ),
-                padding=False,
-                max_length=max_seq_length,
-                truncation=True,
-                add_special_tokens=False,
-            )
-
     elif ds_name == "flickr30k":
 
         def process(sample):