From 57e50b1885a476102ac2e3177c8fa1767e4b9ab9 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Mon, 20 Oct 2025 18:22:49 +0000 Subject: [PATCH 1/5] qwen 3 vl with apply_chat _template true Signed-off-by: Brian Dellabetta --- tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml | 11 +++++------ .../configs/vl_int8_w8a8_dynamic_per_token.yaml | 11 +++++------ tests/lmeval/configs/vl_w4a16_actorder_weight.yaml | 11 +++++------ tests/lmeval/test_lmeval.py | 3 +++ 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml index 3fde9d4c65..45c4d2e404 100644 --- a/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml +++ b/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml @@ -1,18 +1,17 @@ cadence: weekly -model: Qwen/Qwen2.5-VL-7B-Instruct -model_class: Qwen2_5_VLForConditionalGeneration +model: Qwen/Qwen3-VL-8B-Instruct +model_class: Qwen3VLForConditionalGeneration scheme: FP8_DYNAMIC recipe: tests/e2e/vLLM/recipes/FP8/recipe_fp8_dynamic.yaml lmeval: model: "hf-multimodal" model_args: dtype: bfloat16 - add_bos_token: True convert_img_format: True task: mmmu_val_literature + apply_chat_template: True num_fewshot: 0 batch_size: 8 - # dense model achieves accuracy of 0.9 +/ 0.0557 + # dense model achieves accuracy of 0.833 metrics: - acc,none: 0.8333 - acc_stderr,none: 0.0557 + acc,none: 0.833 diff --git a/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml index 86c2f0e12f..df4d54e036 100644 --- a/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml +++ b/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml @@ -1,6 +1,6 @@ cadence: "weekly" -model: Qwen/Qwen2.5-VL-7B-Instruct -model_class: Qwen2_5_VLForConditionalGeneration +model: Qwen/Qwen3-VL-8B-Instruct +model_class: Qwen3VLForConditionalGeneration scheme: INT8_dyn_per_token recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml dataset_id: lmms-lab/flickr30k @@ -9,12 +9,11 @@ lmeval: model: "hf-multimodal" model_args: dtype: bfloat16 - add_bos_token: True convert_img_format: True task: mmmu_val_literature + apply_chat_template: True num_fewshot: 0 batch_size: 8 - # dense model achieves accuracy of 0.9 +/ 0.0557 + # dense model achieves accuracy of 0.833 metrics: - acc,none: 0.833 - acc_stderr,none: 0.0557 \ No newline at end of file + acc,none: 0.833 \ No newline at end of file diff --git a/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml b/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml index 37b162b37d..be79fca2f8 100644 --- a/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml +++ b/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml @@ -1,6 +1,6 @@ cadence: "weekly" -model: Qwen/Qwen2.5-VL-7B-Instruct -model_class: Qwen2_5_VLForConditionalGeneration +model: Qwen/Qwen3-VL-8B-Instruct +model_class: Qwen3VLForConditionalGeneration scheme: W4A16_actorder_weight recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml dataset_id: lmms-lab/flickr30k @@ -9,12 +9,11 @@ lmeval: model: "hf-multimodal" model_args: dtype: bfloat16 - add_bos_token: True convert_img_format: True task: mmmu_val_literature + apply_chat_template: True num_fewshot: 0 batch_size: 8 - # dense model achieves accuracy of 0.9 +/ 0.0557 + # dense model achieves accuracy of 0.8333 metrics: - acc,none: 0.8333 - acc_stderr,none: 0.0557 \ No newline at end of file + acc,none: 0.800 \ No newline at end of file diff --git a/tests/lmeval/test_lmeval.py b/tests/lmeval/test_lmeval.py index a44cd042ff..da99782c68 100644 --- a/tests/lmeval/test_lmeval.py +++ b/tests/lmeval/test_lmeval.py @@ -25,6 +25,7 @@ class LmEvalConfig(BaseModel): num_fewshot: int = 5 limit: int = 1000 batch_size: int = 100 + apply_chat_template: bool = False # Recovery testing (default): compare against base model performance # Default threshold is 0.95 (retain ≥95% of base), can be overridden recovery_threshold: Union[float, dict] = 0.95 @@ -160,6 +161,7 @@ def _eval_base_model(self): num_fewshot=self.lmeval.num_fewshot, limit=self.lmeval.limit, device="cuda:0", + apply_chat_template=self.lmeval.apply_chat_template, batch_size=self.lmeval.batch_size, ) @@ -190,6 +192,7 @@ def _run_lm_eval(self): num_fewshot=self.lmeval.num_fewshot, limit=self.lmeval.limit, device="cuda:0", + apply_chat_template=self.lmeval.apply_chat_template, batch_size=self.lmeval.batch_size, ) From 1e013537fde1f4d6ce94c77873227b7621244879 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Tue, 21 Oct 2025 22:46:56 +0000 Subject: [PATCH 2/5] chartqa p1 Signed-off-by: Brian Dellabetta --- tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml index 45c4d2e404..cc47857bab 100644 --- a/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml +++ b/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml @@ -8,10 +8,16 @@ lmeval: model_args: dtype: bfloat16 convert_img_format: True - task: mmmu_val_literature + task: chartqa apply_chat_template: True num_fewshot: 0 - batch_size: 8 - # dense model achieves accuracy of 0.833 + batch_size: 100 + limit: 100 + # test runs in 26m + # dense model achieves exact_match accuracy of 0.530 + # dense model achieves relaxed_accuracy of 0.780 + # dense model achieves anywhere_accuracy of 0.800 metrics: - acc,none: 0.833 + exact_match,none: 0.530 + relaxed_accuracy,none: 0.780 + anywhere_accuracy,none: 0.810 From 2fc4001de5a978d32bc16004553efb444097c03e Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Wed, 22 Oct 2025 15:24:05 +0000 Subject: [PATCH 3/5] broken test Signed-off-by: Brian Dellabetta --- .../configs/vl_int8_w8a8_dynamic_per_token.yaml | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml index df4d54e036..07bd0e33b8 100644 --- a/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml +++ b/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml @@ -10,10 +10,16 @@ lmeval: model_args: dtype: bfloat16 convert_img_format: True - task: mmmu_val_literature + task: chartqa apply_chat_template: True num_fewshot: 0 - batch_size: 8 - # dense model achieves accuracy of 0.833 + batch_size: 100 + limit: 100 + # test runs in m + # dense model achieves exact_match accuracy of 0. + # dense model achieves relaxed_accuracy of 0. + # dense model achieves anywhere_accuracy of 0. metrics: - acc,none: 0.833 \ No newline at end of file + exact_match,none: 0. + relaxed_accuracy,none: 0. + anywhere_accuracy,none: 0. \ No newline at end of file From 73f11ed2ca6f2628e9583c7b03c6095cac2f9e60 Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Wed, 22 Oct 2025 20:19:22 +0000 Subject: [PATCH 4/5] neuralmagic/calibration dataset Signed-off-by: Brian Dellabetta --- tests/e2e/e2e_utils.py | 15 +++++++++++ .../configs/vl_fp8_dynamic_per_token.yaml | 1 - .../vl_int8_w8a8_dynamic_per_token.yaml | 18 ++++++------- .../configs/vl_w4a16_actorder_weight.yaml | 18 ++++++++----- tests/testing_utils.py | 25 +++++++++++++++++++ 5 files changed, 61 insertions(+), 16 deletions(-) diff --git a/tests/e2e/e2e_utils.py b/tests/e2e/e2e_utils.py index a800c3770b..f052a7ca73 100644 --- a/tests/e2e/e2e_utils.py +++ b/tests/e2e/e2e_utils.py @@ -62,6 +62,21 @@ def data_collator(batch): oneshot_kwargs["data_collator"] = data_collator + elif "calibration" in dataset_id: + + def data_collator(batch): + assert len(batch) == 1 + return { + key: ( + torch.tensor(value) + if key != "pixel_values" + else torch.tensor(value, dtype=torch.bfloat16).squeeze(0) + ) + for key, value in batch[0].items() + } + + oneshot_kwargs["data_collator"] = data_collator + oneshot_kwargs["model"] = loaded_model if recipe: oneshot_kwargs["recipe"] = recipe diff --git a/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml index cc47857bab..9199dd978f 100644 --- a/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml +++ b/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml @@ -13,7 +13,6 @@ lmeval: num_fewshot: 0 batch_size: 100 limit: 100 - # test runs in 26m # dense model achieves exact_match accuracy of 0.530 # dense model achieves relaxed_accuracy of 0.780 # dense model achieves anywhere_accuracy of 0.800 diff --git a/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml index 07bd0e33b8..89145844e6 100644 --- a/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml +++ b/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml @@ -3,8 +3,9 @@ model: Qwen/Qwen3-VL-8B-Instruct model_class: Qwen3VLForConditionalGeneration scheme: INT8_dyn_per_token recipe: tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml -dataset_id: lmms-lab/flickr30k -dataset_split: "test[:512]" +dataset_id: neuralmagic/calibration +dataset_config: LLM +dataset_split: "train[:512]" lmeval: model: "hf-multimodal" model_args: @@ -15,11 +16,10 @@ lmeval: num_fewshot: 0 batch_size: 100 limit: 100 - # test runs in m - # dense model achieves exact_match accuracy of 0. - # dense model achieves relaxed_accuracy of 0. - # dense model achieves anywhere_accuracy of 0. + # dense model achieves exact_match accuracy of 0.520 + # dense model achieves relaxed_accuracy of 0.780 + # dense model achieves anywhere_accuracy of 0.800 metrics: - exact_match,none: 0. - relaxed_accuracy,none: 0. - anywhere_accuracy,none: 0. \ No newline at end of file + exact_match,none: 0.550 + relaxed_accuracy,none: 0.770 + anywhere_accuracy,none: 0.770 \ No newline at end of file diff --git a/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml b/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml index be79fca2f8..121cc14bc8 100644 --- a/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml +++ b/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml @@ -3,17 +3,23 @@ model: Qwen/Qwen3-VL-8B-Instruct model_class: Qwen3VLForConditionalGeneration scheme: W4A16_actorder_weight recipe: tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml -dataset_id: lmms-lab/flickr30k -dataset_split: "test[:512]" +dataset_id: neuralmagic/calibration +dataset_config: LLM +dataset_split: "train[:512]" lmeval: model: "hf-multimodal" model_args: dtype: bfloat16 convert_img_format: True - task: mmmu_val_literature + task: chartqa apply_chat_template: True num_fewshot: 0 - batch_size: 8 - # dense model achieves accuracy of 0.8333 + batch_size: 100 + limit: 100 + # dense model achieves exact_match accuracy of 0.520 + # dense model achieves relaxed_accuracy of 0.780 + # dense model achieves anywhere_accuracy of 0.800 metrics: - acc,none: 0.800 \ No newline at end of file + exact_match,none: 0.540 + relaxed_accuracy,none: 0.780 + anywhere_accuracy,none: 0.800 \ No newline at end of file diff --git a/tests/testing_utils.py b/tests/testing_utils.py index 4ce6a5de69..2cf69720be 100644 --- a/tests/testing_utils.py +++ b/tests/testing_utils.py @@ -285,6 +285,31 @@ def process(sample): "images": sample["image"], } + # "neuralmagic/calibration" + elif ds_name == "calibration": + + def process(example): + messages = [] + for message in example["messages"]: + messages.append( + { + "role": message["role"], + "content": [{"type": "text", "text": message["content"]}], + } + ) + + return processor.apply_chat_template( + messages, + return_tensors="pt", + padding=False, + truncation=True, + max_length=max_seq_length, + tokenize=True, + add_special_tokens=False, + return_dict=True, + add_generation_prompt=False, + ) + else: raise NotImplementedError(f"Cannot preprocess dataset {ds.info.dataset_name}") From 6c51e0333180a0015c313a6a77b6cc256bc24f8a Mon Sep 17 00:00:00 2001 From: Brian Dellabetta Date: Tue, 28 Oct 2025 17:56:52 +0000 Subject: [PATCH 5/5] prune unused datasets, up to 500 samples Signed-off-by: Brian Dellabetta --- .../configs/vl_fp8_dynamic_per_token.yaml | 10 +++--- .../vl_int8_w8a8_dynamic_per_token.yaml | 12 +++---- .../configs/vl_w4a16_actorder_weight.yaml | 12 +++---- tests/testing_utils.py | 33 ------------------- 4 files changed, 17 insertions(+), 50 deletions(-) diff --git a/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml index 9199dd978f..da2b2d9869 100644 --- a/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml +++ b/tests/lmeval/configs/vl_fp8_dynamic_per_token.yaml @@ -12,11 +12,11 @@ lmeval: apply_chat_template: True num_fewshot: 0 batch_size: 100 - limit: 100 - # dense model achieves exact_match accuracy of 0.530 + limit: 500 + # dense model achieves exact_match accuracy of 0.576 # dense model achieves relaxed_accuracy of 0.780 - # dense model achieves anywhere_accuracy of 0.800 + # dense model achieves anywhere_accuracy of 0.806 metrics: - exact_match,none: 0.530 - relaxed_accuracy,none: 0.780 + exact_match,none: 0.596 + relaxed_accuracy,none: 0.784 anywhere_accuracy,none: 0.810 diff --git a/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml b/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml index 89145844e6..7271ccc4fc 100644 --- a/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml +++ b/tests/lmeval/configs/vl_int8_w8a8_dynamic_per_token.yaml @@ -15,11 +15,11 @@ lmeval: apply_chat_template: True num_fewshot: 0 batch_size: 100 - limit: 100 - # dense model achieves exact_match accuracy of 0.520 + limit: 500 + # dense model achieves exact_match accuracy of 0.576 # dense model achieves relaxed_accuracy of 0.780 - # dense model achieves anywhere_accuracy of 0.800 + # dense model achieves anywhere_accuracy of 0.806 metrics: - exact_match,none: 0.550 - relaxed_accuracy,none: 0.770 - anywhere_accuracy,none: 0.770 \ No newline at end of file + exact_match,none: 0.608 + relaxed_accuracy,none: 0.806 + anywhere_accuracy,none: 0.824 \ No newline at end of file diff --git a/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml b/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml index 121cc14bc8..8b07f5c2cb 100644 --- a/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml +++ b/tests/lmeval/configs/vl_w4a16_actorder_weight.yaml @@ -15,11 +15,11 @@ lmeval: apply_chat_template: True num_fewshot: 0 batch_size: 100 - limit: 100 - # dense model achieves exact_match accuracy of 0.520 + limit: 500 + # dense model achieves exact_match accuracy of 0.576 # dense model achieves relaxed_accuracy of 0.780 - # dense model achieves anywhere_accuracy of 0.800 + # dense model achieves anywhere_accuracy of 0.806 metrics: - exact_match,none: 0.540 - relaxed_accuracy,none: 0.780 - anywhere_accuracy,none: 0.800 \ No newline at end of file + exact_match,none: 0.588 + relaxed_accuracy,none: 0.782 + anywhere_accuracy,none: 0.808 \ No newline at end of file diff --git a/tests/testing_utils.py b/tests/testing_utils.py index 2cf69720be..0cd1963036 100644 --- a/tests/testing_utils.py +++ b/tests/testing_utils.py @@ -218,20 +218,6 @@ def process(sample): add_special_tokens=False, ) - elif ds_name == "llm_compression_calibration": - - def process(sample): - return processor( - processor.apply_chat_template( - sample["text"], - tokenize=False, - ), - padding=False, - max_length=max_seq_length, - truncation=True, - add_special_tokens=False, - ) - elif ds_name == "open-platypus": # use the output rather than the instruction def process(sample): @@ -246,25 +232,6 @@ def process(sample): add_special_tokens=False, ) - elif ds_name == "slimorca-deduped-cleaned-corrected": - # find the first element corresponding to a message from a human - def process(sample): - conversation_idx = 0 - for idx, conversation in enumerate(sample["conversations"]): - if conversation["from"] == "human": - conversation_idx = idx - break - return processor( - processor.apply_chat_template( - sample["conversations"][conversation_idx]["value"], - tokenize=False, - ), - padding=False, - max_length=max_seq_length, - truncation=True, - add_special_tokens=False, - ) - elif ds_name == "flickr30k": def process(sample):