UbiquitousLearning · twlddd · May 18, 2026 · May 18, 2026 · May 18, 2026 · May 21, 2026
diff --git a/.gitignore b/.gitignore
@@ -38,5 +38,6 @@ autotuner.log
 
 # Downloaded models and build artifacts (root-only)
 /models/
+/logs/
 # Keep source model adapters tracked
 !tools/mllm-llm-benchmark/models/
diff --git a/README.md b/README.md
@@ -80,7 +80,7 @@ The mllm framework integrates seamlessly with popular community frameworks' chec
 | [Qwen3-4B](https://github.com/QwenLM/Qwen3)                      | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen3-4B-w4a8-i8mm-kai)  |  | |
 | [DeepSeek-OCR](https://github.com/deepseek-ai/DeepSeek-OCR)       | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/DeepSeek-OCR-w4a8-i8mm-kai)  |  | |
 | [SmolLM3](https://huggingface.co/blog/smollm3)| [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/SmolLM3-3B-w4a8-i8mm-kai)  |  | |
-| [Qwen2-VL-2B-Instruct](https://qwenlm.github.io/zh/blog/qwen2-vl/)|[✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen2-VL-2B-Instruct-w4a32kai) || |
+| [Qwen2-VL-2B-Instruct](https://qwenlm.github.io/zh/blog/qwen2-vl/)|[✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen2-VL-2B-Instruct-w4a32kai) | [W4A16-SM8650](https://www.modelscope.cn/models/twlddd/Qwen2-VL-2B-Instruct-Full-QNN-AOT-for-mllm/) | |
 | [Qwen2-VL-7B-Instruct](https://qwenlm.github.io/zh/blog/qwen2-vl/)|[✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen2-VL-7B-Instruct-w4a32kai)|| |
 | [Qwen2.5-VL-3B-Instruct](https://qwenlm.github.io/blog/qwen2.5-vl/)|[✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen2.5-VL-3B-Instruct-w4a32kai)|| |
 | [Qwen2.5-VL-7B-Instruct](https://qwenlm.github.io/blog/qwen2.5-vl/)|[✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen2.5-VL-7B-Instruct-w4a32kai)|| |

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -25,5 +25,6 @@ endif()
 if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE OR MLLM_BUILD_QNN_BACKEND)
   add_subdirectory(qwen3_qnn_aot)
   add_subdirectory(qwen2_qnn_aot)
+  add_subdirectory(qwen2vl_qnn_aot)
   add_subdirectory(llama_qnn_aot)
 endif()
diff --git a/examples/qwen2vl/config_2B_qnn_lpbq.json b/examples/qwen2vl/config_2B_qnn_lpbq.json
@@ -0,0 +1,33 @@
+{
+    "architectures": [
+        "Qwen2VLForConditionalGeneration"
+    ],
+    "visual_in_chans": 3,
+    "visual_embed_dim": 1280,
+    "visual_patch_size": 14,
+    "visual_temporal_patch_size": 2,
+    "visual_spatial_merge_size": 2,
+    "visual_mlp_ratio": 4,
+    "visual_num_heads": 16,
+    "visual_depth": 32,
+    "hidden_size": 1536,
+    "intermediate_size": 8960,
+    "num_attention_heads": 12,
+    "num_key_value_heads": 2,
+    "num_hidden_layers": 28,
+    "max_position_embeddings": 32786,
+    "rms_norm_eps": 1e-06,
+    "vocab_size": 151936,
+    "max_cache_length": 2048,
+    "mrope_section": [
+        16,
+        24,
+        24
+    ],
+    "vision_token_id": 151654,
+    "eos_token_id": 151645,
+    "end_of_text_token_id": 151643,
+    "rope_theta": 1000000.0,
+    "tie_word_embeddings": true,
+    "linear_impl_type": "QNN_LPBQ_w4a16o16_G32"
+}
diff --git a/examples/qwen2vl/main.cpp b/examples/qwen2vl/main.cpp
@@ -65,6 +65,7 @@ MLLM_MAIN({
 
       // Use for loop
       for (auto& step : qwen2vl.chat(inputs)) { std::wcout << qwen2vl_tokenizer.detokenize(step.cur_token_id) << std::flush; }
+      qwen2vl.perfSummary();
 
       // OR
       // Steam it!

diff --git a/examples/qwen2vl/qnn_aot_cfg_2B.json b/examples/qwen2vl/qnn_aot_cfg_2B.json
@@ -0,0 +1,51 @@
+{
+    "target_machine": {
+        "htp_arch": "V75",
+        "htp_chipset": "SM8650",
+        "htp_try_best_performance": "HtpBurst",
+        "htp_security_pd_session": "HtpSignedPd",
+        "htp_vtcm_capability_in_mb": 8
+    },
+    "graph_on_qnn": [
+        "model"
+    ],
+    "op_on_qnn": [
+        "lm_head"
+    ],
+    "split_graph": 1,
+    "quant_recipe": {
+        "llm_recipe": true,
+        "layers": 28,
+        "builtin_llm_pass": {
+            "model": "qwen2",
+            "lm_head": {
+                "fallback": {
+                    "method": "LPBQ",
+                    "sym": true,
+                    "precision": "w4a16",
+                    "block_size": 32
+                }
+            },
+            "linear": {
+                "fallback": {
+                    "method": "LPBQ",
+                    "sym": true,
+                    "precision": "w4a16",
+                    "block_size": 32
+                }
+            },
+            "kv_cache": {
+                "key": {
+                    "method": "per-tensor",
+                    "sym": true,
+                    "precision": "w8a8"
+                },
+                "value": {
+                    "method": "per-tensor",
+                    "sym": true,
+                    "precision": "w8a8"
+                }
+            }
+        }
+    }
+}
diff --git a/examples/qwen2vl/qnn_aot_cfg_2B_lpbq_vprojg16_unsignedpd.json b/examples/qwen2vl/qnn_aot_cfg_2B_lpbq_vprojg16_unsignedpd.json
@@ -0,0 +1,57 @@
+{
+    "target_machine": {
+        "htp_arch": "V75",
+        "htp_chipset": "SM8650",
+        "htp_try_best_performance": "HtpBurst",
+        "htp_security_pd_session": "HtpUnsignedPd",
+        "htp_vtcm_capability_in_mb": 8
+    },
+    "graph_on_qnn": [
+        "model"
+    ],
+    "op_on_qnn": [
+        "lm_head"
+    ],
+    "split_graph": 1,
+    "quant_recipe": {
+        "llm_recipe": true,
+        "layers": 28,
+        "builtin_llm_pass": {
+            "model": "qwen2",
+            "lm_head": {
+                "fallback": {
+                    "method": "LPBQ",
+                    "sym": true,
+                    "precision": "w4a16",
+                    "block_size": 32
+                }
+            },
+            "linear": {
+                "fallback": {
+                    "method": "LPBQ",
+                    "sym": true,
+                    "precision": "w4a16",
+                    "block_size": 32
+                },
+                "model\\.layers\\.[0-9]+\\.self_attn\\.v_proj": {
+                    "method": "LPBQ",
+                    "sym": true,
+                    "precision": "w4a16",
+                    "block_size": 16
+                }
+            },
+            "kv_cache": {
+                "key": {
+                    "method": "per-tensor",
+                    "sym": true,
+                    "precision": "w8a8"
+                },
+                "value": {
+                    "method": "per-tensor",
+                    "sym": true,
+                    "precision": "w8a8"
+                }
+            }
+        }
+    }
+}
diff --git a/examples/qwen2vl/qnn_aot_cfg_2B_visual.json b/examples/qwen2vl/qnn_aot_cfg_2B_visual.json
@@ -0,0 +1,30 @@
+{
+    "target_machine": {
+        "htp_arch": "V75",
+        "htp_chipset": "SM8650",
+        "htp_try_best_performance": "HtpBurst",
+        "htp_security_pd_session": "HtpUnsignedPd",
+        "htp_vtcm_capability_in_mb": 8
+    },
+    "graph_on_qnn": [
+        "visual"
+    ],
+    "op_on_qnn": [],
+    "split_graph": 1,
+    "quant_recipe": {
+        "llm_recipe": true,
+        "layers": 32,
+        "builtin_llm_pass": {
+            "model": "qwen2vl_visual",
+            "linear": {
+                "fallback": {
+                    "method": "LPBQ",
+                    "sym": true,
+                    "precision": "w4a16",
+                    "block_size": 32,
+                    "allow_raw_float_linear": true
+                }
+            }
+        }
+    }
+}
diff --git a/examples/qwen2vl/qnn_aot_cfg_2B_visual_fp16.json b/examples/qwen2vl/qnn_aot_cfg_2B_visual_fp16.json
@@ -0,0 +1,30 @@
+{
+    "target_machine": {
+        "htp_arch": "V75",
+        "htp_chipset": "SM8650",
+        "htp_try_best_performance": "HtpBurst",
+        "htp_security_pd_session": "HtpUnsignedPd",
+        "htp_vtcm_capability_in_mb": 8
+    },
+    "graph_on_qnn": [
+        "visual"
+    ],
+    "op_on_qnn": [],
+    "split_graph": 1,
+    "quant_recipe": {
+        "llm_recipe": true,
+        "layers": 32,
+        "builtin_llm_pass": {
+            "model": "qwen2vl_visual",
+            "linear": {
+                "fallback": {
+                    "method": "LPBQ",
+                    "sym": true,
+                    "precision": "w4a16",
+                    "block_size": 32,
+                    "allow_raw_float_linear": true
+                }
+            }
+        }
+    }
+}
diff --git a/examples/qwen2vl_qnn_aot/CMakeLists.txt b/examples/qwen2vl_qnn_aot/CMakeLists.txt
@@ -0,0 +1,26 @@
+# AOT targets run on x86 and generate QNN context binaries for Android.
+if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE)
+  add_executable(mllm-qwen2vl-aot-c compile.cpp)
+  target_link_libraries(mllm-qwen2vl-aot-c PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
+  target_include_directories(mllm-qwen2vl-aot-c PRIVATE ${MLLM_INCLUDE_DIR})
+
+  add_executable(mllm-qwen2vl-visual-aot-diag compile_visual.cpp)
+  target_link_libraries(mllm-qwen2vl-visual-aot-diag PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
+  target_include_directories(mllm-qwen2vl-visual-aot-diag PRIVATE ${MLLM_INCLUDE_DIR})
+
+  add_executable(mllm-qwen2vl-visual-padding-diag visual_padding_diag.cpp)
+  target_link_libraries(mllm-qwen2vl-visual-padding-diag PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
+  target_include_directories(mllm-qwen2vl-visual-padding-diag PRIVATE ${MLLM_INCLUDE_DIR})
+endif()
+
+add_executable(mllm-qwen2vl-aot-runner aot_run.cpp)
+target_link_libraries(mllm-qwen2vl-aot-runner PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
+target_include_directories(mllm-qwen2vl-aot-runner PRIVATE ${MLLM_INCLUDE_DIR})
+
+add_executable(mllm-qwen2vl-visual-aot-runner visual_aot_run.cpp)
+target_link_libraries(mllm-qwen2vl-visual-aot-runner PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
+target_include_directories(mllm-qwen2vl-visual-aot-runner PRIVATE ${MLLM_INCLUDE_DIR})
+
+add_executable(mllm-qwen2vl-visual-padding-ab-diag visual_padding_ab_diag.cpp)
+target_link_libraries(mllm-qwen2vl-visual-padding-ab-diag PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
+target_include_directories(mllm-qwen2vl-visual-padding-ab-diag PRIVATE ${MLLM_INCLUDE_DIR})