Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -38,5 +38,6 @@ autotuner.log

# Downloaded models and build artifacts (root-only)
/models/
/logs/
# Keep source model adapters tracked
!tools/mllm-llm-benchmark/models/
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ The mllm framework integrates seamlessly with popular community frameworks' chec
| [Qwen3-4B](https://github.com/QwenLM/Qwen3) | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen3-4B-w4a8-i8mm-kai) | | |
| [DeepSeek-OCR](https://github.com/deepseek-ai/DeepSeek-OCR) | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/DeepSeek-OCR-w4a8-i8mm-kai) | | |
| [SmolLM3](https://huggingface.co/blog/smollm3)| [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/SmolLM3-3B-w4a8-i8mm-kai) | | |
| [Qwen2-VL-2B-Instruct](https://qwenlm.github.io/zh/blog/qwen2-vl/)|[✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen2-VL-2B-Instruct-w4a32kai) || |
| [Qwen2-VL-2B-Instruct](https://qwenlm.github.io/zh/blog/qwen2-vl/)|[✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen2-VL-2B-Instruct-w4a32kai) | [W4A16-SM8650](https://www.modelscope.cn/models/twlddd/Qwen2-VL-2B-Instruct-Full-QNN-AOT-for-mllm/) | |
| [Qwen2-VL-7B-Instruct](https://qwenlm.github.io/zh/blog/qwen2-vl/)|[✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen2-VL-7B-Instruct-w4a32kai)|| |
| [Qwen2.5-VL-3B-Instruct](https://qwenlm.github.io/blog/qwen2.5-vl/)|[✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen2.5-VL-3B-Instruct-w4a32kai)|| |
| [Qwen2.5-VL-7B-Instruct](https://qwenlm.github.io/blog/qwen2.5-vl/)|[✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen2.5-VL-7B-Instruct-w4a32kai)|| |
Expand Down
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,5 +25,6 @@ endif()
if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE OR MLLM_BUILD_QNN_BACKEND)
add_subdirectory(qwen3_qnn_aot)
add_subdirectory(qwen2_qnn_aot)
add_subdirectory(qwen2vl_qnn_aot)
add_subdirectory(llama_qnn_aot)
endif()
33 changes: 33 additions & 0 deletions examples/qwen2vl/config_2B_qnn_lpbq.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"architectures": [
"Qwen2VLForConditionalGeneration"
],
"visual_in_chans": 3,
"visual_embed_dim": 1280,
"visual_patch_size": 14,
"visual_temporal_patch_size": 2,
"visual_spatial_merge_size": 2,
"visual_mlp_ratio": 4,
"visual_num_heads": 16,
"visual_depth": 32,
"hidden_size": 1536,
"intermediate_size": 8960,
"num_attention_heads": 12,
"num_key_value_heads": 2,
"num_hidden_layers": 28,
"max_position_embeddings": 32786,
"rms_norm_eps": 1e-06,
"vocab_size": 151936,
"max_cache_length": 2048,
"mrope_section": [
16,
24,
24
],
"vision_token_id": 151654,
"eos_token_id": 151645,
"end_of_text_token_id": 151643,
"rope_theta": 1000000.0,
"tie_word_embeddings": true,
"linear_impl_type": "QNN_LPBQ_w4a16o16_G32"
}
1 change: 1 addition & 0 deletions examples/qwen2vl/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ MLLM_MAIN({

// Use for loop
for (auto& step : qwen2vl.chat(inputs)) { std::wcout << qwen2vl_tokenizer.detokenize(step.cur_token_id) << std::flush; }
qwen2vl.perfSummary();

// OR
// Steam it!
Expand Down
51 changes: 51 additions & 0 deletions examples/qwen2vl/qnn_aot_cfg_2B.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
{
"target_machine": {
"htp_arch": "V75",
"htp_chipset": "SM8650",
"htp_try_best_performance": "HtpBurst",
"htp_security_pd_session": "HtpSignedPd",
"htp_vtcm_capability_in_mb": 8
},
"graph_on_qnn": [
"model"
],
"op_on_qnn": [
"lm_head"
],
"split_graph": 1,
"quant_recipe": {
"llm_recipe": true,
"layers": 28,
"builtin_llm_pass": {
"model": "qwen2",
"lm_head": {
"fallback": {
"method": "LPBQ",
"sym": true,
"precision": "w4a16",
"block_size": 32
}
},
"linear": {
"fallback": {
"method": "LPBQ",
"sym": true,
"precision": "w4a16",
"block_size": 32
}
},
"kv_cache": {
"key": {
"method": "per-tensor",
"sym": true,
"precision": "w8a8"
},
"value": {
"method": "per-tensor",
"sym": true,
"precision": "w8a8"
}
}
}
}
}
57 changes: 57 additions & 0 deletions examples/qwen2vl/qnn_aot_cfg_2B_lpbq_vprojg16_unsignedpd.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
{
"target_machine": {
"htp_arch": "V75",
"htp_chipset": "SM8650",
"htp_try_best_performance": "HtpBurst",
"htp_security_pd_session": "HtpUnsignedPd",
"htp_vtcm_capability_in_mb": 8
},
"graph_on_qnn": [
"model"
],
"op_on_qnn": [
"lm_head"
],
"split_graph": 1,
"quant_recipe": {
"llm_recipe": true,
"layers": 28,
"builtin_llm_pass": {
"model": "qwen2",
"lm_head": {
"fallback": {
"method": "LPBQ",
"sym": true,
"precision": "w4a16",
"block_size": 32
}
},
"linear": {
"fallback": {
"method": "LPBQ",
"sym": true,
"precision": "w4a16",
"block_size": 32
},
"model\\.layers\\.[0-9]+\\.self_attn\\.v_proj": {
"method": "LPBQ",
"sym": true,
"precision": "w4a16",
"block_size": 16
}
},
"kv_cache": {
"key": {
"method": "per-tensor",
"sym": true,
"precision": "w8a8"
},
"value": {
"method": "per-tensor",
"sym": true,
"precision": "w8a8"
}
}
}
}
}
30 changes: 30 additions & 0 deletions examples/qwen2vl/qnn_aot_cfg_2B_visual.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"target_machine": {
"htp_arch": "V75",
"htp_chipset": "SM8650",
"htp_try_best_performance": "HtpBurst",
"htp_security_pd_session": "HtpUnsignedPd",
"htp_vtcm_capability_in_mb": 8
},
"graph_on_qnn": [
"visual"
],
"op_on_qnn": [],
"split_graph": 1,
"quant_recipe": {
"llm_recipe": true,
"layers": 32,
"builtin_llm_pass": {
"model": "qwen2vl_visual",
"linear": {
"fallback": {
"method": "LPBQ",
"sym": true,
"precision": "w4a16",
"block_size": 32,
"allow_raw_float_linear": true
}
}
}
}
}
30 changes: 30 additions & 0 deletions examples/qwen2vl/qnn_aot_cfg_2B_visual_fp16.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"target_machine": {
"htp_arch": "V75",
"htp_chipset": "SM8650",
"htp_try_best_performance": "HtpBurst",
"htp_security_pd_session": "HtpUnsignedPd",
"htp_vtcm_capability_in_mb": 8
},
"graph_on_qnn": [
"visual"
],
"op_on_qnn": [],
"split_graph": 1,
"quant_recipe": {
"llm_recipe": true,
"layers": 32,
"builtin_llm_pass": {
"model": "qwen2vl_visual",
"linear": {
"fallback": {
"method": "LPBQ",
"sym": true,
"precision": "w4a16",
"block_size": 32,
"allow_raw_float_linear": true
}
}
}
}
}
26 changes: 26 additions & 0 deletions examples/qwen2vl_qnn_aot/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# AOT targets run on x86 and generate QNN context binaries for Android.
if(MLLM_QUALCOMM_QNN_AOT_ON_X86_ENABLE)
add_executable(mllm-qwen2vl-aot-c compile.cpp)
target_link_libraries(mllm-qwen2vl-aot-c PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
target_include_directories(mllm-qwen2vl-aot-c PRIVATE ${MLLM_INCLUDE_DIR})

add_executable(mllm-qwen2vl-visual-aot-diag compile_visual.cpp)
target_link_libraries(mllm-qwen2vl-visual-aot-diag PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
target_include_directories(mllm-qwen2vl-visual-aot-diag PRIVATE ${MLLM_INCLUDE_DIR})

add_executable(mllm-qwen2vl-visual-padding-diag visual_padding_diag.cpp)
target_link_libraries(mllm-qwen2vl-visual-padding-diag PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
target_include_directories(mllm-qwen2vl-visual-padding-diag PRIVATE ${MLLM_INCLUDE_DIR})
endif()

add_executable(mllm-qwen2vl-aot-runner aot_run.cpp)
target_link_libraries(mllm-qwen2vl-aot-runner PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
target_include_directories(mllm-qwen2vl-aot-runner PRIVATE ${MLLM_INCLUDE_DIR})

add_executable(mllm-qwen2vl-visual-aot-runner visual_aot_run.cpp)
target_link_libraries(mllm-qwen2vl-visual-aot-runner PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
target_include_directories(mllm-qwen2vl-visual-aot-runner PRIVATE ${MLLM_INCLUDE_DIR})

add_executable(mllm-qwen2vl-visual-padding-ab-diag visual_padding_ab_diag.cpp)
target_link_libraries(mllm-qwen2vl-visual-padding-ab-diag PRIVATE MllmRT MllmCPUBackend MllmQNNBackend)
target_include_directories(mllm-qwen2vl-visual-padding-ab-diag PRIVATE ${MLLM_INCLUDE_DIR})
Loading