From 4745c30e98bfd7acef9017fad9120f2c9db58a53 Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <vivektrivedi@meta.com>
Date: Fri, 31 Oct 2025 17:06:34 -0700
Subject: [PATCH] Slightly improving 4 bit mat mul performance through better
 engage ALU pipes by splitting uint to float conversion operation. (#15447)

Summary:

This diff makes a slight improvement to the performance of 4-bit matrix multiplication by better utilizing the ALU pipes. This is achieved by splitting the `uint` to `float` conversion operation.

Reviewed By: SS-JIA

Differential Revision: D85779855
---
 .../graph/ops/glsl/linear_qcsnw_tiled.glsl    | 35 ++++++++++---------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.glsl
index 204352656c9..c364e70bc9f 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.glsl
@@ -106,8 +106,10 @@ void main() {
     // Preload weight tensor
     for (int r = 0; r < 4; r++) {
       T qmat2[TILE_TXCOLS * 4];
-      VEC4_T qmat2_vec4;
-      uvec4 packed_weight_tex;
+      $if QUANT_NBITS == 4:
+        uvec4 packed_weight_tex;
+      $else:
+        ivec4 packed_weight_tex;
 
       $if QUANT_NBITS == 4:
         $for c in range(0, TILE_TXCOLS, 2):
@@ -119,28 +121,27 @@ void main() {
             packed_weight_tex = texelFetch(
               t_weight, ivec2(weight_txcol + ${c}, pos + r), 0);
 
-          qmat2_vec4 = VEC4_T(packed_weight_tex >> 4);
-          qmat2[${c} * 4 * TILE_TXCOLS + 0] = qmat2_vec4.x;
-          qmat2[${c} * 4 * TILE_TXCOLS + 1] = qmat2_vec4.y;
-          qmat2[${c} * 4 * TILE_TXCOLS + 2] = qmat2_vec4.z;
-          qmat2[${c} * 4 * TILE_TXCOLS + 3] = qmat2_vec4.w;
-
-          qmat2_vec4 = VEC4_T(packed_weight_tex & 0x0F);
-          qmat2[${c} * 4 * TILE_TXCOLS + 4] = qmat2_vec4.x;
-          qmat2[${c} * 4 * TILE_TXCOLS + 5] = qmat2_vec4.y;
-          qmat2[${c} * 4 * TILE_TXCOLS + 6] = qmat2_vec4.z;
-          qmat2[${c} * 4 * TILE_TXCOLS + 7] = qmat2_vec4.w;
+          const uvec4 tmp1 = packed_weight_tex >> 4;
+          qmat2[${c} * 4 * TILE_TXCOLS + 0] = T(tmp1.x);
+          qmat2[${c} * 4 * TILE_TXCOLS + 1] = T(tmp1.y);
+          qmat2[${c} * 4 * TILE_TXCOLS + 2] = T(tmp1.z);
+          qmat2[${c} * 4 * TILE_TXCOLS + 3] = T(tmp1.w);
+
+          const uvec4 tmp2 = packed_weight_tex & 0x0F;
+          qmat2[${c} * 4 * TILE_TXCOLS + 4] = T(tmp2.x);
+          qmat2[${c} * 4 * TILE_TXCOLS + 5] = T(tmp2.y);
+          qmat2[${c} * 4 * TILE_TXCOLS + 6] = T(tmp2.z);
+          qmat2[${c} * 4 * TILE_TXCOLS + 7] = T(tmp2.w);
       $else:
         $for c in range(TILE_TXCOLS):
           $if WEIGHT_STORAGE == "buffer":
             qmat2_bufi = (pos + r) * weight_row_txstride + out_txcol;
             encoded_weight = t_weight[qmat2_bufi + ${c}];
-            packed_weight_tex = uvec4(encoded_weight & 0xFF, (encoded_weight >> 8) & 0xFF, (encoded_weight >> 16) & 0xFF, encoded_weight >> 24);
-            qmat2_vec4 = VEC4_T(packed_weight_tex);
+            packed_weight_tex = ivec4(encoded_weight & 0xFF, (encoded_weight >> 8) & 0xFF, (encoded_weight >> 16) & 0xFF, encoded_weight >> 24);
           $else:
-            qmat2_vec4 = VEC4_T(texelFetch(t_weight, ivec2(out_txcol + ${c}, pos + r), 0));
+            packed_weight_tex = ivec4(texelFetch(t_weight, ivec2(out_txcol + ${c}, pos + r), 0));
           $for j in range(4):
-            qmat2[${c} * 4 + ${j}] = qmat2_vec4[${j}];
+            qmat2[${c} * 4 + ${j}] = T(packed_weight_tex[${j}]);
 
       for (int tr = 0; tr < TILE_ROWS; ++tr) {
         $for c in range(TILE_TXCOLS):