Slightly improving 4 bit mat mul performance through better engage ALU pipes by splitting uint to float conversion operation. (#15447)

trivedivivek · facebook-github-bot · commit 35be9f280485 · 2025-10-29T20:04:32.000-07:00
Summary:

This diff makes a slight improvement to the performance of 4-bit matrix multiplication by better utilizing the ALU pipes. This is achieved by splitting the `uint` to `float` conversion operation.

Differential Revision: D85779855
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.glsl
@@ -106,7 +106,6 @@ void main() {
     // Preload weight tensor
     for (int r = 0; r < 4; r++) {
       T qmat2[TILE_TXCOLS * 4];
-      VEC4_T qmat2_vec4;
       uvec4 packed_weight_tex;
 
       $if QUANT_NBITS == 4:
@@ -119,28 +118,27 @@ void main() {
             packed_weight_tex = texelFetch(
               t_weight, ivec2(weight_txcol + ${c}, pos + r), 0);
 
-          qmat2_vec4 = VEC4_T(packed_weight_tex >> 4);
-          qmat2[${c} * 4 * TILE_TXCOLS + 0] = qmat2_vec4.x;
-          qmat2[${c} * 4 * TILE_TXCOLS + 1] = qmat2_vec4.y;
-          qmat2[${c} * 4 * TILE_TXCOLS + 2] = qmat2_vec4.z;
-          qmat2[${c} * 4 * TILE_TXCOLS + 3] = qmat2_vec4.w;
-
-          qmat2_vec4 = VEC4_T(packed_weight_tex & 0x0F);
-          qmat2[${c} * 4 * TILE_TXCOLS + 4] = qmat2_vec4.x;
-          qmat2[${c} * 4 * TILE_TXCOLS + 5] = qmat2_vec4.y;
-          qmat2[${c} * 4 * TILE_TXCOLS + 6] = qmat2_vec4.z;
-          qmat2[${c} * 4 * TILE_TXCOLS + 7] = qmat2_vec4.w;
+          const uvec4 tmp1 = packed_weight_tex >> 4;
+          qmat2[${c} * 4 * TILE_TXCOLS + 0] = T(tmp1.x);
+          qmat2[${c} * 4 * TILE_TXCOLS + 1] = T(tmp1.y);
+          qmat2[${c} * 4 * TILE_TXCOLS + 2] = T(tmp1.z);
+          qmat2[${c} * 4 * TILE_TXCOLS + 3] = T(tmp1.w);
+
+          const uvec4 tmp2 = packed_weight_tex & 0x0F;
+          qmat2[${c} * 4 * TILE_TXCOLS + 4] = T(tmp2.x);
+          qmat2[${c} * 4 * TILE_TXCOLS + 5] = T(tmp2.y);
+          qmat2[${c} * 4 * TILE_TXCOLS + 6] = T(tmp2.z);
+          qmat2[${c} * 4 * TILE_TXCOLS + 7] = T(tmp2.w);
       $else:
         $for c in range(TILE_TXCOLS):
           $if WEIGHT_STORAGE == "buffer":
             qmat2_bufi = (pos + r) * weight_row_txstride + out_txcol;
             encoded_weight = t_weight[qmat2_bufi + ${c}];
             packed_weight_tex = uvec4(encoded_weight & 0xFF, (encoded_weight >> 8) & 0xFF, (encoded_weight >> 16) & 0xFF, encoded_weight >> 24);
-            qmat2_vec4 = VEC4_T(packed_weight_tex);
           $else:
-            qmat2_vec4 = VEC4_T(texelFetch(t_weight, ivec2(out_txcol + ${c}, pos + r), 0));
+            packed_weight_tex = uvec4(texelFetch(t_weight, ivec2(out_txcol + ${c}, pos + r), 0));
           $for j in range(4):
-            qmat2[${c} * 4 + ${j}] = qmat2_vec4[${j}];
+            qmat2[${c} * 4 + ${j}] = T(packed_weight_tex[${j}]);
 
       for (int tr = 0; tr < TILE_ROWS; ++tr) {
         $for c in range(TILE_TXCOLS):