Skip to content

Commit 35be9f2

Browse files
trivedivivekfacebook-github-bot
authored andcommitted
Slightly improving 4 bit mat mul performance through better engage ALU pipes by splitting uint to float conversion operation. (#15447)
Summary: This diff makes a slight improvement to the performance of 4-bit matrix multiplication by better utilizing the ALU pipes. This is achieved by splitting the `uint` to `float` conversion operation. Differential Revision: D85779855
1 parent fc22dec commit 35be9f2

File tree

1 file changed

+13
-15
lines changed

1 file changed

+13
-15
lines changed

backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.glsl

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,6 @@ void main() {
106106
// Preload weight tensor
107107
for (int r = 0; r < 4; r++) {
108108
T qmat2[TILE_TXCOLS * 4];
109-
VEC4_T qmat2_vec4;
110109
uvec4 packed_weight_tex;
111110

112111
$if QUANT_NBITS == 4:
@@ -119,28 +118,27 @@ void main() {
119118
packed_weight_tex = texelFetch(
120119
t_weight, ivec2(weight_txcol + ${c}, pos + r), 0);
121120

122-
qmat2_vec4 = VEC4_T(packed_weight_tex >> 4);
123-
qmat2[${c} * 4 * TILE_TXCOLS + 0] = qmat2_vec4.x;
124-
qmat2[${c} * 4 * TILE_TXCOLS + 1] = qmat2_vec4.y;
125-
qmat2[${c} * 4 * TILE_TXCOLS + 2] = qmat2_vec4.z;
126-
qmat2[${c} * 4 * TILE_TXCOLS + 3] = qmat2_vec4.w;
127-
128-
qmat2_vec4 = VEC4_T(packed_weight_tex & 0x0F);
129-
qmat2[${c} * 4 * TILE_TXCOLS + 4] = qmat2_vec4.x;
130-
qmat2[${c} * 4 * TILE_TXCOLS + 5] = qmat2_vec4.y;
131-
qmat2[${c} * 4 * TILE_TXCOLS + 6] = qmat2_vec4.z;
132-
qmat2[${c} * 4 * TILE_TXCOLS + 7] = qmat2_vec4.w;
121+
const uvec4 tmp1 = packed_weight_tex >> 4;
122+
qmat2[${c} * 4 * TILE_TXCOLS + 0] = T(tmp1.x);
123+
qmat2[${c} * 4 * TILE_TXCOLS + 1] = T(tmp1.y);
124+
qmat2[${c} * 4 * TILE_TXCOLS + 2] = T(tmp1.z);
125+
qmat2[${c} * 4 * TILE_TXCOLS + 3] = T(tmp1.w);
126+
127+
const uvec4 tmp2 = packed_weight_tex & 0x0F;
128+
qmat2[${c} * 4 * TILE_TXCOLS + 4] = T(tmp2.x);
129+
qmat2[${c} * 4 * TILE_TXCOLS + 5] = T(tmp2.y);
130+
qmat2[${c} * 4 * TILE_TXCOLS + 6] = T(tmp2.z);
131+
qmat2[${c} * 4 * TILE_TXCOLS + 7] = T(tmp2.w);
133132
$else:
134133
$for c in range(TILE_TXCOLS):
135134
$if WEIGHT_STORAGE == "buffer":
136135
qmat2_bufi = (pos + r) * weight_row_txstride + out_txcol;
137136
encoded_weight = t_weight[qmat2_bufi + ${c}];
138137
packed_weight_tex = uvec4(encoded_weight & 0xFF, (encoded_weight >> 8) & 0xFF, (encoded_weight >> 16) & 0xFF, encoded_weight >> 24);
139-
qmat2_vec4 = VEC4_T(packed_weight_tex);
140138
$else:
141-
qmat2_vec4 = VEC4_T(texelFetch(t_weight, ivec2(out_txcol + ${c}, pos + r), 0));
139+
packed_weight_tex = uvec4(texelFetch(t_weight, ivec2(out_txcol + ${c}, pos + r), 0));
142140
$for j in range(4):
143-
qmat2[${c} * 4 + ${j}] = qmat2_vec4[${j}];
141+
qmat2[${c} * 4 + ${j}] = T(packed_weight_tex[${j}]);
144142

145143
for (int tr = 0; tr < TILE_ROWS; ++tr) {
146144
$for c in range(TILE_TXCOLS):

0 commit comments

Comments
 (0)