@@ -287,12 +287,29 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src,
287287 }
288288
289289 GEMMLowpOutputStageInfo output_info;
290- output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
291- output_info.gemmlowp_offset = uoqinfo.offset ;
292- output_info.gemmlowp_min_bound = min_activation;
293- output_info.gemmlowp_max_bound = max_activation;
294- output_info.is_quantized_per_channel = (tmp_weights.data_type () == DataType::QSYMM8_PER_CHANNEL);
295- quantization::calculate_quantized_multipliers (iqinfo, wqinfo, oqinfo, output_info);
290+
291+ // F32 dequant path? (input quantized, output float)
292+ const bool dequantize_f32 = (dst->data_type () == DataType::F32);
293+
294+ if (dequantize_f32)
295+ {
296+ // No requant stage; offsets are handled via offset-contribution on int32
297+ output_info.type = GEMMLowpOutputStageType::NONE;
298+ output_info.gemmlowp_offset = 0 ;
299+ output_info.gemmlowp_min_bound = 0 ;
300+ output_info.gemmlowp_max_bound = 0 ;
301+ output_info.is_quantized_per_channel = false ; // irrelevant when NONE
302+ }
303+ else
304+ {
305+ // Existing Q->Q path
306+ output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
307+ output_info.gemmlowp_offset = uoqinfo.offset ;
308+ output_info.gemmlowp_min_bound = min_activation;
309+ output_info.gemmlowp_max_bound = max_activation;
310+ output_info.is_quantized_per_channel = (tmp_weights.data_type () == DataType::QSYMM8_PER_CHANNEL);
311+ quantization::calculate_quantized_multipliers (iqinfo, wqinfo, oqinfo, output_info);
312+ }
296313
297314 const GEMMInfo gemm_info =
298315 GEMMInfo (false /* is_a_reshaped */ , false /* is_b_reshaped */ , true /* reshape_b_only_on_first_run */ ,
@@ -367,14 +384,30 @@ Status CpuGemmConv2d::validate_mm(const ITensorInfo *src,
367384 {
368385 std::tie (min_activation, max_activation) = get_quantized_activation_min_max (act_info, data_type, uoqinfo);
369386 }
370-
387+ // F32 dequant path? (input quantized, output float)
388+ const bool dequantize_f32 = (dst->data_type () == DataType::F32);
371389 GEMMLowpOutputStageInfo output_info;
372- output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
373- output_info.gemmlowp_offset = uoqinfo.offset ;
374- output_info.gemmlowp_min_bound = min_activation;
375- output_info.gemmlowp_max_bound = max_activation;
376- output_info.is_quantized_per_channel = (weights->data_type () == DataType::QSYMM8_PER_CHANNEL);
377- ARM_COMPUTE_RETURN_ON_ERROR (quantization::calculate_quantized_multipliers (iqinfo, wqinfo, oqinfo, output_info));
390+
391+ if (dequantize_f32)
392+ {
393+ // No requant stage; offsets are handled via offset-contribution on int32
394+ output_info.type = GEMMLowpOutputStageType::NONE;
395+ output_info.gemmlowp_offset = 0 ;
396+ output_info.gemmlowp_min_bound = 0 ;
397+ output_info.gemmlowp_max_bound = 0 ;
398+ output_info.is_quantized_per_channel = false ; // irrelevant when NONE
399+ }
400+ else
401+ {
402+ // Existing Q->Q path
403+ output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
404+ output_info.gemmlowp_offset = uoqinfo.offset ;
405+ output_info.gemmlowp_min_bound = min_activation;
406+ output_info.gemmlowp_max_bound = max_activation;
407+ output_info.is_quantized_per_channel = (weights->data_type () == DataType::QSYMM8_PER_CHANNEL);
408+ ARM_COMPUTE_RETURN_ON_ERROR (
409+ quantization::calculate_quantized_multipliers (iqinfo, wqinfo, oqinfo, output_info));
410+ }
378411
379412 // Perform validation step on GEMMLowp
380413 std::unique_ptr<ITensorInfo> input_qa = src->clone ();
@@ -504,9 +537,11 @@ void CpuGemmConv2d::configure(const ITensorInfo *src,
504537 }
505538
506539 const unsigned int mat_weights_cols = weights->dimension (idx_kernels);
540+ const bool dequantize_f32 = is_data_type_quantized (data_type) && dst->data_type () == DataType::F32;
507541
508542 // Create temporary GEMM output tensor in case we cannot skip col2im
509- const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
543+ const DataType output_data_type = data_type == DataType::BFLOAT16 || dequantize_f32 ? DataType::F32 : data_type;
544+
510545 if (!_skip_col2im)
511546 {
512547 TensorShape shape_gemm;
@@ -725,7 +760,14 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src,
725760 {
726761 if (is_quantized)
727762 {
728- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN (biases, 1 , DataType::S32);
763+ if (data_type == DataType::QASYMM8_SIGNED && dst->data_type () == DataType::F32)
764+ {
765+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN (biases, 1 , DataType::F32);
766+ }
767+ else
768+ {
769+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN (biases, 1 , DataType::S32);
770+ }
729771 }
730772 else if (is_bf16)
731773 {
@@ -776,8 +818,9 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src,
776818 gemm_input_to_use = &im2col_reshaped_info;
777819 }
778820
821+ const bool dequantize_f32 = is_data_type_quantized (data_type) && dst->data_type () == DataType::F32;
779822 // Create temporary GEMM output tensor in case we cannot skip col2im
780- const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
823+ const DataType output_data_type = data_type == DataType::BFLOAT16 || dequantize_f32 ? DataType::F32 : data_type;
781824 if (!skip_col2im)
782825 {
783826 TensorShape shape_gemm = gemm_input_to_use->tensor_shape ();
0 commit comments