fix: Handle padding updates after configure() in CpuActivation

gunes-arm · morgolock · commit 8775379bccd7 · 2025-10-03T09:10:47.000+01:00
Partially Resolves: ARMCL-1199

Signed-off-by: Gunes Bayir &lt;gunes.bayir@arm.com&gt;
Change-Id: I0a978407419cac86eb4b0499aa0234b316cf26b6
diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp
@@ -201,17 +201,19 @@ void init_lut(ActivationLayerInfo::ActivationFunction act_func,
 void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo activation_info)
 {
     ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuActivationKernel::configure");
-    ARM_COMPUTE_UNUSED(dst);
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_ERROR_THROW_ON(CpuActivationKernel::validate(src, dst, activation_info));
 
     heuristics::CpuActivationKernelHeuristics heuristics(src, dst, activation_info);
     _heuristics = std::move(heuristics);
 
-    if (dst != nullptr)
+    _src_padding = src->padding();
+    _inplace     = (dst == nullptr);
+    if (!_inplace)
     {
         // dst auto inizialitation if not yet initialized
         auto_init_if_empty(*dst, *src->clone());
+        _dst_padding = dst->padding();
     }
 
     const auto *uk = _heuristics.kernel();
@@ -234,6 +236,7 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac
         activation_info.setLookupTable256(tmp_lut);
     }
 
+    // Kernel specific logic should be mirrored in prepare()
     if (std::string(uk->name) == "sve_fp16_activation_lut")
     {
         // Create info using init list.
@@ -264,6 +267,50 @@ size_t CpuActivationKernel::get_mws(const CPUInfo &platform, size_t thread_count
     return _heuristics.mws();
 }
 
+void CpuActivationKernel::prepare(ITensorPack &tensors)
+{
+    const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    ITensor       *dst = tensors.get_tensor(TensorType::ACL_DST);
+
+    const ITensorInfo *src_info = src->info();
+    const ITensorInfo *dst_info = dst->info();
+
+    const bool src_padding_changed = (src_info->padding() != _src_padding);
+    const bool dst_padding_changed = (!_inplace && dst_info->padding() != _dst_padding);
+
+    if (src_padding_changed || dst_padding_changed)
+    {
+        // If padding has changed after configuration, recalculate the heuristics
+        const auto                                kernel_before_padding_change = _heuristics.kernel();
+        heuristics::CpuActivationKernelHeuristics heuristics(src_info, dst_info, _act_info);
+        _heuristics                            = std::move(heuristics);
+        const auto kernel_after_padding_change = _heuristics.kernel();
+
+        if (kernel_before_padding_change != kernel_after_padding_change)
+        {
+            // Kernel specific logic in configure must be repeated
+            const auto *uk = _heuristics.kernel();
+            ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
+
+            _name = std::string("CpuActivationKernel").append("/").append(uk->name);
+
+#ifdef __aarch64__
+            if (std::string(uk->name) == "sve_fp16_activation_lut")
+            {
+                // Create info using init list.
+                LUTManager   &lut_manager = LUTManager::get_instance();
+                const LUTInfo info = {_act_info.activation(), _act_info.a(), _act_info.b(), src_info->data_type(),
+                                      src_info->quantization_info().uniform()};
+                _act_info.setLookupTable65536((lut_manager.get_lut_table<LookupTable65536>(info)));
+            }
+#endif // __aarch64__
+        }
+
+        // Re-register the window
+        ICPPKernel::configure(_heuristics.window());
+    }
+}
+
 void CpuActivationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuActivationKernel::run_op");
diff --git a/src/cpu/kernels/CpuActivationKernel.h b/src/cpu/kernels/CpuActivationKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2024 Arm Limited.
+ * Copyright (c) 2017-2025 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #ifndef ACL_SRC_CPU_KERNELS_CPUACTIVATIONKERNEL_H
 #define ACL_SRC_CPU_KERNELS_CPUACTIVATIONKERNEL_H
 
+#include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 
 #include "src/core/common/Macros.h"
@@ -86,10 +87,20 @@ class CpuActivationKernel : public ICPPKernel
         return _heuristics.scheduler_hint().split_dimension();
     }
 
+    /** Prepare the activation kernel for execution (Only executed once)
+     *
+     * @param[in] tensors Pack of input and output tensors
+     *
+     */
+    void prepare(ITensorPack &tensors);
+
 private:
     ActivationLayerInfo                       _act_info{};
     std::string                               _name{};
     heuristics::CpuActivationKernelHeuristics _heuristics{};
+    PaddingSize                               _src_padding{};
+    PaddingSize                               _dst_padding{};
+    bool                                      _inplace{};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.cpp b/src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2024 Arm Limited.
+ * Copyright (c) 2017-2025 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -318,8 +318,6 @@ CpuActivationKernelHeuristics::CpuActivationKernelHeuristics(const ITensorInfo
                                                              const ITensorInfo         *dst,
                                                              const ActivationLayerInfo &activation_info)
 {
-    ARM_COMPUTE_UNUSED(dst);
-
     // Set kernel
     const DataType                    dtype = src->data_type();
     ActivationDataTypeISASelectorData selector{dtype, CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(),
@@ -329,7 +327,8 @@ CpuActivationKernelHeuristics::CpuActivationKernelHeuristics(const ITensorInfo
 
     // Set window and scheduling hint
     int split_dim;
-    std::tie(_window, split_dim) = calculate_squashed_or_max_window(*src);
+    std::tie(_window, split_dim) =
+        dst == nullptr ? calculate_squashed_or_max_window(*src) : calculate_squashed_or_max_window(*src, *dst);
 
     // Collapse window with SME kernels in Y-Dim
     if (std::string(_kernel->name) == "sme2_fp32_logistic")
diff --git a/src/cpu/operators/CpuActivation.cpp b/src/cpu/operators/CpuActivation.cpp
@@ -40,7 +40,9 @@ void CpuActivation::configure(const ITensorInfo *input, ITensorInfo *output, con
 {
     ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuActivation::configure");
     ARM_COMPUTE_LOG_PARAMS(input, output, activation_info);
-    auto k = std::make_unique<kernels::CpuActivationKernel>();
+
+    _is_prepared = false;
+    auto k       = std::make_unique<kernels::CpuActivationKernel>();
     k->configure(input, output, activation_info);
     _kernel = std::move(k);
 }
@@ -56,7 +58,15 @@ void CpuActivation::run(ITensorPack &tensors)
 {
     ARM_COMPUTE_TRACE_EVENT(ARM_COMPUTE_PROF_CAT_CPU, ARM_COMPUTE_PROF_LVL_CPU, "CpuActivation::run");
     ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-    auto split_dimension = static_cast<kernels::CpuActivationKernel *>(_kernel.get())->get_split_dimension_hint();
+
+    auto kernel_casted = static_cast<kernels::CpuActivationKernel *>(_kernel.get());
+    if (!_is_prepared)
+    {
+        kernel_casted->prepare(tensors);
+        _is_prepared = true;
+    }
+
+    const size_t split_dimension = kernel_casted->get_split_dimension_hint();
     NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors);
 }
 
diff --git a/src/cpu/operators/CpuActivation.h b/src/cpu/operators/CpuActivation.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2023, 2025 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_ACTIVATION_H
-#define ARM_COMPUTE_CPU_ACTIVATION_H
+#ifndef ACL_SRC_CPU_OPERATORS_CPUACTIVATION_H
+#define ACL_SRC_CPU_OPERATORS_CPUACTIVATION_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 
@@ -53,7 +53,10 @@ class CpuActivation : public ICpuOperator
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
+
+private:
+    bool _is_prepared{};
 };
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_ACTIVATION_H */
+#endif // ACL_SRC_CPU_OPERATORS_CPUACTIVATION_H
diff --git a/tests/validation/NEON/ActivationLayer.cpp b/tests/validation/NEON/ActivationLayer.cpp
@@ -50,6 +50,7 @@ namespace test
 {
 namespace validation
 {
+using framework::dataset::make;
 namespace
 {
 
@@ -62,6 +63,11 @@ const auto NeonActivationFunctionsDataset = concat(datasets::ActivationFunctions
 
 /** Input data sets. */
 const auto ActivationDataset = combine(combine(framework::dataset::make("InPlace", { false, true }), NeonActivationFunctionsDataset), framework::dataset::make("AlphaBeta", { 0.5f, 1.f }));
+const auto ActivationDatasetForPaddingAfterConfigure = combine(
+    make("InPlace", { false, true }),
+    NeonActivationFunctionsDataset,
+    make("AlphaBeta", { 0.5f })
+);
 
 template <typename T, ARM_COMPUTE_REQUIRES_TA(arm_compute::utils::traits::is_floating_point<T>::value)>
 void test_float_sqrt_boundary_value()
@@ -181,6 +187,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 
 template <typename T>
 using NEActivationLayerFixture = ActivationValidationFixture<Tensor, Accessor, NEActivationLayer, T>;
+template <typename T>
+using NEActivationLayerWithPaddingFixture = ActivationWithPaddingValidationFixture<Tensor, Accessor, NEActivationLayer, T>;
 
 TEST_SUITE(Float)
 #ifdef ARM_COMPUTE_ENABLE_FP16
@@ -204,6 +212,25 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEActivationLayerFixture<half>, framework::Data
         framework::ARM_COMPUTE_PRINT_INFO();
     }
 }
+
+FIXTURE_DATA_TEST_CASE(PaddingAfterConfigure, NEActivationLayerWithPaddingFixture<half>, framework::DatasetMode::ALL,
+    combine(
+        make("Shape", TensorShape{ 7U, 7U, 17U, 2U }),
+        ActivationDatasetForPaddingAfterConfigure,
+        make("DataType", DataType::F16))
+    )
+{
+    if(CPUInfo::get().has_fp16())
+    {
+        // Validate output
+        validate(Accessor(_target), _reference, helper::relative_tolerance(_data_type, _function), 0.f, helper::absolute_tolerance(_data_type, _function));
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("Device does not support fp16 vector operations. Test SKIPPED.");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
 TEST_SUITE_END() // FP16
 #endif           /* ARM_COMPUTE_ENABLE_FP16 */
 
@@ -212,28 +239,45 @@ TEST_CASE(SqrtBoundaryValue, framework::DatasetMode::ALL)
 {
     test_float_sqrt_boundary_value<float>();
 }
-FIXTURE_DATA_TEST_CASE(RunSmall, NEActivationLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ActivationDataset), framework::dataset::make("DataType",
+FIXTURE_DATA_TEST_CASE(RunSmall, NEActivationLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ActivationDataset), make("DataType",
                                                                                                        DataType::F32)))
 
 {
     // Validate output
     validate(Accessor(_target), _reference, helper::relative_tolerance(_data_type, _function), 0.f, helper::absolute_tolerance(_data_type, _function));
 }
+
+FIXTURE_DATA_TEST_CASE(PaddingAfterConfigure, NEActivationLayerWithPaddingFixture<float>, framework::DatasetMode::ALL,
+    combine(
+        make("Shape", TensorShape{ 7U, 7U, 17U, 2U }),
+        ActivationDatasetForPaddingAfterConfigure,
+        make("DataType", DataType::F32))
+    )
+{
+    validate(Accessor(_target), _reference, helper::relative_tolerance(_data_type, _function), 0.f, helper::absolute_tolerance(_data_type, _function));
+}
 // Run only on SME Devices to stress Logistic SME kernel
 #ifdef ARM_COMPUTE_ENABLE_SME2
 TEST_SUITE(SME)
-const auto LogsisticDataset =  combine(framework::dataset::make("InPlace", { false }), framework::dataset::make("Function", ActivationLayerInfo::ActivationFunction::LOGISTIC), framework::dataset::make("AlphaBeta", { 1.f }));
-FIXTURE_DATA_TEST_CASE(RunLogistic5D, NEActivationLayerFixture<float>, framework::DatasetMode::ALL, combine(datasets::Tiny5dShapes(), LogsisticDataset, framework::dataset::make("DataType",
+const auto LogisticDataset =  combine(make("InPlace", { false }), make("Function", ActivationLayerInfo::ActivationFunction::LOGISTIC), make("AlphaBeta", { 1.f }));
+FIXTURE_DATA_TEST_CASE(RunLogistic5D, NEActivationLayerFixture<float>, framework::DatasetMode::ALL, combine(datasets::Tiny5dShapes(), LogisticDataset, make("DataType",
                                                                                                        DataType::F32)))
 
 {
     // Validate output
     validate(Accessor(_target), _reference, helper::relative_tolerance(_data_type, _function), 0.f, helper::absolute_tolerance(_data_type, _function));
 }
 
-FIXTURE_DATA_TEST_CASE(RunLogisticSME, NEActivationLayerFixture<float>, framework::DatasetMode::ALL, combine(datasets::LogisticSMEStressShapesFp32(), LogsisticDataset, framework::dataset::make("DataType",
+FIXTURE_DATA_TEST_CASE(RunLogisticSME, NEActivationLayerFixture<float>, framework::DatasetMode::ALL, combine(datasets::LogisticSMEStressShapesFp32(), LogisticDataset, make("DataType",
                                                                                                        DataType::F32)))
 
+{
+    // Validate output
+    validate(Accessor(_target), _reference, helper::relative_tolerance(_data_type, _function), 0.f, helper::absolute_tolerance(_data_type, _function));
+}
+FIXTURE_DATA_TEST_CASE(PaddingAfterConfigure, NEActivationLayerWithPaddingFixture<float>, framework::DatasetMode::ALL,
+    combine(datasets::LogisticSMEStressShapesFp32(), LogisticDataset, make("DataType", DataType::F32)))
+
 {
     // Validate output
     validate(Accessor(_target), _reference, helper::relative_tolerance(_data_type, _function), 0.f, helper::absolute_tolerance(_data_type, _function));
@@ -245,6 +289,8 @@ TEST_SUITE_END() // Float
 
 template <typename T>
 using NEActivationLayerQuantizedFixture = ActivationValidationQuantizedFixture<Tensor, Accessor, NEActivationLayer, T>;
+template <typename T>
+using NEActivationLayerWithPaddingQuantizedFixture = ActivationWithPaddingValidationQuantizedFixture<Tensor, Accessor, NEActivationLayer, T>;
 
 /** Input data sets. */
 const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationFunction",
@@ -263,6 +309,13 @@ const auto QuantizedActivationFunctionsDataset = framework::dataset::make("Activ
 const auto QuantizedActivationDataset = combine(combine(framework::dataset::make("InPlace", { false }),
                                                         concat(QuantizedActivationFunctionsDataset, framework::dataset::make("ActivationFunction", ActivationLayerInfo::ActivationFunction::HARD_SWISH))),
                                                 framework::dataset::make("AlphaBeta", { 0.5f, 1.f }));
+const auto QuantizedActivationDatasetForPaddingAfterConfigure = combine(
+    make("InPlace", { false }),
+    concat(QuantizedActivationFunctionsDataset,
+        make("ActivationFunction", ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+    ),
+    make("AlphaBeta", { 0.5f})
+);
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
@@ -274,6 +327,17 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEActivationLayerQuantizedFixture<uint8_t>, fra
     // Validate output
     validate(Accessor(_target), _reference, helper::tolerance_qasymm8(_function));
 }
+FIXTURE_DATA_TEST_CASE(PaddingAfterConfigure, NEActivationLayerWithPaddingQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
+    combine(
+        make("Shape", TensorShape{ 7U, 7U, 17U, 2U }),
+        QuantizedActivationDatasetForPaddingAfterConfigure,
+        make("DataType", DataType::QASYMM8),
+        make("QuantizationInfo", { QuantizationInfo(0.1f, 128.0f) }
+    )))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, helper::tolerance_qasymm8(_function));
+}
 TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
@@ -285,6 +349,17 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEActivationLayerQuantizedFixture<int8_t>, fram
     // Validate output
     validate(Accessor(_target), _reference, helper::tolerance_qasymm8(_function));
 }
+FIXTURE_DATA_TEST_CASE(PaddingAfterConfigure, NEActivationLayerWithPaddingQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+    combine(
+        make("Shape", TensorShape{ 7U, 7U, 17U, 2U }),
+        QuantizedActivationDatasetForPaddingAfterConfigure,
+        make("DataType", DataType::QASYMM8_SIGNED),
+        make("QuantizationInfo", { QuantizationInfo(0.5f, 10.0f) }
+    )))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, helper::tolerance_qasymm8(_function));
+}
 TEST_SUITE_END() // QASYMM8_SIGNED
 
 /** Input data sets. */
@@ -297,6 +372,12 @@ const auto Int16QuantizedActivationFunctionsDataset = framework::dataset::make("
 const auto Int16QuantizedActivationDataset = combine(combine(framework::dataset::make("InPlace", { false }), Int16QuantizedActivationFunctionsDataset),
                                                      framework::dataset::make("AlphaBeta", { 0.5f, 1.f }));
 
+const auto Int16QuantizedActivationDatasetForPaddingAfterConfigure = combine(
+    make("InPlace", { false }),
+    Int16QuantizedActivationFunctionsDataset,
+    make("AlphaBeta", { 0.5f })
+);
+
 TEST_SUITE(QSYMM16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEActivationLayerQuantizedFixture<int16_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), Int16QuantizedActivationDataset),
                                                                                                                   framework::dataset::make("DataType",
@@ -306,6 +387,17 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEActivationLayerQuantizedFixture<int16_t>, fra
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qsymm16);
 }
+FIXTURE_DATA_TEST_CASE(PaddingAfterConfigure, NEActivationLayerWithPaddingQuantizedFixture<int16_t>, framework::DatasetMode::ALL,
+    combine(
+        make("Shape", TensorShape{ 7U, 7U, 17U, 2U }),
+        Int16QuantizedActivationDatasetForPaddingAfterConfigure,
+        make("DataType", DataType::QSYMM16),
+        make("QuantizationInfo", { QuantizationInfo(1.f / 32768.f, 0.f) }))
+    )
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qsymm16);
+}
 TEST_SUITE_END() // QSYMM16
 TEST_SUITE_END() // Quantized
 
diff --git a/tests/validation/fixtures/ActivationLayerFixture.h b/tests/validation/fixtures/ActivationLayerFixture.h