diff --git a/README.md b/README.md index c6c1eb2c..f2107f82 100755 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ The IRON Python API for Ryzenβ„’ AI NPUs is described in the following paper: | [Reduction]() | Reduction | bfloat16 | | | 🟑 | | | [Dequant](./aie_kernels/generic/expand.cc) | Dequant Q4NX from [AWQ](https://github.com/mit-han-lab/llm-awq) to bfloat16 | bfloat16 | βœ“ | βœ“ | 🟒 | [iron/operators/dequant/](./iron/operators/dequant/) | | [RELU](./aie_kernels/aie2/relu.cc) | RELU | bfloat16 | βœ“ | βœ“ | 🟒 | [iron/operators/relu/](./iron/operators/relu/) | -| [Leaky RELU](./aie_kernels/aie2p/leaky_relu.cc) (WIP) | Leaky RELU kernel | bfloat16 | | βœ“ | βšͺ | [iron/operators/leaky_relu/](./iron/operators/leaky_relu/) | +| [Leaky RELU](./aie_kernels/aie2/leaky_relu.cc) | Leaky RELU | bfloat16 | βœ“ | βœ“ | 🟒 | [iron/operators/leaky_relu/](./iron/operators/leaky_relu/) | | [GELU](./aie_kernels/aie2/gelu.cc) | GELU | bfloat16 | βœ“ | βœ“ | 🟒 | [iron/operators/gelu/](./iron/operators/gelu/) | | [LayerNorm](./aie_kernels/aie2/layer_norm.cc) | LayerNorm | bfloat16 | βœ“ | βœ“ | 🟒 | [iron/operators/layer_norm/](./iron/operators/layer_norm/) | | [Convolution]() | Convolution | bfloat16 | | | 🟑 | | diff --git a/aie_kernels/aie2/leaky_relu.cc b/aie_kernels/aie2/leaky_relu.cc new file mode 100644 index 00000000..1c8ec05f --- /dev/null +++ b/aie_kernels/aie2/leaky_relu.cc @@ -0,0 +1,47 @@ +// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "../aie_kernel_utils.h" + +#include +#include + +using namespace aie; + +void leaky_relu_vectorized_bf16(bfloat16 *restrict a, + bfloat16 *restrict c, + const int32_t vector_size, + const bfloat16 alpha) +{ + event0(); + + auto it_in = aie::begin_restrict_vector<16>((bfloat16 *)a); + auto it_out = aie::begin_restrict_vector<16>((bfloat16 *)c); + + // Broadcast alpha to a vector + vector alpha_vec = aie::broadcast(alpha); + + AIE_PREPARE_FOR_PIPELINING + AIE_LOOP_MIN_ITERATION_COUNT(16) + for (int i = 0; i < vector_size; i += 16) { + vector input = *it_in++; + // Leaky RELU: f(x) = max(x, alpha * x) where alpha is typically 0.01 + // When alpha < 1: if x > 0 then x, else alpha * x + vector alpha_times_input = aie::mul(input, alpha_vec); + vector output = aie::max(input, alpha_times_input); + *it_out++ = output; + } + + event1(); + + return; +} + +extern "C" { + +void leaky_relu_bf16(bfloat16 *restrict input, bfloat16 *restrict output, int input_size, bfloat16 alpha) +{ + leaky_relu_vectorized_bf16(input, output, input_size, alpha); +} + +} // extern "C" diff --git a/iron/operators/leaky_relu/design.py b/iron/operators/leaky_relu/design.py index 278bd046..825986c4 100644 --- a/iron/operators/leaky_relu/design.py +++ b/iron/operators/leaky_relu/design.py @@ -50,7 +50,7 @@ def my_leaky_relu( leaky_relu_fcn = Kernel( "leaky_relu_bf16", "leaky_relu.o", - [line_type, line_type, np.int32, np.dtype[xfr_dtype]], + [line_type, line_type, np.int32, xfr_dtype], ) # Task for the core to perform diff --git a/iron/operators/leaky_relu/test.py b/iron/operators/leaky_relu/test.py index 77bc9db9..851a114b 100755 --- a/iron/operators/leaky_relu/test.py +++ b/iron/operators/leaky_relu/test.py @@ -25,7 +25,6 @@ def get_params(): @pytest.mark.parametrize( "input_length,num_aie_columns,num_channels,tile_size,alpha", get_params() ) -@pytest.mark.skip(reason="Leaky ReLU is currently broken (#36)") @pytest.mark.metrics( Latency=r"Latency \(us\): (?P[\d\.]+)", Bandwidth=r"Effective Bandwidth: (?P[\d\.e\+-]+) GB/s",