Skip to content

Commit ece76c0

Browse files
ekochetkigcbot
authored andcommitted
Make AllocaSinking more aggressive for high register pressure kernels
When private memory is allocated on stack (e.g. -O0), rematerialization does not handle address arithmetic. This commit makes alloca sinking more aggressive during PrivateMemoryResolution.
1 parent ddc7a1d commit ece76c0

File tree

2 files changed

+72
-5
lines changed

2 files changed

+72
-5
lines changed

IGC/Compiler/Optimizer/OpenCLPasses/PrivateMemory/PrivateMemoryResolution.cpp

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -442,7 +442,7 @@ bool PrivateMemoryResolution::runOnModule(llvm::Module &M) {
442442
// [6] load j
443443
// [7] bar(j)
444444
//
445-
static void sinkAllocas(SmallVectorImpl<AllocaInst *> &Allocas) {
445+
static void sinkAllocas(SmallVectorImpl<AllocaInst *> &Allocas, bool highAllocaRAPressure) {
446446
IGC_ASSERT(false == Allocas.empty());
447447
DominatorTree DT;
448448
llvm::LoopInfoBase<llvm::BasicBlock, llvm::Loop> LI;
@@ -499,9 +499,12 @@ static void sinkAllocas(SmallVectorImpl<AllocaInst *> &Allocas) {
499499
}
500500

501501
// Find the nearest Denominator outside loops to prevent multiple allocations
502-
BasicBlock *CurBB = AI->getParent();
503-
while (DomBB && DomBB != CurBB && LI.getLoopFor(DomBB) != nullptr) {
504-
DomBB = DT.getNode(DomBB)->getIDom()->getBlock();
502+
// In case when we have too many allocas that increase RegPressure, skip this optimization
503+
if (!highAllocaRAPressure) {
504+
BasicBlock *CurBB = AI->getParent();
505+
while (DomBB && DomBB != CurBB && LI.getLoopFor(DomBB) != nullptr) {
506+
DomBB = DT.getNode(DomBB)->getIDom()->getBlock();
507+
}
505508
}
506509

507510
if (DomBB) {
@@ -861,7 +864,12 @@ bool PrivateMemoryResolution::resolveAllocaInstructions(bool privateOnStack) {
861864
if (Ctx.m_instrTypes.numAllocaInsts > IGC_GET_FLAG_VALUE(AllocaRAPressureThreshold)) {
862865
sinkAllocaSingleUse(allocaInsts);
863866
}
864-
sinkAllocas(allocaInsts);
867+
868+
// if we have privateOnStack, then address rematerialization will not work for allocas, in this case
869+
// we need to sink aggressively, even into loops
870+
// TODO: combine this and the upper heuristic into one independent from number of allocas, but from RegPressure
871+
bool highAllocaRAPressure = allocaInsts.size() > IGC_GET_FLAG_VALUE(AllocaRAPressureThreshold) && privateOnStack;
872+
sinkAllocas(allocaInsts, highAllocaRAPressure);
865873

866874
// Each AllocaInst creates a buffer and all buffers are put together
867875
// sequentially like the following:
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2025 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
;
9+
; REQUIRES: regkeys
10+
; RUN: igc_opt --opaque-pointers --regkey AllocaRAPressureThreshold=2 --igc-private-mem-resolution -S %s | FileCheck %s
11+
12+
; CHECK: for.cond:
13+
; CHECK: for.body:
14+
; CHECK: call{{.*}}llvm.genx.GenISA.StackAlloca
15+
; CHECK: br label %for.cond
16+
17+
18+
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-n8:16:32"
19+
target triple = "spir64-unknown-unknown"
20+
21+
define spir_kernel void @call_func_in_loop() #0 {
22+
entry:
23+
%out.addr = alloca ptr addrspace(1), align 8
24+
%gid = alloca i32, align 4
25+
%j = alloca i32, align 4
26+
br label %for.cond
27+
28+
for.cond: ; preds = %for.body, %entry
29+
br label %for.body
30+
31+
for.body: ; preds = %for.cond
32+
%0 = load i32, ptr %j, align 4
33+
br label %for.cond
34+
}
35+
36+
declare spir_func void @add_one()
37+
38+
attributes #0 = { "visaStackCall" }
39+
40+
!igc.functions = !{!0, !4}
41+
42+
!0 = !{ptr @add_one, !1}
43+
!1 = !{!2, !3}
44+
!2 = !{!"function_type", i32 2}
45+
!3 = !{!"implicit_arg_desc"}
46+
!4 = !{ptr @call_func_in_loop, !5}
47+
!5 = !{!6, !7}
48+
!6 = !{!"function_type", i32 0}
49+
!7 = !{!"implicit_arg_desc", !8, !9, !10, !11, !12, !13, !14, !15, !17}
50+
!8 = !{i32 0}
51+
!9 = !{i32 2}
52+
!10 = !{i32 7}
53+
!11 = !{i32 8}
54+
!12 = !{i32 9}
55+
!13 = !{i32 10}
56+
!14 = !{i32 13}
57+
!15 = !{i32 15, !16}
58+
!16 = !{!"explicit_arg_num", i32 0}
59+
!17 = !{i32 59, !16}

0 commit comments

Comments
 (0)