Skip to content

Commit a8fdd2e

Browse files
committed
[SDAG][AMDGPU] Allow opting in to OOB-generating PTRADD transforms
This PR adds a TargetLowering hook, canTransformPtrArithOutOfBounds, that targets can use to allow transformations to introduce out-of-bounds pointer arithmetic. It also moves two such transformations from the AMDGPU-specific DAG combines to the generic DAGCombiner. This is motivated by target features like AArch64's checked pointer arithmetic, CPA, which does not tolerate the introduction of out-of-bounds pointer arithmetic.
1 parent 67b8a8d commit a8fdd2e

File tree

4 files changed

+94
-100
lines changed

4 files changed

+94
-100
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3498,6 +3498,13 @@ class LLVM_ABI TargetLoweringBase {
34983498
return false;
34993499
}
35003500

3501+
/// True if the target allows transformations of in-bounds pointer
3502+
/// arithmetic that cause out-of-bounds intermediate results.
3503+
virtual bool canTransformPtrArithOutOfBounds(const Function &F,
3504+
EVT PtrVT) const {
3505+
return false;
3506+
}
3507+
35013508
/// Does this target support complex deinterleaving
35023509
virtual bool isComplexDeinterleavingSupported() const { return false; }
35033510

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 74 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -2689,59 +2689,82 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) {
26892689
if (PtrVT == IntVT && isNullConstant(N0))
26902690
return N1;
26912691

2692-
if (N0.getOpcode() != ISD::PTRADD ||
2693-
reassociationCanBreakAddressingModePattern(ISD::PTRADD, DL, N, N0, N1))
2694-
return SDValue();
2695-
2696-
SDValue X = N0.getOperand(0);
2697-
SDValue Y = N0.getOperand(1);
2698-
SDValue Z = N1;
2699-
bool N0OneUse = N0.hasOneUse();
2700-
bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
2701-
bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
2702-
2703-
// (ptradd (ptradd x, y), z) -> (ptradd x, (add y, z)) if:
2704-
// * y is a constant and (ptradd x, y) has one use; or
2705-
// * y and z are both constants.
2706-
if ((YIsConstant && N0OneUse) || (YIsConstant && ZIsConstant)) {
2707-
// If both additions in the original were NUW, the new ones are as well.
2708-
SDNodeFlags Flags =
2709-
(N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
2710-
SDValue Add = DAG.getNode(ISD::ADD, DL, IntVT, {Y, Z}, Flags);
2711-
AddToWorklist(Add.getNode());
2712-
return DAG.getMemBasePlusOffset(X, Add, DL, Flags);
2692+
if (N0.getOpcode() == ISD::PTRADD &&
2693+
!reassociationCanBreakAddressingModePattern(ISD::PTRADD, DL, N, N0, N1)) {
2694+
SDValue X = N0.getOperand(0);
2695+
SDValue Y = N0.getOperand(1);
2696+
SDValue Z = N1;
2697+
bool N0OneUse = N0.hasOneUse();
2698+
bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
2699+
bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
2700+
2701+
// (ptradd (ptradd x, y), z) -> (ptradd x, (add y, z)) if:
2702+
// * y is a constant and (ptradd x, y) has one use; or
2703+
// * y and z are both constants.
2704+
if ((YIsConstant && N0OneUse) || (YIsConstant && ZIsConstant)) {
2705+
// If both additions in the original were NUW, the new ones are as well.
2706+
SDNodeFlags Flags =
2707+
(N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
2708+
SDValue Add = DAG.getNode(ISD::ADD, DL, IntVT, {Y, Z}, Flags);
2709+
AddToWorklist(Add.getNode());
2710+
return DAG.getMemBasePlusOffset(X, Add, DL, Flags);
2711+
}
2712+
}
2713+
2714+
// The following combines can turn in-bounds pointer arithmetic out of bounds.
2715+
// That is problematic for settings like AArch64's CPA, which checks that
2716+
// intermediate results of pointer arithmetic remain in bounds. The target
2717+
// therefore needs to opt-in to enable them.
2718+
if (!TLI.canTransformPtrArithOutOfBounds(
2719+
DAG.getMachineFunction().getFunction(), PtrVT))
2720+
return SDValue();
2721+
2722+
if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) {
2723+
// Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with
2724+
// global address GA and constant c, such that c can be folded into GA.
2725+
SDValue GAValue = N0.getOperand(0);
2726+
if (const GlobalAddressSDNode *GA =
2727+
dyn_cast<GlobalAddressSDNode>(GAValue)) {
2728+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2729+
if (!LegalOperations && TLI.isOffsetFoldingLegal(GA)) {
2730+
// If both additions in the original were NUW, reassociation preserves
2731+
// that.
2732+
SDNodeFlags Flags =
2733+
(N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
2734+
SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
2735+
AddToWorklist(Inner.getNode());
2736+
return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
2737+
}
2738+
}
27132739
}
27142740

2715-
// TODO: There is another possible fold here that was proven useful.
2716-
// It would be this:
2717-
//
2718-
// (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y) if:
2719-
// * (ptradd x, y) has one use; and
2720-
// * y is a constant; and
2721-
// * z is not a constant.
2722-
//
2723-
// In some cases, specifically in AArch64's FEAT_CPA, it exposes the
2724-
// opportunity to select more complex instructions such as SUBPT and
2725-
// MSUBPT. However, a hypothetical corner case has been found that we could
2726-
// not avoid. Consider this (pseudo-POSIX C):
2727-
//
2728-
// char *foo(char *x, int z) {return (x + LARGE_CONSTANT) + z;}
2729-
// char *p = mmap(LARGE_CONSTANT);
2730-
// char *q = foo(p, -LARGE_CONSTANT);
2731-
//
2732-
// Then x + LARGE_CONSTANT is one-past-the-end, so valid, and a
2733-
// further + z takes it back to the start of the mapping, so valid,
2734-
// regardless of the address mmap gave back. However, if mmap gives you an
2735-
// address < LARGE_CONSTANT (ignoring high bits), x - LARGE_CONSTANT will
2736-
// borrow from the high bits (with the subsequent + z carrying back into
2737-
// the high bits to give you a well-defined pointer) and thus trip
2738-
// FEAT_CPA's pointer corruption checks.
2739-
//
2740-
// We leave this fold as an opportunity for future work, addressing the
2741-
// corner case for FEAT_CPA, as well as reconciling the solution with the
2742-
// more general application of pointer arithmetic in other future targets.
2743-
// For now each architecture that wants this fold must implement it in the
2744-
// target-specific code (see e.g. SITargetLowering::performPtrAddCombine)
2741+
if (N1.getOpcode() == ISD::ADD && N1.hasOneUse()) {
2742+
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
2743+
// y is not, and (add y, z) is used only once.
2744+
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
2745+
// z is not, and (add y, z) is used only once.
2746+
// The goal is to move constant offsets to the outermost ptradd, to create
2747+
// more opportunities to fold offsets into memory instructions.
2748+
// Together with the another combine above, this also implements
2749+
// (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
2750+
SDValue X = N0;
2751+
SDValue Y = N1.getOperand(0);
2752+
SDValue Z = N1.getOperand(1);
2753+
bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
2754+
bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
2755+
2756+
// If both additions in the original were NUW, reassociation preserves that.
2757+
SDNodeFlags ReassocFlags =
2758+
(N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
2759+
2760+
if (ZIsConstant != YIsConstant) {
2761+
if (YIsConstant)
2762+
std::swap(Y, Z);
2763+
SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
2764+
AddToWorklist(Inner.getNode());
2765+
return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags);
2766+
}
2767+
}
27452768

27462769
return SDValue();
27472770
}

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 10 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -10817,6 +10817,11 @@ bool SITargetLowering::shouldPreservePtrArith(const Function &F,
1081710817
return UseSelectionDAGPTRADD && PtrVT == MVT::i64;
1081810818
}
1081910819

10820+
bool SITargetLowering::canTransformPtrArithOutOfBounds(const Function &F,
10821+
EVT PtrVT) const {
10822+
return true;
10823+
}
10824+
1082010825
// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
1082110826
// offset (the offset that is included in bounds checking and swizzling, to be
1082210827
// split between the instruction's voffset and immoffset fields) and soffset
@@ -15365,64 +15370,17 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
1536515370
return Folded;
1536615371
}
1536715372

15368-
if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) {
15369-
// Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with
15370-
// global address GA and constant c, such that c can be folded into GA.
15371-
SDValue GAValue = N0.getOperand(0);
15372-
if (const GlobalAddressSDNode *GA =
15373-
dyn_cast<GlobalAddressSDNode>(GAValue)) {
15374-
if (DCI.isBeforeLegalizeOps() && isOffsetFoldingLegal(GA)) {
15375-
// If both additions in the original were NUW, reassociation preserves
15376-
// that.
15377-
SDNodeFlags Flags =
15378-
(N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
15379-
SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
15380-
DCI.AddToWorklist(Inner.getNode());
15381-
return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
15382-
}
15383-
}
15384-
}
15385-
1538615373
if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
1538715374
return SDValue();
1538815375

15389-
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
15390-
// y is not, and (add y, z) is used only once.
15391-
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
15392-
// z is not, and (add y, z) is used only once.
15393-
// The goal is to move constant offsets to the outermost ptradd, to create
15394-
// more opportunities to fold offsets into memory instructions.
15395-
// Together with the generic combines in DAGCombiner.cpp, this also
15396-
// implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
15397-
//
15398-
// This transform is here instead of in the general DAGCombiner as it can
15399-
// turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
15400-
// AArch64's CPA.
1540115376
SDValue X = N0;
1540215377
SDValue Y = N1.getOperand(0);
1540315378
SDValue Z = N1.getOperand(1);
1540415379
bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
1540515380
bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
1540615381

15407-
// If both additions in the original were NUW, reassociation preserves that.
15408-
SDNodeFlags ReassocFlags =
15409-
(N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
15410-
15411-
if (ZIsConstant != YIsConstant) {
15412-
if (YIsConstant)
15413-
std::swap(Y, Z);
15414-
SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
15415-
DCI.AddToWorklist(Inner.getNode());
15416-
return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags);
15417-
}
15418-
15419-
// If one of Y and Z is constant, they have been handled above. If both were
15420-
// constant, the addition would have been folded in SelectionDAG::getNode
15421-
// already. This ensures that the generic DAG combines won't undo the
15422-
// following reassociation.
15423-
assert(!YIsConstant && !ZIsConstant);
15424-
15425-
if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) {
15382+
if (!YIsConstant && !ZIsConstant && !X->isDivergent() &&
15383+
Y->isDivergent() != Z->isDivergent()) {
1542615384
// Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
1542715385
// y are uniform and z isn't.
1542815386
// Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
@@ -15433,6 +15391,9 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
1543315391
// reassociate.
1543415392
if (Y->isDivergent())
1543515393
std::swap(Y, Z);
15394+
// If both additions in the original were NUW, reassociation preserves that.
15395+
SDNodeFlags ReassocFlags =
15396+
(N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
1543615397
SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
1543715398
DCI.AddToWorklist(UniformInner.getNode());
1543815399
return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);

llvm/lib/Target/AMDGPU/SIISelLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
265265

266266
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override;
267267

268+
bool canTransformPtrArithOutOfBounds(const Function &F,
269+
EVT PtrVT) const override;
270+
268271
private:
269272
// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
270273
// the three offsets (voffset, soffset and instoffset) into the SDValue[3]

0 commit comments

Comments
 (0)