[Reassociate] Compute value ranks iteratively#204952
Conversation
ReassociatePass::getRank recursively walks operand def-use chains before memoizing ranks. For a very deep acyclic chain whose tail feeds a reassociable operation, this can recurse once per chain element and overflow the native stack. Compute ranks with an explicit post-order worklist instead. The worklist preserves the existing rank calculation and memoization behavior while bounding native stack use. Add a regression test that generates a 65536-deep select chain and runs reassociate under an 8 MB stack.
|
@llvm/pr-subscribers-llvm-transforms Author: Hao Ren (nvidia-moomoo) ChangesReassociatePass::getRank recursively walks operand def-use chains before memoizing ranks. For a very deep acyclic chain whose tail feeds a reassociable operation, this can recurse once per chain element and overflow the native stack. Compute ranks with an explicit post-order worklist instead. The worklist preserves the existing rank calculation and memoization behavior while bounding native stack use. Add a regression test that generates a 65536-deep select chain and runs reassociate under an 8 MB stack. Full diff: https://github.com/llvm/llvm-project/pull/204952.diff 3 Files Affected:
diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp
index 1ef4d02127ed2..b0c5b7d3f8f32 100644
--- a/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -212,24 +212,78 @@ unsigned ReassociatePass::getRank(Value *V) {
if (unsigned Rank = ValueRankMap[I])
return Rank; // Rank already known?
- // If this is an expression, return the 1+MAX(rank(LHS), rank(RHS)) so that
- // we can reassociate expressions for code motion! Since we do not recurse
- // for PHI nodes, we cannot have infinite recursion here, because there
- // cannot be loops in the value graph that do not go through PHI nodes.
- unsigned Rank = 0, MaxRank = RankMap[I->getParent()];
- for (unsigned i = 0, e = I->getNumOperands(); i != e && Rank != MaxRank; ++i)
- Rank = std::max(Rank, getRank(I->getOperand(i)));
-
- // If this is a 'not' or 'neg' instruction, do not count it for rank. This
- // assures us that X and ~X will have the same rank.
- if (!match(I, m_Not(m_Value())) && !match(I, m_Neg(m_Value())) &&
- !match(I, m_FNeg(m_Value())))
- ++Rank;
-
- LLVM_DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " << Rank
- << "\n");
-
- return ValueRankMap[I] = Rank;
+ // Return 1+MAX(rank(LHS), rank(RHS)) for expressions so we can reassociate
+ // expressions for code motion. Use an explicit worklist rather than native
+ // recursion so long acyclic use-def chains do not overflow the stack.
+ struct RankWorkItem {
+ Instruction *I;
+ unsigned OpNo;
+ unsigned Rank;
+ unsigned MaxRank;
+ };
+
+ auto GetLeafRank = [this](Value *V) {
+ return isa<Argument>(V) ? ValueRankMap[V] : 0;
+ };
+
+ SmallVector<RankWorkItem, 16> Worklist;
+ Worklist.push_back(RankWorkItem{I, 0, 0, RankMap[I->getParent()]});
+
+ auto CompleteRank = [&](unsigned Rank) {
+ // A rank has been computed for the current work item. Pop it and, if a
+ // parent is waiting, fold that rank into the parent before resuming there.
+ Worklist.pop_back();
+ if (Worklist.empty())
+ return true;
+
+ RankWorkItem &Parent = Worklist.back();
+ Parent.Rank = std::max(Parent.Rank, Rank);
+ ++Parent.OpNo;
+ return false;
+ };
+
+ do {
+ RankWorkItem &Item = Worklist.back();
+ Instruction *CurI = Item.I;
+
+ if (unsigned Rank = ValueRankMap[CurI]) {
+ if (CompleteRank(Rank))
+ return Rank;
+ continue;
+ }
+
+ if (Item.OpNo != CurI->getNumOperands() && Item.Rank != Item.MaxRank) {
+ Value *Op = CurI->getOperand(Item.OpNo);
+ if (Instruction *OpI = dyn_cast<Instruction>(Op)) {
+ if (unsigned OpRank = ValueRankMap[OpI]) {
+ Item.Rank = std::max(Item.Rank, OpRank);
+ ++Item.OpNo;
+ } else {
+ Worklist.push_back(
+ RankWorkItem{OpI, 0, 0, RankMap[OpI->getParent()]});
+ }
+ } else {
+ Item.Rank = std::max(Item.Rank, GetLeafRank(Op));
+ ++Item.OpNo;
+ }
+ continue;
+ }
+
+ unsigned Rank = Item.Rank;
+
+ // If this is a 'not' or 'neg' instruction, do not count it for rank. This
+ // assures us that X and ~X will have the same rank.
+ if (!match(CurI, m_Not(m_Value())) && !match(CurI, m_Neg(m_Value())) &&
+ !match(CurI, m_FNeg(m_Value())))
+ ++Rank;
+
+ LLVM_DEBUG(dbgs() << "Calculated Rank[" << CurI->getName() << "] = "
+ << Rank << "\n");
+
+ ValueRankMap[CurI] = Rank;
+ if (CompleteRank(Rank))
+ return Rank;
+ } while (true);
}
// Canonicalize constants to RHS. Otherwise, sort the operands by rank.
diff --git a/llvm/test/Transforms/Reassociate/Inputs/deep-select-chain.py b/llvm/test/Transforms/Reassociate/Inputs/deep-select-chain.py
new file mode 100644
index 0000000000000..637782e6f09a5
--- /dev/null
+++ b/llvm/test/Transforms/Reassociate/Inputs/deep-select-chain.py
@@ -0,0 +1,19 @@
+import sys
+
+
+def main():
+ depth = int(sys.argv[1])
+ assert depth > 0
+
+ print("define i32 @deep_select_chain(i1 %p, i32 %a, i32 %b) {")
+ print("entry:")
+ print(" %s0 = select i1 %p, i32 %a, i32 %b")
+ for i in range(1, depth):
+ print(f" %s{i} = select i1 %p, i32 %s{i - 1}, i32 %b")
+ print(f" %r = add i32 %s{depth - 1}, %a")
+ print(" ret i32 %r")
+ print("}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/llvm/test/Transforms/Reassociate/deep-select-chain.ll b/llvm/test/Transforms/Reassociate/deep-select-chain.ll
new file mode 100644
index 0000000000000..f32f4ffa39bc3
--- /dev/null
+++ b/llvm/test/Transforms/Reassociate/deep-select-chain.ll
@@ -0,0 +1,17 @@
+; RUN: opt -S -passes=reassociate < %s | FileCheck %s
+; RUN: %python %S/Inputs/deep-select-chain.py 65536 > %t.deep.ll
+; RUN: ulimit -s 8192 && opt -S -passes=reassociate < %t.deep.ll -o /dev/null
+; REQUIRES: system-linux
+
+define i32 @small_select_chain(i1 %p, i32 %a, i32 %b) {
+; CHECK-LABEL: @small_select_chain(
+; CHECK: %s0 = select i1 %p, i32 %a, i32 %b
+; CHECK-NEXT: %s1 = select i1 %p, i32 %s0, i32 %b
+; CHECK-NEXT: %r = add i32 %s1, %a
+; CHECK-NEXT: ret i32 %r
+;
+ %s0 = select i1 %p, i32 %a, i32 %b
+ %s1 = select i1 %p, i32 %s0, i32 %b
+ %r = add i32 %s1, %a
+ ret i32 %r
+}
|
You can test this locally with the following command:git-clang-format --diff origin/main HEAD --extensions cpp -- llvm/lib/Transforms/Scalar/Reassociate.cpp --diff_from_common_commit
View the diff from clang-format here.diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp
index b0c5b7d3f..9e7c0507b 100644
--- a/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -277,8 +277,8 @@ unsigned ReassociatePass::getRank(Value *V) {
!match(CurI, m_FNeg(m_Value())))
++Rank;
- LLVM_DEBUG(dbgs() << "Calculated Rank[" << CurI->getName() << "] = "
- << Rank << "\n");
+ LLVM_DEBUG(dbgs() << "Calculated Rank[" << CurI->getName() << "] = " << Rank
+ << "\n");
ValueRankMap[CurI] = Rank;
if (CompleteRank(Rank))
|
ReassociatePass::getRank recursively walks operand def-use chains before memoizing ranks. For a very deep acyclic chain whose tail feeds a reassociable operation, this can recurse once per chain element and overflow the native stack.
Compute ranks with an explicit post-order worklist instead. The worklist preserves the existing rank calculation and memoization behavior while bounding native stack use.
Add a regression test that generates a 65536-deep select chain and runs reassociate under an 8 MB stack.