Skip to content

Commit 07a31ad

Browse files
authored
[X86] EltsFromConsecutiveLoads - recognise reverse load patterns. (#168706)
See if we can create a vector load from the src elements in reverse and then shuffle these back into place. SLP will (usually) catch this in the middle-end, but there are a few BUILD_VECTOR scalarizations etc. that appear during DAG legalization. I did start looking at a more general permute fold, but I haven't found any good test examples for this yet - happy to take another look if somebody has examples.
1 parent e44646b commit 07a31ad

File tree

7 files changed

+163
-502
lines changed

7 files changed

+163
-502
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7557,6 +7557,19 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
75577557
}
75587558
}
75597559

7560+
// REVERSE - attempt to match the loads in reverse and then shuffle back.
7561+
// TODO: Do this for any permute or mismatching element counts.
7562+
if (Depth == 0 && !ZeroMask && TLI.isTypeLegal(VT) && VT.isVector() &&
7563+
NumElems == VT.getVectorNumElements()) {
7564+
SmallVector<SDValue, 16> ReverseElts(Elts.rbegin(), Elts.rend());
7565+
if (SDValue RevLd = EltsFromConsecutiveLoads(
7566+
VT, ReverseElts, DL, DAG, Subtarget, IsAfterLegalize, Depth + 1)) {
7567+
SmallVector<int, 16> ReverseMask(NumElems);
7568+
std::iota(ReverseMask.rbegin(), ReverseMask.rend(), 0);
7569+
return DAG.getVectorShuffle(VT, DL, RevLd, DAG.getUNDEF(VT), ReverseMask);
7570+
}
7571+
}
7572+
75607573
return SDValue();
75617574
}
75627575

@@ -59490,8 +59503,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
5949059503
if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT,
5949159504
*FirstLd->getMemOperand(), &Fast) &&
5949259505
Fast) {
59493-
if (SDValue Ld =
59494-
EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
59506+
if (SDValue Ld = EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget,
59507+
false, Depth + 1))
5949559508
return Ld;
5949659509
}
5949759510
}

llvm/test/CodeGen/X86/bitcnt-big-integer.ll

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -844,13 +844,11 @@ define i32 @test_ctlz_i512(i512 %a0) nounwind {
844844
; AVX512-NEXT: vmovq %rcx, %xmm2
845845
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
846846
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
847-
; AVX512-NEXT: vmovq %r8, %xmm1
848-
; AVX512-NEXT: vmovq %r9, %xmm2
849-
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
850-
; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
851-
; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
847+
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
848+
; AVX512-NEXT: vmovq %r8, %xmm2
849+
; AVX512-NEXT: vmovq %r9, %xmm3
852850
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
853-
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
851+
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
854852
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
855853
; AVX512-NEXT: vplzcntq %zmm0, %zmm1
856854
; AVX512-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
@@ -2071,13 +2069,11 @@ define i32 @test_ctlz_undef_i512(i512 %a0) nounwind {
20712069
; AVX512-NEXT: vmovq %rcx, %xmm2
20722070
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
20732071
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
2074-
; AVX512-NEXT: vmovq %r8, %xmm1
2075-
; AVX512-NEXT: vmovq %r9, %xmm2
2076-
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
2077-
; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
2078-
; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
2072+
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
2073+
; AVX512-NEXT: vmovq %r8, %xmm2
2074+
; AVX512-NEXT: vmovq %r9, %xmm3
20792075
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
2080-
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
2076+
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
20812077
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
20822078
; AVX512-NEXT: vptestmq %zmm0, %zmm0, %k1
20832079
; AVX512-NEXT: vplzcntq %zmm0, %zmm0

llvm/test/CodeGen/X86/build-vector-256.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -417,9 +417,8 @@ define <32 x i8> @test_buildvector_v32i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
417417
define <4 x double> @test_buildvector_4f64_2_var(double %a0, double %a1) {
418418
; AVX1-32-LABEL: test_buildvector_4f64_2_var:
419419
; AVX1-32: # %bb.0:
420-
; AVX1-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
421-
; AVX1-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
422-
; AVX1-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
420+
; AVX1-32-NEXT: vmovupd {{[0-9]+}}(%esp), %xmm0
421+
; AVX1-32-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
423422
; AVX1-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
424423
; AVX1-32-NEXT: retl
425424
;

llvm/test/CodeGen/X86/chain_order.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,8 @@ define void @cftx020(ptr nocapture %a) {
66
; CHECK-LABEL: cftx020:
77
; CHECK: # %bb.0: # %entry
88
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
9-
; CHECK-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
109
; CHECK-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
11-
; CHECK-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
10+
; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0]
1211
; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
1312
; CHECK-NEXT: vmovupd (%rdi), %xmm1
1413
; CHECK-NEXT: vmovupd %xmm0, (%rdi)

0 commit comments

Comments
 (0)