[X86] Use the Divider resource consistently for scalar integer division#204972
[X86] Use the Divider resource consistently for scalar integer division#204972as4230 wants to merge 1 commit into
Conversation
65c102d to
45c8bb2
Compare
|
@llvm/pr-subscribers-backend-x86 Author: Adam Scott (as4230) ChangesThe Intel scheduler models from Haswell through Ice Lake model scalar integer division inconsistently. The Divider resource is used by some forms but not others with no consistent rule and on most of these models the 16/32/64-bit register forms just put one flat latency across the issue ports with no divider at all. On Skylake, for example, the register form of DIV32 is given a latency of 76 with 32 uops, and IDIV32 a latency of 102 with 66 uops, when the real numbers are closer to 28 cycles and 10 uops. The same flat latency is used for every width so DIV stays 76 whether it is 16, 32 or 64 bit and IDIV stays 102, even though a 64-bit divide is slower than a 16-bit one. Every WriteDiv and WriteIDiv form now goes through the Divider. The latency, throughput and uop counts come from uops.info. Data sources:
Changes:Format below is latency / divider / uops, where divider is the number of cycles the Divider resource is held or ports if that form didn't use the Divider and latency is worst-case max across all operand paths. Haswell
Broadwell
Skylake
Ice Lake
Fixes #201568. Patch is 108.09 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/204972.diff 14 Files Affected:
diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td
index 4b697c4336341..da0fa935c7567 100644
--- a/llvm/lib/Target/X86/X86SchedBroadwell.td
+++ b/llvm/lib/Target/X86/X86SchedBroadwell.td
@@ -212,24 +212,15 @@ defm : BWWriteResPair<WriteBEXTR, [BWPort06,BWPort15], 2, [1,1], 2>;
defm : BWWriteResPair<WriteBLS, [BWPort15], 1>;
defm : BWWriteResPair<WriteBZHI, [BWPort15], 1>;
-// TODO: Why isn't the BWDivider used consistently?
-defm : X86WriteRes<WriteDiv8, [BWPort0, BWDivider], 25, [1, 10], 1>;
-defm : X86WriteRes<WriteDiv16, [BWPort0,BWPort1,BWPort5,BWPort6,BWPort01,BWPort0156], 80, [7,7,3,3,1,11], 32>;
-defm : X86WriteRes<WriteDiv32, [BWPort0,BWPort1,BWPort5,BWPort6,BWPort01,BWPort0156], 80, [7,7,3,3,1,11], 32>;
-defm : X86WriteRes<WriteDiv64, [BWPort0,BWPort1,BWPort5,BWPort6,BWPort01,BWPort0156], 80, [7,7,3,3,1,11], 32>;
-defm : X86WriteRes<WriteDiv8Ld, [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 34, [2,2,2,1,1], 8>;
-defm : X86WriteRes<WriteDiv16Ld, [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 34, [2,2,2,1,1], 8>;
-defm : X86WriteRes<WriteDiv32Ld, [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 34, [2,2,2,1,1], 8>;
-defm : X86WriteRes<WriteDiv64Ld, [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 34, [2,2,2,1,1], 8>;
-
-defm : X86WriteRes<WriteIDiv8, [BWPort0, BWDivider], 25, [1,10], 1>;
-defm : X86WriteRes<WriteIDiv16, [BWPort0, BWDivider], 25, [1,10], 1>;
-defm : X86WriteRes<WriteIDiv32, [BWPort0, BWDivider], 25, [1,10], 1>;
-defm : X86WriteRes<WriteIDiv64, [BWPort0, BWDivider], 25, [1,10], 1>;
-defm : X86WriteRes<WriteIDiv8Ld, [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 35, [2,2,2,1,1], 8>;
-defm : X86WriteRes<WriteIDiv16Ld, [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 35, [2,2,2,1,1], 8>;
-defm : X86WriteRes<WriteIDiv32Ld, [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 35, [2,2,2,1,1], 8>;
-defm : X86WriteRes<WriteIDiv64Ld, [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 35, [2,2,2,1,1], 8>;
+defm : BWWriteResPair<WriteDiv8, [BWPort0, BWDivider], 24, [1,9], 9>;
+defm : BWWriteResPair<WriteDiv16, [BWPort0, BWDivider], 25, [1,9], 11>;
+defm : BWWriteResPair<WriteDiv32, [BWPort0, BWDivider], 28, [1,9], 10>;
+defm : BWWriteResPair<WriteDiv64, [BWPort0, BWDivider], 92, [1,21], 36>;
+
+defm : BWWriteResPair<WriteIDiv8, [BWPort0, BWDivider], 24, [1,6], 9>;
+defm : BWWriteResPair<WriteIDiv16, [BWPort0, BWDivider], 26, [1,6], 10>;
+defm : BWWriteResPair<WriteIDiv32, [BWPort0, BWDivider], 28, [1,6], 9>;
+defm : BWWriteResPair<WriteIDiv64, [BWPort0, BWDivider], 100, [1,24], 59>;
// Floating point. This covers both scalar and vector operations.
defm : X86WriteRes<WriteFLD0, [BWPort01], 1, [1], 1>;
diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td
index dd1e1c3a6cb53..d6f0d3f4e179f 100644
--- a/llvm/lib/Target/X86/X86SchedHaswell.td
+++ b/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -214,24 +214,15 @@ defm : HWWriteResPair<WriteBEXTR, [HWPort06,HWPort15], 2, [1,1], 2>;
defm : HWWriteResPair<WriteBLS, [HWPort15], 1>;
defm : HWWriteResPair<WriteBZHI, [HWPort15], 1>;
-// TODO: Why isn't the HWDivider used?
-defm : X86WriteRes<WriteDiv8, [HWPort0,HWPort1,HWPort5,HWPort6], 22, [], 9>;
-defm : X86WriteRes<WriteDiv16, [HWPort0,HWPort1,HWPort5,HWPort6,HWPort01,HWPort0156], 98, [7,7,3,3,1,11], 32>;
-defm : X86WriteRes<WriteDiv32, [HWPort0,HWPort1,HWPort5,HWPort6,HWPort01,HWPort0156], 98, [7,7,3,3,1,11], 32>;
-defm : X86WriteRes<WriteDiv64, [HWPort0,HWPort1,HWPort5,HWPort6,HWPort01,HWPort0156], 98, [7,7,3,3,1,11], 32>;
-defm : X86WriteRes<WriteDiv8Ld, [HWPort0,HWPort23,HWDivider], 29, [1,1,10], 2>;
-defm : X86WriteRes<WriteDiv16Ld, [HWPort0,HWPort23,HWDivider], 29, [1,1,10], 2>;
-defm : X86WriteRes<WriteDiv32Ld, [HWPort0,HWPort23,HWDivider], 29, [1,1,10], 2>;
-defm : X86WriteRes<WriteDiv64Ld, [HWPort0,HWPort23,HWDivider], 29, [1,1,10], 2>;
-
-defm : X86WriteRes<WriteIDiv8, [HWPort0,HWPort1,HWPort5,HWPort6], 23, [], 9>;
-defm : X86WriteRes<WriteIDiv16, [HWPort0,HWPort1,HWPort5,HWPort6,HWPort06,HWPort0156], 112, [4,2,4,8,14,34], 66>;
-defm : X86WriteRes<WriteIDiv32, [HWPort0,HWPort1,HWPort5,HWPort6,HWPort06,HWPort0156], 112, [4,2,4,8,14,34], 66>;
-defm : X86WriteRes<WriteIDiv64, [HWPort0,HWPort1,HWPort5,HWPort6,HWPort06,HWPort0156], 112, [4,2,4,8,14,34], 66>;
-defm : X86WriteRes<WriteIDiv8Ld, [HWPort0,HWPort23,HWDivider], 29, [1,1,10], 2>;
-defm : X86WriteRes<WriteIDiv16Ld, [HWPort0,HWPort23,HWDivider], 29, [1,1,10], 2>;
-defm : X86WriteRes<WriteIDiv32Ld, [HWPort0,HWPort23,HWDivider], 29, [1,1,10], 2>;
-defm : X86WriteRes<WriteIDiv64Ld, [HWPort0,HWPort23,HWDivider], 29, [1,1,10], 2>;
+defm : HWWriteResPair<WriteDiv8, [HWPort0, HWDivider], 24, [1,9], 9>;
+defm : HWWriteResPair<WriteDiv16, [HWPort0, HWDivider], 25, [1,9], 11>;
+defm : HWWriteResPair<WriteDiv32, [HWPort0, HWDivider], 28, [1,9], 10>;
+defm : HWWriteResPair<WriteDiv64, [HWPort0, HWDivider], 94, [1,21], 36>;
+
+defm : HWWriteResPair<WriteIDiv8, [HWPort0, HWDivider], 24, [1,8], 9>;
+defm : HWWriteResPair<WriteIDiv16, [HWPort0, HWDivider], 25, [1,8], 10>;
+defm : HWWriteResPair<WriteIDiv32, [HWPort0, HWDivider], 28, [1,8], 9>;
+defm : HWWriteResPair<WriteIDiv64, [HWPort0, HWDivider], 101, [1,24], 59>;
// Floating point. This covers both scalar and vector operations.
defm : X86WriteRes<WriteFLD0, [HWPort01], 1, [1], 1>;
diff --git a/llvm/lib/Target/X86/X86SchedIceLake.td b/llvm/lib/Target/X86/X86SchedIceLake.td
index 1c57093c89796..594296355592a 100644
--- a/llvm/lib/Target/X86/X86SchedIceLake.td
+++ b/llvm/lib/Target/X86/X86SchedIceLake.td
@@ -148,23 +148,15 @@ defm : X86WriteRes<WriteCMPXCHG,[ICXPort06, ICXPort0156], 5, [2,3], 5>;
defm : X86WriteRes<WriteCMPXCHGRMW,[ICXPort23,ICXPort06,ICXPort0156,ICXPort78,ICXPort49], 8, [1,2,1,1,1], 6>;
defm : X86WriteRes<WriteXCHG, [ICXPort0156], 2, [3], 3>;
-// TODO: Why isn't the ICXDivider used?
-defm : ICXWriteResPair<WriteDiv8, [ICXPort0, ICXDivider], 25, [1,10], 1, 4>;
-defm : X86WriteRes<WriteDiv16, [ICXPort0,ICXPort1,ICXPort5,ICXPort6,ICXPort05,ICXPort0156], 76, [7,2,8,3,1,11], 32>;
-defm : X86WriteRes<WriteDiv32, [ICXPort0,ICXPort1,ICXPort5,ICXPort6,ICXPort05,ICXPort0156], 76, [7,2,8,3,1,11], 32>;
-defm : X86WriteRes<WriteDiv64, [ICXPort0,ICXPort1,ICXPort5,ICXPort6,ICXPort05,ICXPort0156], 76, [7,2,8,3,1,11], 32>;
-defm : X86WriteRes<WriteDiv16Ld, [ICXPort0,ICXPort23,ICXDivider], 29, [1,1,10], 2>;
-defm : X86WriteRes<WriteDiv32Ld, [ICXPort0,ICXPort23,ICXDivider], 29, [1,1,10], 2>;
-defm : X86WriteRes<WriteDiv64Ld, [ICXPort0,ICXPort23,ICXDivider], 29, [1,1,10], 2>;
-
-defm : X86WriteRes<WriteIDiv8, [ICXPort0, ICXDivider], 25, [1,10], 1>;
-defm : X86WriteRes<WriteIDiv16, [ICXPort0,ICXPort1,ICXPort5,ICXPort6,ICXPort06,ICXPort0156], 102, [4,2,4,8,14,34], 66>;
-defm : X86WriteRes<WriteIDiv32, [ICXPort0,ICXPort1,ICXPort5,ICXPort6,ICXPort06,ICXPort0156], 102, [4,2,4,8,14,34], 66>;
-defm : X86WriteRes<WriteIDiv64, [ICXPort0,ICXPort1,ICXPort5,ICXPort6,ICXPort06,ICXPort0156], 102, [4,2,4,8,14,34], 66>;
-defm : X86WriteRes<WriteIDiv8Ld, [ICXPort0,ICXPort5,ICXPort23,ICXPort0156], 28, [2,4,1,1], 8>;
-defm : X86WriteRes<WriteIDiv16Ld, [ICXPort0,ICXPort5,ICXPort23,ICXPort0156], 28, [2,4,1,1], 8>;
-defm : X86WriteRes<WriteIDiv32Ld, [ICXPort0,ICXPort5,ICXPort23,ICXPort0156], 28, [2,4,1,1], 8>;
-defm : X86WriteRes<WriteIDiv64Ld, [ICXPort0,ICXPort5,ICXPort23,ICXPort0156], 28, [2,4,1,1], 8>;
+defm : ICXWriteResPair<WriteDiv8, [ICXPort0, ICXDivider], 15, [1,6], 4>;
+defm : ICXWriteResPair<WriteDiv16, [ICXPort0, ICXDivider], 16, [1,6], 6>;
+defm : ICXWriteResPair<WriteDiv32, [ICXPort0, ICXDivider], 15, [1,6], 4>;
+defm : ICXWriteResPair<WriteDiv64, [ICXPort0, ICXDivider], 18, [1,10], 4>;
+
+defm : ICXWriteResPair<WriteIDiv8, [ICXPort0, ICXDivider], 15, [1,6], 4>;
+defm : ICXWriteResPair<WriteIDiv16, [ICXPort0, ICXDivider], 16, [1,6], 6>;
+defm : ICXWriteResPair<WriteIDiv32, [ICXPort0, ICXDivider], 15, [1,6], 4>;
+defm : ICXWriteResPair<WriteIDiv64, [ICXPort0, ICXDivider], 18, [1,10], 4>;
defm : ICXWriteResPair<WriteCRC32, [ICXPort1], 3>;
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
index f15a7c7076414..2d2e5d0daa634 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
@@ -146,23 +146,15 @@ defm : X86WriteRes<WriteCMPXCHG,[SKLPort06, SKLPort0156], 5, [2,3], 5>;
defm : X86WriteRes<WriteCMPXCHGRMW,[SKLPort23,SKLPort06,SKLPort0156,SKLPort237,SKLPort4], 8, [1,2,1,1,1], 6>;
defm : X86WriteRes<WriteXCHG, [SKLPort0156], 2, [3], 3>;
-// TODO: Why isn't the SKLDivider used?
-defm : SKLWriteResPair<WriteDiv8, [SKLPort0,SKLDivider], 25, [1,10], 1, 4>;
-defm : X86WriteRes<WriteDiv16, [SKLPort0,SKLPort1,SKLPort5,SKLPort6,SKLPort05,SKLPort0156], 76, [7,2,8,3,1,11], 32>;
-defm : X86WriteRes<WriteDiv32, [SKLPort0,SKLPort1,SKLPort5,SKLPort6,SKLPort05,SKLPort0156], 76, [7,2,8,3,1,11], 32>;
-defm : X86WriteRes<WriteDiv64, [SKLPort0,SKLPort1,SKLPort5,SKLPort6,SKLPort05,SKLPort0156], 76, [7,2,8,3,1,11], 32>;
-defm : X86WriteRes<WriteDiv16Ld, [SKLPort0,SKLPort23,SKLDivider], 29, [1,1,10], 2>;
-defm : X86WriteRes<WriteDiv32Ld, [SKLPort0,SKLPort23,SKLDivider], 29, [1,1,10], 2>;
-defm : X86WriteRes<WriteDiv64Ld, [SKLPort0,SKLPort23,SKLDivider], 29, [1,1,10], 2>;
-
-defm : X86WriteRes<WriteIDiv8, [SKLPort0,SKLDivider], 25, [1,10], 1>;
-defm : X86WriteRes<WriteIDiv16, [SKLPort0,SKLPort1,SKLPort5,SKLPort6,SKLPort06,SKLPort0156], 102, [4,2,4,8,14,34], 66>;
-defm : X86WriteRes<WriteIDiv32, [SKLPort0,SKLPort1,SKLPort5,SKLPort6,SKLPort06,SKLPort0156], 102, [4,2,4,8,14,34], 66>;
-defm : X86WriteRes<WriteIDiv64, [SKLPort0,SKLPort1,SKLPort5,SKLPort6,SKLPort06,SKLPort0156], 102, [4,2,4,8,14,34], 66>;
-defm : X86WriteRes<WriteIDiv8Ld, [SKLPort0,SKLPort5,SKLPort23,SKLPort0156], 28, [2,4,1,1], 8>;
-defm : X86WriteRes<WriteIDiv16Ld, [SKLPort0,SKLPort5,SKLPort23,SKLPort0156], 28, [2,4,1,1], 8>;
-defm : X86WriteRes<WriteIDiv32Ld, [SKLPort0,SKLPort5,SKLPort23,SKLPort0156], 28, [2,4,1,1], 8>;
-defm : X86WriteRes<WriteIDiv64Ld, [SKLPort0,SKLPort5,SKLPort23,SKLPort0156], 28, [2,4,1,1], 8>;
+defm : SKLWriteResPair<WriteDiv8, [SKLPort0, SKLDivider], 25, [1,6], 10>;
+defm : SKLWriteResPair<WriteDiv16, [SKLPort0, SKLDivider], 24, [1,6], 10>;
+defm : SKLWriteResPair<WriteDiv32, [SKLPort0, SKLDivider], 28, [1,6], 10>;
+defm : SKLWriteResPair<WriteDiv64, [SKLPort0, SKLDivider], 90, [1,21], 36>;
+
+defm : SKLWriteResPair<WriteIDiv8, [SKLPort0, SKLDivider], 25, [1,6], 11>;
+defm : SKLWriteResPair<WriteIDiv16, [SKLPort0, SKLDivider], 24, [1,6], 10>;
+defm : SKLWriteResPair<WriteIDiv32, [SKLPort0, SKLDivider], 28, [1,6], 10>;
+defm : SKLWriteResPair<WriteIDiv64, [SKLPort0, SKLDivider], 96, [1,24], 57>;
defm : SKLWriteResPair<WriteCRC32, [SKLPort1], 3>;
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
index 2a793d0205986..fb726b44e57d0 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
@@ -147,23 +147,15 @@ defm : X86WriteRes<WriteCMPXCHG,[SKXPort06, SKXPort0156], 5, [2,3], 5>;
defm : X86WriteRes<WriteCMPXCHGRMW,[SKXPort23,SKXPort06,SKXPort0156,SKXPort237,SKXPort4], 8, [1,2,1,1,1], 6>;
defm : X86WriteRes<WriteXCHG, [SKXPort0156], 2, [3], 3>;
-// TODO: Why isn't the SKXDivider used?
-defm : SKXWriteResPair<WriteDiv8, [SKXPort0, SKXDivider], 25, [1,10], 1, 4>;
-defm : X86WriteRes<WriteDiv16, [SKXPort0,SKXPort1,SKXPort5,SKXPort6,SKXPort05,SKXPort0156], 76, [7,2,8,3,1,11], 32>;
-defm : X86WriteRes<WriteDiv32, [SKXPort0,SKXPort1,SKXPort5,SKXPort6,SKXPort05,SKXPort0156], 76, [7,2,8,3,1,11], 32>;
-defm : X86WriteRes<WriteDiv64, [SKXPort0,SKXPort1,SKXPort5,SKXPort6,SKXPort05,SKXPort0156], 76, [7,2,8,3,1,11], 32>;
-defm : X86WriteRes<WriteDiv16Ld, [SKXPort0,SKXPort23,SKXDivider], 29, [1,1,10], 2>;
-defm : X86WriteRes<WriteDiv32Ld, [SKXPort0,SKXPort23,SKXDivider], 29, [1,1,10], 2>;
-defm : X86WriteRes<WriteDiv64Ld, [SKXPort0,SKXPort23,SKXDivider], 29, [1,1,10], 2>;
-
-defm : X86WriteRes<WriteIDiv8, [SKXPort0, SKXDivider], 25, [1,10], 1>;
-defm : X86WriteRes<WriteIDiv16, [SKXPort0,SKXPort1,SKXPort5,SKXPort6,SKXPort06,SKXPort0156], 102, [4,2,4,8,14,34], 66>;
-defm : X86WriteRes<WriteIDiv32, [SKXPort0,SKXPort1,SKXPort5,SKXPort6,SKXPort06,SKXPort0156], 102, [4,2,4,8,14,34], 66>;
-defm : X86WriteRes<WriteIDiv64, [SKXPort0,SKXPort1,SKXPort5,SKXPort6,SKXPort06,SKXPort0156], 102, [4,2,4,8,14,34], 66>;
-defm : X86WriteRes<WriteIDiv8Ld, [SKXPort0,SKXPort5,SKXPort23,SKXPort0156], 28, [2,4,1,1], 8>;
-defm : X86WriteRes<WriteIDiv16Ld, [SKXPort0,SKXPort5,SKXPort23,SKXPort0156], 28, [2,4,1,1], 8>;
-defm : X86WriteRes<WriteIDiv32Ld, [SKXPort0,SKXPort5,SKXPort23,SKXPort0156], 28, [2,4,1,1], 8>;
-defm : X86WriteRes<WriteIDiv64Ld, [SKXPort0,SKXPort5,SKXPort23,SKXPort0156], 28, [2,4,1,1], 8>;
+defm : SKXWriteResPair<WriteDiv8, [SKXPort0, SKXDivider], 25, [1,6], 10>;
+defm : SKXWriteResPair<WriteDiv16, [SKXPort0, SKXDivider], 24, [1,6], 10>;
+defm : SKXWriteResPair<WriteDiv32, [SKXPort0, SKXDivider], 28, [1,6], 10>;
+defm : SKXWriteResPair<WriteDiv64, [SKXPort0, SKXDivider], 89, [1,21], 36>;
+
+defm : SKXWriteResPair<WriteIDiv8, [SKXPort0, SKXDivider], 25, [1,6], 11>;
+defm : SKXWriteResPair<WriteIDiv16, [SKXPort0, SKXDivider], 24, [1,6], 10>;
+defm : SKXWriteResPair<WriteIDiv32, [SKXPort0, SKXDivider], 28, [1,6], 10>;
+defm : SKXWriteResPair<WriteIDiv64, [SKXPort0, SKXDivider], 96, [1,24], 57>;
defm : SKXWriteResPair<WriteCRC32, [SKXPort1], 3>;
diff --git a/llvm/test/CodeGen/X86/masked-sdiv.ll b/llvm/test/CodeGen/X86/masked-sdiv.ll
index 30d7f116dcb47..5f7366ccd9ebe 100644
--- a/llvm/test/CodeGen/X86/masked-sdiv.ll
+++ b/llvm/test/CodeGen/X86/masked-sdiv.ll
@@ -91,47 +91,47 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i1> %m) {
; AVX2-NEXT: cltd
; AVX2-NEXT: idivl %esi
; AVX2-NEXT: vmovd %eax, %xmm2
-; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
+; AVX2-NEXT: vpextrd $2, %xmm1, %esi
; AVX2-NEXT: vpextrd $2, %xmm0, %eax
; AVX2-NEXT: cltd
-; AVX2-NEXT: idivl %ecx
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: vpextrd $3, %xmm1, %esi
+; AVX2-NEXT: idivl %esi
+; AVX2-NEXT: movl %eax, %esi
+; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
+; AVX2-NEXT: vpextrd $3, %xmm1, %ecx
; AVX2-NEXT: vpextrd $3, %xmm0, %eax
; AVX2-NEXT: cltd
-; AVX2-NEXT: idivl %esi
-; AVX2-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm0
+; AVX2-NEXT: idivl %ecx
+; AVX2-NEXT: vpinsrd $2, %esi, %xmm2, %xmm0
; AVX2-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: sdiv_v4i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1]
; AVX512-NEXT: vpmovd2m %xmm2, %k1
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
-; AVX512-NEXT: vmovdqa32 %xmm1, %xmm2 {%k1}
-; AVX512-NEXT: vpextrd $1, %xmm2, %ecx
+; AVX512-NEXT: vmovdqa32 %xmm1, %xmm3 {%k1}
+; AVX512-NEXT: vpextrd $1, %xmm3, %ecx
; AVX512-NEXT: vpextrd $1, %xmm0, %eax
; AVX512-NEXT: cltd
; AVX512-NEXT: idivl %ecx
; AVX512-NEXT: movl %eax, %ecx
-; AVX512-NEXT: vmovd %xmm2, %esi
+; AVX512-NEXT: vmovd %xmm3, %esi
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: cltd
; AVX512-NEXT: idivl %esi
; AVX512-NEXT: movl %eax, %esi
-; AVX512-NEXT: vpextrd $2, %xmm2, %edi
+; AVX512-NEXT: vpextrd $2, %xmm3, %edi
; AVX512-NEXT: vpextrd $2, %xmm0, %eax
; AVX512-NEXT: cltd
; AVX512-NEXT: idivl %edi
; AVX512-NEXT: movl %eax, %edi
-; AVX512-NEXT: vmovd %esi, %xmm1
-; AVX512-NEXT: vpextrd $3, %xmm2, %esi
+; AVX512-NEXT: vpextrd $3, %xmm3, %r8d
; AVX512-NEXT: vpextrd $3, %xmm0, %eax
+; AVX512-NEXT: vmovd %esi, %xmm0
; AVX512-NEXT: cltd
-; AVX512-NEXT: idivl %esi
-; AVX512-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm0
+; AVX512-NEXT: idivl %r8d
+; AVX512-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
; AVX512-NEXT: vpinsrd $2, %edi, %xmm0, %xmm0
; AVX512-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
; AVX512-NEXT: retq
@@ -323,25 +323,24 @@ define <4 x i64> @sdiv_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i1> %m) {
; AVX2-NEXT: vpextrq $1, %xmm3, %rax
; AVX2-NEXT: cqto
; AVX2-NEXT: idivq %rcx
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: vmovq %xmm2, %rsi
+; AVX2-NEXT: vmovq %rax, %xmm4
+; AVX2-NEXT: vmovq %xmm2, %rcx
; AVX2-NEXT: vmovq %xmm3, %rax
; AVX2-NEXT: cqto
-; AVX2-NEXT: idivq %rsi
-; AVX2-NEXT: movq %rax, %rsi
-; AVX2-NEXT: vpextrq $1, %xmm1, %rdi
+; AVX2-NEXT: idivq %rcx
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: vpextrq $1, %xmm1, %rsi
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: cqto
-; AVX2-NEXT: idivq %rdi
-; AVX2-NEXT: movq %rax, %rdi
+; AVX2-NEXT: idivq %rsi
+; AVX2-NEXT: movq %rax, %rsi
; AVX2-NEXT: vmovq %rcx, %xmm2
-; AVX2-NEXT: vmovq %rsi, %xmm3
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
; AVX2-NEXT: vmovq %xmm1, %rcx
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: cqto
; AVX2-NEXT: idivq %rcx
-; AVX2-NEXT: vmovq %rdi, %xmm0
+; AVX2-NEXT: vmovq %rsi, %xmm0
; AVX2-NEXT: vmovq %rax, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
@@ -350,34 +349,34 @@ define <4 x i64> @sdiv_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i1> %m) {
; AVX512-LABEL: sdiv_v4i64:
; AVX512: # %bb.0:
; AVX512-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
; AVX512-NEXT: vpmovd2m %xmm2, %k1
-; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1]
-; AVX512-NEXT: vmovdqa64 %ymm1, %ymm2 {%k1}
-; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm1
+; AVX512-NEXT: vmovdqa64 %ymm1, %ymm3 {%k1}
+; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm1
; AVX512-NEXT: vpextrq $1, %xmm1, %rcx
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512-NEXT: vpextrq $1, %xmm3, %rax
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512-NEXT: vpextrq $1, %xmm2, %rax
; AVX512-NEXT: cqto
; AVX512-NEXT: idivq %rcx
; AVX512-NEXT: movq %rax, %rcx
; AVX512-NEXT: vmovq %xmm1, %rsi
-; AVX512-NEXT: vmovq %xmm3, %rax
+; AVX512-NEXT: vmovq %xmm2, %rax
; AVX512-NEXT: cqto
; AVX512-NEXT: idivq %rsi
; AVX512-NEXT: movq %rax, %rsi
-; AVX512-NEXT: vpextrq $1, %xmm2, %rdi
+; AVX512-NEXT: vmovq %rcx, %xmm1
+; AVX512-NEXT: vpextrq $1, %xmm3, %rcx
; AVX512-NEXT: vpextrq $1, %xmm0, %rax
; AVX512-NEXT: cqto
-; AVX512-NEXT: idivq %rdi
-; AVX512-NEXT: movq %rax, %rdi
-; AVX512-NEXT: vmovq %rcx, %xmm1
-; AVX512-NEXT: vmovq %xmm2, %rcx
+; AVX512-NEXT: idivq %rcx
+; AVX512-NEXT: movq %rax, %rcx
+; AVX512-NEXT: vmovq %xmm3, %rdi
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: cqto
-; AVX512-NEXT: idivq %rcx
+; AVX512-NEXT: idivq %rdi
; AVX512-NEXT: vmovq %rsi, %xmm0
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512-NEXT: vmovq %rdi, %xmm1
+; AVX512-NEXT: vmovq %rcx, %xmm1
; AVX512-NEXT: vmovq %rax, %xmm2
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
@@ -477,47 +476,47 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i1> %m) {
; AVX2-NEXT: cltd
; AVX2-NEXT: idivl %esi
; AVX2-NEXT: vmovd %eax, %xmm2
-; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: vpextrd $2, %xmm1, %ecx
+; AVX2-NEXT: vpextrd $2, %xmm1, %esi
; AVX2-NEXT: vpextrd $2, %xmm0, %eax
; AVX2-NEXT: cltd
-; AVX2-NEXT: idivl %ecx
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: vpextrd $3, %xmm1, %esi
+; AVX2-NEXT: idivl %esi
+; AVX2-NEXT: movl %eax, %esi
+; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
+; AVX2-NEXT: vpextrd $3, %xmm1, %ecx
; AVX2-NEXT: vpextrd $3, %xmm0, %eax
; AVX2-NEXT: cltd
-; AVX2-NEXT: idivl %esi
-; AVX2-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm0
+; AVX2-NEXT: idivl %ecx
+; AVX2-NEXT: vpinsrd $2, %esi, %xmm2, %x...
[truncated]
|
The Intel scheduler models from Haswell through Ice Lake model scalar integer division inconsistently. The Divider resource is used by some forms but not others with no consistent rule and on most of these models the 16/32/64-bit register forms just put one flat latency across the issue ports with no divider at all.
On Skylake, for example, the register form of DIV32 is given a latency of 76 with 32 uops, and IDIV32 a latency of 102 with 66 uops, when the real numbers are closer to 28 cycles and 10 uops. The same flat latency is used for every width so DIV stays 76 whether it is 16, 32 or 64 bit and IDIV stays 102, even though a 64-bit divide is slower than a 16-bit one.
Every WriteDiv and WriteIDiv form now goes through the Divider. The latency, throughput and uop counts come from uops.info.
Data sources:
https://uops.info/html-instr/DIV_R8l.html
https://uops.info/html-instr/DIV_R16.html
https://uops.info/html-instr/DIV_R32.html
https://uops.info/html-instr/DIV_R64.html
https://uops.info/html-instr/IDIV_R8l.html
https://uops.info/html-instr/IDIV_R16.html
https://uops.info/html-instr/IDIV_R32.html
https://uops.info/html-instr/IDIV_R64.html
Changes:
Format below is latency / divider / uops, where divider is the number of cycles the Divider resource is held or ports if that form didn't use the Divider and latency is worst-case max across all operand paths.
Haswell
Broadwell
Skylake
Ice Lake
Fixes #201568.