diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 19eabb46752bf..582a353632436 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -9,6 +9,10 @@ def BITOP3_32 : ComplexPattern; def BITOP3_16 : ComplexPattern; +// Matches PTRADD as a commutative operation. +def ptradd_commutative : PatFrags<(ops node:$src0, node:$src1), + [(ptradd node:$src0, node:$src1), (ptradd node:$src1, node:$src0)]>; + // Special case for v_div_fmas_{f32|f64}, since it seems to be the // only VOP instruction that implicitly reads VCC. let Asm64 = " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod" in { @@ -938,12 +942,18 @@ def : GCNPat< (DivergentBinFrag i32:$src0, IsPow2Plus1:$src1), (V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>; -let SubtargetPredicate = HasLshlAddU64Inst in +let SubtargetPredicate = HasLshlAddU64Inst in { def : GCNPat< (ThreeOpFrag i64:$src0, i32:$src1, i64:$src2), (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2) >; +def : GCNPat < + // (ptradd z, (shl x, y)) or (ptradd (shl x, y), z) -> ((x << y) + z) + (ThreeOpFrag i64:$src0, i32:$src1, i64:$src2), + (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2)>; +} // End SubtargetPredicate = HasLshlAddU64Inst + let SubtargetPredicate = HasAddMinMaxInsts in { def : ThreeOp_i32_Pats; def : ThreeOp_i32_Pats; @@ -1019,19 +1029,24 @@ multiclass IMAD32_Pats { // Handle cases where amdgpu-codegenprepare-mul24 made a mul24 instead of a normal mul. // We need to separate this because otherwise OtherPredicates would be overriden. -class IMAD32_Mul24_Pat: GCNPat < - (i64 (add (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2)), +class IMAD32_Mul24_Pats_Impl : GCNPat < + (i64 (AddOp (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2)), (inst $src0, $src1, $src2, 0 /* clamp */) >; +multiclass IMAD32_Mul24_Pats { + def : IMAD32_Mul24_Pats_Impl; + def : IMAD32_Mul24_Pats_Impl; +} + // exclude pre-GFX9 where it was slow let OtherPredicates = [HasNotMADIntraFwdBug], SubtargetPredicate = isGFX9Plus in { defm : IMAD32_Pats; - def : IMAD32_Mul24_Pat; + defm : IMAD32_Mul24_Pats; } let OtherPredicates = [HasMADIntraFwdBug], SubtargetPredicate = isGFX11Only in { defm : IMAD32_Pats; - def : IMAD32_Mul24_Pat; + defm : IMAD32_Mul24_Pats; } def VOP3_PERMLANE_Profile : VOP3_Profile, VOP3_OPSEL> { diff --git a/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll b/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll index 6bb68e1e26a14..9c49aade6099f 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll @@ -119,10 +119,7 @@ define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) { ; GFX942-GISEL: ; %bb.0: ; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX942-GISEL-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX942-GISEL-NEXT: s_nop 1 -; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX942-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] ; GFX942-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 12, v0 ; GFX942-GISEL-NEXT: s_nop 1 ; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.async.to.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.async.to.lds.ll index f571030077870..3019c1d897d98 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.async.to.lds.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.async.to.lds.ll @@ -270,29 +270,15 @@ entry: } define amdgpu_ps void @cluster_load_async_to_lds_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 inreg %mask, i32 %idx) { -; GFX1250-SDAG-LABEL: cluster_load_async_to_lds_b64_saddr_no_scale_offset: -; GFX1250-SDAG: ; %bb.0: ; %entry -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 m0, s2 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1] -; GFX1250-SDAG-NEXT: cluster_load_async_to_lds_b64 v0, v[2:3], off offset:16 th:TH_LOAD_NT -; GFX1250-SDAG-NEXT: s_endpgm -; -; GFX1250-GISEL-LABEL: cluster_load_async_to_lds_b64_saddr_no_scale_offset: -; GFX1250-GISEL: ; %bb.0: ; %entry -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, v1 -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX1250-GISEL-NEXT: s_mov_b32 m0, s2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2 -; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v5, v3, vcc_lo -; GFX1250-GISEL-NEXT: cluster_load_async_to_lds_b64 v0, v[2:3], off offset:16 th:TH_LOAD_NT -; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-LABEL: cluster_load_async_to_lds_b64_saddr_no_scale_offset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_mov_b32 m0, s2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX1250-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1] +; GFX1250-NEXT: cluster_load_async_to_lds_b64 v0, v[2:3], off offset:16 th:TH_LOAD_NT +; GFX1250-NEXT: s_endpgm entry: %idxprom = sext i32 %idx to i64 %gep = getelementptr i32, ptr addrspace(1) %gaddr, i64 %idxprom diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll index dd679101047ea..d5fae1e4a9657 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll @@ -160,27 +160,14 @@ entry: } define amdgpu_ps void @global_load_async_to_lds_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) { -; GFX1250-SDAG-LABEL: global_load_async_to_lds_b64_saddr_no_scale_offset: -; GFX1250-SDAG: ; %bb.0: ; %entry -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1] -; GFX1250-SDAG-NEXT: global_load_async_to_lds_b64 v0, v[2:3], off offset:16 th:TH_LOAD_NT -; GFX1250-SDAG-NEXT: s_endpgm -; -; GFX1250-GISEL-LABEL: global_load_async_to_lds_b64_saddr_no_scale_offset: -; GFX1250-GISEL: ; %bb.0: ; %entry -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, v1 -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2 -; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v5, v3, vcc_lo -; GFX1250-GISEL-NEXT: global_load_async_to_lds_b64 v0, v[2:3], off offset:16 th:TH_LOAD_NT -; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-LABEL: global_load_async_to_lds_b64_saddr_no_scale_offset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX1250-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1] +; GFX1250-NEXT: global_load_async_to_lds_b64 v0, v[2:3], off offset:16 th:TH_LOAD_NT +; GFX1250-NEXT: s_endpgm entry: %idxprom = sext i32 %idx to i64 %gep = getelementptr i32, ptr addrspace(1) %gaddr, i64 %idxprom diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.store.async.from.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.store.async.from.lds.ll index fd35313802558..22563f8e5ff46 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.store.async.from.lds.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.store.async.from.lds.ll @@ -160,27 +160,14 @@ entry: } define amdgpu_ps void @global_store_async_from_lds_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) { -; GFX1250-SDAG-LABEL: global_store_async_from_lds_b64_saddr_no_scale_offset: -; GFX1250-SDAG: ; %bb.0: ; %entry -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1] -; GFX1250-SDAG-NEXT: global_store_async_from_lds_b64 v[2:3], v0, off offset:16 th:TH_STORE_NT -; GFX1250-SDAG-NEXT: s_endpgm -; -; GFX1250-GISEL-LABEL: global_store_async_from_lds_b64_saddr_no_scale_offset: -; GFX1250-GISEL: ; %bb.0: ; %entry -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, v1 -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2 -; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v5, v3, vcc_lo -; GFX1250-GISEL-NEXT: global_store_async_from_lds_b64 v[2:3], v0, off offset:16 th:TH_STORE_NT -; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-LABEL: global_store_async_from_lds_b64_saddr_no_scale_offset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX1250-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1] +; GFX1250-NEXT: global_store_async_from_lds_b64 v[2:3], v0, off offset:16 th:TH_STORE_NT +; GFX1250-NEXT: s_endpgm entry: %idxprom = sext i32 %idx to i64 %gep = getelementptr i32, ptr addrspace(1) %gaddr, i64 %idxprom diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll index 3377290ecb1e0..350d468344f65 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll @@ -169,29 +169,15 @@ entry: } define amdgpu_ps void @global_load_monitor_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 %idx) { -; GFX1250-SDAG-LABEL: global_load_monitor_b64_saddr_no_scale_offset: -; GFX1250-SDAG: ; %bb.0: ; %entry -; GFX1250-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1] -; GFX1250-SDAG-NEXT: global_load_monitor_b64 v[2:3], v[2:3], off th:TH_LOAD_NT -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX1250-SDAG-NEXT: s_endpgm -; -; GFX1250-GISEL-LABEL: global_load_monitor_b64_saddr_no_scale_offset: -; GFX1250-GISEL: ; %bb.0: ; %entry -; GFX1250-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] -; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v5, v3, vcc_lo -; GFX1250-GISEL-NEXT: global_load_monitor_b64 v[2:3], v[2:3], off th:TH_LOAD_NT -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off -; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-LABEL: global_load_monitor_b64_saddr_no_scale_offset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1] +; GFX1250-NEXT: global_load_monitor_b64 v[2:3], v[2:3], off th:TH_LOAD_NT +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX1250-NEXT: s_endpgm entry: %idxprom = sext i32 %idx to i64 %gep = getelementptr i32, ptr addrspace(1) %addr, i64 %idxprom @@ -199,3 +185,6 @@ entry: store <2 x i32> %val, ptr addrspace(1) %use ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX1250-GISEL: {{.*}} +; GFX1250-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index 4db232cbfa8c7..0fe4d337a5bd7 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -265,18 +265,11 @@ define amdgpu_kernel void @fold_mad64(ptr addrspace(1) %p) { ; Use non-zero shift amounts in v_lshl_add_u64. define ptr @select_v_lshl_add_u64(ptr %base, i64 %voffset) { -; GFX942_PTRADD-LABEL: select_v_lshl_add_u64: -; GFX942_PTRADD: ; %bb.0: -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT: v_lshlrev_b64 v[2:3], 3, v[2:3] -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] -; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] -; -; GFX942_LEGACY-LABEL: select_v_lshl_add_u64: -; GFX942_LEGACY: ; %bb.0: -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 3, v[0:1] -; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: select_v_lshl_add_u64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 3, v[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds i64, ptr %base, i64 %voffset ret ptr %gep } @@ -284,23 +277,13 @@ define ptr @select_v_lshl_add_u64(ptr %base, i64 %voffset) { ; Fold mul and add into v_mad, even if amdgpu-codegenprepare-mul24 turned the ; mul into a mul24. define ptr @fold_mul24_into_mad(ptr %base, i64 %a, i64 %b) { -; GFX942_PTRADD-LABEL: fold_mul24_into_mad: -; GFX942_PTRADD: ; %bb.0: -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT: v_and_b32_e32 v2, 0xfffff, v2 -; GFX942_PTRADD-NEXT: v_and_b32_e32 v4, 0xfffff, v4 -; GFX942_PTRADD-NEXT: v_mul_hi_u32_u24_e32 v3, v2, v4 -; GFX942_PTRADD-NEXT: v_mul_u32_u24_e32 v2, v2, v4 -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] -; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] -; -; GFX942_LEGACY-LABEL: fold_mul24_into_mad: -; GFX942_LEGACY: ; %bb.0: -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT: v_and_b32_e32 v2, 0xfffff, v2 -; GFX942_LEGACY-NEXT: v_and_b32_e32 v3, 0xfffff, v4 -; GFX942_LEGACY-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v3, v[0:1] -; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: fold_mul24_into_mad: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_and_b32_e32 v2, 0xfffff, v2 +; GFX942-NEXT: v_and_b32_e32 v3, 0xfffff, v4 +; GFX942-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v3, v[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] %a_masked = and i64 %a, u0xfffff %b_masked = and i64 %b, u0xfffff %mul = mul i64 %a_masked, %b_masked diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll index 32f2395c7b2ad..9dd25025d4381 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll @@ -25,20 +25,12 @@ define ptr @gep_as0(ptr %p, i64 %offset) { ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX942_PTRADD-LABEL: gep_as0: -; GFX942_PTRADD: ; %bb.0: ; %entry -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5 -; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] -; -; GFX942_LEGACY-LABEL: gep_as0: -; GFX942_LEGACY: ; %bb.0: ; %entry -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] -; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5 -; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: gep_as0: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: gep_as0: ; GFX10: ; %bb.0: ; %entry @@ -188,20 +180,12 @@ define ptr @multi_gep_as0(ptr %p, i64 %offset) { ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX942_PTRADD-LABEL: multi_gep_as0: -; GFX942_PTRADD: ; %bb.0: ; %entry -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5 -; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] -; -; GFX942_LEGACY-LABEL: multi_gep_as0: -; GFX942_LEGACY: ; %bb.0: ; %entry -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] -; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5 -; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: multi_gep_as0: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: multi_gep_as0: ; GFX10: ; %bb.0: ; %entry @@ -537,3 +521,5 @@ entry: ; GFX12_PTRADD: {{.*}} ; GFX8_LEGACY: {{.*}} ; GFX8_PTRADD: {{.*}} +; GFX942_LEGACY: {{.*}} +; GFX942_PTRADD: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll index 307ff046d48c2..335d58c43c936 100644 --- a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll @@ -28,27 +28,14 @@ entry: } define amdgpu_ps float @flat_load_b32_idxprom_wrong_stride(ptr align 4 inreg %p, i32 %idx) { -; SDAG-LABEL: flat_load_b32_idxprom_wrong_stride: -; SDAG: ; %bb.0: ; %entry -; SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 3, s[0:1] -; SDAG-NEXT: flat_load_b32 v0, v[0:1] -; SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; SDAG-NEXT: ; return to shader part epilog -; -; GISEL-LABEL: flat_load_b32_idxprom_wrong_stride: -; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 3, v[0:1] -; GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo -; GISEL-NEXT: flat_load_b32 v0, v[0:1] -; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GISEL-NEXT: ; return to shader part epilog +; GCN-LABEL: flat_load_b32_idxprom_wrong_stride: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 3, s[0:1] +; GCN-NEXT: flat_load_b32 v0, v[0:1] +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog entry: %idxprom = sext i32 %idx to i64 %arrayidx = getelementptr inbounds <2 x float>, ptr %p, i64 %idxprom @@ -380,16 +367,12 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg % ; ; GISEL-LABEL: flat_atomicrmw_b64_rtn_idxprom: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v6, src_flat_scratch_base_hi -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 3, v[2:3] +; GISEL-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v0, src_flat_scratch_base_hi ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v0 -; GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v1, vcc_lo +; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GISEL-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 3, s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_xor_b32_e32 v0, v5, v6 +; GISEL-NEXT: v_xor_b32_e32 v0, v5, v0 ; GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GISEL-NEXT: s_and_saveexec_b32 s2, vcc_lo @@ -405,7 +388,7 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg % ; GISEL-NEXT: s_branch .LBB21_5 ; GISEL-NEXT: .LBB21_3: ; %atomicrmw.global ; GISEL-NEXT: v_mov_b64_e32 v[0:1], 1 -; GISEL-NEXT: ; implicit-def: $vgpr4 +; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GISEL-NEXT: flat_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] scale_offset th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GISEL-NEXT: s_wait_xcnt 0x0 ; GISEL-NEXT: s_and_not1_saveexec_b32 s0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll index faea84e34d7eb..a0fca0e2bdc72 100644 --- a/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll @@ -28,27 +28,14 @@ entry: } define amdgpu_ps float @global_load_b32_idxprom_wrong_stride(ptr addrspace(1) align 4 inreg %p, i32 %idx) { -; SDAG-LABEL: global_load_b32_idxprom_wrong_stride: -; SDAG: ; %bb.0: ; %entry -; SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 3, s[0:1] -; SDAG-NEXT: global_load_b32 v0, v[0:1], off -; SDAG-NEXT: s_wait_loadcnt 0x0 -; SDAG-NEXT: ; return to shader part epilog -; -; GISEL-LABEL: global_load_b32_idxprom_wrong_stride: -; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 3, v[0:1] -; GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo -; GISEL-NEXT: global_load_b32 v0, v[0:1], off -; GISEL-NEXT: s_wait_loadcnt 0x0 -; GISEL-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_b32_idxprom_wrong_stride: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 3, s[0:1] +; GCN-NEXT: global_load_b32 v0, v[0:1], off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog entry: %idxprom = sext i32 %idx to i64 %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(1) %p, i64 %idxprom @@ -349,3 +336,6 @@ entry: } !0 = !{i32 0, i32 1024} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GISEL: {{.*}} +; SDAG: {{.*}}