Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 64 additions & 34 deletions src/llvm-multiversioning.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,8 @@ struct CloneCtx {
void clone_partial(Group &grp, Target &tgt);
uint32_t get_func_id(Function *F) const;
std::pair<uint32_t,GlobalVariable*> get_reloc_slot(Function *F) const;

Function *create_trampoline(Function *F, GlobalVariable *slot, bool autoinit=false);
void rewrite_alias(GlobalAlias *alias, Function* F);

MDNode *tbaa_const;
Expand Down Expand Up @@ -493,6 +495,53 @@ void CloneCtx::prepare_vmap(ValueToValueMapTy &vmap)
}
}

Function *CloneCtx::create_trampoline(Function *F, GlobalVariable *slot, bool autoinit)
{
Function *trampoline =
Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage, "", &M);

trampoline->copyAttributesFrom(F);
trampoline->setVisibility(GlobalValue::HiddenVisibility);
trampoline->setDSOLocal(true);

// drop multiversioning attributes
trampoline->removeFnAttr("julia.mv.reloc");
trampoline->removeFnAttr("julia.mv.clones");

auto BB = BasicBlock::Create(F->getContext(), "top", trampoline);
IRBuilder<> irbuilder(BB);

if (autoinit) {
irbuilder.CreateCall(F->getParent()->getOrInsertFunction(
XSTR(jl_autoinit_and_adopt_thread),
PointerType::get(F->getContext(), 0)
));
}

auto ptr = irbuilder.CreateLoad(F->getType(), slot);
ptr->setMetadata(llvm::LLVMContext::MD_tbaa, tbaa_const);
ptr->setMetadata(llvm::LLVMContext::MD_invariant_load, MDNode::get(F->getContext(), None));

SmallVector<Value *, 0> Args;
for (auto &arg : trampoline->args())
Args.push_back(&arg);
auto call = irbuilder.CreateCall(F->getFunctionType(), ptr, ArrayRef<Value *>(Args));
if (F->isVarArg()) {
assert(!TT.isARM() && !TT.isPPC() && "musttail not supported on ARM/PPC!");
call->setTailCallKind(CallInst::TCK_MustTail);
} else {
call->setTailCallKind(CallInst::TCK_Tail);

}

if (F->getReturnType() == Type::getVoidTy(F->getContext()))
irbuilder.CreateRetVoid();
else
irbuilder.CreateRet(call);

return trampoline;
}

void CloneCtx::prepare_slots()
{
for (auto &F : orig_funcs) {
Expand All @@ -507,7 +556,12 @@ void CloneCtx::prepare_slots()
else {
auto id = get_func_id(F);
const_relocs[id] = GV;
GV->setInitializer(Constant::getNullValue(F->getType()));

// Initialize with a single-use trampoline that calls `jl_autoinit_and_adopt_thread`,
// so that auto-initialization works with multi-versioned entrypoints.
Function *trampoline = create_trampoline(F, GV, /* autoinit */ true);
trampoline->setName(F->getName() + ".autoinit_trampoline");
GV->setInitializer(trampoline);
}
}
}
Expand Down Expand Up @@ -665,45 +719,21 @@ void CloneCtx::rewrite_alias(GlobalAlias *alias, Function *F)
{
assert(!is_vector(F->getFunctionType()));

Function *trampoline =
Function::Create(F->getFunctionType(), alias->getLinkage(), "", &M);
trampoline->copyAttributesFrom(F);
trampoline->takeName(alias);
trampoline->setVisibility(alias->getVisibility());
trampoline->setDSOLocal(alias->isDSOLocal());
// drop multiversioning attributes, add alias attribute for testing purposes
trampoline->removeFnAttr("julia.mv.reloc");
trampoline->removeFnAttr("julia.mv.clones");
trampoline->addFnAttr("julia.mv.alias");
trampoline->setDLLStorageClass(alias->getDLLStorageClass());
alias->eraseFromParent();

uint32_t id;
GlobalVariable *slot;
std::tie(id, slot) = get_reloc_slot(F);
assert(slot);

auto BB = BasicBlock::Create(F->getContext(), "top", trampoline);
IRBuilder<> irbuilder(BB);
Function *trampoline = create_trampoline(F, slot, /* autoinit */ false);
trampoline->addFnAttr("julia.mv.alias"); // add alias attribute for testing purposes

auto ptr = irbuilder.CreateLoad(F->getType(), slot);
ptr->setMetadata(llvm::LLVMContext::MD_tbaa, tbaa_const);
ptr->setMetadata(llvm::LLVMContext::MD_invariant_load, MDNode::get(F->getContext(), None));

SmallVector<Value *, 0> Args;
for (auto &arg : trampoline->args())
Args.push_back(&arg);
auto call = irbuilder.CreateCall(F->getFunctionType(), ptr, ArrayRef<Value *>(Args));
if (F->isVarArg()) {
assert(!TT.isARM() && !TT.isPPC() && "musttail not supported on ARM/PPC!");
call->setTailCallKind(CallInst::TCK_MustTail);
} else {
call->setTailCallKind(CallInst::TCK_Tail);
}
trampoline->takeName(alias);
trampoline->setLinkage(alias->getLinkage());
trampoline->setVisibility(alias->getVisibility());
trampoline->setDSOLocal(alias->isDSOLocal());
trampoline->setDLLStorageClass(alias->getDLLStorageClass());

if (F->getReturnType() == Type::getVoidTy(F->getContext()))
irbuilder.CreateRetVoid();
else
irbuilder.CreateRet(call);
alias->eraseFromParent();
}

void CloneCtx::fix_gv_uses()
Expand Down
20 changes: 15 additions & 5 deletions test/llvmpasses/multiversioning-clone-only.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
; CHECK: @jl_fvar_idxs = hidden constant [1 x i32] zeroinitializer
; CHECK: @jl_gvar_idxs = hidden constant [0 x i32] zeroinitializer
; OPAQUE: @subtarget_cloned_gv = hidden global ptr null
; OPAQUE: @subtarget_cloned.reloc_slot = hidden global ptr null
; OPAQUE: @subtarget_cloned.reloc_slot = hidden global ptr @subtarget_cloned.autoinit_trampoline
; CHECK: @jl_fvar_count = hidden constant i64 1
; OPAQUE: @jl_fvar_ptrs = hidden global [1 x ptr] [ptr @subtarget_cloned]
; CHECK: @jl_clone_slots = hidden constant [5 x i32]
Expand Down Expand Up @@ -57,7 +57,7 @@ define noundef i32 @subtarget_cloned(i32 noundef %0) #2 {
; COM: should fixup this callsite since 2 is cloned for a subtarget
; CHECK: define{{.*}}@call_subtarget_cloned({{.*}}#[[CALL_SUBTARGET_CLONED_DEFAULT_ATTRS:[0-9]+]]
; CHECK-NEXT: [[FUNC_PTR:%[0-9]+]] = load{{.*}}@subtarget_cloned.reloc_slot{{.*}}!tbaa ![[TBAA_CONST_METADATA:[0-9]+]], !invariant.load
; CHECK-NEXT: call{{.*}}[[FUNC_PTR]]
; CHECK-NEXT: call{{.*}}[[FUNC_PTR]]({{.*}})
; CHECK: ret i32
define noundef i32 @call_subtarget_cloned(i32 noundef %0) #3 {
%2 = call noundef i32 @subtarget_cloned(i32 noundef %0)
Expand All @@ -66,13 +66,23 @@ define noundef i32 @call_subtarget_cloned(i32 noundef %0) #3 {

; CHECK: define{{.*}}@call_subtarget_cloned_but_not_cloned({{.*}}#[[BORING_DEFAULT_ATTRS]]
; CHECK-NEXT: [[FUNC_PTR:%[0-9]+]] = load{{.*}}@subtarget_cloned.reloc_slot{{.*}}!tbaa ![[TBAA_CONST_METADATA]], !invariant.load
; CHECK-NEXT: call{{.*}}[[FUNC_PTR]]
; CHECK-NEXT: call{{.*}}[[FUNC_PTR]]({{.*}})
; CHECK: ret i32
define noundef i32 @call_subtarget_cloned_but_not_cloned(i32 noundef %0) #0 {
%2 = call noundef i32 @subtarget_cloned(i32 noundef %0)
ret i32 %2
}

; COM: check that the autoinit trampoline is generated correctly
; CHECK: define{{.*}}@subtarget_cloned.autoinit_trampoline({{.*}}
; CHECK-NEXT: top:
; CHECK-NEXT: call ptr @ijl_autoinit_and_adopt_thread()
; CHECK-NEXT: [[FUNC_PTR:%[0-9]+]] = load ptr, ptr @subtarget_cloned.reloc_slot{{.*}}!tbaa ![[TBAA_CONST_METADATA]], !invariant.load
; CHECK-NEXT: call{{.*}}[[FUNC_PTR]]({{.*}})
; CHECK: ret i32

declare ptr @ijl_autoinit_and_adopt_thread()

; CHECK: define{{.*}}@boring.1({{.*}}#[[BORING_CLONEALL_ATTRS:[0-9]+]]
; CHECK-NEXT: ret i32 %0

Expand Down Expand Up @@ -106,10 +116,10 @@ define noundef i32 @call_subtarget_cloned_but_not_cloned(i32 noundef %0) #0 {
; CHECK-NOT: @subtarget_cloned_but_not_cloned.2

; COM: check for alias being rewritten to a function trampoline
; CHECK: define{{.*}}@subtarget_cloned_aliased{{.*}}#[[SUBTARGET_ALIASED_ATTRS:[0-9]+]]
; CHECK: define{{.*}}@subtarget_cloned_aliased{{[^.]*}}#[[SUBTARGET_ALIASED_ATTRS:[0-9]+]]
; CHECK-NOT: }
; CHECK: [[FUNC_PTR:%[0-9]+]] = load{{.*}}@subtarget_cloned.reloc_slot{{.*}}!tbaa ![[TBAA_CONST_METADATA]], !invariant.load
; CHECK-NEXT: call{{.*}}[[FUNC_PTR]]
; CHECK-NEXT: call{{.*}}[[FUNC_PTR]]({{.*}})
; CHECK: ret i32

; CHECK: attributes #[[BORING_DEFAULT_ATTRS]]
Expand Down
2 changes: 1 addition & 1 deletion test/llvmpasses/multiversioning-x86.ll
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
; OPAQUE: @jl_gvar_ptrs = global [0 x ptr] zeroinitializer, align 8
; CHECK: @jl_fvar_idxs = hidden constant [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4], align 8
; CHECK: @jl_gvar_idxs = hidden constant [0 x i32] zeroinitializer, align 8
; OPAQUE: @simd_test.reloc_slot = hidden global ptr null
; OPAQUE: @simd_test.reloc_slot = hidden global ptr @simd_test.autoinit_trampoline
; OPAQUE: @jl_fvar_ptrs = hidden global [5 x ptr] [ptr @boring, ptr @fastmath_test, ptr @loop_test, ptr @simd_test, ptr @simd_test_call]
; OPAQUE: @jl_clone_slots = hidden constant [3 x i32] [i32 1, i32 3, i32 trunc (i64 sub (i64 ptrtoint (ptr @simd_test.reloc_slot to i64), i64 ptrtoint (ptr @jl_clone_slots to i64)) to i32)]
; CHECK: @jl_clone_idxs = hidden constant [10 x i32] [i32 -2147483647, i32 3, i32 -2147483647, i32 3, i32 4, i32 1, i32 1, i32 2, i32 -2147483645, i32 4]
Expand Down
Loading