From d8bb729c4daebe3571539ad300d52064709ce80f Mon Sep 17 00:00:00 2001 From: Cody Tapscott Date: Wed, 19 Nov 2025 03:18:38 +0000 Subject: [PATCH] Add uninitialized multi-versioning trampoline for autoinit support --- src/llvm-multiversioning.cpp | 98 ++++++++++++------- test/llvmpasses/multiversioning-clone-only.ll | 20 +++- test/llvmpasses/multiversioning-x86.ll | 2 +- 3 files changed, 80 insertions(+), 40 deletions(-) diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp index e9276526ef263..5387ae1ed93ba 100644 --- a/src/llvm-multiversioning.cpp +++ b/src/llvm-multiversioning.cpp @@ -378,6 +378,8 @@ struct CloneCtx { void clone_partial(Group &grp, Target &tgt); uint32_t get_func_id(Function *F) const; std::pair get_reloc_slot(Function *F) const; + + Function *create_trampoline(Function *F, GlobalVariable *slot, bool autoinit=false); void rewrite_alias(GlobalAlias *alias, Function* F); MDNode *tbaa_const; @@ -493,6 +495,53 @@ void CloneCtx::prepare_vmap(ValueToValueMapTy &vmap) } } +Function *CloneCtx::create_trampoline(Function *F, GlobalVariable *slot, bool autoinit) +{ + Function *trampoline = + Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage, "", &M); + + trampoline->copyAttributesFrom(F); + trampoline->setVisibility(GlobalValue::HiddenVisibility); + trampoline->setDSOLocal(true); + + // drop multiversioning attributes + trampoline->removeFnAttr("julia.mv.reloc"); + trampoline->removeFnAttr("julia.mv.clones"); + + auto BB = BasicBlock::Create(F->getContext(), "top", trampoline); + IRBuilder<> irbuilder(BB); + + if (autoinit) { + irbuilder.CreateCall(F->getParent()->getOrInsertFunction( + XSTR(jl_autoinit_and_adopt_thread), + PointerType::get(F->getContext(), 0) + )); + } + + auto ptr = irbuilder.CreateLoad(F->getType(), slot); + ptr->setMetadata(llvm::LLVMContext::MD_tbaa, tbaa_const); + ptr->setMetadata(llvm::LLVMContext::MD_invariant_load, MDNode::get(F->getContext(), None)); + + SmallVector Args; + for (auto &arg : trampoline->args()) + Args.push_back(&arg); + auto call = irbuilder.CreateCall(F->getFunctionType(), ptr, ArrayRef(Args)); + if (F->isVarArg()) { + assert(!TT.isARM() && !TT.isPPC() && "musttail not supported on ARM/PPC!"); + call->setTailCallKind(CallInst::TCK_MustTail); + } else { + call->setTailCallKind(CallInst::TCK_Tail); + + } + + if (F->getReturnType() == Type::getVoidTy(F->getContext())) + irbuilder.CreateRetVoid(); + else + irbuilder.CreateRet(call); + + return trampoline; +} + void CloneCtx::prepare_slots() { for (auto &F : orig_funcs) { @@ -507,7 +556,12 @@ void CloneCtx::prepare_slots() else { auto id = get_func_id(F); const_relocs[id] = GV; - GV->setInitializer(Constant::getNullValue(F->getType())); + + // Initialize with a single-use trampoline that calls `jl_autoinit_and_adopt_thread`, + // so that auto-initialization works with multi-versioned entrypoints. + Function *trampoline = create_trampoline(F, GV, /* autoinit */ true); + trampoline->setName(F->getName() + ".autoinit_trampoline"); + GV->setInitializer(trampoline); } } } @@ -665,45 +719,21 @@ void CloneCtx::rewrite_alias(GlobalAlias *alias, Function *F) { assert(!is_vector(F->getFunctionType())); - Function *trampoline = - Function::Create(F->getFunctionType(), alias->getLinkage(), "", &M); - trampoline->copyAttributesFrom(F); - trampoline->takeName(alias); - trampoline->setVisibility(alias->getVisibility()); - trampoline->setDSOLocal(alias->isDSOLocal()); - // drop multiversioning attributes, add alias attribute for testing purposes - trampoline->removeFnAttr("julia.mv.reloc"); - trampoline->removeFnAttr("julia.mv.clones"); - trampoline->addFnAttr("julia.mv.alias"); - trampoline->setDLLStorageClass(alias->getDLLStorageClass()); - alias->eraseFromParent(); - uint32_t id; GlobalVariable *slot; std::tie(id, slot) = get_reloc_slot(F); + assert(slot); - auto BB = BasicBlock::Create(F->getContext(), "top", trampoline); - IRBuilder<> irbuilder(BB); + Function *trampoline = create_trampoline(F, slot, /* autoinit */ false); + trampoline->addFnAttr("julia.mv.alias"); // add alias attribute for testing purposes - auto ptr = irbuilder.CreateLoad(F->getType(), slot); - ptr->setMetadata(llvm::LLVMContext::MD_tbaa, tbaa_const); - ptr->setMetadata(llvm::LLVMContext::MD_invariant_load, MDNode::get(F->getContext(), None)); - - SmallVector Args; - for (auto &arg : trampoline->args()) - Args.push_back(&arg); - auto call = irbuilder.CreateCall(F->getFunctionType(), ptr, ArrayRef(Args)); - if (F->isVarArg()) { - assert(!TT.isARM() && !TT.isPPC() && "musttail not supported on ARM/PPC!"); - call->setTailCallKind(CallInst::TCK_MustTail); - } else { - call->setTailCallKind(CallInst::TCK_Tail); - } + trampoline->takeName(alias); + trampoline->setLinkage(alias->getLinkage()); + trampoline->setVisibility(alias->getVisibility()); + trampoline->setDSOLocal(alias->isDSOLocal()); + trampoline->setDLLStorageClass(alias->getDLLStorageClass()); - if (F->getReturnType() == Type::getVoidTy(F->getContext())) - irbuilder.CreateRetVoid(); - else - irbuilder.CreateRet(call); + alias->eraseFromParent(); } void CloneCtx::fix_gv_uses() diff --git a/test/llvmpasses/multiversioning-clone-only.ll b/test/llvmpasses/multiversioning-clone-only.ll index 00f0db0aa1e91..c4f5257a59988 100644 --- a/test/llvmpasses/multiversioning-clone-only.ll +++ b/test/llvmpasses/multiversioning-clone-only.ll @@ -7,7 +7,7 @@ ; CHECK: @jl_fvar_idxs = hidden constant [1 x i32] zeroinitializer ; CHECK: @jl_gvar_idxs = hidden constant [0 x i32] zeroinitializer ; OPAQUE: @subtarget_cloned_gv = hidden global ptr null -; OPAQUE: @subtarget_cloned.reloc_slot = hidden global ptr null +; OPAQUE: @subtarget_cloned.reloc_slot = hidden global ptr @subtarget_cloned.autoinit_trampoline ; CHECK: @jl_fvar_count = hidden constant i64 1 ; OPAQUE: @jl_fvar_ptrs = hidden global [1 x ptr] [ptr @subtarget_cloned] ; CHECK: @jl_clone_slots = hidden constant [5 x i32] @@ -57,7 +57,7 @@ define noundef i32 @subtarget_cloned(i32 noundef %0) #2 { ; COM: should fixup this callsite since 2 is cloned for a subtarget ; CHECK: define{{.*}}@call_subtarget_cloned({{.*}}#[[CALL_SUBTARGET_CLONED_DEFAULT_ATTRS:[0-9]+]] ; CHECK-NEXT: [[FUNC_PTR:%[0-9]+]] = load{{.*}}@subtarget_cloned.reloc_slot{{.*}}!tbaa ![[TBAA_CONST_METADATA:[0-9]+]], !invariant.load -; CHECK-NEXT: call{{.*}}[[FUNC_PTR]] +; CHECK-NEXT: call{{.*}}[[FUNC_PTR]]({{.*}}) ; CHECK: ret i32 define noundef i32 @call_subtarget_cloned(i32 noundef %0) #3 { %2 = call noundef i32 @subtarget_cloned(i32 noundef %0) @@ -66,13 +66,23 @@ define noundef i32 @call_subtarget_cloned(i32 noundef %0) #3 { ; CHECK: define{{.*}}@call_subtarget_cloned_but_not_cloned({{.*}}#[[BORING_DEFAULT_ATTRS]] ; CHECK-NEXT: [[FUNC_PTR:%[0-9]+]] = load{{.*}}@subtarget_cloned.reloc_slot{{.*}}!tbaa ![[TBAA_CONST_METADATA]], !invariant.load -; CHECK-NEXT: call{{.*}}[[FUNC_PTR]] +; CHECK-NEXT: call{{.*}}[[FUNC_PTR]]({{.*}}) ; CHECK: ret i32 define noundef i32 @call_subtarget_cloned_but_not_cloned(i32 noundef %0) #0 { %2 = call noundef i32 @subtarget_cloned(i32 noundef %0) ret i32 %2 } +; COM: check that the autoinit trampoline is generated correctly +; CHECK: define{{.*}}@subtarget_cloned.autoinit_trampoline({{.*}} +; CHECK-NEXT: top: +; CHECK-NEXT: call ptr @ijl_autoinit_and_adopt_thread() +; CHECK-NEXT: [[FUNC_PTR:%[0-9]+]] = load ptr, ptr @subtarget_cloned.reloc_slot{{.*}}!tbaa ![[TBAA_CONST_METADATA]], !invariant.load +; CHECK-NEXT: call{{.*}}[[FUNC_PTR]]({{.*}}) +; CHECK: ret i32 + +declare ptr @ijl_autoinit_and_adopt_thread() + ; CHECK: define{{.*}}@boring.1({{.*}}#[[BORING_CLONEALL_ATTRS:[0-9]+]] ; CHECK-NEXT: ret i32 %0 @@ -106,10 +116,10 @@ define noundef i32 @call_subtarget_cloned_but_not_cloned(i32 noundef %0) #0 { ; CHECK-NOT: @subtarget_cloned_but_not_cloned.2 ; COM: check for alias being rewritten to a function trampoline -; CHECK: define{{.*}}@subtarget_cloned_aliased{{.*}}#[[SUBTARGET_ALIASED_ATTRS:[0-9]+]] +; CHECK: define{{.*}}@subtarget_cloned_aliased{{[^.]*}}#[[SUBTARGET_ALIASED_ATTRS:[0-9]+]] ; CHECK-NOT: } ; CHECK: [[FUNC_PTR:%[0-9]+]] = load{{.*}}@subtarget_cloned.reloc_slot{{.*}}!tbaa ![[TBAA_CONST_METADATA]], !invariant.load -; CHECK-NEXT: call{{.*}}[[FUNC_PTR]] +; CHECK-NEXT: call{{.*}}[[FUNC_PTR]]({{.*}}) ; CHECK: ret i32 ; CHECK: attributes #[[BORING_DEFAULT_ATTRS]] diff --git a/test/llvmpasses/multiversioning-x86.ll b/test/llvmpasses/multiversioning-x86.ll index ff4a8abba5252..e2918d0c20eec 100644 --- a/test/llvmpasses/multiversioning-x86.ll +++ b/test/llvmpasses/multiversioning-x86.ll @@ -11,7 +11,7 @@ ; OPAQUE: @jl_gvar_ptrs = global [0 x ptr] zeroinitializer, align 8 ; CHECK: @jl_fvar_idxs = hidden constant [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4], align 8 ; CHECK: @jl_gvar_idxs = hidden constant [0 x i32] zeroinitializer, align 8 -; OPAQUE: @simd_test.reloc_slot = hidden global ptr null +; OPAQUE: @simd_test.reloc_slot = hidden global ptr @simd_test.autoinit_trampoline ; OPAQUE: @jl_fvar_ptrs = hidden global [5 x ptr] [ptr @boring, ptr @fastmath_test, ptr @loop_test, ptr @simd_test, ptr @simd_test_call] ; OPAQUE: @jl_clone_slots = hidden constant [3 x i32] [i32 1, i32 3, i32 trunc (i64 sub (i64 ptrtoint (ptr @simd_test.reloc_slot to i64), i64 ptrtoint (ptr @jl_clone_slots to i64)) to i32)] ; CHECK: @jl_clone_idxs = hidden constant [10 x i32] [i32 -2147483647, i32 3, i32 -2147483647, i32 3, i32 4, i32 1, i32 1, i32 2, i32 -2147483645, i32 4]