@@ -420,6 +420,94 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
420420 return EventImpl;
421421}
422422
423+ EventImplPtr queue_impl::submit_kernel_scheduler_bypass (
424+ KernelData &KData, std::vector<detail::EventImplPtr> &DepEvents,
425+ bool EventNeeded, detail::kernel_impl *KernelImplPtr,
426+ detail::kernel_bundle_impl *KernelBundleImpPtr,
427+ const detail::code_location &CodeLoc, bool IsTopCodeLoc) {
428+ std::vector<ur_event_handle_t > RawEvents;
429+
430+ // TODO checking the size of the events vector and avoiding the call is
431+ // more efficient here at this point
432+ if (DepEvents.size () > 0 ) {
433+ RawEvents = detail::Command::getUrEvents (DepEvents, this , false );
434+ }
435+
436+ bool DiscardEvent = !EventNeeded && supportsDiscardingPiEvents ();
437+ if (DiscardEvent) {
438+ // Kernel only uses assert if it's non interop one
439+ bool KernelUsesAssert =
440+ !(KernelImplPtr && KernelImplPtr->isInterop ()) && KData.usesAssert ();
441+ DiscardEvent = !KernelUsesAssert;
442+ }
443+
444+ std::shared_ptr<detail::event_impl> ResultEvent =
445+ DiscardEvent ? nullptr : detail::event_impl::create_device_event (*this );
446+
447+ auto EnqueueKernel = [&]() {
448+ #ifdef XPTI_ENABLE_INSTRUMENTATION
449+ xpti_td *CmdTraceEvent = nullptr ;
450+ uint64_t InstanceID = 0 ;
451+ auto StreamID = detail::getActiveXPTIStreamID ();
452+ // Only enable instrumentation if there are subscribes to the SYCL
453+ // stream
454+ const bool xptiEnabled = xptiCheckTraceEnabled (StreamID);
455+ if (xptiEnabled) {
456+ std::tie (CmdTraceEvent, InstanceID) = emitKernelInstrumentationData (
457+ StreamID, KernelImplPtr, CodeLoc, IsTopCodeLoc,
458+ *KData.getDeviceKernelInfoPtr (), this , KData.getNDRDesc (),
459+ KernelBundleImpPtr, KData.getArgs ());
460+ detail::emitInstrumentationGeneral (StreamID, InstanceID, CmdTraceEvent,
461+ xpti::trace_task_begin, nullptr );
462+ }
463+ #endif
464+ const detail::RTDeviceBinaryImage *BinImage = nullptr ;
465+ if (detail::SYCLConfig<detail::SYCL_JIT_AMDGCN_PTX_KERNELS>::get ()) {
466+ BinImage = detail::retrieveKernelBinary (*this , KData.getKernelName ());
467+ assert (BinImage && " Failed to obtain a binary image." );
468+ }
469+ enqueueImpKernel (*this , KData.getNDRDesc (), KData.getArgs (),
470+ KernelBundleImpPtr, KernelImplPtr,
471+ *KData.getDeviceKernelInfoPtr (), RawEvents,
472+ ResultEvent.get (), nullptr , KData.getKernelCacheConfig (),
473+ KData.isCooperative (), KData.usesClusterLaunch (),
474+ KData.getKernelWorkGroupMemorySize (), BinImage,
475+ KData.getKernelFuncPtr ());
476+ #ifdef XPTI_ENABLE_INSTRUMENTATION
477+ if (xptiEnabled) {
478+ // Emit signal only when event is created
479+ if (!DiscardEvent) {
480+ detail::emitInstrumentationGeneral (
481+ StreamID, InstanceID, CmdTraceEvent, xpti::trace_signal,
482+ static_cast <const void *>(ResultEvent->getHandle ()));
483+ }
484+ detail::emitInstrumentationGeneral (StreamID, InstanceID, CmdTraceEvent,
485+ xpti::trace_task_end, nullptr );
486+ }
487+ #endif
488+ };
489+
490+ if (DiscardEvent) {
491+ EnqueueKernel ();
492+ } else {
493+ ResultEvent->setWorkerQueue (weak_from_this ());
494+ ResultEvent->setStateIncomplete ();
495+ ResultEvent->setSubmissionTime ();
496+
497+ EnqueueKernel ();
498+ ResultEvent->setEnqueued ();
499+ // connect returned event with dependent events
500+ if (!isInOrder ()) {
501+ // DepEvents is not used anymore, so can move.
502+ ResultEvent->getPreparedDepsEvents () = std::move (DepEvents);
503+ // ResultEvent is local for current thread, no need to lock.
504+ ResultEvent->cleanDepEventsThroughOneLevelUnlocked ();
505+ }
506+ }
507+
508+ return ResultEvent;
509+ }
510+
423511EventImplPtr queue_impl::submit_command_to_graph (
424512 ext::oneapi::experimental::detail::graph_impl &GraphImpl,
425513 std::unique_ptr<detail::CG> CommandGroup, sycl::detail::CGType CGType,
@@ -475,26 +563,31 @@ EventImplPtr queue_impl::submit_command_to_graph(
475563 return EventImpl;
476564}
477565
478- detail:: EventImplPtr queue_impl::submit_kernel_direct_impl (
566+ EventImplPtr queue_impl::submit_kernel_direct_impl (
479567 const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel,
480568 detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent,
481569 const detail::code_location &CodeLoc, bool IsTopCodeLoc) {
482570
483571 KernelData KData;
484572
485- std::shared_ptr<detail::HostKernelBase> HostKernelPtr =
486- HostKernel.takeOrCopyOwnership ();
487-
488573 KData.setDeviceKernelInfoPtr (DeviceKernelInfo);
489- KData.setKernelFunc (HostKernelPtr-> getPtr ());
574+ KData.setKernelFunc (HostKernel. getPtr ());
490575 KData.setNDRDesc (NDRDesc);
491576
492- auto SubmitKernelFunc =
493- [&](detail::CG::StorageInitHelper &CGData) -> EventImplPtr {
577+ auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData,
578+ bool SchedulerBypass) -> EventImplPtr {
579+ if (SchedulerBypass) {
580+ return submit_kernel_scheduler_bypass (KData, CGData.MEvents ,
581+ CallerNeedsEvent, nullptr , nullptr ,
582+ CodeLoc, IsTopCodeLoc);
583+ }
494584 std::unique_ptr<detail::CG> CommandGroup;
495585 std::vector<std::shared_ptr<detail::stream_impl>> StreamStorage;
496586 std::vector<std::shared_ptr<const void >> AuxiliaryResources;
497587
588+ std::shared_ptr<detail::HostKernelBase> HostKernelPtr =
589+ HostKernel.takeOrCopyOwnership ();
590+
498591 KData.extractArgsAndReqsFromLambda ();
499592
500593 CommandGroup.reset (new detail::CGExecKernel (
@@ -504,10 +597,8 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl(
504597 std::move (CGData), std::move (KData).getArgs (),
505598 *KData.getDeviceKernelInfoPtr (), std::move (StreamStorage),
506599 std::move (AuxiliaryResources), detail::CGType::Kernel,
507- UR_KERNEL_CACHE_CONFIG_DEFAULT,
508- false , // KernelIsCooperative
509- false , // KernelUsesClusterLaunch
510- 0 , // KernelWorkGroupMemorySize
600+ KData.getKernelCacheConfig (), KData.isCooperative (),
601+ KData.usesClusterLaunch (), KData.getKernelWorkGroupMemorySize (),
511602 CodeLoc));
512603 CommandGroup->MIsTopCodeLoc = IsTopCodeLoc;
513604
@@ -567,11 +658,21 @@ queue_impl::submit_direct(bool CallerNeedsEvent,
567658 }
568659 }
569660
570- EventImplPtr EventImpl = SubmitCommandFunc (CGData);
661+ bool SchedulerBypass =
662+ (CGData.MEvents .size () > 0
663+ ? detail::Scheduler::areEventsSafeForSchedulerBypass (
664+ CGData.MEvents , getContextImpl ())
665+ : true ) &&
666+ !hasCommandGraph ();
571667
572- // Sync with the last event for in order queue
573- if (isInOrder () && !EventImpl->isDiscarded ()) {
574- LastEvent = EventImpl;
668+ EventImplPtr EventImpl = SubmitCommandFunc (CGData, SchedulerBypass);
669+
670+ // Sync with the last event for in order queue. For scheduler-bypass flow,
671+ // the ordering is done at the layers below the SYCL runtime,
672+ // but for the scheduler-based flow, it needs to be done here, as the
673+ // scheduler handles host task submissions.
674+ if (isInOrder ()) {
675+ LastEvent = SchedulerBypass ? nullptr : EventImpl;
575676 }
576677
577678 // Barrier and un-enqueued commands synchronization for out or order queue
0 commit comments