Skip to content

Commit fd54539

Browse files
committed
Report job success immediately after main graph completes, before post steps
- Report success to users as soon as main graph completes successfully - Post steps (promotion/cleanup) now run as best-effort and don't affect job result - Add metrics tracking: main_graph_duration_seconds, post_steps_duration_seconds, time_saved_seconds - Metrics are extractable from ci-operator-metrics.json in test_platform_insights events - Prevent duplicate success reporting by tracking early-report state in options
1 parent b751022 commit fd54539

1 file changed

Lines changed: 41 additions & 4 deletions

File tree

cmd/ci-operator/main.go

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -454,6 +454,8 @@ type options struct {
454454
metricsAgent *metrics.MetricsAgent
455455

456456
skippedImages sets.Set[string]
457+
458+
successReported bool
457459
}
458460

459461
func bindOptions(flag *flag.FlagSet) *options {
@@ -919,6 +921,11 @@ func (o *options) Report(errs ...error) {
919921
}
920922

921923
if len(errorToReport) == 0 {
924+
// Skip reporting success if it was already reported early (before post steps)
925+
if o.successReported {
926+
logrus.Debug("Success was already reported early, skipping duplicate report.")
927+
return
928+
}
922929
reporter.Report(nil)
923930
}
924931
}
@@ -1057,26 +1064,56 @@ func (o *options) Run() []error {
10571064
return wrapped
10581065
}
10591066

1060-
// Run each of the promotion steps concurrently
1067+
// Main graph completed successfully - report success immediately before post steps
1068+
mainGraphCompletedAt := time.Now()
1069+
mainGraphDuration := mainGraphCompletedAt.Sub(start)
1070+
eventRecorder.Event(runtimeObject, coreapi.EventTypeNormal, "CiJobSucceeded", eventJobDescription(o.jobSpec, o.namespace))
1071+
1072+
// Report success to users immediately (post steps are best-effort cleanup)
1073+
reporter, loadErr := o.resultsOptions.Reporter(o.jobSpec, o.consoleHost)
1074+
if loadErr != nil {
1075+
logrus.WithError(loadErr).Warn("Could not load result reporting options, skipping early success report.")
1076+
} else {
1077+
reporter.Report(nil)
1078+
o.successReported = true
1079+
}
1080+
1081+
// Run each of the promotion steps concurrently (best-effort cleanup)
1082+
postStepsStart := time.Now()
10611083
lenOfPromotionSteps := len(promotionSteps)
10621084
detailsChan := make(chan api.CIOperatorStepDetails, lenOfPromotionSteps)
10631085
errChan := make(chan error, lenOfPromotionSteps)
10641086
for _, step := range promotionSteps {
10651087
go runPromotionStep(ctx, step, detailsChan, errChan, o.metricsAgent)
10661088
}
1089+
postStepsFailed := false
10671090
for i := 0; i < lenOfPromotionSteps; i++ {
10681091
select {
10691092
case details := <-detailsChan:
10701093
graph.MergeFrom(details)
10711094
case err := <-errChan:
10721095
errorDesc := fmt.Sprintf("post step failed while %s. with error: %v", eventJobDescription(o.jobSpec, o.namespace), err)
10731096
eventRecorder.Event(runtimeObject, coreapi.EventTypeWarning, "PostStepFailed", errorDesc)
1074-
return []error{results.ForReason("executing_post").WithError(err).Unwrap()} // If any of the promotion steps fail, it is considered a failure
1097+
logrus.WithError(err).Warn("Post step failed, but job success was already reported. Continuing with cleanup.")
1098+
postStepsFailed = true
1099+
// Post step failures don't affect job success (already reported), but we still record them
10751100
}
10761101
}
10771102

1078-
eventRecorder.Event(runtimeObject, coreapi.EventTypeNormal, "CiJobSucceeded", eventJobDescription(o.jobSpec, o.namespace))
1079-
o.metricsAgent.Record(metrics.NewInsightsEvent(metrics.InsightExecutionCompleted, metrics.Context{"duration_seconds": time.Since(start).Seconds(), "success": true}))
1103+
// Record final metrics including post steps duration
1104+
postStepsDuration := time.Since(postStepsStart)
1105+
totalDuration := time.Since(start)
1106+
metricsContext := metrics.Context{
1107+
"duration_seconds": totalDuration.Seconds(),
1108+
"main_graph_duration_seconds": mainGraphDuration.Seconds(),
1109+
"post_steps_duration_seconds": postStepsDuration.Seconds(),
1110+
"time_saved_seconds": postStepsDuration.Seconds(),
1111+
"success": true,
1112+
}
1113+
if postStepsFailed {
1114+
metricsContext["post_steps_failed"] = true
1115+
}
1116+
o.metricsAgent.Record(metrics.NewInsightsEvent(metrics.InsightExecutionCompleted, metricsContext))
10801117

10811118
return nil
10821119
})

0 commit comments

Comments
 (0)