11package redisfailover
22
33import (
4+ "context"
45 "errors"
6+ "github.com/spotahome/redis-operator/service/k8s"
7+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
58 "strconv"
69 "time"
710
@@ -85,6 +88,15 @@ func (r *RedisFailoverHandler) UpdateRedisesPods(rf *redisfailoverv1.RedisFailov
8588// CheckAndHeal runs verifcation checks to ensure the RedisFailover is in an expected and healthy state.
8689// If the checks do not match up to expectations, an attempt will be made to "heal" the RedisFailover into a healthy state.
8790func (r * RedisFailoverHandler ) CheckAndHeal (rf * redisfailoverv1.RedisFailover ) error {
91+
92+ oldState := rf .Status .State
93+
94+ rf .Status = redisfailoverv1.RedisFailoverStatus {
95+ State : redisfailoverv1 .HealthyState ,
96+ }
97+
98+ defer updateStatus (r .k8sservice , rf , oldState )
99+
88100 if rf .Bootstrapping () {
89101 return r .checkAndHealBootstrapMode (rf )
90102 }
@@ -99,19 +111,33 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
99111 // Sentinel knows the correct slave number
100112
101113 if ! r .rfChecker .IsRedisRunning (rf ) {
102- setRedisCheckerMetrics (r .mClient , "redis" , rf .Namespace , rf .Name , metrics .REDIS_REPLICA_MISMATCH , metrics .NOT_APPLICABLE , errors .New ("not all replicas running" ))
114+ errorMsg := "not all replicas running"
115+ rf .Status = redisfailoverv1.RedisFailoverStatus {
116+ State : redisfailoverv1 .NotHealthyState ,
117+ Message : errorMsg ,
118+ }
119+ setRedisCheckerMetrics (r .mClient , "redis" , rf .Namespace , rf .Name , metrics .REDIS_REPLICA_MISMATCH , metrics .NOT_APPLICABLE , errors .New (errorMsg ))
103120 r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Debugf ("Number of redis mismatch, waiting for redis statefulset reconcile" )
104121 return nil
105122 }
106123
107124 if ! r .rfChecker .IsSentinelRunning (rf ) {
108- setRedisCheckerMetrics (r .mClient , "sentinel" , rf .Namespace , rf .Name , metrics .SENTINEL_REPLICA_MISMATCH , metrics .NOT_APPLICABLE , errors .New ("not all replicas running" ))
125+ errorMsg := "not all replicas running"
126+ rf .Status = redisfailoverv1.RedisFailoverStatus {
127+ State : redisfailoverv1 .NotHealthyState ,
128+ Message : errorMsg ,
129+ }
130+ setRedisCheckerMetrics (r .mClient , "sentinel" , rf .Namespace , rf .Name , metrics .SENTINEL_REPLICA_MISMATCH , metrics .NOT_APPLICABLE , errors .New (errorMsg ))
109131 r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Debugf ("Number of sentinel mismatch, waiting for sentinel deployment reconcile" )
110132 return nil
111133 }
112134
113135 nMasters , err := r .rfChecker .GetNumberMasters (rf )
114136 if err != nil {
137+ rf .Status = redisfailoverv1.RedisFailoverStatus {
138+ State : redisfailoverv1 .NotHealthyState ,
139+ Message : "unable to get number of masters" ,
140+ }
115141 return err
116142 }
117143
@@ -125,7 +151,12 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
125151 err = r .rfHealer .SetOldestAsMaster (rf )
126152 setRedisCheckerMetrics (r .mClient , "redis" , rf .Namespace , rf .Name , metrics .NO_MASTER , metrics .NOT_APPLICABLE , err )
127153 if err != nil {
128- r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Errorf ("Error in Setting oldest Pod as master" )
154+ errorMsg := "Error in Setting oldest Pod as master"
155+ rf .Status = redisfailoverv1.RedisFailoverStatus {
156+ State : redisfailoverv1 .NotHealthyState ,
157+ Message : errorMsg ,
158+ }
159+ r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Errorf (errorMsg )
129160 return err
130161 }
131162 return nil
@@ -138,6 +169,10 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
138169 r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Warningf ("Number of Masters running is 0" )
139170 maxUptime , err := r .rfChecker .GetMaxRedisPodTime (rf )
140171 if err != nil {
172+ rf .Status = redisfailoverv1.RedisFailoverStatus {
173+ State : redisfailoverv1 .NotHealthyState ,
174+ Message : "unable to get Redis POD time" ,
175+ }
141176 return err
142177 }
143178
@@ -150,13 +185,22 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
150185 err2 := r .rfHealer .SetOldestAsMaster (rf )
151186 setRedisCheckerMetrics (r .mClient , "redis" , rf .Namespace , rf .Name , metrics .NO_MASTER , metrics .NOT_APPLICABLE , err2 )
152187 if err2 != nil {
153- r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Errorf ("Error in Setting oldest Pod as master" )
188+ errorMsg := "Error in Setting oldest Pod as master"
189+ rf .Status = redisfailoverv1.RedisFailoverStatus {
190+ State : redisfailoverv1 .NotHealthyState ,
191+ Message : errorMsg ,
192+ }
193+ r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Errorf (errorMsg )
154194 return err2
155195 }
156196 } else {
157197 //sentinels are having a quorum to make a failover , but check if redis are not having local hostip (first boot) as master
158198 status , err2 := r .rfChecker .CheckIfMasterLocalhost (rf )
159199 if err2 != nil {
200+ rf .Status = redisfailoverv1.RedisFailoverStatus {
201+ State : redisfailoverv1 .NotHealthyState ,
202+ Message : "unable to check if master localhost" ,
203+ }
160204 r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Errorf ("CheckIfMasterLocalhost failed retry later" )
161205 return err2
162206 } else if status {
@@ -165,7 +209,12 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
165209 err3 := r .rfHealer .SetOldestAsMaster (rf )
166210 setRedisCheckerMetrics (r .mClient , "redis" , rf .Namespace , rf .Name , metrics .NO_MASTER , metrics .NOT_APPLICABLE , err3 )
167211 if err3 != nil {
168- r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Errorf ("Error in Setting oldest Pod as master" )
212+ errorMsg := "Error in Setting oldest Pod as master"
213+ rf .Status = redisfailoverv1.RedisFailoverStatus {
214+ State : redisfailoverv1 .NotHealthyState ,
215+ Message : errorMsg ,
216+ }
217+ r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Errorf (errorMsg )
169218 return err3
170219 }
171220
@@ -183,11 +232,20 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
183232 setRedisCheckerMetrics (r .mClient , "redis" , rf .Namespace , rf .Name , metrics .NUMBER_OF_MASTERS , metrics .NOT_APPLICABLE , nil )
184233 default :
185234 setRedisCheckerMetrics (r .mClient , "redis" , rf .Namespace , rf .Name , metrics .NUMBER_OF_MASTERS , metrics .NOT_APPLICABLE , errors .New ("multiple masters detected" ))
186- return errors .New ("more than one master, fix manually" )
235+ errorMsg := "more than one master, fix manually"
236+ rf .Status = redisfailoverv1.RedisFailoverStatus {
237+ State : redisfailoverv1 .NotHealthyState ,
238+ Message : errorMsg ,
239+ }
240+ return errors .New (errorMsg )
187241 }
188242
189243 master , err := r .rfChecker .GetMasterIP (rf )
190244 if err != nil {
245+ rf .Status = redisfailoverv1.RedisFailoverStatus {
246+ State : redisfailoverv1 .NotHealthyState ,
247+ Message : "unable to get master IP" ,
248+ }
191249 return err
192250 }
193251
@@ -196,23 +254,38 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
196254 if err != nil {
197255 r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Warningf ("Slave not associated to master: %s" , err .Error ())
198256 if err = r .rfHealer .SetMasterOnAll (master , rf ); err != nil {
257+ rf .Status = redisfailoverv1.RedisFailoverStatus {
258+ State : redisfailoverv1 .NotHealthyState ,
259+ }
199260 return err
200261 }
201262 }
202263
203264 err = r .applyRedisCustomConfig (rf )
204265 setRedisCheckerMetrics (r .mClient , "redis" , rf .Namespace , rf .Name , metrics .APPLY_REDIS_CONFIG , metrics .NOT_APPLICABLE , err )
205266 if err != nil {
267+ rf .Status = redisfailoverv1.RedisFailoverStatus {
268+ State : redisfailoverv1 .NotHealthyState ,
269+ Message : "unable to apply custom config" ,
270+ }
206271 return err
207272 }
208273
209274 err = r .UpdateRedisesPods (rf )
210275 if err != nil {
276+ rf .Status = redisfailoverv1.RedisFailoverStatus {
277+ State : redisfailoverv1 .NotHealthyState ,
278+ Message : "unable to update redis PODs" ,
279+ }
211280 return err
212281 }
213282
214283 sentinels , err := r .rfChecker .GetSentinelsIPs (rf )
215284 if err != nil {
285+ rf .Status = redisfailoverv1.RedisFailoverStatus {
286+ State : redisfailoverv1 .NotHealthyState ,
287+ Message : "unable to get sentinels IPs" ,
288+ }
216289 return err
217290 }
218291
@@ -223,6 +296,9 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
223296 if err != nil {
224297 r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Warningf ("Fixing sentinel not monitoring expected master: %s" , err .Error ())
225298 if err := r .rfHealer .NewSentinelMonitor (sip , master , rf ); err != nil {
299+ rf .Status = redisfailoverv1.RedisFailoverStatus {
300+ State : redisfailoverv1 .NotHealthyState ,
301+ }
226302 return err
227303 }
228304 }
@@ -233,37 +309,62 @@ func (r *RedisFailoverHandler) CheckAndHeal(rf *redisfailoverv1.RedisFailover) e
233309func (r * RedisFailoverHandler ) checkAndHealBootstrapMode (rf * redisfailoverv1.RedisFailover ) error {
234310
235311 if ! r .rfChecker .IsRedisRunning (rf ) {
236- setRedisCheckerMetrics (r .mClient , "redis" , rf .Namespace , rf .Name , metrics .REDIS_REPLICA_MISMATCH , metrics .NOT_APPLICABLE , errors .New ("not all replicas running" ))
312+ errorMsg := "not all replicas running"
313+ r .k8sservice .UpdateRedisFailoverStatus (context .Background (), rf .Namespace , rf , metav1.PatchOptions {})
314+ setRedisCheckerMetrics (r .mClient , "redis" , rf .Namespace , rf .Name , metrics .REDIS_REPLICA_MISMATCH , metrics .NOT_APPLICABLE , errors .New (errorMsg ))
237315 r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Debugf ("Number of redis mismatch, waiting for redis statefulset reconcile" )
238316 return nil
239317 }
240318
241319 err := r .UpdateRedisesPods (rf )
242320 if err != nil {
243- return err
321+ rf .Status = redisfailoverv1.RedisFailoverStatus {
322+ State : redisfailoverv1 .NotHealthyState ,
323+ Message : "unable to update Redis PODs" ,
324+ }
244325 }
245326 err = r .applyRedisCustomConfig (rf )
246327 setRedisCheckerMetrics (r .mClient , "redis" , rf .Namespace , rf .Name , metrics .APPLY_REDIS_CONFIG , metrics .NOT_APPLICABLE , err )
247328 if err != nil {
329+ rf .Status = redisfailoverv1.RedisFailoverStatus {
330+ State : redisfailoverv1 .NotHealthyState ,
331+ Message : "unable to set Redis custom config" ,
332+ }
248333 return err
249334 }
250335
251336 bootstrapSettings := rf .Spec .BootstrapNode
252337 err = r .rfHealer .SetExternalMasterOnAll (bootstrapSettings .Host , bootstrapSettings .Port , rf )
253338 setRedisCheckerMetrics (r .mClient , "redis" , rf .Namespace , rf .Name , metrics .APPLY_EXTERNAL_MASTER , metrics .NOT_APPLICABLE , err )
254339 if err != nil {
340+ rf .Status = redisfailoverv1.RedisFailoverStatus {
341+ State : redisfailoverv1 .NotHealthyState ,
342+ Message : "unable to set external master to all" ,
343+ }
255344 return err
256345 }
257346
258347 if rf .SentinelsAllowed () {
259348 if ! r .rfChecker .IsSentinelRunning (rf ) {
260- setRedisCheckerMetrics (r .mClient , "sentinel" , rf .Namespace , rf .Name , metrics .SENTINEL_REPLICA_MISMATCH , metrics .NOT_APPLICABLE , errors .New ("not all replicas running" ))
349+ errorMsg := "not all replicas running"
350+ rf .Status = redisfailoverv1.RedisFailoverStatus {
351+ State : redisfailoverv1 .NotHealthyState ,
352+ Message : errorMsg ,
353+ }
354+ r .k8sservice .UpdateRedisFailoverStatus (context .Background (), rf .Namespace , rf , metav1.PatchOptions {})
355+ setRedisCheckerMetrics (r .mClient , "sentinel" , rf .Namespace , rf .Name , metrics .SENTINEL_REPLICA_MISMATCH , metrics .NOT_APPLICABLE , errors .New (errorMsg ))
261356 r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Debugf ("Number of sentinel mismatch, waiting for sentinel deployment reconcile" )
262357 return nil
358+ } else {
359+ r .k8sservice .UpdateRedisFailoverStatus (context .Background (), rf .Namespace , rf , metav1.PatchOptions {})
263360 }
264361
265362 sentinels , err := r .rfChecker .GetSentinelsIPs (rf )
266363 if err != nil {
364+ rf .Status = redisfailoverv1.RedisFailoverStatus {
365+ State : redisfailoverv1 .NotHealthyState ,
366+ Message : "unable to get sentinels IPs" ,
367+ }
267368 return err
268369 }
269370 for _ , sip := range sentinels {
@@ -272,6 +373,10 @@ func (r *RedisFailoverHandler) checkAndHealBootstrapMode(rf *redisfailoverv1.Red
272373 if err != nil {
273374 r .logger .WithField ("redisfailover" , rf .ObjectMeta .Name ).WithField ("namespace" , rf .ObjectMeta .Namespace ).Warningf ("Fixing sentinel not monitoring expected master: %s" , err .Error ())
274375 if err := r .rfHealer .NewSentinelMonitorWithPort (sip , bootstrapSettings .Host , bootstrapSettings .Port , rf ); err != nil {
376+ rf .Status = redisfailoverv1.RedisFailoverStatus {
377+ State : redisfailoverv1 .NotHealthyState ,
378+ Message : "unable to check sentinel monitor" ,
379+ }
275380 return err
276381 }
277382 }
@@ -346,3 +451,10 @@ func setRedisCheckerMetrics(metricsClient metrics.Recorder, mode /* redis or sen
346451 }
347452 }
348453}
454+
455+ func updateStatus (k8sservice k8s.Services , rf * redisfailoverv1.RedisFailover , oldState string ) {
456+ if oldState != rf .Status .State {
457+ rf .Status .LastChanged = time .Now ().Format (time .RFC3339 )
458+ }
459+ k8sservice .UpdateRedisFailoverStatus (context .Background (), rf .Namespace , rf , metav1.PatchOptions {})
460+ }
0 commit comments