99 "os"
1010 "path"
1111 "regexp"
12+ "sort"
1213 "strconv"
1314 "strings"
1415 "sync"
@@ -103,6 +104,10 @@ const (
103104 concurrentMG = 4
104105 // annotation to look for in ClusterServiceVersions and ClusterOperators when using --all-images
105106 mgAnnotation = "operators.openshift.io/must-gather-image"
107+
108+ notReadyTaintKey = "node.kubernetes.io/not-ready"
109+ unreachableTaintKey = "node.kubernetes.io/unreachable"
110+ controlPlaneNodeRoleLabel = "node-role.kubernetes.io/control-plane"
106111)
107112
108113func NewMustGatherCommand (f kcmdutil.Factory , streams genericiooptions.IOStreams ) * cobra.Command {
@@ -476,6 +481,9 @@ func (o *MustGatherOptions) Run() error {
476481 }
477482
478483 // ... and create must-gather pod(s)
484+ candidateNames := getCandidateNodeNames (nodes , hasMaster )
485+ affinity := buildNodeAffinity (candidateNames )
486+
479487 var pods []* corev1.Pod
480488 for _ , image := range o .Images {
481489 _ , err := imagereference .Parse (image )
@@ -496,7 +504,7 @@ func (o *MustGatherOptions) Run() error {
496504 return err
497505 }
498506 for _ , node := range nodes .Items {
499- pods = append (pods , o .newPod (node .Name , image , hasMaster ))
507+ pods = append (pods , o .newPod (node .Name , image , hasMaster , affinity ))
500508 }
501509 } else {
502510 if o .NodeName != "" {
@@ -506,7 +514,7 @@ func (o *MustGatherOptions) Run() error {
506514 return err
507515 }
508516 }
509- pods = append (pods , o .newPod (o .NodeName , image , hasMaster ))
517+ pods = append (pods , o .newPod (o .NodeName , image , hasMaster , affinity ))
510518 }
511519 }
512520
@@ -924,7 +932,7 @@ func newClusterRoleBinding(ns *corev1.Namespace) *rbacv1.ClusterRoleBinding {
924932// newPod creates a pod with 2 containers with a shared volume mount:
925933// - gather: init containers that run gather command
926934// - copy: no-op container we can exec into
927- func (o * MustGatherOptions ) newPod (node , image string , hasMaster bool ) * corev1.Pod {
935+ func (o * MustGatherOptions ) newPod (node , image string , hasMaster bool , affinity * corev1. Affinity ) * corev1.Pod {
928936 zero := int64 (0 )
929937
930938 nodeSelector := map [string ]string {
@@ -956,6 +964,7 @@ func (o *MustGatherOptions) newPod(node, image string, hasMaster bool) *corev1.P
956964 // so setting priority class to system-cluster-critical
957965 PriorityClassName : "system-cluster-critical" ,
958966 RestartPolicy : corev1 .RestartPolicyNever ,
967+ Affinity : affinity ,
959968 Volumes : []corev1.Volume {
960969 {
961970 Name : "must-gather-output" ,
@@ -1058,6 +1067,121 @@ func (o *MustGatherOptions) newPod(node, image string, hasMaster bool) *corev1.P
10581067 return ret
10591068}
10601069
1070+ func getNodeLastHeartbeatTime (node corev1.Node ) * metav1.Time {
1071+ for _ , cond := range node .Status .Conditions {
1072+ if cond .Type == corev1 .NodeReady {
1073+ if ! cond .LastHeartbeatTime .IsZero () {
1074+ return & cond .LastHeartbeatTime
1075+ }
1076+ return nil
1077+ }
1078+ }
1079+ return nil
1080+ }
1081+
1082+ func isNodeReadyByCondition (node corev1.Node ) bool {
1083+ for _ , cond := range node .Status .Conditions {
1084+ if cond .Type == corev1 .NodeReady && cond .Status == corev1 .ConditionTrue {
1085+ return true
1086+ }
1087+ }
1088+ return false
1089+ }
1090+
1091+ func isNodeReadyAndReachableByTaint (node corev1.Node ) bool {
1092+ for _ , taint := range node .Spec .Taints {
1093+ if taint .Key == unreachableTaintKey || taint .Key == notReadyTaintKey {
1094+ return false
1095+ }
1096+ }
1097+ return true
1098+ }
1099+
1100+ func getCandidateNodeNames (nodes * corev1.NodeList , hasMaster bool ) []string {
1101+ var controlPlaneNodes , allControlPlaneNodes , workerNodes , unschedulableNodes , remainingNodes , selectedNodes []corev1.Node
1102+ for _ , node := range nodes .Items {
1103+ if _ , ok := node .Labels [controlPlaneNodeRoleLabel ]; ok {
1104+ allControlPlaneNodes = append (allControlPlaneNodes , node )
1105+ }
1106+ if ! isNodeReadyByCondition (node ) || ! isNodeReadyAndReachableByTaint (node ) {
1107+ remainingNodes = append (remainingNodes , node )
1108+ continue
1109+ }
1110+ if node .Spec .Unschedulable {
1111+ unschedulableNodes = append (unschedulableNodes , node )
1112+ continue
1113+ }
1114+ if _ , ok := node .Labels [controlPlaneNodeRoleLabel ]; ok {
1115+ controlPlaneNodes = append (controlPlaneNodes , node )
1116+ } else {
1117+ workerNodes = append (workerNodes , node )
1118+ }
1119+ }
1120+
1121+ if hasMaster {
1122+ if len (controlPlaneNodes ) > 0 {
1123+ selectedNodes = controlPlaneNodes
1124+ } else {
1125+ selectedNodes = allControlPlaneNodes
1126+ }
1127+ } else {
1128+ selectedNodes = controlPlaneNodes
1129+ if len (selectedNodes ) == 0 {
1130+ selectedNodes = workerNodes
1131+ }
1132+ if len (selectedNodes ) == 0 {
1133+ selectedNodes = unschedulableNodes
1134+ }
1135+ if len (selectedNodes ) == 0 {
1136+ selectedNodes = remainingNodes
1137+ }
1138+ }
1139+
1140+ sort .SliceStable (selectedNodes , func (i , j int ) bool {
1141+ iTime := getNodeLastHeartbeatTime (selectedNodes [i ])
1142+ jTime := getNodeLastHeartbeatTime (selectedNodes [j ])
1143+ if jTime == nil {
1144+ return true
1145+ }
1146+ if iTime == nil {
1147+ return false
1148+ }
1149+ return jTime .Before (iTime )
1150+ })
1151+
1152+ nodeNames := []string {}
1153+ for idx , n := range selectedNodes {
1154+ if idx >= 10 {
1155+ break
1156+ }
1157+ nodeNames = append (nodeNames , n .Name )
1158+ }
1159+ return nodeNames
1160+ }
1161+
1162+ func buildNodeAffinity (nodeHostnames []string ) * corev1.Affinity {
1163+ if len (nodeHostnames ) == 0 {
1164+ return nil
1165+ }
1166+ return & corev1.Affinity {
1167+ NodeAffinity : & corev1.NodeAffinity {
1168+ RequiredDuringSchedulingIgnoredDuringExecution : & corev1.NodeSelector {
1169+ NodeSelectorTerms : []corev1.NodeSelectorTerm {
1170+ {
1171+ MatchExpressions : []corev1.NodeSelectorRequirement {
1172+ {
1173+ Key : "kubernetes.io/hostname" ,
1174+ Operator : corev1 .NodeSelectorOpIn ,
1175+ Values : nodeHostnames ,
1176+ },
1177+ },
1178+ },
1179+ },
1180+ },
1181+ },
1182+ }
1183+ }
1184+
10611185// BackupGathering is called if the full must-gather has an error. This is useful for making sure we get *something*
10621186// no matter what has failed. It should be focused on universal openshift failures.
10631187func (o * MustGatherOptions ) BackupGathering (ctx context.Context , errs []error ) {
0 commit comments