Skip to content

Commit 0975f04

Browse files
committed
backport of openshift#2071 to release-4.19
1 parent 298429b commit 0975f04

File tree

1 file changed

+127
-3
lines changed

1 file changed

+127
-3
lines changed

pkg/cli/admin/mustgather/mustgather.go

Lines changed: 127 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"os"
1010
"path"
1111
"regexp"
12+
"sort"
1213
"strconv"
1314
"strings"
1415
"sync"
@@ -103,6 +104,10 @@ const (
103104
concurrentMG = 4
104105
// annotation to look for in ClusterServiceVersions and ClusterOperators when using --all-images
105106
mgAnnotation = "operators.openshift.io/must-gather-image"
107+
108+
notReadyTaintKey = "node.kubernetes.io/not-ready"
109+
unreachableTaintKey = "node.kubernetes.io/unreachable"
110+
controlPlaneNodeRoleLabel = "node-role.kubernetes.io/control-plane"
106111
)
107112

108113
func NewMustGatherCommand(f kcmdutil.Factory, streams genericiooptions.IOStreams) *cobra.Command {
@@ -476,6 +481,9 @@ func (o *MustGatherOptions) Run() error {
476481
}
477482

478483
// ... and create must-gather pod(s)
484+
candidateNames := getCandidateNodeNames(nodes, hasMaster)
485+
affinity := buildNodeAffinity(candidateNames)
486+
479487
var pods []*corev1.Pod
480488
for _, image := range o.Images {
481489
_, err := imagereference.Parse(image)
@@ -496,7 +504,7 @@ func (o *MustGatherOptions) Run() error {
496504
return err
497505
}
498506
for _, node := range nodes.Items {
499-
pods = append(pods, o.newPod(node.Name, image, hasMaster))
507+
pods = append(pods, o.newPod(node.Name, image, hasMaster, affinity))
500508
}
501509
} else {
502510
if o.NodeName != "" {
@@ -506,7 +514,7 @@ func (o *MustGatherOptions) Run() error {
506514
return err
507515
}
508516
}
509-
pods = append(pods, o.newPod(o.NodeName, image, hasMaster))
517+
pods = append(pods, o.newPod(o.NodeName, image, hasMaster, affinity))
510518
}
511519
}
512520

@@ -924,7 +932,7 @@ func newClusterRoleBinding(ns *corev1.Namespace) *rbacv1.ClusterRoleBinding {
924932
// newPod creates a pod with 2 containers with a shared volume mount:
925933
// - gather: init containers that run gather command
926934
// - copy: no-op container we can exec into
927-
func (o *MustGatherOptions) newPod(node, image string, hasMaster bool) *corev1.Pod {
935+
func (o *MustGatherOptions) newPod(node, image string, hasMaster bool, affinity *corev1.Affinity) *corev1.Pod {
928936
zero := int64(0)
929937

930938
nodeSelector := map[string]string{
@@ -956,6 +964,7 @@ func (o *MustGatherOptions) newPod(node, image string, hasMaster bool) *corev1.P
956964
// so setting priority class to system-cluster-critical
957965
PriorityClassName: "system-cluster-critical",
958966
RestartPolicy: corev1.RestartPolicyNever,
967+
Affinity: affinity,
959968
Volumes: []corev1.Volume{
960969
{
961970
Name: "must-gather-output",
@@ -1058,6 +1067,121 @@ func (o *MustGatherOptions) newPod(node, image string, hasMaster bool) *corev1.P
10581067
return ret
10591068
}
10601069

1070+
func getNodeLastHeartbeatTime(node corev1.Node) *metav1.Time {
1071+
for _, cond := range node.Status.Conditions {
1072+
if cond.Type == corev1.NodeReady {
1073+
if !cond.LastHeartbeatTime.IsZero() {
1074+
return &cond.LastHeartbeatTime
1075+
}
1076+
return nil
1077+
}
1078+
}
1079+
return nil
1080+
}
1081+
1082+
func isNodeReadyByCondition(node corev1.Node) bool {
1083+
for _, cond := range node.Status.Conditions {
1084+
if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue {
1085+
return true
1086+
}
1087+
}
1088+
return false
1089+
}
1090+
1091+
func isNodeReadyAndReachableByTaint(node corev1.Node) bool {
1092+
for _, taint := range node.Spec.Taints {
1093+
if taint.Key == unreachableTaintKey || taint.Key == notReadyTaintKey {
1094+
return false
1095+
}
1096+
}
1097+
return true
1098+
}
1099+
1100+
func getCandidateNodeNames(nodes *corev1.NodeList, hasMaster bool) []string {
1101+
var controlPlaneNodes, allControlPlaneNodes, workerNodes, unschedulableNodes, remainingNodes, selectedNodes []corev1.Node
1102+
for _, node := range nodes.Items {
1103+
if _, ok := node.Labels[controlPlaneNodeRoleLabel]; ok {
1104+
allControlPlaneNodes = append(allControlPlaneNodes, node)
1105+
}
1106+
if !isNodeReadyByCondition(node) || !isNodeReadyAndReachableByTaint(node) {
1107+
remainingNodes = append(remainingNodes, node)
1108+
continue
1109+
}
1110+
if node.Spec.Unschedulable {
1111+
unschedulableNodes = append(unschedulableNodes, node)
1112+
continue
1113+
}
1114+
if _, ok := node.Labels[controlPlaneNodeRoleLabel]; ok {
1115+
controlPlaneNodes = append(controlPlaneNodes, node)
1116+
} else {
1117+
workerNodes = append(workerNodes, node)
1118+
}
1119+
}
1120+
1121+
if hasMaster {
1122+
if len(controlPlaneNodes) > 0 {
1123+
selectedNodes = controlPlaneNodes
1124+
} else {
1125+
selectedNodes = allControlPlaneNodes
1126+
}
1127+
} else {
1128+
selectedNodes = controlPlaneNodes
1129+
if len(selectedNodes) == 0 {
1130+
selectedNodes = workerNodes
1131+
}
1132+
if len(selectedNodes) == 0 {
1133+
selectedNodes = unschedulableNodes
1134+
}
1135+
if len(selectedNodes) == 0 {
1136+
selectedNodes = remainingNodes
1137+
}
1138+
}
1139+
1140+
sort.SliceStable(selectedNodes, func(i, j int) bool {
1141+
iTime := getNodeLastHeartbeatTime(selectedNodes[i])
1142+
jTime := getNodeLastHeartbeatTime(selectedNodes[j])
1143+
if jTime == nil {
1144+
return true
1145+
}
1146+
if iTime == nil {
1147+
return false
1148+
}
1149+
return jTime.Before(iTime)
1150+
})
1151+
1152+
nodeNames := []string{}
1153+
for idx, n := range selectedNodes {
1154+
if idx >= 10 {
1155+
break
1156+
}
1157+
nodeNames = append(nodeNames, n.Name)
1158+
}
1159+
return nodeNames
1160+
}
1161+
1162+
func buildNodeAffinity(nodeHostnames []string) *corev1.Affinity {
1163+
if len(nodeHostnames) == 0 {
1164+
return nil
1165+
}
1166+
return &corev1.Affinity{
1167+
NodeAffinity: &corev1.NodeAffinity{
1168+
RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{
1169+
NodeSelectorTerms: []corev1.NodeSelectorTerm{
1170+
{
1171+
MatchExpressions: []corev1.NodeSelectorRequirement{
1172+
{
1173+
Key: "kubernetes.io/hostname",
1174+
Operator: corev1.NodeSelectorOpIn,
1175+
Values: nodeHostnames,
1176+
},
1177+
},
1178+
},
1179+
},
1180+
},
1181+
},
1182+
}
1183+
}
1184+
10611185
// BackupGathering is called if the full must-gather has an error. This is useful for making sure we get *something*
10621186
// no matter what has failed. It should be focused on universal openshift failures.
10631187
func (o *MustGatherOptions) BackupGathering(ctx context.Context, errs []error) {

0 commit comments

Comments
 (0)