diff --git a/api/v1/tensorfusionconnection_types.go b/api/v1/tensorfusionconnection_types.go index d3116426..617e6405 100644 --- a/api/v1/tensorfusionconnection_types.go +++ b/api/v1/tensorfusionconnection_types.go @@ -31,8 +31,9 @@ const ( type Resource struct { Tflops resource.Quantity `json:"tflops"` + // +optional // 0-100 percentage, mutually exclusive with TFLOPs - ComputePercent resource.Quantity `json:"compute"` + ComputePercent resource.Quantity `json:"compute,omitempty"` Vram resource.Quantity `json:"vram"` } diff --git a/charts/tensor-fusion/Chart.yaml b/charts/tensor-fusion/Chart.yaml index 32068abf..1644fe3c 100644 --- a/charts/tensor-fusion/Chart.yaml +++ b/charts/tensor-fusion/Chart.yaml @@ -15,10 +15,10 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 1.7.1 +version: 1.7.2 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "1.48.2" +appVersion: "1.48.3" diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_gpuresourcequotas.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_gpuresourcequotas.yaml index f3c5ffdf..b579a45b 100644 --- a/charts/tensor-fusion/crds/tensor-fusion.ai_gpuresourcequotas.yaml +++ b/charts/tensor-fusion/crds/tensor-fusion.ai_gpuresourcequotas.yaml @@ -82,7 +82,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -110,7 +109,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -139,7 +137,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -166,7 +163,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -206,7 +202,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -238,7 +233,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -349,7 +343,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -376,7 +369,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_gpus.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_gpus.yaml index bb2a3330..9069664d 100644 --- a/charts/tensor-fusion/crds/tensor-fusion.ai_gpus.yaml +++ b/charts/tensor-fusion/crds/tensor-fusion.ai_gpus.yaml @@ -89,7 +89,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -115,7 +114,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml index 70ea9e71..ac79c7ca 100644 --- a/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml +++ b/charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml @@ -166,7 +166,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -193,7 +192,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml index 12315a1c..e89011eb 100644 --- a/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml +++ b/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml @@ -187,7 +187,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -214,7 +213,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -507,7 +505,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -533,7 +530,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -594,7 +590,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -621,7 +616,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -747,7 +741,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -773,7 +766,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml index 13f34962..f3f4a37d 100644 --- a/charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml +++ b/charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml @@ -174,7 +174,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -201,7 +200,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -494,7 +492,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -520,7 +517,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object diff --git a/charts/tensor-fusion/values.yaml b/charts/tensor-fusion/values.yaml index 87c57d68..115a992a 100644 --- a/charts/tensor-fusion/values.yaml +++ b/charts/tensor-fusion/values.yaml @@ -51,14 +51,16 @@ controller: tolerations: [] affinity: podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - topologyKey: kubernetes.io/hostname - labelSelector: - matchExpressions: - - key: tensor-fusion.ai/component - operator: In - values: - - operator + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + topologyKey: kubernetes.io/hostname + labelSelector: + matchExpressions: + - key: tensor-fusion.ai/component + operator: In + values: + - operator livenessProbe: httpGet: diff --git a/cmd/nodediscovery/main.go b/cmd/nodediscovery/main.go index ef366855..7ed30c0f 100644 --- a/cmd/nodediscovery/main.go +++ b/cmd/nodediscovery/main.go @@ -149,8 +149,8 @@ func main() { numaNodeId, ret := device.GetNumaNodeId() if ret != nvml.SUCCESS { - ctrl.Log.Error(errors.New(nvml.ErrorString(ret)), "unable to get NUMA node ID of device", "index", i) - os.Exit(1) + ctrl.Log.Info("unable to get NUMA node ID of device, will set to -1", "index", i, "msg", nvml.ErrorString(ret)) + numaNodeId = -1 } // Nvidia mobile series GPU chips are the same as desktop series GPU, but clock speed is lower diff --git a/config/crd/bases/tensor-fusion.ai_gpuresourcequotas.yaml b/config/crd/bases/tensor-fusion.ai_gpuresourcequotas.yaml index f3c5ffdf..b579a45b 100644 --- a/config/crd/bases/tensor-fusion.ai_gpuresourcequotas.yaml +++ b/config/crd/bases/tensor-fusion.ai_gpuresourcequotas.yaml @@ -82,7 +82,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -110,7 +109,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -139,7 +137,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -166,7 +163,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -206,7 +202,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -238,7 +233,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -349,7 +343,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -376,7 +369,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object diff --git a/config/crd/bases/tensor-fusion.ai_gpus.yaml b/config/crd/bases/tensor-fusion.ai_gpus.yaml index bb2a3330..9069664d 100644 --- a/config/crd/bases/tensor-fusion.ai_gpus.yaml +++ b/config/crd/bases/tensor-fusion.ai_gpus.yaml @@ -89,7 +89,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -115,7 +114,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object diff --git a/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml b/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml index 70ea9e71..ac79c7ca 100644 --- a/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml +++ b/config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml @@ -166,7 +166,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -193,7 +192,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object diff --git a/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml b/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml index 12315a1c..e89011eb 100644 --- a/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml +++ b/config/crd/bases/tensor-fusion.ai_tensorfusionworkloads.yaml @@ -187,7 +187,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -214,7 +213,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -507,7 +505,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -533,7 +530,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -594,7 +590,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -621,7 +616,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -747,7 +741,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -773,7 +766,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object diff --git a/config/crd/bases/tensor-fusion.ai_workloadprofiles.yaml b/config/crd/bases/tensor-fusion.ai_workloadprofiles.yaml index 13f34962..f3f4a37d 100644 --- a/config/crd/bases/tensor-fusion.ai_workloadprofiles.yaml +++ b/config/crd/bases/tensor-fusion.ai_workloadprofiles.yaml @@ -174,7 +174,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -201,7 +200,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -494,7 +492,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object @@ -520,7 +517,6 @@ spec: pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true required: - - compute - tflops - vram type: object diff --git a/internal/utils/config.go b/internal/utils/config.go index f18ac5fc..ee4a228c 100644 --- a/internal/utils/config.go +++ b/internal/utils/config.go @@ -4,6 +4,7 @@ import ( context "context" "encoding/base64" "encoding/json" + "fmt" "os" "strings" "time" @@ -128,23 +129,20 @@ func GetGPUResource(pod *corev1.Pod, isRequest bool) (tfv1.Resource, error) { vramKey = constants.VRAMLimitAnnotation computePercentKey = constants.ComputeLimitAnnotation } - tflops, err := resource.ParseQuantity(pod.Annotations[tflopsKey]) - if err != nil { - return tfv1.Resource{}, err - } - vram, err := resource.ParseQuantity(pod.Annotations[vramKey]) - if err != nil { - return tfv1.Resource{}, err + + tflops, tflopsErr := resource.ParseQuantity(pod.Annotations[tflopsKey]) + computePercent, percentErr := resource.ParseQuantity(pod.Annotations[computePercentKey]) + if tflopsErr == nil && percentErr == nil { + return tfv1.Resource{}, fmt.Errorf("tflops and compute-percent are mutually exclusive, please specify only one") + } else if tflopsErr != nil && percentErr != nil { + return tfv1.Resource{}, fmt.Errorf("failed to parse tflops or compute-percent, must specify one: %w", tflopsErr) } - // TFLOPs and compute percent are mutually exclusive, if TFLOPs is set, compute percent will be ignored - computePercent := resource.Quantity{} - if tflops.IsZero() { - computePercent, err = resource.ParseQuantity(pod.Annotations[computePercentKey]) - if err != nil { - return tfv1.Resource{}, err - } + vram, vramErr := resource.ParseQuantity(pod.Annotations[vramKey]) + if vramErr != nil { + return tfv1.Resource{}, vramErr } + return tfv1.Resource{ Tflops: tflops, Vram: vram,