Skip to content

Commit 6323781

Browse files
authored
fix: numa node not found and required compute percent issue (#416)
1 parent ddb7ae5 commit 6323781

15 files changed

+28
-75
lines changed

api/v1/tensorfusionconnection_types.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,9 @@ const (
3131
type Resource struct {
3232
Tflops resource.Quantity `json:"tflops"`
3333

34+
// +optional
3435
// 0-100 percentage, mutually exclusive with TFLOPs
35-
ComputePercent resource.Quantity `json:"compute"`
36+
ComputePercent resource.Quantity `json:"compute,omitempty"`
3637

3738
Vram resource.Quantity `json:"vram"`
3839
}

charts/tensor-fusion/Chart.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 1.7.1
18+
version: 1.7.2
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to
2222
# follow Semantic Versioning. They should reflect the version the application is using.
2323
# It is recommended to use it with quotes.
24-
appVersion: "1.48.2"
24+
appVersion: "1.48.3"

charts/tensor-fusion/crds/tensor-fusion.ai_gpuresourcequotas.yaml

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,6 @@ spec:
8282
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
8383
x-kubernetes-int-or-string: true
8484
required:
85-
- compute
8685
- tflops
8786
- vram
8887
type: object
@@ -110,7 +109,6 @@ spec:
110109
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
111110
x-kubernetes-int-or-string: true
112111
required:
113-
- compute
114112
- tflops
115113
- vram
116114
type: object
@@ -139,7 +137,6 @@ spec:
139137
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
140138
x-kubernetes-int-or-string: true
141139
required:
142-
- compute
143140
- tflops
144141
- vram
145142
type: object
@@ -166,7 +163,6 @@ spec:
166163
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
167164
x-kubernetes-int-or-string: true
168165
required:
169-
- compute
170166
- tflops
171167
- vram
172168
type: object
@@ -206,7 +202,6 @@ spec:
206202
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
207203
x-kubernetes-int-or-string: true
208204
required:
209-
- compute
210205
- tflops
211206
- vram
212207
type: object
@@ -238,7 +233,6 @@ spec:
238233
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
239234
x-kubernetes-int-or-string: true
240235
required:
241-
- compute
242236
- tflops
243237
- vram
244238
type: object
@@ -349,7 +343,6 @@ spec:
349343
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
350344
x-kubernetes-int-or-string: true
351345
required:
352-
- compute
353346
- tflops
354347
- vram
355348
type: object
@@ -376,7 +369,6 @@ spec:
376369
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
377370
x-kubernetes-int-or-string: true
378371
required:
379-
- compute
380372
- tflops
381373
- vram
382374
type: object

charts/tensor-fusion/crds/tensor-fusion.ai_gpus.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,6 @@ spec:
8989
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
9090
x-kubernetes-int-or-string: true
9191
required:
92-
- compute
9392
- tflops
9493
- vram
9594
type: object
@@ -115,7 +114,6 @@ spec:
115114
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
116115
x-kubernetes-int-or-string: true
117116
required:
118-
- compute
119117
- tflops
120118
- vram
121119
type: object

charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,6 @@ spec:
166166
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
167167
x-kubernetes-int-or-string: true
168168
required:
169-
- compute
170169
- tflops
171170
- vram
172171
type: object
@@ -193,7 +192,6 @@ spec:
193192
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
194193
x-kubernetes-int-or-string: true
195194
required:
196-
- compute
197195
- tflops
198196
- vram
199197
type: object

charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionworkloads.yaml

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,6 @@ spec:
187187
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
188188
x-kubernetes-int-or-string: true
189189
required:
190-
- compute
191190
- tflops
192191
- vram
193192
type: object
@@ -214,7 +213,6 @@ spec:
214213
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
215214
x-kubernetes-int-or-string: true
216215
required:
217-
- compute
218216
- tflops
219217
- vram
220218
type: object
@@ -507,7 +505,6 @@ spec:
507505
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
508506
x-kubernetes-int-or-string: true
509507
required:
510-
- compute
511508
- tflops
512509
- vram
513510
type: object
@@ -533,7 +530,6 @@ spec:
533530
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
534531
x-kubernetes-int-or-string: true
535532
required:
536-
- compute
537533
- tflops
538534
- vram
539535
type: object
@@ -594,7 +590,6 @@ spec:
594590
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
595591
x-kubernetes-int-or-string: true
596592
required:
597-
- compute
598593
- tflops
599594
- vram
600595
type: object
@@ -621,7 +616,6 @@ spec:
621616
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
622617
x-kubernetes-int-or-string: true
623618
required:
624-
- compute
625619
- tflops
626620
- vram
627621
type: object
@@ -747,7 +741,6 @@ spec:
747741
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
748742
x-kubernetes-int-or-string: true
749743
required:
750-
- compute
751744
- tflops
752745
- vram
753746
type: object
@@ -773,7 +766,6 @@ spec:
773766
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
774767
x-kubernetes-int-or-string: true
775768
required:
776-
- compute
777769
- tflops
778770
- vram
779771
type: object

charts/tensor-fusion/crds/tensor-fusion.ai_workloadprofiles.yaml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,6 @@ spec:
174174
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
175175
x-kubernetes-int-or-string: true
176176
required:
177-
- compute
178177
- tflops
179178
- vram
180179
type: object
@@ -201,7 +200,6 @@ spec:
201200
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
202201
x-kubernetes-int-or-string: true
203202
required:
204-
- compute
205203
- tflops
206204
- vram
207205
type: object
@@ -494,7 +492,6 @@ spec:
494492
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
495493
x-kubernetes-int-or-string: true
496494
required:
497-
- compute
498495
- tflops
499496
- vram
500497
type: object
@@ -520,7 +517,6 @@ spec:
520517
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
521518
x-kubernetes-int-or-string: true
522519
required:
523-
- compute
524520
- tflops
525521
- vram
526522
type: object

charts/tensor-fusion/values.yaml

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -51,14 +51,16 @@ controller:
5151
tolerations: []
5252
affinity:
5353
podAntiAffinity:
54-
requiredDuringSchedulingIgnoredDuringExecution:
55-
- topologyKey: kubernetes.io/hostname
56-
labelSelector:
57-
matchExpressions:
58-
- key: tensor-fusion.ai/component
59-
operator: In
60-
values:
61-
- operator
54+
preferredDuringSchedulingIgnoredDuringExecution:
55+
- weight: 100
56+
podAffinityTerm:
57+
topologyKey: kubernetes.io/hostname
58+
labelSelector:
59+
matchExpressions:
60+
- key: tensor-fusion.ai/component
61+
operator: In
62+
values:
63+
- operator
6264

6365
livenessProbe:
6466
httpGet:

cmd/nodediscovery/main.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -149,8 +149,8 @@ func main() {
149149

150150
numaNodeId, ret := device.GetNumaNodeId()
151151
if ret != nvml.SUCCESS {
152-
ctrl.Log.Error(errors.New(nvml.ErrorString(ret)), "unable to get NUMA node ID of device", "index", i)
153-
os.Exit(1)
152+
ctrl.Log.Info("unable to get NUMA node ID of device, will set to -1", "index", i, "msg", nvml.ErrorString(ret))
153+
numaNodeId = -1
154154
}
155155

156156
// Nvidia mobile series GPU chips are the same as desktop series GPU, but clock speed is lower

config/crd/bases/tensor-fusion.ai_gpuresourcequotas.yaml

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,6 @@ spec:
8282
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
8383
x-kubernetes-int-or-string: true
8484
required:
85-
- compute
8685
- tflops
8786
- vram
8887
type: object
@@ -110,7 +109,6 @@ spec:
110109
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
111110
x-kubernetes-int-or-string: true
112111
required:
113-
- compute
114112
- tflops
115113
- vram
116114
type: object
@@ -139,7 +137,6 @@ spec:
139137
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
140138
x-kubernetes-int-or-string: true
141139
required:
142-
- compute
143140
- tflops
144141
- vram
145142
type: object
@@ -166,7 +163,6 @@ spec:
166163
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
167164
x-kubernetes-int-or-string: true
168165
required:
169-
- compute
170166
- tflops
171167
- vram
172168
type: object
@@ -206,7 +202,6 @@ spec:
206202
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
207203
x-kubernetes-int-or-string: true
208204
required:
209-
- compute
210205
- tflops
211206
- vram
212207
type: object
@@ -238,7 +233,6 @@ spec:
238233
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
239234
x-kubernetes-int-or-string: true
240235
required:
241-
- compute
242236
- tflops
243237
- vram
244238
type: object
@@ -349,7 +343,6 @@ spec:
349343
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
350344
x-kubernetes-int-or-string: true
351345
required:
352-
- compute
353346
- tflops
354347
- vram
355348
type: object
@@ -376,7 +369,6 @@ spec:
376369
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
377370
x-kubernetes-int-or-string: true
378371
required:
379-
- compute
380372
- tflops
381373
- vram
382374
type: object

0 commit comments

Comments
 (0)