Skip to content

Commit ccff1e4

Browse files
nikhilsksajmera-pensando
authored andcommitted
Adding DCM E2Es (#690)
* Adding DCM E2Es * sanity
1 parent e5d5bcd commit ccff1e4

File tree

2 files changed

+39
-68
lines changed

2 files changed

+39
-68
lines changed

tests/e2e/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ lint:
6161
@goimports -w ./
6262
@go vet ./...
6363
dcm_e2e:
64-
go test -test.timeout=360m -check.f TestDCM.* -v -deviceConfigName test-deviceconfig
64+
go test -test.timeout=360m -check.f TestDCM.* -v -deviceConfigName test-deviceconfig -simEnable
6565
agfhc:
6666
go test -test.timeout=360m -check.f TestTestRunner.* -v -framework "AGFHC"
6767

tests/e2e/dcm_e2e_test.go

Lines changed: 38 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"bufio"
2121
"context"
2222
"encoding/json"
23+
"errors"
2324
"fmt"
2425
"os"
2526
"sort"
@@ -67,15 +68,13 @@ type GPUConfigProfile struct {
6768

6869
func (s *E2ESuite) addRemoveNodeLabels(nodeName string, selectedProfile string) {
6970
err := utils.AddNodeLabel(s.clientSet, nodeName, "dcm.amd.com/gpu-config-profile", selectedProfile)
70-
_ = utils.AddNodeLabel(s.clientSet, nodeName, "dcm.amd.com/apply-gpu-config-profile", "apply")
7171
if err != nil {
7272
logger.Infof("Error adding node lbels: %s\n", err.Error())
7373
return
7474
}
75-
time.Sleep(45 * time.Second)
75+
time.Sleep(15 * time.Second)
7676
// Allow partition to happen
7777
err = utils.DeleteNodeLabel(s.clientSet, nodeName, "dcm.amd.com/gpu-config-profile")
78-
_ = utils.DeleteNodeLabel(s.clientSet, nodeName, "dcm.amd.com/apply-gpu-config-profile")
7978
if err != nil {
8079
logger.Infof("Error removing node lbels: %s\n", err.Error())
8180
return
@@ -118,6 +117,7 @@ func (s *E2ESuite) GetPodName(ns string) (string, error) {
118117
func (s *E2ESuite) GetLatestEvents(ns string) ([]corev1.Event, error) {
119118

120119
dsName := s.cfgName + "-device-config-manager"
120+
logger.Infof("dsName: %s\n", dsName)
121121
fieldSelector := fields.Set{
122122
"involvedObject.name": dsName,
123123
}.AsSelector().String()
@@ -218,8 +218,9 @@ func (s *E2ESuite) createConfigMap() GPUConfigProfiles {
218218

219219
profiles_set1 := []*ProfileConfig{
220220
{
221-
ComputePartition: "SPX",
221+
ComputePartition: "DPX",
222222
MemoryPartition: "NPS1",
223+
NumGPUsAssigned: 1,
223224
},
224225
}
225226

@@ -406,18 +407,20 @@ func (s *E2ESuite) TestDCMConfigMapPartitionHomogenous(c *C) {
406407
s.configMapHelper(c)
407408
// Trigger partition using labels
408409
logger.Infof("Add node label after pod comes up")
409-
time.Sleep(30 * time.Second)
410+
time.Sleep(5 * time.Second)
410411

411412
nodeName := s.getWorkerNode(c)
412413
logger.Infof("NODE NAME %v", nodeName)
413414

414415
s.addRemoveNodeLabels(nodeName, "default")
415416

417+
logger.Infof("Getting pod logs")
416418
logs := s.getLogs()
417-
if strings.Contains(logs, "Partition completed successfully") && (!strings.Contains(logs, "ERROR")) && (s.eventHelper("SuccessfullyPartitioned", "Normal")) {
419+
420+
if strings.Contains(logs, "Partition not required") || (strings.Contains(logs, "Partition completed successfully") && (!strings.Contains(logs, "ERROR")) && (s.eventHelper("SuccessfullyPartitioned", "Normal"))) {
418421
logger.Infof("Successfully tested homogenous default partitioning")
419422
} else {
420-
logger.Errorf("Failure test homogenous partitioning")
423+
assert.NoError(c, errors.New("Test case Failed"), "failure test homogenous partitioning")
421424
}
422425
}
423426

@@ -432,18 +435,19 @@ func (s *E2ESuite) TestDCMConfigMapPartitionHeterogenous(c *C) {
432435
s.configMapHelper(c)
433436
// Trigger partition using labels
434437
logger.Infof("Add node label after pod comes up")
435-
time.Sleep(30 * time.Second)
438+
time.Sleep(5 * time.Second)
436439

437440
nodeName := s.getWorkerNode(c)
438441
logger.Infof("NODE NAME %v", nodeName)
439442

440443
s.addRemoveNodeLabels(nodeName, "e2e_profile1")
441444

445+
logger.Infof("Getting pod logs")
442446
logs := s.getLogs()
443-
if strings.Contains(logs, "Partition completed successfully") && (!strings.Contains(logs, "ERROR")) && (s.eventHelper("SuccessfullyPartitioned", "Normal")) {
447+
if strings.Contains(logs, "Partition not required") || (strings.Contains(logs, "Partition completed successfully") && (!strings.Contains(logs, "ERROR")) && (s.eventHelper("SuccessfullyPartitioned", "Normal"))) {
444448
logger.Infof("Successfully tested heterogenous partitioning")
445449
} else {
446-
logger.Errorf("Failure test heterogenous partitioning")
450+
assert.NoError(c, errors.New("Test case Failed"), "failure test heterogenous partitioning")
447451
}
448452
}
449453

@@ -458,131 +462,98 @@ func (s *E2ESuite) TestDCMPartitionNPS4(c *C) {
458462
s.configMapHelper(c)
459463
// Trigger partition using labels
460464
logger.Infof("Add node label after pod comes up")
461-
time.Sleep(30 * time.Second)
465+
time.Sleep(5 * time.Second)
462466

463467
nodeName := s.getWorkerNode(c)
464468
logger.Infof("NODE NAME %v", nodeName)
465469

466470
s.addRemoveNodeLabels(nodeName, "e2e_profile2")
467-
time.Sleep(30 * time.Second)
468471

472+
logger.Infof("Getting pod logs")
469473
logs := s.getLogs()
470-
if strings.Contains(logs, "Partition completed successfully") && (!strings.Contains(logs, "ERROR")) && (s.eventHelper("SuccessfullyPartitioned", "Normal")) {
474+
if strings.Contains(logs, "Partition not required") || (strings.Contains(logs, "Partition completed successfully") && (!strings.Contains(logs, "ERROR")) && (s.eventHelper("SuccessfullyPartitioned", "Normal"))) {
471475
logger.Infof("Successfully tested NPS4 partitioning")
472476
} else {
473-
logger.Errorf("Failure test NPS4 partitioning")
477+
assert.NoError(c, errors.New("Test case Failed"), "failure test NPS4 partitioning")
474478
}
475479
}
476480

477481
func (s *E2ESuite) TestDCMInvalidComputeType(c *C) {
482+
if s.simEnable {
483+
c.Skip("Skipping for non amd gpu testbed")
484+
}
478485
if !dcmImageDefined {
479486
c.Skip("skip DCM test because E2E_DCM_IMAGE is not defined")
480487
}
481488
c.Skip("Skipping DCM Partition test for now, enable after fixing the test")
482489
s.configMapHelper(c)
483490
// Trigger partition using labels
484491
logger.Infof("Add node label after pod comes up")
485-
time.Sleep(30 * time.Second)
492+
time.Sleep(5 * time.Second)
486493

487494
nodeName := s.getWorkerNode(c)
488495
logger.Infof("NODE NAME %v", nodeName)
489496

490497
s.addRemoveNodeLabels(nodeName, "inval_prof1")
491498

499+
logger.Infof("Getting pod logs")
492500
logs := s.getLogs()
493-
if strings.Contains(logs, "Invalid partition types") && (s.eventHelper("InvalidComputeType", "Warning")) {
501+
if strings.Contains(logs, "Invalid partition types") && (s.eventHelper("InvalidProfileInfo", "Warning")) {
494502
logger.Infof("Successfully tested invalid compute type profile")
495503
} else {
496-
logger.Errorf("Failure testing invalid compute type")
504+
assert.NoError(c, errors.New("Test case Failed"), "failure testing invalid compute type")
497505
}
498506
}
499507

500508
func (s *E2ESuite) TestDCMInvalidMemoryType(c *C) {
509+
if s.simEnable {
510+
c.Skip("Skipping for non amd gpu testbed")
511+
}
501512
if !dcmImageDefined {
502513
c.Skip("skip DCM test because E2E_DCM_IMAGE is not defined")
503514
}
504515
c.Skip("Skipping DCM Partition test for now, enable after fixing the test")
505516
s.configMapHelper(c)
506517
// Trigger partition using labels
507518
logger.Infof("Add node label after pod comes up")
508-
time.Sleep(30 * time.Second)
519+
time.Sleep(5 * time.Second)
509520

510521
nodeName := s.getWorkerNode(c)
511522
logger.Infof("NODE NAME %v", nodeName)
512523

513524
s.addRemoveNodeLabels(nodeName, "inval_prof2")
514525

526+
logger.Infof("Getting pod logs")
515527
logs := s.getLogs()
516-
if strings.Contains(logs, "Invalid partition types") && (s.eventHelper("InvalidMemoryType", "Warning")) {
528+
if strings.Contains(logs, "Invalid partition types") && (s.eventHelper("InvalidProfileInfo", "Warning")) {
517529
logger.Infof("Successfully tested invalid memory type profile")
518530
} else {
519-
logger.Errorf("Failure testing invalid memory type")
531+
assert.NoError(c, errors.New("Test case Failed"), "failure testing invalid memory type")
520532
}
521533
}
522534

523535
func (s *E2ESuite) TestDCMInvalidGPUFilter(c *C) {
536+
if s.simEnable {
537+
c.Skip("Skipping for non amd gpu testbed")
538+
}
524539
if !dcmImageDefined {
525540
c.Skip("skip DCM test because E2E_DCM_IMAGE is not defined")
526541
}
527542
c.Skip("Skipping DCM Partition test for now, enable after fixing the test")
528543
s.configMapHelper(c)
529544
// Trigger partition using labels
530545
logger.Infof("Add node label after pod comes up")
531-
time.Sleep(30 * time.Second)
546+
time.Sleep(5 * time.Second)
532547

533548
nodeName := s.getWorkerNode(c)
534549
logger.Infof("NODE NAME %v", nodeName)
535550

536551
s.addRemoveNodeLabels(nodeName, "inval_prof3")
537552

538-
logs := s.getLogs()
539-
if strings.Contains(logs, "exceeding the total number") && strings.Contains(logs, "ERROR") && (s.eventHelper("InvalidProfileInfo", "Warning")) {
553+
if s.eventHelper("InvalidProfileInfo", "Warning") {
540554
logger.Infof("Successfully tested invalid GPU filter profile")
541555
} else {
542-
logger.Errorf("Failure testing invalid GPU filter profile")
543-
}
544-
}
545-
546-
func (s *E2ESuite) TestDCMDefaultPartition(c *C) {
547-
if s.simEnable {
548-
c.Skip("Skipping for non amd gpu testbed")
549-
}
550-
if !dcmImageDefined {
551-
c.Skip("skip DCM test because E2E_DCM_IMAGE is not defined")
552-
}
553-
logger.Infof("###BEGIN TESTCASE###\n")
554-
// check to see existing deviceconfig DS pods
555-
_, err := s.dClient.DeviceConfigs(s.ns).Get(s.cfgName, metav1.GetOptions{})
556-
assert.Errorf(c, err, fmt.Sprintf("config %v exists", s.cfgName))
557-
558-
// fetch the CR
559-
devCfg := s.getDeviceConfigForDCM(c)
560-
logger.Infof("create device-config %+v", devCfg.Spec.ConfigManager)
561-
s.createDeviceConfig(devCfg, c)
562-
563-
s.checkDeviceConfigManagerStatus(devCfg, s.ns, c)
564-
logger.Infof("SUCCESSFULLY DEPLOYED DCM DAEMONSET")
565-
time.Sleep(30 * time.Second)
566-
567-
nodeName := s.getWorkerNode(c)
568-
err = utils.AddNodeLabel(s.clientSet, nodeName, "dcm.amd.com/apply-gpu-config-profile", "apply")
569-
if err != nil {
570-
logger.Infof("Error adding node lbels: %s\n", err.Error())
571-
return
572-
}
573-
time.Sleep(15 * time.Second)
574-
// Allow partition to happen
575-
err = utils.DeleteNodeLabel(s.clientSet, nodeName, "dcm.amd.com/apply-gpu-config-profile")
576-
if err != nil {
577-
logger.Infof("Error removing node lbels: %s\n", err.Error())
578-
return
579-
}
580-
581-
logs := s.getLogs()
582-
if strings.Contains(logs, "Partition completed successfully") && (!strings.Contains(logs, "ERROR")) && (s.eventHelper("SuccessfullyPartitioned", "Normal")) {
583-
logger.Infof("Successfully tested default partitioning")
584-
} else {
585-
logger.Errorf("Failure testing default partitioning")
556+
assert.NoError(c, errors.New("Test case Failed"), "failure testing invalid GPdvsdU filter profile")
586557
}
587558
}
588559

0 commit comments

Comments
 (0)