From 90dced841f379d2af1dde4306edc9c1f6a2734e5 Mon Sep 17 00:00:00 2001 From: Kyle Felter Date: Tue, 30 Jun 2026 11:09:12 -0500 Subject: [PATCH 1/2] feat: Add DPU reprovision endpoint Signed-off-by: Kyle Felter --- rest-api/api/pkg/api/handler/adminmachine.go | 100 ++++++++ rest-api/api/pkg/api/handler/adminops_test.go | 150 ++++++++++++ .../api/pkg/api/handler/dpureprovision.go | 70 ++++++ rest-api/api/pkg/api/model/adminops_test.go | 27 +++ rest-api/api/pkg/api/model/dpureprovision.go | 78 ++++++ rest-api/api/pkg/api/routes.go | 5 + rest-api/api/pkg/api/routes_test.go | 5 +- rest-api/openapi/spec.yaml | 80 +++++++ rest-api/sdk/standard/api_dpu_reprovision.go | 188 +++++++++++++++ rest-api/sdk/standard/client.go | 3 + .../standard/model_dpu_reprovision_request.go | 198 ++++++++++++++++ .../model_dpu_reprovision_response.go | 223 ++++++++++++++++++ 12 files changed, 1126 insertions(+), 1 deletion(-) create mode 100644 rest-api/api/pkg/api/handler/adminmachine.go create mode 100644 rest-api/api/pkg/api/handler/adminops_test.go create mode 100644 rest-api/api/pkg/api/handler/dpureprovision.go create mode 100644 rest-api/api/pkg/api/model/adminops_test.go create mode 100644 rest-api/api/pkg/api/model/dpureprovision.go create mode 100644 rest-api/sdk/standard/api_dpu_reprovision.go create mode 100644 rest-api/sdk/standard/model_dpu_reprovision_request.go create mode 100644 rest-api/sdk/standard/model_dpu_reprovision_response.go diff --git a/rest-api/api/pkg/api/handler/adminmachine.go b/rest-api/api/pkg/api/handler/adminmachine.go new file mode 100644 index 0000000000..93a678a535 --- /dev/null +++ b/rest-api/api/pkg/api/handler/adminmachine.go @@ -0,0 +1,100 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package handler + +import ( + "context" + "errors" + "fmt" + "net/http" + + "github.com/labstack/echo/v4" + "github.com/rs/zerolog" + tClient "go.temporal.io/sdk/client" + + "github.com/NVIDIA/infra-controller/rest-api/api/internal/config" + "github.com/NVIDIA/infra-controller/rest-api/api/pkg/api/handler/util/common" + sc "github.com/NVIDIA/infra-controller/rest-api/api/pkg/client/site" + auth "github.com/NVIDIA/infra-controller/rest-api/auth/pkg/authorization" + cutil "github.com/NVIDIA/infra-controller/rest-api/common/pkg/util" + cdb "github.com/NVIDIA/infra-controller/rest-api/db/pkg/db" + cdbm "github.com/NVIDIA/infra-controller/rest-api/db/pkg/db/model" +) + +type adminMachineBase struct { + dbSession *cdb.Session + scp *sc.ClientPool + cfg *config.Config + tracerSpan *cutil.TracerSpan +} + +func (b adminMachineBase) authorizeMachine( + ctx context.Context, + c echo.Context, + logger zerolog.Logger, + org string, + dbUser *cdbm.User, + machineID string, +) (tClient.Client, string, *cdbm.Machine, error) { + if dbUser == nil { + logger.Error().Msg("invalid User object found in request context") + return nil, "", nil, cutil.NewAPIErrorResponse(c, http.StatusInternalServerError, "Failed to retrieve current user", nil) + } + if machineID == "" { + return nil, "", nil, cutil.NewAPIErrorResponse(c, http.StatusBadRequest, "Machine ID is required", nil) + } + + ok, err := auth.ValidateOrgMembership(dbUser, org) + if !ok { + if err != nil { + logger.Error().Err(err).Msg("error validating org membership for User in request") + } else { + logger.Warn().Msg("could not validate org membership for user, access denied") + } + return nil, "", nil, cutil.NewAPIErrorResponse(c, http.StatusForbidden, fmt.Sprintf("Failed to validate membership for org: %s", org), nil) + } + + if ok := auth.ValidateUserRoles(dbUser, org, nil, auth.ProviderAdminRole); !ok { + logger.Warn().Msg("user does not have Provider Admin role, access denied") + return nil, "", nil, cutil.NewAPIErrorResponse(c, http.StatusForbidden, "User does not have Provider Admin role with org", nil) + } + + provider, err := common.GetInfrastructureProviderForOrg(ctx, nil, b.dbSession, org) + if err != nil { + logger.Warn().Err(err).Msg("error getting infrastructure provider for org") + return nil, "", nil, cutil.NewAPIErrorResponse(c, http.StatusBadRequest, "Failed to retrieve Infrastructure Provider for org", nil) + } + + machine, err := cdbm.NewMachineDAO(b.dbSession).GetByID(ctx, nil, machineID, nil, false) + if err != nil { + if errors.Is(err, cdb.ErrDoesNotExist) { + return nil, "", nil, cutil.NewAPIErrorResponse(c, http.StatusNotFound, "Could not find Machine with specified ID", nil) + } + logger.Error().Err(err).Msg("error retrieving Machine DB entity") + return nil, "", nil, cutil.NewAPIErrorResponse(c, http.StatusInternalServerError, "Could not retrieve Machine", nil) + } + + if machine.InfrastructureProviderID != provider.ID { + return nil, "", nil, cutil.NewAPIErrorResponse(c, http.StatusForbidden, "Machine doesn't belong to org's Infrastructure provider", nil) + } + + site, err := common.GetSiteFromIDString(ctx, nil, machine.SiteID.String(), b.dbSession) + if err != nil { + if errors.Is(err, cdb.ErrDoesNotExist) || errors.Is(err, common.ErrInvalidID) { + return nil, "", nil, cutil.NewAPIErrorResponse(c, http.StatusBadRequest, "Machine Site does not exist", nil) + } + logger.Error().Err(err).Msg("error retrieving Machine Site from DB") + return nil, "", nil, cutil.NewAPIErrorResponse(c, http.StatusInternalServerError, "Failed to retrieve Machine Site due to DB error", nil) + } + if site.InfrastructureProviderID != provider.ID { + return nil, "", nil, cutil.NewAPIErrorResponse(c, http.StatusForbidden, "Machine Site doesn't belong to current org's Provider", nil) + } + + stc, err := b.scp.GetClientByID(site.ID) + if err != nil { + logger.Error().Err(err).Msg("failed to retrieve Temporal client for Site") + return nil, "", nil, cutil.NewAPIErrorResponse(c, http.StatusInternalServerError, "Failed to retrieve client for Site", nil) + } + return stc, site.ID.String(), machine, nil +} diff --git a/rest-api/api/pkg/api/handler/adminops_test.go b/rest-api/api/pkg/api/handler/adminops_test.go new file mode 100644 index 0000000000..4b7a42b5a1 --- /dev/null +++ b/rest-api/api/pkg/api/handler/adminops_test.go @@ -0,0 +1,150 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package handler + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "github.com/labstack/echo/v4" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/require" + tmocks "go.temporal.io/sdk/mocks" + "google.golang.org/protobuf/encoding/protojson" + + "github.com/NVIDIA/infra-controller/rest-api/api/pkg/api/handler/util/common" + "github.com/NVIDIA/infra-controller/rest-api/api/pkg/api/model" + sc "github.com/NVIDIA/infra-controller/rest-api/api/pkg/client/site" + authz "github.com/NVIDIA/infra-controller/rest-api/auth/pkg/authorization" + "github.com/NVIDIA/infra-controller/rest-api/common/pkg/coreproxy" + cutil "github.com/NVIDIA/infra-controller/rest-api/common/pkg/util" + cdbm "github.com/NVIDIA/infra-controller/rest-api/db/pkg/db/model" + cwssaws "github.com/NVIDIA/infra-controller/rest-api/workflow-schema/schema/site-agent/workflows/v1" +) + +type dpuReprovisionHandlerFixture struct { + org string + siteID string + machineID string + user interface{} + handler echo.HandlerFunc + proxiedReq *coreproxy.Request +} + +func newDpuReprovisionHandlerFixture(t *testing.T) dpuReprovisionHandlerFixture { + t.Helper() + + dbSession := common.TestInitDB(t) + t.Cleanup(dbSession.Close) + common.TestSetupSchema(t, dbSession) + + org := "test-org" + user := common.TestBuildUser(t, dbSession, "test-starfleet-id", org, []string{authz.ProviderAdminRole}) + ip := common.TestBuildInfrastructureProvider(t, dbSession, "Test Infrastructure Provider", org, user) + site := common.TestBuildSite(t, dbSession, ip, "Test Site", user) + it := common.TestBuildInstanceType(t, dbSession, "test-instance-type", cutil.GetPtr(site.ID), site, nil, user) + machine := common.TestBuildMachine(t, dbSession, ip, site, &it.ID, cutil.GetPtr("test-controller-machine-type"), cdbm.MachineStatusReady) + + proxiedReq := &coreproxy.Request{} + wrun := &tmocks.WorkflowRun{} + wrun.On("Get", mock.Anything, mock.Anything).Return(nil) + + tsc := &tmocks.Client{} + tsc.On( + "ExecuteWorkflow", + mock.Anything, + mock.Anything, + coreproxy.WorkflowName, + mock.MatchedBy(func(req coreproxy.Request) bool { + *proxiedReq = req + return true + }), + ).Return(wrun, nil) + + scp := sc.NewClientPool(nil) + scp.IDClientMap[site.ID.String()] = tsc + + cfg := common.GetTestConfig() + h := NewDpuReprovisionHandler(dbSession, scp, cfg) + + return dpuReprovisionHandlerFixture{ + org: org, + siteID: site.ID.String(), + machineID: machine.ID, + user: user, + handler: h.Handle, + proxiedReq: proxiedReq, + } +} + +func (f dpuReprovisionHandlerFixture) request(t *testing.T, body any) *httptest.ResponseRecorder { + t.Helper() + + var reqBody string + if body != nil { + bodyBytes, err := json.Marshal(body) + require.NoError(t, err) + reqBody = string(bodyBytes) + } + + e := echo.New() + req := httptest.NewRequest(http.MethodPatch, "/", strings.NewReader(reqBody)) + if body != nil { + req.Header.Set(echo.HeaderContentType, echo.MIMEApplicationJSON) + } + rec := httptest.NewRecorder() + ec := e.NewContext(req, rec) + ec.SetParamNames("orgName", "machineId") + ec.SetParamValues(f.org, f.machineID) + ec.Set("user", f.user) + + require.NoError(t, f.handler(ec)) + return rec +} + +func TestDpuReprovisionHandlerProxiesRequest(t *testing.T) { + fixture := newDpuReprovisionHandlerFixture(t) + + rec := fixture.request(t, model.APIDpuReprovisionRequest{Mode: model.DpuReprovisionModeRestart, UpdateFirmware: true}) + assert.Equal(t, http.StatusOK, rec.Code) + assert.Equal(t, cwssaws.Forge_TriggerDpuReprovisioning_FullMethodName, fixture.proxiedReq.FullMethod) + assert.Empty(t, fixture.proxiedReq.EncryptedSecrets) + + var coreReq cwssaws.DpuReprovisioningRequest + require.NoError(t, protojson.Unmarshal(fixture.proxiedReq.RequestJSON, &coreReq)) + assert.Equal(t, fixture.machineID, coreReq.GetMachineId().GetId()) + assert.Equal(t, cwssaws.DpuReprovisioningRequest_Restart, coreReq.GetMode()) + assert.Equal(t, cwssaws.UpdateInitiator_AdminCli, coreReq.GetInitiator()) + assert.True(t, coreReq.GetUpdateFirmware()) +} + +func TestDpuReprovisionHandlerRejectsInvalidMode(t *testing.T) { + fixture := newDpuReprovisionHandlerFixture(t) + + rec := fixture.request(t, model.APIDpuReprovisionRequest{Mode: "restart"}) + assert.Equal(t, http.StatusBadRequest, rec.Code) + assert.Empty(t, fixture.proxiedReq.FullMethod) +} + +func TestDpuReprovisionHandlerRequiresProviderAdmin(t *testing.T) { + fixture := newDpuReprovisionHandlerFixture(t) + fixture.user = &cdbm.User{OrgData: cdbm.OrgData{fixture.org: cdbm.Org{Name: fixture.org}}} + + rec := fixture.request(t, model.APIDpuReprovisionRequest{Mode: model.DpuReprovisionModeSet}) + assert.Equal(t, http.StatusForbidden, rec.Code) + assert.Empty(t, fixture.proxiedReq.FullMethod) +} + +func TestDpuReprovisionHandlerRejectsMissingMachine(t *testing.T) { + fixture := newDpuReprovisionHandlerFixture(t) + fixture.machineID = "missing-machine" + + rec := fixture.request(t, model.APIDpuReprovisionRequest{Mode: model.DpuReprovisionModeSet}) + assert.Equal(t, http.StatusNotFound, rec.Code) + assert.Empty(t, fixture.proxiedReq.FullMethod) +} diff --git a/rest-api/api/pkg/api/handler/dpureprovision.go b/rest-api/api/pkg/api/handler/dpureprovision.go new file mode 100644 index 0000000000..c3ddda8905 --- /dev/null +++ b/rest-api/api/pkg/api/handler/dpureprovision.go @@ -0,0 +1,70 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package handler + +import ( + "net/http" + + "github.com/labstack/echo/v4" + + "github.com/NVIDIA/infra-controller/rest-api/api/internal/config" + "github.com/NVIDIA/infra-controller/rest-api/api/pkg/api/handler/util/common" + "github.com/NVIDIA/infra-controller/rest-api/api/pkg/api/model" + sc "github.com/NVIDIA/infra-controller/rest-api/api/pkg/client/site" + cutil "github.com/NVIDIA/infra-controller/rest-api/common/pkg/util" + cdb "github.com/NVIDIA/infra-controller/rest-api/db/pkg/db" + cwssaws "github.com/NVIDIA/infra-controller/rest-api/workflow-schema/schema/site-agent/workflows/v1" +) + +type DpuReprovisionHandler struct { + adminMachineBase +} + +func NewDpuReprovisionHandler(dbSession *cdb.Session, scp *sc.ClientPool, cfg *config.Config) DpuReprovisionHandler { + return DpuReprovisionHandler{ + adminMachineBase{dbSession: dbSession, scp: scp, cfg: cfg, tracerSpan: cutil.NewTracerSpan()}, + } +} + +// Handle godoc +// @Summary Trigger DPU Reprovisioning +// @Description Trigger DPU reprovisioning for a Machine through NICo Core. Provider Admin only. +// @Tags dpu-reprovision +// @Accept json +// @Produce json +// @Security ApiKeyAuth +// @Param org path string true "Name of NGC organization" +// @Param machineId path string true "ID of Machine" +// @Param request body model.APIDpuReprovisionRequest true "DPU reprovision request" +// @Success 200 {object} model.APIDpuReprovisionResponse +// @Router /v2/org/{org}/nico/machine/{machineId}/dpu-reprovision [patch] +func (h DpuReprovisionHandler) Handle(c echo.Context) error { + org, dbUser, ctx, logger, handlerSpan := common.SetupHandler("DpuReprovision", "Trigger", c, h.tracerSpan) + if handlerSpan != nil { + defer handlerSpan.End() + } + + machineID := c.Param("machineId") + var apiReq model.APIDpuReprovisionRequest + if err := c.Bind(&apiReq); err != nil { + return cutil.NewAPIErrorResponse(c, http.StatusBadRequest, "Invalid request body", nil) + } + if err := apiReq.Validate(); err != nil { + return cutil.NewAPIErrorResponse(c, http.StatusBadRequest, err.Error(), nil) + } + + stc, siteID, _, errResp := h.authorizeMachine(ctx, c, logger, org, dbUser, machineID) + if errResp != nil || stc == nil { + return errResp + } + + logger.Info().Str("machineID", machineID).Str("mode", apiReq.Mode).Str("siteID", siteID).Msg("triggering DPU reprovisioning via Core proxy") + code, err := common.ExecuteCoreGRPC(ctx, stc, cwssaws.Forge_TriggerDpuReprovisioning_FullMethodName, apiReq.ToProto(machineID), nil, siteID) + if err != nil { + logger.Error().Err(err).Msg("failed to trigger DPU reprovisioning") + return cutil.NewAPIErrorResponse(c, code, "Failed to trigger DPU reprovisioning", nil) + } + + return c.JSON(http.StatusOK, model.NewAPIDpuReprovisionResponse(machineID, &apiReq)) +} diff --git a/rest-api/api/pkg/api/model/adminops_test.go b/rest-api/api/pkg/api/model/adminops_test.go new file mode 100644 index 0000000000..989685cf9d --- /dev/null +++ b/rest-api/api/pkg/api/model/adminops_test.go @@ -0,0 +1,27 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package model + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + cwssaws "github.com/NVIDIA/infra-controller/rest-api/workflow-schema/schema/site-agent/workflows/v1" +) + +func TestAPIDpuReprovisionRequestValidateAndToProto(t *testing.T) { + req := APIDpuReprovisionRequest{Mode: DpuReprovisionModeRestart, UpdateFirmware: true} + require.NoError(t, req.Validate()) + + protoReq := req.ToProto("machine-1") + assert.Equal(t, "machine-1", protoReq.GetMachineId().GetId()) + assert.Equal(t, cwssaws.DpuReprovisioningRequest_Restart, protoReq.GetMode()) + assert.Equal(t, cwssaws.UpdateInitiator_AdminCli, protoReq.GetInitiator()) + assert.True(t, protoReq.GetUpdateFirmware()) + + assert.Error(t, (&APIDpuReprovisionRequest{}).Validate()) + assert.Error(t, (&APIDpuReprovisionRequest{Mode: "restart"}).Validate()) +} diff --git a/rest-api/api/pkg/api/model/dpureprovision.go b/rest-api/api/pkg/api/model/dpureprovision.go new file mode 100644 index 0000000000..514ff68de3 --- /dev/null +++ b/rest-api/api/pkg/api/model/dpureprovision.go @@ -0,0 +1,78 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package model + +import ( + "fmt" + + cwssaws "github.com/NVIDIA/infra-controller/rest-api/workflow-schema/schema/site-agent/workflows/v1" + validation "github.com/go-ozzo/ozzo-validation/v4" +) + +const ( + DpuReprovisionModeSet = "Set" + DpuReprovisionModeClear = "Clear" + DpuReprovisionModeRestart = "Restart" +) + +var validDpuReprovisionModes = []string{ + DpuReprovisionModeSet, + DpuReprovisionModeClear, + DpuReprovisionModeRestart, +} + +var validDpuReprovisionModesAny = func() []interface{} { + result := make([]interface{}, len(validDpuReprovisionModes)) + for i, mode := range validDpuReprovisionModes { + result[i] = mode + } + return result +}() + +type APIDpuReprovisionRequest struct { + Mode string `json:"mode"` + UpdateFirmware bool `json:"updateFirmware,omitempty"` +} + +func (r *APIDpuReprovisionRequest) Validate() error { + return validation.ValidateStruct(r, + validation.Field(&r.Mode, + validation.Required.Error(validationErrorValueRequired), + validation.In(validDpuReprovisionModesAny...).Error(fmt.Sprintf("must be one of %v", validDpuReprovisionModes))), + ) +} + +func (r *APIDpuReprovisionRequest) ToProto(machineID string) *cwssaws.DpuReprovisioningRequest { + return &cwssaws.DpuReprovisioningRequest{ + MachineId: &cwssaws.MachineId{Id: machineID}, + Mode: dpuReprovisionModeToProto(r.Mode), + Initiator: cwssaws.UpdateInitiator_AdminCli, + UpdateFirmware: r.UpdateFirmware, + } +} + +type APIDpuReprovisionResponse struct { + MachineID string `json:"machineId"` + Mode string `json:"mode"` + UpdateFirmware bool `json:"updateFirmware,omitempty"` +} + +func NewAPIDpuReprovisionResponse(machineID string, req *APIDpuReprovisionRequest) *APIDpuReprovisionResponse { + return &APIDpuReprovisionResponse{ + MachineID: machineID, + Mode: req.Mode, + UpdateFirmware: req.UpdateFirmware, + } +} + +func dpuReprovisionModeToProto(mode string) cwssaws.DpuReprovisioningRequest_Mode { + switch mode { + case DpuReprovisionModeClear: + return cwssaws.DpuReprovisioningRequest_Clear + case DpuReprovisionModeRestart: + return cwssaws.DpuReprovisioningRequest_Restart + default: + return cwssaws.DpuReprovisioningRequest_Set + } +} diff --git a/rest-api/api/pkg/api/routes.go b/rest-api/api/pkg/api/routes.go index 4b69aefd74..d0a693f41c 100644 --- a/rest-api/api/pkg/api/routes.go +++ b/rest-api/api/pkg/api/routes.go @@ -546,6 +546,11 @@ func NewAPIRoutes(dbSession *cdb.Session, tc tClient.Client, tnc tClient.Namespa Method: http.MethodGet, Handler: apiHandler.NewGetMachineHandler(dbSession, tc, cfg), }, + { + Path: apiPathPrefix + "/machine/:machineId/dpu-reprovision", + Method: http.MethodPatch, + Handler: apiHandler.NewDpuReprovisionHandler(dbSession, scp, cfg), + }, { Path: apiPathPrefix + "/machine/:id", Method: http.MethodPatch, diff --git a/rest-api/api/pkg/api/routes_test.go b/rest-api/api/pkg/api/routes_test.go index 50ceca7614..5e20f547eb 100644 --- a/rest-api/api/pkg/api/routes_test.go +++ b/rest-api/api/pkg/api/routes_test.go @@ -56,7 +56,7 @@ func TestNewAPIRoutes(t *testing.T) { "expected-rack": 7, "expected-switch": 5, "instance-type": 5, - "machine": 6, + "machine": 7, "allocation": 6, "subnet": 5, "machine-instance-type": 3, @@ -112,6 +112,9 @@ func TestNewAPIRoutes(t *testing.T) { bmcCredentialPath := "/org/:orgName/" + cfg.GetAPIName() + "/credential/bmc" assertRouteExists(t, got, http.MethodPut, bmcCredentialPath) + machineAdminPath := "/org/:orgName/" + cfg.GetAPIName() + "/machine/:machineId" + assertRouteExists(t, got, http.MethodPatch, machineAdminPath+"/dpu-reprovision") + expectedMachineBatchPath := "/org/:orgName/" + cfg.GetAPIName() + "/expected-machine/batch" assertRouteExists(t, got, http.MethodPost, expectedMachineBatchPath) assertRouteExists(t, got, http.MethodPatch, expectedMachineBatchPath) diff --git a/rest-api/openapi/spec.yaml b/rest-api/openapi/spec.yaml index 327eea4265..aca2d04fc2 100644 --- a/rest-api/openapi/spec.yaml +++ b/rest-api/openapi/spec.yaml @@ -9123,6 +9123,51 @@ paths: description: Describes an error response for 500 Internal Server Error $ref: '#/components/responses/GenericHttpError' description: Org must have an Infrastructure Provider entity. Machine must belong to the Provider. User must have authorization role with `PROVIDER_ADMIN` suffix. Machine must meet certain criteria to be eligible for deletion. + '/v2/org/{org}/nico/machine/{machineId}/dpu-reprovision': + parameters: + - schema: + type: string + name: org + in: path + required: true + description: Name of the Org + - schema: + type: string + name: machineId + in: path + required: true + description: ID of the Machine + patch: + summary: Trigger DPU reprovisioning + tags: + - DPU Reprovision + requestBody: + required: true + content: + application/json: + schema: + $ref: '#/components/schemas/DpuReprovisionRequest' + responses: + '200': + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/DpuReprovisionResponse' + '400': + $ref: '#/components/responses/ValidationError' + '403': + $ref: '#/components/responses/ForbiddenError' + '404': + $ref: '#/components/responses/NotFoundError' + '500': + $ref: '#/components/responses/GenericHttpError' + operationId: trigger-dpu-reprovisioning + description: |- + Trigger DPU reprovisioning for a Machine through NICo Core. The request + is authorized, machine-scoped, and proxied to the Machine's owning Site. + This endpoint calls Core's `TriggerDpuReprovisioning` RPC directly. + User must have authorization role with `PROVIDER_ADMIN` suffix. '/v2/org/{org}/nico/machine/{machineId}/status-history': parameters: - schema: @@ -13150,6 +13195,41 @@ components: macAddress: type: string description: BMC MAC address. Required for kind BMCRoot, ignored for SiteWideRoot. + DpuReprovisionRequest: + type: object + title: DpuReprovisionRequest + description: Request to trigger DPU reprovisioning through NICo Core. + required: + - mode + properties: + mode: + type: string + description: DPU reprovisioning mode. + enum: + - Set + - Clear + - Restart + updateFirmware: + type: boolean + default: false + description: Whether to update firmware during reprovisioning. + DpuReprovisionResponse: + type: object + title: DpuReprovisionResponse + description: Response for a DPU reprovisioning request. + required: + - machineId + - mode + properties: + machineId: + type: string + description: ID of the Machine. + mode: + type: string + description: DPU reprovisioning mode accepted. + updateFirmware: + type: boolean + description: Whether firmware update was requested. InfrastructureProvider: description: Infrastructure providers own and manage datacenters type: object diff --git a/rest-api/sdk/standard/api_dpu_reprovision.go b/rest-api/sdk/standard/api_dpu_reprovision.go new file mode 100644 index 0000000000..9eed836572 --- /dev/null +++ b/rest-api/sdk/standard/api_dpu_reprovision.go @@ -0,0 +1,188 @@ +/* +NVIDIA Infra Controller REST API + +NVIDIA Infra Controller REST API allows users to create and manage resources, e.g., VPCs, Subnets, and Instances, across all connected NVIDIA Infra Controller datacenters, also referred to as Sites. + +API version: 1.6.0 +*/ + +// Code generated by OpenAPI Generator (https://openapi-generator.tech); DO NOT EDIT. + +package standard + +import ( + "bytes" + "context" + "io" + "net/http" + "net/url" + "strings" +) + +// DPUReprovisionAPIService DPUReprovisionAPI service +type DPUReprovisionAPIService service + +type ApiTriggerDpuReprovisioningRequest struct { + ctx context.Context + ApiService *DPUReprovisionAPIService + org string + machineId string + dpuReprovisionRequest *DpuReprovisionRequest +} + +func (r ApiTriggerDpuReprovisioningRequest) DpuReprovisionRequest(dpuReprovisionRequest DpuReprovisionRequest) ApiTriggerDpuReprovisioningRequest { + r.dpuReprovisionRequest = &dpuReprovisionRequest + return r +} + +func (r ApiTriggerDpuReprovisioningRequest) Execute() (*DpuReprovisionResponse, *http.Response, error) { + return r.ApiService.TriggerDpuReprovisioningExecute(r) +} + +/* +TriggerDpuReprovisioning Trigger DPU reprovisioning + +Trigger DPU reprovisioning for a Machine through NICo Core. The request +is authorized, machine-scoped, and proxied to the Machine's owning Site. +This endpoint calls Core's `TriggerDpuReprovisioning` RPC directly. +User must have authorization role with `PROVIDER_ADMIN` suffix. + + @param ctx context.Context - for authentication, logging, cancellation, deadlines, tracing, etc. Passed from http.Request or context.Background(). + @param org Name of the Org + @param machineId ID of the Machine + @return ApiTriggerDpuReprovisioningRequest +*/ +func (a *DPUReprovisionAPIService) TriggerDpuReprovisioning(ctx context.Context, org string, machineId string) ApiTriggerDpuReprovisioningRequest { + return ApiTriggerDpuReprovisioningRequest{ + ApiService: a, + ctx: ctx, + org: org, + machineId: machineId, + } +} + +// Execute executes the request +// +// @return DpuReprovisionResponse +func (a *DPUReprovisionAPIService) TriggerDpuReprovisioningExecute(r ApiTriggerDpuReprovisioningRequest) (*DpuReprovisionResponse, *http.Response, error) { + var ( + localVarHTTPMethod = http.MethodPatch + localVarPostBody interface{} + formFiles []formFile + localVarReturnValue *DpuReprovisionResponse + ) + + localBasePath, err := a.client.cfg.ServerURLWithContext(r.ctx, "DPUReprovisionAPIService.TriggerDpuReprovisioning") + if err != nil { + return localVarReturnValue, nil, &GenericOpenAPIError{error: err.Error()} + } + + localVarPath := localBasePath + "/v2/org/{org}/nico/machine/{machineId}/dpu-reprovision" + localVarPath = strings.Replace(localVarPath, "{"+"org"+"}", url.PathEscape(parameterValueToString(r.org, "org")), -1) + localVarPath = strings.Replace(localVarPath, "{"+"machineId"+"}", url.PathEscape(parameterValueToString(r.machineId, "machineId")), -1) + + localVarHeaderParams := make(map[string]string) + localVarQueryParams := url.Values{} + localVarFormParams := url.Values{} + if r.dpuReprovisionRequest == nil { + return localVarReturnValue, nil, reportError("dpuReprovisionRequest is required and must be specified") + } + + // to determine the Content-Type header + localVarHTTPContentTypes := []string{"application/json"} + + // set Content-Type header + localVarHTTPContentType := selectHeaderContentType(localVarHTTPContentTypes) + if localVarHTTPContentType != "" { + localVarHeaderParams["Content-Type"] = localVarHTTPContentType + } + + // to determine the Accept header + localVarHTTPHeaderAccepts := []string{"application/json"} + + // set Accept header + localVarHTTPHeaderAccept := selectHeaderAccept(localVarHTTPHeaderAccepts) + if localVarHTTPHeaderAccept != "" { + localVarHeaderParams["Accept"] = localVarHTTPHeaderAccept + } + // body params + localVarPostBody = r.dpuReprovisionRequest + req, err := a.client.prepareRequest(r.ctx, localVarPath, localVarHTTPMethod, localVarPostBody, localVarHeaderParams, localVarQueryParams, localVarFormParams, formFiles) + if err != nil { + return localVarReturnValue, nil, err + } + + localVarHTTPResponse, err := a.client.callAPI(req) + if err != nil || localVarHTTPResponse == nil { + return localVarReturnValue, localVarHTTPResponse, err + } + + localVarBody, err := io.ReadAll(localVarHTTPResponse.Body) + localVarHTTPResponse.Body.Close() + localVarHTTPResponse.Body = io.NopCloser(bytes.NewBuffer(localVarBody)) + if err != nil { + return localVarReturnValue, localVarHTTPResponse, err + } + + if localVarHTTPResponse.StatusCode >= 300 { + newErr := &GenericOpenAPIError{ + body: localVarBody, + error: localVarHTTPResponse.Status, + } + if localVarHTTPResponse.StatusCode == 400 { + var v NICoAPIError + err = a.client.decode(&v, localVarBody, localVarHTTPResponse.Header.Get("Content-Type")) + if err != nil { + newErr.error = err.Error() + return localVarReturnValue, localVarHTTPResponse, newErr + } + newErr.error = formatErrorMessage(localVarHTTPResponse.Status, &v) + newErr.model = v + return localVarReturnValue, localVarHTTPResponse, newErr + } + if localVarHTTPResponse.StatusCode == 403 { + var v NICoAPIError + err = a.client.decode(&v, localVarBody, localVarHTTPResponse.Header.Get("Content-Type")) + if err != nil { + newErr.error = err.Error() + return localVarReturnValue, localVarHTTPResponse, newErr + } + newErr.error = formatErrorMessage(localVarHTTPResponse.Status, &v) + newErr.model = v + return localVarReturnValue, localVarHTTPResponse, newErr + } + if localVarHTTPResponse.StatusCode == 404 { + var v NICoAPIError + err = a.client.decode(&v, localVarBody, localVarHTTPResponse.Header.Get("Content-Type")) + if err != nil { + newErr.error = err.Error() + return localVarReturnValue, localVarHTTPResponse, newErr + } + newErr.error = formatErrorMessage(localVarHTTPResponse.Status, &v) + newErr.model = v + return localVarReturnValue, localVarHTTPResponse, newErr + } + if localVarHTTPResponse.StatusCode == 500 { + var v NICoAPIError + err = a.client.decode(&v, localVarBody, localVarHTTPResponse.Header.Get("Content-Type")) + if err != nil { + newErr.error = err.Error() + return localVarReturnValue, localVarHTTPResponse, newErr + } + newErr.error = formatErrorMessage(localVarHTTPResponse.Status, &v) + newErr.model = v + } + return localVarReturnValue, localVarHTTPResponse, newErr + } + + err = a.client.decode(&localVarReturnValue, localVarBody, localVarHTTPResponse.Header.Get("Content-Type")) + if err != nil { + newErr := &GenericOpenAPIError{ + body: localVarBody, + error: err.Error(), + } + return localVarReturnValue, localVarHTTPResponse, newErr + } + + return localVarReturnValue, localVarHTTPResponse, nil +} diff --git a/rest-api/sdk/standard/client.go b/rest-api/sdk/standard/client.go index 30b573f2ab..121ecf6408 100644 --- a/rest-api/sdk/standard/client.go +++ b/rest-api/sdk/standard/client.go @@ -59,6 +59,8 @@ type APIClient struct { DPUExtensionServiceAPI *DPUExtensionServiceAPIService + DPUReprovisionAPI *DPUReprovisionAPIService + ExpectedMachineAPI *ExpectedMachineAPIService ExpectedPowerShelfAPI *ExpectedPowerShelfAPIService @@ -142,6 +144,7 @@ func NewAPIClient(cfg *Configuration) *APIClient { c.AuditAPI = (*AuditAPIService)(&c.common) c.BMCCredentialAPI = (*BMCCredentialAPIService)(&c.common) c.DPUExtensionServiceAPI = (*DPUExtensionServiceAPIService)(&c.common) + c.DPUReprovisionAPI = (*DPUReprovisionAPIService)(&c.common) c.ExpectedMachineAPI = (*ExpectedMachineAPIService)(&c.common) c.ExpectedPowerShelfAPI = (*ExpectedPowerShelfAPIService)(&c.common) c.ExpectedRackAPI = (*ExpectedRackAPIService)(&c.common) diff --git a/rest-api/sdk/standard/model_dpu_reprovision_request.go b/rest-api/sdk/standard/model_dpu_reprovision_request.go new file mode 100644 index 0000000000..abd20b377a --- /dev/null +++ b/rest-api/sdk/standard/model_dpu_reprovision_request.go @@ -0,0 +1,198 @@ +/* +NVIDIA Infra Controller REST API + +NVIDIA Infra Controller REST API allows users to create and manage resources, e.g., VPCs, Subnets, and Instances, across all connected NVIDIA Infra Controller datacenters, also referred to as Sites. + +API version: 1.6.0 +*/ + +// Code generated by OpenAPI Generator (https://openapi-generator.tech); DO NOT EDIT. + +package standard + +import ( + "bytes" + "encoding/json" + "fmt" +) + +// checks if the DpuReprovisionRequest type satisfies the MappedNullable interface at compile time +var _ MappedNullable = &DpuReprovisionRequest{} + +// DpuReprovisionRequest Request to trigger DPU reprovisioning through NICo Core. +type DpuReprovisionRequest struct { + // DPU reprovisioning mode. + Mode string `json:"mode"` + // Whether to update firmware during reprovisioning. + UpdateFirmware *bool `json:"updateFirmware,omitempty"` +} + +type _DpuReprovisionRequest DpuReprovisionRequest + +// NewDpuReprovisionRequest instantiates a new DpuReprovisionRequest object +// This constructor will assign default values to properties that have it defined, +// and makes sure properties required by API are set, but the set of arguments +// will change when the set of required properties is changed +func NewDpuReprovisionRequest(mode string) *DpuReprovisionRequest { + this := DpuReprovisionRequest{} + this.Mode = mode + var updateFirmware bool = false + this.UpdateFirmware = &updateFirmware + return &this +} + +// NewDpuReprovisionRequestWithDefaults instantiates a new DpuReprovisionRequest object +// This constructor will only assign default values to properties that have it defined, +// but it doesn't guarantee that properties required by API are set +func NewDpuReprovisionRequestWithDefaults() *DpuReprovisionRequest { + this := DpuReprovisionRequest{} + var updateFirmware bool = false + this.UpdateFirmware = &updateFirmware + return &this +} + +// GetMode returns the Mode field value +func (o *DpuReprovisionRequest) GetMode() string { + if o == nil { + var ret string + return ret + } + + return o.Mode +} + +// GetModeOk returns a tuple with the Mode field value +// and a boolean to check if the value has been set. +func (o *DpuReprovisionRequest) GetModeOk() (*string, bool) { + if o == nil { + return nil, false + } + return &o.Mode, true +} + +// SetMode sets field value +func (o *DpuReprovisionRequest) SetMode(v string) { + o.Mode = v +} + +// GetUpdateFirmware returns the UpdateFirmware field value if set, zero value otherwise. +func (o *DpuReprovisionRequest) GetUpdateFirmware() bool { + if o == nil || IsNil(o.UpdateFirmware) { + var ret bool + return ret + } + return *o.UpdateFirmware +} + +// GetUpdateFirmwareOk returns a tuple with the UpdateFirmware field value if set, nil otherwise +// and a boolean to check if the value has been set. +func (o *DpuReprovisionRequest) GetUpdateFirmwareOk() (*bool, bool) { + if o == nil || IsNil(o.UpdateFirmware) { + return nil, false + } + return o.UpdateFirmware, true +} + +// HasUpdateFirmware returns a boolean if a field has been set. +func (o *DpuReprovisionRequest) HasUpdateFirmware() bool { + if o != nil && !IsNil(o.UpdateFirmware) { + return true + } + + return false +} + +// SetUpdateFirmware gets a reference to the given bool and assigns it to the UpdateFirmware field. +func (o *DpuReprovisionRequest) SetUpdateFirmware(v bool) { + o.UpdateFirmware = &v +} + +func (o DpuReprovisionRequest) MarshalJSON() ([]byte, error) { + toSerialize, err := o.ToMap() + if err != nil { + return []byte{}, err + } + return json.Marshal(toSerialize) +} + +func (o DpuReprovisionRequest) ToMap() (map[string]interface{}, error) { + toSerialize := map[string]interface{}{} + toSerialize["mode"] = o.Mode + if !IsNil(o.UpdateFirmware) { + toSerialize["updateFirmware"] = o.UpdateFirmware + } + return toSerialize, nil +} + +func (o *DpuReprovisionRequest) UnmarshalJSON(data []byte) (err error) { + // This validates that all required properties are included in the JSON object + // by unmarshalling the object into a generic map with string keys and checking + // that every required field exists as a key in the generic map. + requiredProperties := []string{ + "mode", + } + + allProperties := make(map[string]interface{}) + + err = json.Unmarshal(data, &allProperties) + + if err != nil { + return err + } + + for _, requiredProperty := range requiredProperties { + if _, exists := allProperties[requiredProperty]; !exists { + return fmt.Errorf("no value given for required property %v", requiredProperty) + } + } + + varDpuReprovisionRequest := _DpuReprovisionRequest{} + + decoder := json.NewDecoder(bytes.NewReader(data)) + decoder.DisallowUnknownFields() + err = decoder.Decode(&varDpuReprovisionRequest) + + if err != nil { + return err + } + + *o = DpuReprovisionRequest(varDpuReprovisionRequest) + + return err +} + +type NullableDpuReprovisionRequest struct { + value *DpuReprovisionRequest + isSet bool +} + +func (v NullableDpuReprovisionRequest) Get() *DpuReprovisionRequest { + return v.value +} + +func (v *NullableDpuReprovisionRequest) Set(val *DpuReprovisionRequest) { + v.value = val + v.isSet = true +} + +func (v NullableDpuReprovisionRequest) IsSet() bool { + return v.isSet +} + +func (v *NullableDpuReprovisionRequest) Unset() { + v.value = nil + v.isSet = false +} + +func NewNullableDpuReprovisionRequest(val *DpuReprovisionRequest) *NullableDpuReprovisionRequest { + return &NullableDpuReprovisionRequest{value: val, isSet: true} +} + +func (v NullableDpuReprovisionRequest) MarshalJSON() ([]byte, error) { + return json.Marshal(v.value) +} + +func (v *NullableDpuReprovisionRequest) UnmarshalJSON(src []byte) error { + v.isSet = true + return json.Unmarshal(src, &v.value) +} diff --git a/rest-api/sdk/standard/model_dpu_reprovision_response.go b/rest-api/sdk/standard/model_dpu_reprovision_response.go new file mode 100644 index 0000000000..719ed4e281 --- /dev/null +++ b/rest-api/sdk/standard/model_dpu_reprovision_response.go @@ -0,0 +1,223 @@ +/* +NVIDIA Infra Controller REST API + +NVIDIA Infra Controller REST API allows users to create and manage resources, e.g., VPCs, Subnets, and Instances, across all connected NVIDIA Infra Controller datacenters, also referred to as Sites. + +API version: 1.6.0 +*/ + +// Code generated by OpenAPI Generator (https://openapi-generator.tech); DO NOT EDIT. + +package standard + +import ( + "bytes" + "encoding/json" + "fmt" +) + +// checks if the DpuReprovisionResponse type satisfies the MappedNullable interface at compile time +var _ MappedNullable = &DpuReprovisionResponse{} + +// DpuReprovisionResponse Response for a DPU reprovisioning request. +type DpuReprovisionResponse struct { + // ID of the Machine. + MachineId string `json:"machineId"` + // DPU reprovisioning mode accepted. + Mode string `json:"mode"` + // Whether firmware update was requested. + UpdateFirmware *bool `json:"updateFirmware,omitempty"` +} + +type _DpuReprovisionResponse DpuReprovisionResponse + +// NewDpuReprovisionResponse instantiates a new DpuReprovisionResponse object +// This constructor will assign default values to properties that have it defined, +// and makes sure properties required by API are set, but the set of arguments +// will change when the set of required properties is changed +func NewDpuReprovisionResponse(machineId string, mode string) *DpuReprovisionResponse { + this := DpuReprovisionResponse{} + this.MachineId = machineId + this.Mode = mode + return &this +} + +// NewDpuReprovisionResponseWithDefaults instantiates a new DpuReprovisionResponse object +// This constructor will only assign default values to properties that have it defined, +// but it doesn't guarantee that properties required by API are set +func NewDpuReprovisionResponseWithDefaults() *DpuReprovisionResponse { + this := DpuReprovisionResponse{} + return &this +} + +// GetMachineId returns the MachineId field value +func (o *DpuReprovisionResponse) GetMachineId() string { + if o == nil { + var ret string + return ret + } + + return o.MachineId +} + +// GetMachineIdOk returns a tuple with the MachineId field value +// and a boolean to check if the value has been set. +func (o *DpuReprovisionResponse) GetMachineIdOk() (*string, bool) { + if o == nil { + return nil, false + } + return &o.MachineId, true +} + +// SetMachineId sets field value +func (o *DpuReprovisionResponse) SetMachineId(v string) { + o.MachineId = v +} + +// GetMode returns the Mode field value +func (o *DpuReprovisionResponse) GetMode() string { + if o == nil { + var ret string + return ret + } + + return o.Mode +} + +// GetModeOk returns a tuple with the Mode field value +// and a boolean to check if the value has been set. +func (o *DpuReprovisionResponse) GetModeOk() (*string, bool) { + if o == nil { + return nil, false + } + return &o.Mode, true +} + +// SetMode sets field value +func (o *DpuReprovisionResponse) SetMode(v string) { + o.Mode = v +} + +// GetUpdateFirmware returns the UpdateFirmware field value if set, zero value otherwise. +func (o *DpuReprovisionResponse) GetUpdateFirmware() bool { + if o == nil || IsNil(o.UpdateFirmware) { + var ret bool + return ret + } + return *o.UpdateFirmware +} + +// GetUpdateFirmwareOk returns a tuple with the UpdateFirmware field value if set, nil otherwise +// and a boolean to check if the value has been set. +func (o *DpuReprovisionResponse) GetUpdateFirmwareOk() (*bool, bool) { + if o == nil || IsNil(o.UpdateFirmware) { + return nil, false + } + return o.UpdateFirmware, true +} + +// HasUpdateFirmware returns a boolean if a field has been set. +func (o *DpuReprovisionResponse) HasUpdateFirmware() bool { + if o != nil && !IsNil(o.UpdateFirmware) { + return true + } + + return false +} + +// SetUpdateFirmware gets a reference to the given bool and assigns it to the UpdateFirmware field. +func (o *DpuReprovisionResponse) SetUpdateFirmware(v bool) { + o.UpdateFirmware = &v +} + +func (o DpuReprovisionResponse) MarshalJSON() ([]byte, error) { + toSerialize, err := o.ToMap() + if err != nil { + return []byte{}, err + } + return json.Marshal(toSerialize) +} + +func (o DpuReprovisionResponse) ToMap() (map[string]interface{}, error) { + toSerialize := map[string]interface{}{} + toSerialize["machineId"] = o.MachineId + toSerialize["mode"] = o.Mode + if !IsNil(o.UpdateFirmware) { + toSerialize["updateFirmware"] = o.UpdateFirmware + } + return toSerialize, nil +} + +func (o *DpuReprovisionResponse) UnmarshalJSON(data []byte) (err error) { + // This validates that all required properties are included in the JSON object + // by unmarshalling the object into a generic map with string keys and checking + // that every required field exists as a key in the generic map. + requiredProperties := []string{ + "machineId", + "mode", + } + + allProperties := make(map[string]interface{}) + + err = json.Unmarshal(data, &allProperties) + + if err != nil { + return err + } + + for _, requiredProperty := range requiredProperties { + if _, exists := allProperties[requiredProperty]; !exists { + return fmt.Errorf("no value given for required property %v", requiredProperty) + } + } + + varDpuReprovisionResponse := _DpuReprovisionResponse{} + + decoder := json.NewDecoder(bytes.NewReader(data)) + decoder.DisallowUnknownFields() + err = decoder.Decode(&varDpuReprovisionResponse) + + if err != nil { + return err + } + + *o = DpuReprovisionResponse(varDpuReprovisionResponse) + + return err +} + +type NullableDpuReprovisionResponse struct { + value *DpuReprovisionResponse + isSet bool +} + +func (v NullableDpuReprovisionResponse) Get() *DpuReprovisionResponse { + return v.value +} + +func (v *NullableDpuReprovisionResponse) Set(val *DpuReprovisionResponse) { + v.value = val + v.isSet = true +} + +func (v NullableDpuReprovisionResponse) IsSet() bool { + return v.isSet +} + +func (v *NullableDpuReprovisionResponse) Unset() { + v.value = nil + v.isSet = false +} + +func NewNullableDpuReprovisionResponse(val *DpuReprovisionResponse) *NullableDpuReprovisionResponse { + return &NullableDpuReprovisionResponse{value: val, isSet: true} +} + +func (v NullableDpuReprovisionResponse) MarshalJSON() ([]byte, error) { + return json.Marshal(v.value) +} + +func (v *NullableDpuReprovisionResponse) UnmarshalJSON(src []byte) error { + v.isSet = true + return json.Unmarshal(src, &v.value) +} From 5425d407b307f0b135719fa73392650a25acefd5 Mon Sep 17 00:00:00 2001 From: Kyle Felter Date: Tue, 30 Jun 2026 12:38:06 -0500 Subject: [PATCH 2/2] refactor: Clarify DPU reprovision request assembly Signed-off-by: Kyle Felter --- rest-api/api/pkg/api/handler/adminmachine.go | 100 ------------------ .../api/pkg/api/handler/dpureprovision.go | 62 +++++++++-- rest-api/api/pkg/api/model/adminops_test.go | 4 +- rest-api/api/pkg/api/model/dpureprovision.go | 9 +- 4 files changed, 61 insertions(+), 114 deletions(-) delete mode 100644 rest-api/api/pkg/api/handler/adminmachine.go diff --git a/rest-api/api/pkg/api/handler/adminmachine.go b/rest-api/api/pkg/api/handler/adminmachine.go deleted file mode 100644 index 93a678a535..0000000000 --- a/rest-api/api/pkg/api/handler/adminmachine.go +++ /dev/null @@ -1,100 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -package handler - -import ( - "context" - "errors" - "fmt" - "net/http" - - "github.com/labstack/echo/v4" - "github.com/rs/zerolog" - tClient "go.temporal.io/sdk/client" - - "github.com/NVIDIA/infra-controller/rest-api/api/internal/config" - "github.com/NVIDIA/infra-controller/rest-api/api/pkg/api/handler/util/common" - sc "github.com/NVIDIA/infra-controller/rest-api/api/pkg/client/site" - auth "github.com/NVIDIA/infra-controller/rest-api/auth/pkg/authorization" - cutil "github.com/NVIDIA/infra-controller/rest-api/common/pkg/util" - cdb "github.com/NVIDIA/infra-controller/rest-api/db/pkg/db" - cdbm "github.com/NVIDIA/infra-controller/rest-api/db/pkg/db/model" -) - -type adminMachineBase struct { - dbSession *cdb.Session - scp *sc.ClientPool - cfg *config.Config - tracerSpan *cutil.TracerSpan -} - -func (b adminMachineBase) authorizeMachine( - ctx context.Context, - c echo.Context, - logger zerolog.Logger, - org string, - dbUser *cdbm.User, - machineID string, -) (tClient.Client, string, *cdbm.Machine, error) { - if dbUser == nil { - logger.Error().Msg("invalid User object found in request context") - return nil, "", nil, cutil.NewAPIErrorResponse(c, http.StatusInternalServerError, "Failed to retrieve current user", nil) - } - if machineID == "" { - return nil, "", nil, cutil.NewAPIErrorResponse(c, http.StatusBadRequest, "Machine ID is required", nil) - } - - ok, err := auth.ValidateOrgMembership(dbUser, org) - if !ok { - if err != nil { - logger.Error().Err(err).Msg("error validating org membership for User in request") - } else { - logger.Warn().Msg("could not validate org membership for user, access denied") - } - return nil, "", nil, cutil.NewAPIErrorResponse(c, http.StatusForbidden, fmt.Sprintf("Failed to validate membership for org: %s", org), nil) - } - - if ok := auth.ValidateUserRoles(dbUser, org, nil, auth.ProviderAdminRole); !ok { - logger.Warn().Msg("user does not have Provider Admin role, access denied") - return nil, "", nil, cutil.NewAPIErrorResponse(c, http.StatusForbidden, "User does not have Provider Admin role with org", nil) - } - - provider, err := common.GetInfrastructureProviderForOrg(ctx, nil, b.dbSession, org) - if err != nil { - logger.Warn().Err(err).Msg("error getting infrastructure provider for org") - return nil, "", nil, cutil.NewAPIErrorResponse(c, http.StatusBadRequest, "Failed to retrieve Infrastructure Provider for org", nil) - } - - machine, err := cdbm.NewMachineDAO(b.dbSession).GetByID(ctx, nil, machineID, nil, false) - if err != nil { - if errors.Is(err, cdb.ErrDoesNotExist) { - return nil, "", nil, cutil.NewAPIErrorResponse(c, http.StatusNotFound, "Could not find Machine with specified ID", nil) - } - logger.Error().Err(err).Msg("error retrieving Machine DB entity") - return nil, "", nil, cutil.NewAPIErrorResponse(c, http.StatusInternalServerError, "Could not retrieve Machine", nil) - } - - if machine.InfrastructureProviderID != provider.ID { - return nil, "", nil, cutil.NewAPIErrorResponse(c, http.StatusForbidden, "Machine doesn't belong to org's Infrastructure provider", nil) - } - - site, err := common.GetSiteFromIDString(ctx, nil, machine.SiteID.String(), b.dbSession) - if err != nil { - if errors.Is(err, cdb.ErrDoesNotExist) || errors.Is(err, common.ErrInvalidID) { - return nil, "", nil, cutil.NewAPIErrorResponse(c, http.StatusBadRequest, "Machine Site does not exist", nil) - } - logger.Error().Err(err).Msg("error retrieving Machine Site from DB") - return nil, "", nil, cutil.NewAPIErrorResponse(c, http.StatusInternalServerError, "Failed to retrieve Machine Site due to DB error", nil) - } - if site.InfrastructureProviderID != provider.ID { - return nil, "", nil, cutil.NewAPIErrorResponse(c, http.StatusForbidden, "Machine Site doesn't belong to current org's Provider", nil) - } - - stc, err := b.scp.GetClientByID(site.ID) - if err != nil { - logger.Error().Err(err).Msg("failed to retrieve Temporal client for Site") - return nil, "", nil, cutil.NewAPIErrorResponse(c, http.StatusInternalServerError, "Failed to retrieve client for Site", nil) - } - return stc, site.ID.String(), machine, nil -} diff --git a/rest-api/api/pkg/api/handler/dpureprovision.go b/rest-api/api/pkg/api/handler/dpureprovision.go index c3ddda8905..6cd0bdd881 100644 --- a/rest-api/api/pkg/api/handler/dpureprovision.go +++ b/rest-api/api/pkg/api/handler/dpureprovision.go @@ -4,6 +4,7 @@ package handler import ( + "errors" "net/http" "github.com/labstack/echo/v4" @@ -14,16 +15,21 @@ import ( sc "github.com/NVIDIA/infra-controller/rest-api/api/pkg/client/site" cutil "github.com/NVIDIA/infra-controller/rest-api/common/pkg/util" cdb "github.com/NVIDIA/infra-controller/rest-api/db/pkg/db" + cdbm "github.com/NVIDIA/infra-controller/rest-api/db/pkg/db/model" cwssaws "github.com/NVIDIA/infra-controller/rest-api/workflow-schema/schema/site-agent/workflows/v1" ) type DpuReprovisionHandler struct { - adminMachineBase + dbSession *cdb.Session + scp *sc.ClientPool + tracerSpan *cutil.TracerSpan } -func NewDpuReprovisionHandler(dbSession *cdb.Session, scp *sc.ClientPool, cfg *config.Config) DpuReprovisionHandler { +func NewDpuReprovisionHandler(dbSession *cdb.Session, scp *sc.ClientPool, _ *config.Config) DpuReprovisionHandler { return DpuReprovisionHandler{ - adminMachineBase{dbSession: dbSession, scp: scp, cfg: cfg, tracerSpan: cutil.NewTracerSpan()}, + dbSession: dbSession, + scp: scp, + tracerSpan: cutil.NewTracerSpan(), } } @@ -44,8 +50,16 @@ func (h DpuReprovisionHandler) Handle(c echo.Context) error { if handlerSpan != nil { defer handlerSpan.End() } + if dbUser == nil { + logger.Error().Msg("invalid User object found in request context") + return cutil.NewAPIErrorResponse(c, http.StatusInternalServerError, "Failed to retrieve current user", nil) + } machineID := c.Param("machineId") + if machineID == "" { + return cutil.NewAPIErrorResponse(c, http.StatusBadRequest, "Machine ID is required", nil) + } + var apiReq model.APIDpuReprovisionRequest if err := c.Bind(&apiReq); err != nil { return cutil.NewAPIErrorResponse(c, http.StatusBadRequest, "Invalid request body", nil) @@ -53,18 +67,50 @@ func (h DpuReprovisionHandler) Handle(c echo.Context) error { if err := apiReq.Validate(); err != nil { return cutil.NewAPIErrorResponse(c, http.StatusBadRequest, err.Error(), nil) } + apiReq.MachineID = machineID + + provider, apiErr := common.IsProvider(ctx, logger, h.dbSession, org, dbUser, false) + if apiErr != nil { + return cutil.NewAPIErrorResponse(c, apiErr.Code, apiErr.Message, apiErr.Data) + } + + machine, err := cdbm.NewMachineDAO(h.dbSession).GetByID(ctx, nil, machineID, nil, false) + if err != nil { + if errors.Is(err, cdb.ErrDoesNotExist) { + return cutil.NewAPIErrorResponse(c, http.StatusNotFound, "Could not find Machine with specified ID", nil) + } + logger.Error().Err(err).Msg("error retrieving Machine DB entity") + return cutil.NewAPIErrorResponse(c, http.StatusInternalServerError, "Could not retrieve Machine", nil) + } + if machine.InfrastructureProviderID != provider.ID { + return cutil.NewAPIErrorResponse(c, http.StatusForbidden, "Machine doesn't belong to org's Infrastructure provider", nil) + } - stc, siteID, _, errResp := h.authorizeMachine(ctx, c, logger, org, dbUser, machineID) - if errResp != nil || stc == nil { - return errResp + site, err := common.GetSiteFromIDString(ctx, nil, machine.SiteID.String(), h.dbSession) + if err != nil { + if errors.Is(err, cdb.ErrDoesNotExist) || errors.Is(err, common.ErrInvalidID) { + return cutil.NewAPIErrorResponse(c, http.StatusBadRequest, "Machine Site does not exist", nil) + } + logger.Error().Err(err).Msg("error retrieving Machine Site from DB") + return cutil.NewAPIErrorResponse(c, http.StatusInternalServerError, "Failed to retrieve Machine Site due to DB error", nil) + } + if site.InfrastructureProviderID != provider.ID { + return cutil.NewAPIErrorResponse(c, http.StatusForbidden, "Machine Site doesn't belong to current org's Provider", nil) + } + + stc, err := h.scp.GetClientByID(site.ID) + if err != nil { + logger.Error().Err(err).Msg("failed to retrieve Temporal client for Site") + return cutil.NewAPIErrorResponse(c, http.StatusInternalServerError, "Failed to retrieve client for Site", nil) } + siteID := site.ID.String() logger.Info().Str("machineID", machineID).Str("mode", apiReq.Mode).Str("siteID", siteID).Msg("triggering DPU reprovisioning via Core proxy") - code, err := common.ExecuteCoreGRPC(ctx, stc, cwssaws.Forge_TriggerDpuReprovisioning_FullMethodName, apiReq.ToProto(machineID), nil, siteID) + code, err := common.ExecuteCoreGRPC(ctx, stc, cwssaws.Forge_TriggerDpuReprovisioning_FullMethodName, apiReq.ToProto(), nil, siteID) if err != nil { logger.Error().Err(err).Msg("failed to trigger DPU reprovisioning") return cutil.NewAPIErrorResponse(c, code, "Failed to trigger DPU reprovisioning", nil) } - return c.JSON(http.StatusOK, model.NewAPIDpuReprovisionResponse(machineID, &apiReq)) + return c.JSON(http.StatusOK, model.NewAPIDpuReprovisionResponse(&apiReq)) } diff --git a/rest-api/api/pkg/api/model/adminops_test.go b/rest-api/api/pkg/api/model/adminops_test.go index 989685cf9d..ca52617aad 100644 --- a/rest-api/api/pkg/api/model/adminops_test.go +++ b/rest-api/api/pkg/api/model/adminops_test.go @@ -13,10 +13,10 @@ import ( ) func TestAPIDpuReprovisionRequestValidateAndToProto(t *testing.T) { - req := APIDpuReprovisionRequest{Mode: DpuReprovisionModeRestart, UpdateFirmware: true} + req := APIDpuReprovisionRequest{MachineID: "machine-1", Mode: DpuReprovisionModeRestart, UpdateFirmware: true} require.NoError(t, req.Validate()) - protoReq := req.ToProto("machine-1") + protoReq := req.ToProto() assert.Equal(t, "machine-1", protoReq.GetMachineId().GetId()) assert.Equal(t, cwssaws.DpuReprovisioningRequest_Restart, protoReq.GetMode()) assert.Equal(t, cwssaws.UpdateInitiator_AdminCli, protoReq.GetInitiator()) diff --git a/rest-api/api/pkg/api/model/dpureprovision.go b/rest-api/api/pkg/api/model/dpureprovision.go index 514ff68de3..784649054e 100644 --- a/rest-api/api/pkg/api/model/dpureprovision.go +++ b/rest-api/api/pkg/api/model/dpureprovision.go @@ -31,6 +31,7 @@ var validDpuReprovisionModesAny = func() []interface{} { }() type APIDpuReprovisionRequest struct { + MachineID string `json:"-"` Mode string `json:"mode"` UpdateFirmware bool `json:"updateFirmware,omitempty"` } @@ -43,9 +44,9 @@ func (r *APIDpuReprovisionRequest) Validate() error { ) } -func (r *APIDpuReprovisionRequest) ToProto(machineID string) *cwssaws.DpuReprovisioningRequest { +func (r *APIDpuReprovisionRequest) ToProto() *cwssaws.DpuReprovisioningRequest { return &cwssaws.DpuReprovisioningRequest{ - MachineId: &cwssaws.MachineId{Id: machineID}, + MachineId: &cwssaws.MachineId{Id: r.MachineID}, Mode: dpuReprovisionModeToProto(r.Mode), Initiator: cwssaws.UpdateInitiator_AdminCli, UpdateFirmware: r.UpdateFirmware, @@ -58,9 +59,9 @@ type APIDpuReprovisionResponse struct { UpdateFirmware bool `json:"updateFirmware,omitempty"` } -func NewAPIDpuReprovisionResponse(machineID string, req *APIDpuReprovisionRequest) *APIDpuReprovisionResponse { +func NewAPIDpuReprovisionResponse(req *APIDpuReprovisionRequest) *APIDpuReprovisionResponse { return &APIDpuReprovisionResponse{ - MachineID: machineID, + MachineID: req.MachineID, Mode: req.Mode, UpdateFirmware: req.UpdateFirmware, }