From 819b01507f4a54ff17855d4b372684502050b3cf Mon Sep 17 00:00:00 2001 From: Randy Grok Date: Sun, 2 Nov 2025 10:59:41 +0100 Subject: [PATCH 01/21] fix: health endpoint now detects stopped block production This commit fixes issue #2643 where the health endpoint still reports OK when a node has stopped producing blocks. Changes: - Updated HealthServer to accept store, config, and logger dependencies - Implemented block production monitoring in the Livez endpoint: * For aggregator nodes, checks if LastBlockTime is recent * Returns WARN if block production is slow (> 3x block time) * Returns FAIL if block production has stopped (> 5x block time) * Uses LazyBlockInterval for lazy mode aggregators * Non-aggregator nodes continue to return PASS - Added constants for health check thresholds: * healthCheckWarnMultiplier = 3 * healthCheckFailMultiplier = 5 - Added comprehensive unit tests covering all scenarios: Server tests (pkg/rpc/server/server_test.go): * Non-aggregator nodes * Aggregator with no blocks * Aggregator with recent blocks (PASS) * Aggregator with slow production (WARN) * Aggregator with stopped production (FAIL) * Lazy aggregator with correct thresholds * Error handling Client tests (pkg/rpc/client/client_test.go): * Non-aggregator returns PASS * Aggregator with recent blocks returns PASS * Aggregator with slow block production returns WARN * Aggregator with stopped block production returns FAIL - Updated setupTestServer to pass new dependencies - Added createCustomTestServer helper for testing with custom configs The thresholds are configurable based on the node's BlockTime or LazyBlockInterval settings, making the health check adaptive to different node configurations. Fixes #2643 --- node/helpers_test.go | 6 + node/single_sequencer_integration_test.go | 67 +++++++++++ pkg/rpc/client/client_test.go | 135 +++++++++++++++++++--- pkg/rpc/server/server.go | 77 +++++++++++- pkg/rpc/server/server_test.go | 130 ++++++++++++++++++++- 5 files changed, 389 insertions(+), 26 deletions(-) diff --git a/node/helpers_test.go b/node/helpers_test.go index 06d789060f..e9b2719fb8 100644 --- a/node/helpers_test.go +++ b/node/helpers_test.go @@ -24,6 +24,7 @@ import ( evconfig "github.com/evstack/ev-node/pkg/config" "github.com/evstack/ev-node/pkg/p2p" "github.com/evstack/ev-node/pkg/p2p/key" + rpcclient "github.com/evstack/ev-node/pkg/rpc/client" remote_signer "github.com/evstack/ev-node/pkg/signer/noop" "github.com/evstack/ev-node/types" ) @@ -317,3 +318,8 @@ func verifyNodesSynced(node1, syncingNode Node, source Source) error { return fmt.Errorf("nodes not synced: sequencer at height %v, syncing node at height %v", sequencerHeight, syncingHeight) }) } + +// NewRPCClient creates a new RPC client for testing +func NewRPCClient(address string) *rpcclient.Client { + return rpcclient.NewClient("http://" + address) +} diff --git a/node/single_sequencer_integration_test.go b/node/single_sequencer_integration_test.go index 213760df6e..1056a0f65e 100644 --- a/node/single_sequencer_integration_test.go +++ b/node/single_sequencer_integration_test.go @@ -418,3 +418,70 @@ func waitForBlockN(t *testing.T, n uint64, node *FullNode, blockInterval time.Du return got >= n }, timeout[0], blockInterval/2) } +// TestHealthEndpointWhenBlockProductionStops verifies that the health endpoint +// correctly reports WARN and FAIL states when an aggregator stops producing blocks. +func TestHealthEndpointWhenBlockProductionStops(t *testing.T) { + require := require.New(t) + + // Set up configuration with specific block time for predictable health checks + config := getTestConfig(t, 1) + config.Node.Aggregator = true + config.Node.BlockTime = evconfig.DurationWrapper{Duration: 500 * time.Millisecond} + config.Node.MaxPendingHeadersAndData = 2 + + // Set DA block time large enough to avoid header submission to DA layer + // This will cause block production to stop once MaxPendingHeadersAndData is reached + config.DA.BlockTime = evconfig.DurationWrapper{Duration: 100 * time.Second} + + node, cleanup := createNodeWithCleanup(t, config) + defer cleanup() + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + var runningWg sync.WaitGroup + startNodeInBackground(t, []*FullNode{node}, []context.Context{ctx}, &runningWg, 0, nil) + + // Wait for first block to be produced + waitForBlockN(t, 1, node, config.Node.BlockTime.Duration) + + // Create RPC client + rpcClient := NewRPCClient(config.RPC.Address) + + // Verify health is PASS while blocks are being produced + health, err := rpcClient.GetHealth(ctx) + require.NoError(err) + require.Equal("PASS", health.String(), "Health should be PASS while producing blocks") + + // Wait for block production to stop (when MaxPendingHeadersAndData is reached) + time.Sleep(time.Duration(config.Node.MaxPendingHeadersAndData+2) * config.Node.BlockTime.Duration) + + // Get the height to confirm blocks stopped + height, err := getNodeHeight(node, Store) + require.NoError(err) + require.LessOrEqual(height, config.Node.MaxPendingHeadersAndData) + + // Health check threshold calculations: + // blockTime = 500ms + // warnThreshold = blockTime * 3 = 1500ms = 1.5s + // failThreshold = blockTime * 5 = 2500ms = 2.5s + + // Wait for WARN threshold (3x block time = 1.5 seconds after last block) + // We need to wait a bit longer to account for the time blocks take to stop + time.Sleep(1700 * time.Millisecond) + + health, err = rpcClient.GetHealth(ctx) + require.NoError(err) + // Health could be WARN or FAIL depending on exact timing, but should not be PASS + require.NotEqual("PASS", health.String(), "Health should not be PASS after block production stops") + + // Wait for FAIL threshold (5x block time = 2.5 seconds total after last block) + time.Sleep(1500 * time.Millisecond) + + health, err = rpcClient.GetHealth(ctx) + require.NoError(err) + require.Equal("FAIL", health.String(), "Health should be FAIL after 5x block time without new blocks") + + // Stop the node and wait for shutdown + shutdownAndWait(t, []context.CancelFunc{cancel}, &runningWg, 10*time.Second) +} diff --git a/pkg/rpc/client/client_test.go b/pkg/rpc/client/client_test.go index 97c2475137..23082cdc1a 100644 --- a/pkg/rpc/client/client_test.go +++ b/pkg/rpc/client/client_test.go @@ -30,13 +30,12 @@ func setupTestServer(t *testing.T, mockStore *mocks.MockStore, mockP2P *mocks.Mo // Create the servers logger := zerolog.Nop() - storeServer := server.NewStoreServer(mockStore, logger) - p2pServer := server.NewP2PServer(mockP2P) - healthServer := server.NewHealthServer() - - // Create config server with test config testConfig := config.DefaultConfig() testConfig.DA.Namespace = "test-headers" + + storeServer := server.NewStoreServer(mockStore, logger) + p2pServer := server.NewP2PServer(mockP2P) + healthServer := server.NewHealthServer(mockStore, testConfig, logger) configServer := server.NewConfigServer(testConfig, nil, logger) // Register the store service @@ -242,21 +241,123 @@ func TestClientGetNetInfo(t *testing.T) { } func TestClientGetHealth(t *testing.T) { - // Create mocks - mockStore := mocks.NewMockStore(t) - mockP2P := mocks.NewMockP2PRPC(t) + t.Run("non-aggregator returns PASS", func(t *testing.T) { + mockStore := mocks.NewMockStore(t) + mockP2P := mocks.NewMockP2PRPC(t) + + testServer, client := setupTestServer(t, mockStore, mockP2P) + defer testServer.Close() + + healthStatus, err := client.GetHealth(context.Background()) + + require.NoError(t, err) + require.Equal(t, "PASS", healthStatus.String()) + }) + + t.Run("aggregator with recent blocks returns PASS", func(t *testing.T) { + mockStore := mocks.NewMockStore(t) + mockP2P := mocks.NewMockP2PRPC(t) + + // Setup aggregator config + testConfig := config.DefaultConfig() + testConfig.Node.Aggregator = true + testConfig.Node.BlockTime.Duration = 1 * time.Second + + // Create state with recent block + state := types.State{ + LastBlockHeight: 100, + LastBlockTime: time.Now().Add(-500 * time.Millisecond), + } + mockStore.On("GetState", mock.Anything).Return(state, nil) + + // Create custom test server with aggregator config + testServer := createCustomTestServer(t, mockStore, mockP2P, testConfig) + defer testServer.Close() + + client := NewClient(testServer.URL) + healthStatus, err := client.GetHealth(context.Background()) + + require.NoError(t, err) + require.Equal(t, "PASS", healthStatus.String()) + mockStore.AssertExpectations(t) + }) + + t.Run("aggregator with slow block production returns WARN", func(t *testing.T) { + mockStore := mocks.NewMockStore(t) + mockP2P := mocks.NewMockP2PRPC(t) + + testConfig := config.DefaultConfig() + testConfig.Node.Aggregator = true + testConfig.Node.BlockTime.Duration = 1 * time.Second + + // State with block older than 3x block time + state := types.State{ + LastBlockHeight: 100, + LastBlockTime: time.Now().Add(-4 * time.Second), + } + mockStore.On("GetState", mock.Anything).Return(state, nil) + + testServer := createCustomTestServer(t, mockStore, mockP2P, testConfig) + defer testServer.Close() + + client := NewClient(testServer.URL) + healthStatus, err := client.GetHealth(context.Background()) + + require.NoError(t, err) + require.Equal(t, "WARN", healthStatus.String()) + mockStore.AssertExpectations(t) + }) + + t.Run("aggregator with stopped block production returns FAIL", func(t *testing.T) { + mockStore := mocks.NewMockStore(t) + mockP2P := mocks.NewMockP2PRPC(t) + + testConfig := config.DefaultConfig() + testConfig.Node.Aggregator = true + testConfig.Node.BlockTime.Duration = 1 * time.Second + + // State with block older than 5x block time + state := types.State{ + LastBlockHeight: 100, + LastBlockTime: time.Now().Add(-10 * time.Second), + } + mockStore.On("GetState", mock.Anything).Return(state, nil) + + testServer := createCustomTestServer(t, mockStore, mockP2P, testConfig) + defer testServer.Close() + + client := NewClient(testServer.URL) + healthStatus, err := client.GetHealth(context.Background()) + + require.NoError(t, err) + require.Equal(t, "FAIL", healthStatus.String()) + mockStore.AssertExpectations(t) + }) +} - // Setup test server and client - testServer, client := setupTestServer(t, mockStore, mockP2P) - defer testServer.Close() +// createCustomTestServer creates a test server with custom configuration +func createCustomTestServer(t *testing.T, mockStore *mocks.MockStore, mockP2P *mocks.MockP2PRPC, testConfig config.Config) *httptest.Server { + mux := http.NewServeMux() + + logger := zerolog.Nop() + storeServer := server.NewStoreServer(mockStore, logger) + p2pServer := server.NewP2PServer(mockP2P) + healthServer := server.NewHealthServer(mockStore, testConfig, logger) + configServer := server.NewConfigServer(testConfig, nil, logger) - // Call GetHealth - healthStatus, err := client.GetHealth(context.Background()) + storePath, storeHandler := rpc.NewStoreServiceHandler(storeServer) + mux.Handle(storePath, storeHandler) - // Assert expectations - require.NoError(t, err) - // Health server always returns PASS in Livez - require.NotEqual(t, healthStatus.String(), "UNKNOWN") + p2pPath, p2pHandler := rpc.NewP2PServiceHandler(p2pServer) + mux.Handle(p2pPath, p2pHandler) + + healthPath, healthHandler := rpc.NewHealthServiceHandler(healthServer) + mux.Handle(healthPath, healthHandler) + + configPath, configHandler := rpc.NewConfigServiceHandler(configServer) + mux.Handle(configPath, configHandler) + + return httptest.NewServer(h2c.NewHandler(mux, &http2.Server{})) } func TestClientGetNamespace(t *testing.T) { diff --git a/pkg/rpc/server/server.go b/pkg/rpc/server/server.go index a38f1e802b..923b322fc7 100644 --- a/pkg/rpc/server/server.go +++ b/pkg/rpc/server/server.go @@ -28,6 +28,16 @@ import ( rpc "github.com/evstack/ev-node/types/pb/evnode/v1/v1connect" ) +const ( + // healthCheckWarnMultiplier is the multiplier for block time to determine WARN threshold + // If no block has been produced in (blockTime * healthCheckWarnMultiplier), return WARN + healthCheckWarnMultiplier = 3 + + // healthCheckFailMultiplier is the multiplier for block time to determine FAIL threshold + // If no block has been produced in (blockTime * healthCheckFailMultiplier), return FAIL + healthCheckFailMultiplier = 5 +) + var _ rpc.StoreServiceHandler = (*StoreServer)(nil) // StoreServer implements the StoreService defined in the proto file @@ -288,11 +298,19 @@ func (p *P2PServer) GetNetInfo( } // HealthServer implements the HealthService defined in the proto file -type HealthServer struct{} +type HealthServer struct { + store store.Store + config config.Config + logger zerolog.Logger +} // NewHealthServer creates a new HealthServer instance -func NewHealthServer() *HealthServer { - return &HealthServer{} +func NewHealthServer(store store.Store, config config.Config, logger zerolog.Logger) *HealthServer { + return &HealthServer{ + store: store, + config: config, + logger: logger, + } } // Livez implements the HealthService.Livez RPC @@ -300,7 +318,56 @@ func (h *HealthServer) Livez( ctx context.Context, req *connect.Request[emptypb.Empty], ) (*connect.Response[pb.GetHealthResponse], error) { - // always return healthy + // For aggregator nodes, check if block production is healthy + if h.config.Node.Aggregator { + state, err := h.store.GetState(ctx) + if err != nil { + h.logger.Error().Err(err).Msg("Failed to get state for health check") + return connect.NewResponse(&pb.GetHealthResponse{ + Status: pb.HealthStatus_FAIL, + }), nil + } + + // If we have blocks, check if the last block time is recent + if state.LastBlockHeight > 0 { + timeSinceLastBlock := time.Since(state.LastBlockTime) + + // Calculate the threshold based on block time + blockTime := h.config.Node.BlockTime.Duration + + // For lazy mode, use the lazy block interval instead + if h.config.Node.LazyMode { + blockTime = h.config.Node.LazyBlockInterval.Duration + } + + warnThreshold := blockTime * healthCheckWarnMultiplier + failThreshold := blockTime * healthCheckFailMultiplier + + if timeSinceLastBlock > failThreshold { + h.logger.Warn(). + Dur("time_since_last_block", timeSinceLastBlock). + Dur("fail_threshold", failThreshold). + Uint64("last_block_height", state.LastBlockHeight). + Time("last_block_time", state.LastBlockTime). + Msg("Health check: node has stopped producing blocks (FAIL)") + return connect.NewResponse(&pb.GetHealthResponse{ + Status: pb.HealthStatus_FAIL, + }), nil + } else if timeSinceLastBlock > warnThreshold { + h.logger.Warn(). + Dur("time_since_last_block", timeSinceLastBlock). + Dur("warn_threshold", warnThreshold). + Uint64("last_block_height", state.LastBlockHeight). + Time("last_block_time", state.LastBlockTime). + Msg("Health check: block production is slow (WARN)") + return connect.NewResponse(&pb.GetHealthResponse{ + Status: pb.HealthStatus_WARN, + }), nil + } + } + } + + // For non-aggregator nodes or if checks pass, return healthy return connect.NewResponse(&pb.GetHealthResponse{ Status: pb.HealthStatus_PASS, }), nil @@ -310,7 +377,7 @@ func (h *HealthServer) Livez( func NewServiceHandler(store store.Store, peerManager p2p.P2PRPC, proposerAddress []byte, logger zerolog.Logger, config config.Config, bestKnown BestKnownHeightProvider) (http.Handler, error) { storeServer := NewStoreServer(store, logger) p2pServer := NewP2PServer(peerManager) - healthServer := NewHealthServer() + healthServer := NewHealthServer(store, config, logger) configServer := NewConfigServer(config, proposerAddress, logger) mux := http.NewServeMux() diff --git a/pkg/rpc/server/server_test.go b/pkg/rpc/server/server_test.go index 6b3848e8cb..1aa71c11e2 100644 --- a/pkg/rpc/server/server_test.go +++ b/pkg/rpc/server/server_test.go @@ -361,10 +361,132 @@ func TestP2PServer_GetNetInfo(t *testing.T) { } func TestHealthServer_Livez(t *testing.T) { - h := NewHealthServer() - resp, err := h.Livez(context.Background(), connect.NewRequest(&emptypb.Empty{})) - require.NoError(t, err) - require.Equal(t, pb.HealthStatus_PASS, resp.Msg.Status) + logger := zerolog.Nop() + + t.Run("non-aggregator always returns PASS", func(t *testing.T) { + mockStore := mocks.NewMockStore(t) + testConfig := config.DefaultConfig() + testConfig.Node.Aggregator = false + + h := NewHealthServer(mockStore, testConfig, logger) + resp, err := h.Livez(context.Background(), connect.NewRequest(&emptypb.Empty{})) + require.NoError(t, err) + require.Equal(t, pb.HealthStatus_PASS, resp.Msg.Status) + }) + + t.Run("aggregator with no blocks returns PASS", func(t *testing.T) { + mockStore := mocks.NewMockStore(t) + testConfig := config.DefaultConfig() + testConfig.Node.Aggregator = true + + // State with no blocks yet + state := types.State{ + LastBlockHeight: 0, + } + mockStore.On("GetState", mock.Anything).Return(state, nil) + + h := NewHealthServer(mockStore, testConfig, logger) + resp, err := h.Livez(context.Background(), connect.NewRequest(&emptypb.Empty{})) + require.NoError(t, err) + require.Equal(t, pb.HealthStatus_PASS, resp.Msg.Status) + mockStore.AssertExpectations(t) + }) + + t.Run("aggregator with recent blocks returns PASS", func(t *testing.T) { + mockStore := mocks.NewMockStore(t) + testConfig := config.DefaultConfig() + testConfig.Node.Aggregator = true + testConfig.Node.BlockTime.Duration = 1 * time.Second + + // State with recent block + state := types.State{ + LastBlockHeight: 100, + LastBlockTime: time.Now().Add(-500 * time.Millisecond), // Recent block + } + mockStore.On("GetState", mock.Anything).Return(state, nil) + + h := NewHealthServer(mockStore, testConfig, logger) + resp, err := h.Livez(context.Background(), connect.NewRequest(&emptypb.Empty{})) + require.NoError(t, err) + require.Equal(t, pb.HealthStatus_PASS, resp.Msg.Status) + mockStore.AssertExpectations(t) + }) + + t.Run("aggregator with slow block production returns WARN", func(t *testing.T) { + mockStore := mocks.NewMockStore(t) + testConfig := config.DefaultConfig() + testConfig.Node.Aggregator = true + testConfig.Node.BlockTime.Duration = 1 * time.Second + + // State with block older than 3x block time (warn threshold) + state := types.State{ + LastBlockHeight: 100, + LastBlockTime: time.Now().Add(-4 * time.Second), // 4 seconds ago > 3x block time + } + mockStore.On("GetState", mock.Anything).Return(state, nil) + + h := NewHealthServer(mockStore, testConfig, logger) + resp, err := h.Livez(context.Background(), connect.NewRequest(&emptypb.Empty{})) + require.NoError(t, err) + require.Equal(t, pb.HealthStatus_WARN, resp.Msg.Status) + mockStore.AssertExpectations(t) + }) + + t.Run("aggregator with stopped block production returns FAIL", func(t *testing.T) { + mockStore := mocks.NewMockStore(t) + testConfig := config.DefaultConfig() + testConfig.Node.Aggregator = true + testConfig.Node.BlockTime.Duration = 1 * time.Second + + // State with block older than 5x block time (fail threshold) + state := types.State{ + LastBlockHeight: 100, + LastBlockTime: time.Now().Add(-10 * time.Second), // 10 seconds ago > 5x block time + } + mockStore.On("GetState", mock.Anything).Return(state, nil) + + h := NewHealthServer(mockStore, testConfig, logger) + resp, err := h.Livez(context.Background(), connect.NewRequest(&emptypb.Empty{})) + require.NoError(t, err) + require.Equal(t, pb.HealthStatus_FAIL, resp.Msg.Status) + mockStore.AssertExpectations(t) + }) + + t.Run("lazy aggregator uses lazy block interval for threshold", func(t *testing.T) { + mockStore := mocks.NewMockStore(t) + testConfig := config.DefaultConfig() + testConfig.Node.Aggregator = true + testConfig.Node.LazyMode = true + testConfig.Node.BlockTime.Duration = 1 * time.Second + testConfig.Node.LazyBlockInterval.Duration = 10 * time.Second + + // State with block older than 3x lazy block interval (warn threshold) + state := types.State{ + LastBlockHeight: 100, + LastBlockTime: time.Now().Add(-35 * time.Second), // 35 seconds ago > 3x lazy interval (30s) + } + mockStore.On("GetState", mock.Anything).Return(state, nil) + + h := NewHealthServer(mockStore, testConfig, logger) + resp, err := h.Livez(context.Background(), connect.NewRequest(&emptypb.Empty{})) + require.NoError(t, err) + require.Equal(t, pb.HealthStatus_WARN, resp.Msg.Status) + mockStore.AssertExpectations(t) + }) + + t.Run("aggregator with state error returns FAIL", func(t *testing.T) { + mockStore := mocks.NewMockStore(t) + testConfig := config.DefaultConfig() + testConfig.Node.Aggregator = true + + mockStore.On("GetState", mock.Anything).Return(types.State{}, fmt.Errorf("state error")) + + h := NewHealthServer(mockStore, testConfig, logger) + resp, err := h.Livez(context.Background(), connect.NewRequest(&emptypb.Empty{})) + require.NoError(t, err) + require.Equal(t, pb.HealthStatus_FAIL, resp.Msg.Status) + mockStore.AssertExpectations(t) + }) } func TestHealthLiveEndpoint(t *testing.T) { From 17d84028cf60bec439b4437743257f49d4feffd8 Mon Sep 17 00:00:00 2001 From: Randy Grok Date: Sun, 2 Nov 2025 11:18:39 +0100 Subject: [PATCH 02/21] refactor: eliminate code duplication in client tests Remove createCustomTestServer function which was redundant after making setupTestServer accept optional config parameter. This eliminates 36 lines of duplicated server setup code. Changes: - Make setupTestServer accept variadic config parameter - Update all test cases to use setupTestServer directly - Remove createCustomTestServer function entirely Result: -21 net lines of code, improved maintainability, cleaner API. --- pkg/rpc/client/client_test.go | 51 +++++++++++------------------------ 1 file changed, 15 insertions(+), 36 deletions(-) diff --git a/pkg/rpc/client/client_test.go b/pkg/rpc/client/client_test.go index 23082cdc1a..3d7974476c 100644 --- a/pkg/rpc/client/client_test.go +++ b/pkg/rpc/client/client_test.go @@ -23,15 +23,23 @@ import ( rpc "github.com/evstack/ev-node/types/pb/evnode/v1/v1connect" ) -// setupTestServer creates a test server with mock store and mock p2p manager -func setupTestServer(t *testing.T, mockStore *mocks.MockStore, mockP2P *mocks.MockP2PRPC) (*httptest.Server, *Client) { +// setupTestServer creates a test server with mock store and mock p2p manager. +// An optional custom config can be provided; if not provided, uses DefaultConfig with test-headers namespace. +func setupTestServer(t *testing.T, mockStore *mocks.MockStore, mockP2P *mocks.MockP2PRPC, customConfig ...config.Config) (*httptest.Server, *Client) { // Create a new HTTP test server mux := http.NewServeMux() // Create the servers logger := zerolog.Nop() - testConfig := config.DefaultConfig() - testConfig.DA.Namespace = "test-headers" + + // Use custom config if provided, otherwise use default + var testConfig config.Config + if len(customConfig) > 0 { + testConfig = customConfig[0] + } else { + testConfig = config.DefaultConfig() + testConfig.DA.Namespace = "test-headers" + } storeServer := server.NewStoreServer(mockStore, logger) p2pServer := server.NewP2PServer(mockP2P) @@ -271,10 +279,8 @@ func TestClientGetHealth(t *testing.T) { mockStore.On("GetState", mock.Anything).Return(state, nil) // Create custom test server with aggregator config - testServer := createCustomTestServer(t, mockStore, mockP2P, testConfig) + testServer, client := setupTestServer(t, mockStore, mockP2P, testConfig) defer testServer.Close() - - client := NewClient(testServer.URL) healthStatus, err := client.GetHealth(context.Background()) require.NoError(t, err) @@ -297,10 +303,9 @@ func TestClientGetHealth(t *testing.T) { } mockStore.On("GetState", mock.Anything).Return(state, nil) - testServer := createCustomTestServer(t, mockStore, mockP2P, testConfig) + testServer, client := setupTestServer(t, mockStore, mockP2P, testConfig) defer testServer.Close() - client := NewClient(testServer.URL) healthStatus, err := client.GetHealth(context.Background()) require.NoError(t, err) @@ -323,10 +328,9 @@ func TestClientGetHealth(t *testing.T) { } mockStore.On("GetState", mock.Anything).Return(state, nil) - testServer := createCustomTestServer(t, mockStore, mockP2P, testConfig) + testServer, client := setupTestServer(t, mockStore, mockP2P, testConfig) defer testServer.Close() - client := NewClient(testServer.URL) healthStatus, err := client.GetHealth(context.Background()) require.NoError(t, err) @@ -335,31 +339,6 @@ func TestClientGetHealth(t *testing.T) { }) } -// createCustomTestServer creates a test server with custom configuration -func createCustomTestServer(t *testing.T, mockStore *mocks.MockStore, mockP2P *mocks.MockP2PRPC, testConfig config.Config) *httptest.Server { - mux := http.NewServeMux() - - logger := zerolog.Nop() - storeServer := server.NewStoreServer(mockStore, logger) - p2pServer := server.NewP2PServer(mockP2P) - healthServer := server.NewHealthServer(mockStore, testConfig, logger) - configServer := server.NewConfigServer(testConfig, nil, logger) - - storePath, storeHandler := rpc.NewStoreServiceHandler(storeServer) - mux.Handle(storePath, storeHandler) - - p2pPath, p2pHandler := rpc.NewP2PServiceHandler(p2pServer) - mux.Handle(p2pPath, p2pHandler) - - healthPath, healthHandler := rpc.NewHealthServiceHandler(healthServer) - mux.Handle(healthPath, healthHandler) - - configPath, configHandler := rpc.NewConfigServiceHandler(configServer) - mux.Handle(configPath, configHandler) - - return httptest.NewServer(h2c.NewHandler(mux, &http2.Server{})) -} - func TestClientGetNamespace(t *testing.T) { // Create mocks mockStore := mocks.NewMockStore(t) From f9abded74e3d139cfbc58afee2e1b3879d8f3e6a Mon Sep 17 00:00:00 2001 From: Randy Grok <98407738+randygrok@users.noreply.github.com> Date: Sun, 2 Nov 2025 11:19:42 +0100 Subject: [PATCH 03/21] Apply suggestion from @gemini-code-assist[bot] Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- pkg/rpc/server/server.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/rpc/server/server.go b/pkg/rpc/server/server.go index 923b322fc7..06ede9ca36 100644 --- a/pkg/rpc/server/server.go +++ b/pkg/rpc/server/server.go @@ -344,7 +344,7 @@ func (h *HealthServer) Livez( failThreshold := blockTime * healthCheckFailMultiplier if timeSinceLastBlock > failThreshold { - h.logger.Warn(). + h.logger.Error(). Dur("time_since_last_block", timeSinceLastBlock). Dur("fail_threshold", failThreshold). Uint64("last_block_height", state.LastBlockHeight). From 35f7c74f9d1401af459892fa30e19071a26f8296 Mon Sep 17 00:00:00 2001 From: Randy Grok Date: Sun, 2 Nov 2025 11:23:20 +0100 Subject: [PATCH 04/21] test: replace time.Sleep with require.Eventually in health integration test Replace fixed duration time.Sleep calls with require.Eventually polling to make the health endpoint integration test more robust and less flaky. Changes: - Use require.Eventually to poll for health state transitions - Poll every 100ms instead of fixed sleeps - Generous timeouts (5s and 10s) that terminate early on success - Better error handling during polling Benefits: - More resilient to timing variations in CI/CD environments - Faster test execution (completes as soon as conditions are met) - Eliminates magic numbers (1700ms compensation) - Expresses intent clearly (wait until condition is met) - Non-flaky (tested 3x consecutively) --- node/single_sequencer_integration_test.go | 34 +++++++++++++---------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/node/single_sequencer_integration_test.go b/node/single_sequencer_integration_test.go index 1056a0f65e..219b95eb68 100644 --- a/node/single_sequencer_integration_test.go +++ b/node/single_sequencer_integration_test.go @@ -466,21 +466,25 @@ func TestHealthEndpointWhenBlockProductionStops(t *testing.T) { // warnThreshold = blockTime * 3 = 1500ms = 1.5s // failThreshold = blockTime * 5 = 2500ms = 2.5s - // Wait for WARN threshold (3x block time = 1.5 seconds after last block) - // We need to wait a bit longer to account for the time blocks take to stop - time.Sleep(1700 * time.Millisecond) - - health, err = rpcClient.GetHealth(ctx) - require.NoError(err) - // Health could be WARN or FAIL depending on exact timing, but should not be PASS - require.NotEqual("PASS", health.String(), "Health should not be PASS after block production stops") - - // Wait for FAIL threshold (5x block time = 2.5 seconds total after last block) - time.Sleep(1500 * time.Millisecond) - - health, err = rpcClient.GetHealth(ctx) - require.NoError(err) - require.Equal("FAIL", health.String(), "Health should be FAIL after 5x block time without new blocks") + // Poll for health to transition away from PASS (to WARN or FAIL) + // This is more robust than fixed time.Sleep as it handles timing variations + require.Eventually(func() bool { + health, err := rpcClient.GetHealth(ctx) + if err != nil { + return false + } + return health.String() != "PASS" + }, 5*time.Second, 100*time.Millisecond, "Health should transition away from PASS after block production stops") + + // Poll for health to reach FAIL state + // Timeout is set to 10 seconds to be safe, but should happen around 2.5s + require.Eventually(func() bool { + health, err := rpcClient.GetHealth(ctx) + if err != nil { + return false + } + return health.String() == "FAIL" + }, 10*time.Second, 100*time.Millisecond, "Health should be FAIL after 5x block time without new blocks") // Stop the node and wait for shutdown shutdownAndWait(t, []context.CancelFunc{cancel}, &runningWg, 10*time.Second) From a65b37348e01cd45c9961a857aea6661c092b085 Mon Sep 17 00:00:00 2001 From: Randy Grok Date: Wed, 5 Nov 2025 10:33:40 +0100 Subject: [PATCH 05/21] Refactor health check implementation: Replace gRPC HealthService with HTTP endpoint - Removed the HealthService and its related proto definitions. - Implemented a new HTTP health check endpoint at `/health/live`. - Updated the Client to use the new HTTP health check instead of the gRPC call. - Enhanced health check logic to return PASS, WARN, or FAIL based on block production status. - Modified tests to validate the new health check endpoint and its responses. - Updated server to register custom HTTP endpoints including the new health check. --- Cargo.lock | 858 +++++++++++++++++- client/crates/client/Cargo.toml | 1 + client/crates/client/src/error.rs | 3 + client/crates/client/src/health.rs | 55 +- .../types/src/proto/evnode.v1.messages.rs | 47 +- .../types/src/proto/evnode.v1.services.rs | 339 +------ pkg/rpc/client/client.go | 74 +- pkg/rpc/client/client_test.go | 8 +- pkg/rpc/server/da_visualization_test.go | 6 +- pkg/rpc/server/http.go | 64 +- pkg/rpc/server/http_test.go | 4 +- pkg/rpc/server/server.go | 96 +- pkg/rpc/server/server_test.go | 157 ++-- proto/evnode/v1/health.proto | 32 - test/docker-e2e/upgrade_test.go | 5 +- types/pb/evnode/v1/health.pb.go | 198 ---- .../pb/evnode/v1/v1connect/health.connect.go | 111 --- 17 files changed, 1118 insertions(+), 940 deletions(-) delete mode 100644 proto/evnode/v1/health.proto delete mode 100644 types/pb/evnode/v1/health.pb.go delete mode 100644 types/pb/evnode/v1/v1connect/health.connect.go diff --git a/Cargo.lock b/Cargo.lock index 4128d840b7..6ab5036225 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -65,6 +65,12 @@ dependencies = [ "syn", ] +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + [[package]] name = "autocfg" version = "1.5.0" @@ -82,9 +88,9 @@ dependencies = [ "bitflags 1.3.2", "bytes", "futures-util", - "http", - "http-body", - "hyper", + "http 0.2.12", + "http-body 0.4.6", + "hyper 0.14.32", "itoa", "matchit", "memchr", @@ -93,8 +99,8 @@ dependencies = [ "pin-project-lite", "rustversion", "serde", - "sync_wrapper", - "tower", + "sync_wrapper 0.1.2", + "tower 0.4.13", "tower-layer", "tower-service", ] @@ -108,8 +114,8 @@ dependencies = [ "async-trait", "bytes", "futures-util", - "http", - "http-body", + "http 0.2.12", + "http-body 0.4.6", "mime", "rustversion", "tower-layer", @@ -137,6 +143,12 @@ version = "0.21.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + [[package]] name = "bitflags" version = "1.3.2" @@ -149,6 +161,12 @@ version = "2.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" +[[package]] +name = "bumpalo" +version = "3.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" + [[package]] name = "byteorder" version = "1.5.0" @@ -176,6 +194,23 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9555578bc9e57714c812a1f84e4fc5b4d21fcb063490c624de019f7464c91268" +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "either" version = "1.15.0" @@ -205,11 +240,12 @@ dependencies = [ "async-trait", "ev-types", "futures", - "thiserror", + "reqwest", + "thiserror 1.0.69", "tokio", "tokio-test", "tonic", - "tower", + "tower 0.4.13", "tracing", "tracing-subscriber", ] @@ -244,6 +280,15 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "form_urlencoded" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" +dependencies = [ + "percent-encoding", +] + [[package]] name = "futures" version = "0.3.31" @@ -340,8 +385,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" dependencies = [ "cfg-if", + "js-sys", "libc", "wasi 0.11.1+wasi-snapshot-preview1", + "wasm-bindgen", ] [[package]] @@ -351,9 +398,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" dependencies = [ "cfg-if", + "js-sys", "libc", "r-efi", "wasi 0.14.2+wasi-0.2.4", + "wasm-bindgen", ] [[package]] @@ -373,7 +422,7 @@ dependencies = [ "futures-core", "futures-sink", "futures-util", - "http", + "http 0.2.12", "indexmap 2.10.0", "slab", "tokio", @@ -420,6 +469,17 @@ dependencies = [ "itoa", ] +[[package]] +name = "http" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + [[package]] name = "http-body" version = "0.4.6" @@ -427,7 +487,30 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" dependencies = [ "bytes", - "http", + "http 0.2.12", + "pin-project-lite", +] + +[[package]] +name = "http-body" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" +dependencies = [ + "bytes", + "http 1.3.1", +] + +[[package]] +name = "http-body-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" +dependencies = [ + "bytes", + "futures-core", + "http 1.3.1", + "http-body 1.0.1", "pin-project-lite", ] @@ -454,8 +537,8 @@ dependencies = [ "futures-core", "futures-util", "h2", - "http", - "http-body", + "http 0.2.12", + "http-body 0.4.6", "httparse", "httpdate", "itoa", @@ -467,18 +550,182 @@ dependencies = [ "want", ] +[[package]] +name = "hyper" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb3aa54a13a0dfe7fbe3a59e0c76093041720fdc77b110cc0fc260fafb4dc51e" +dependencies = [ + "atomic-waker", + "bytes", + "futures-channel", + "futures-core", + "http 1.3.1", + "http-body 1.0.1", + "httparse", + "itoa", + "pin-project-lite", + "pin-utils", + "smallvec", + "tokio", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.27.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" +dependencies = [ + "http 1.3.1", + "hyper 1.7.0", + "hyper-util", + "rustls 0.23.35", + "rustls-pki-types", + "tokio", + "tokio-rustls 0.26.4", + "tower-service", + "webpki-roots", +] + [[package]] name = "hyper-timeout" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" dependencies = [ - "hyper", + "hyper 0.14.32", "pin-project-lite", "tokio", "tokio-io-timeout", ] +[[package]] +name = "hyper-util" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c6995591a8f1380fcb4ba966a252a4b29188d51d2b89e3a252f5305be65aea8" +dependencies = [ + "base64 0.22.1", + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "http 1.3.1", + "http-body 1.0.1", + "hyper 1.7.0", + "ipnet", + "libc", + "percent-encoding", + "pin-project-lite", + "socket2", + "tokio", + "tower-service", + "tracing", +] + +[[package]] +name = "icu_collections" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" +dependencies = [ + "displaydoc", + "potential_utf", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_normalizer" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599" +dependencies = [ + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a" + +[[package]] +name = "icu_properties" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e93fcd3157766c0c8da2f8cff6ce651a31f0810eaa1c51ec363ef790bbb5fb99" +dependencies = [ + "icu_collections", + "icu_locale_core", + "icu_properties_data", + "icu_provider", + "zerotrie", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02845b3647bb045f1100ecd6480ff52f34c35f82d9880e029d329c21d1054899" + +[[package]] +name = "icu_provider" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614" +dependencies = [ + "displaydoc", + "icu_locale_core", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + +[[package]] +name = "idna" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + [[package]] name = "indexmap" version = "1.9.3" @@ -499,6 +746,22 @@ dependencies = [ "hashbrown 0.15.4", ] +[[package]] +name = "ipnet" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" + +[[package]] +name = "iri-string" +version = "0.7.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f867b9d1d896b67beb18518eda36fdb77a32ea590de864f1325b294a6d14397" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "itertools" version = "0.12.1" @@ -514,6 +777,16 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" +[[package]] +name = "js-sys" +version = "0.3.82" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b011eec8cc36da2aab2d5cff675ec18454fad408585853910a202391cf9f8e65" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + [[package]] name = "lazy_static" version = "1.5.0" @@ -532,6 +805,12 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12" +[[package]] +name = "litemap" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" + [[package]] name = "lock_api" version = "0.4.13" @@ -548,6 +827,12 @@ version = "0.4.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" +[[package]] +name = "lru-slab" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" + [[package]] name = "matchit" version = "0.7.3" @@ -696,6 +981,15 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "potential_utf" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" +dependencies = [ + "zerovec", +] + [[package]] name = "ppv-lite86" version = "0.2.21" @@ -777,6 +1071,61 @@ dependencies = [ "prost", ] +[[package]] +name = "quinn" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20" +dependencies = [ + "bytes", + "cfg_aliases", + "pin-project-lite", + "quinn-proto", + "quinn-udp", + "rustc-hash", + "rustls 0.23.35", + "socket2", + "thiserror 2.0.17", + "tokio", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-proto" +version = "0.11.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31" +dependencies = [ + "bytes", + "getrandom 0.3.3", + "lru-slab", + "rand 0.9.2", + "ring", + "rustc-hash", + "rustls 0.23.35", + "rustls-pki-types", + "slab", + "thiserror 2.0.17", + "tinyvec", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-udp" +version = "0.5.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd" +dependencies = [ + "cfg_aliases", + "libc", + "once_cell", + "socket2", + "tracing", + "windows-sys 0.60.2", +] + [[package]] name = "quote" version = "1.0.40" @@ -799,8 +1148,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", - "rand_chacha", - "rand_core", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + +[[package]] +name = "rand" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +dependencies = [ + "rand_chacha 0.9.0", + "rand_core 0.9.3", ] [[package]] @@ -810,7 +1169,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.3", ] [[package]] @@ -822,6 +1191,15 @@ dependencies = [ "getrandom 0.2.16", ] +[[package]] +name = "rand_core" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" +dependencies = [ + "getrandom 0.3.3", +] + [[package]] name = "redox_syscall" version = "0.5.13" @@ -860,6 +1238,44 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" +[[package]] +name = "reqwest" +version = "0.12.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d0946410b9f7b082a427e4ef5c8ff541a88b357bc6c637c40db3a68ac70a36f" +dependencies = [ + "base64 0.22.1", + "bytes", + "futures-core", + "http 1.3.1", + "http-body 1.0.1", + "http-body-util", + "hyper 1.7.0", + "hyper-rustls", + "hyper-util", + "js-sys", + "log", + "percent-encoding", + "pin-project-lite", + "quinn", + "rustls 0.23.35", + "rustls-pki-types", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper 1.0.2", + "tokio", + "tokio-rustls 0.26.4", + "tower 0.5.2", + "tower-http", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", + "webpki-roots", +] + [[package]] name = "ring" version = "0.17.14" @@ -880,6 +1296,12 @@ version = "0.1.25" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "989e6739f80c4ad5b13e0fd7fe89531180375b18520cc8c82080e4dc4035b84f" +[[package]] +name = "rustc-hash" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" + [[package]] name = "rustix" version = "1.0.7" @@ -901,17 +1323,41 @@ checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e" dependencies = [ "log", "ring", - "rustls-webpki", + "rustls-webpki 0.101.7", "sct", ] +[[package]] +name = "rustls" +version = "0.23.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "533f54bc6a7d4f647e46ad909549eda97bf5afc1585190ef692b4286b198bd8f" +dependencies = [ + "once_cell", + "ring", + "rustls-pki-types", + "rustls-webpki 0.103.8", + "subtle", + "zeroize", +] + [[package]] name = "rustls-pemfile" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c" dependencies = [ - "base64", + "base64 0.21.7", +] + +[[package]] +name = "rustls-pki-types" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94182ad936a0c91c324cd46c6511b9510ed16af436d7b5bab34beab0afd55f7a" +dependencies = [ + "web-time", + "zeroize", ] [[package]] @@ -924,12 +1370,29 @@ dependencies = [ "untrusted", ] +[[package]] +name = "rustls-webpki" +version = "0.103.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ffdfa2f5286e2247234e03f680868ac2815974dc39e00ea15adc445d0aafe52" +dependencies = [ + "ring", + "rustls-pki-types", + "untrusted", +] + [[package]] name = "rustversion" version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d" +[[package]] +name = "ryu" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" + [[package]] name = "same-file" version = "1.0.6" @@ -975,6 +1438,30 @@ dependencies = [ "syn", ] +[[package]] +name = "serde_json" +version = "1.0.143" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d401abef1d108fbd9cbaebc3e46611f4b1021f714a0597a71f41ee463f5f4a5a" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + [[package]] name = "sharded-slab" version = "0.1.7" @@ -1021,6 +1508,18 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "stable_deref_trait" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" + +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + [[package]] name = "syn" version = "2.0.104" @@ -1038,6 +1537,26 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" +[[package]] +name = "sync_wrapper" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" +dependencies = [ + "futures-core", +] + +[[package]] +name = "synstructure" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "tempfile" version = "3.20.0" @@ -1057,7 +1576,16 @@ version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" dependencies = [ - "thiserror-impl", + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" +dependencies = [ + "thiserror-impl 2.0.17", ] [[package]] @@ -1071,6 +1599,17 @@ dependencies = [ "syn", ] +[[package]] +name = "thiserror-impl" +version = "2.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "thread_local" version = "1.1.9" @@ -1080,6 +1619,31 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "tinystr" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869" +dependencies = [ + "displaydoc", + "zerovec", +] + +[[package]] +name = "tinyvec" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + [[package]] name = "tokio" version = "1.45.1" @@ -1125,7 +1689,17 @@ version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" dependencies = [ - "rustls", + "rustls 0.21.12", + "tokio", +] + +[[package]] +name = "tokio-rustls" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" +dependencies = [ + "rustls 0.23.35", "tokio", ] @@ -1175,22 +1749,22 @@ dependencies = [ "async-stream", "async-trait", "axum", - "base64", + "base64 0.21.7", "bytes", "h2", - "http", - "http-body", - "hyper", + "http 0.2.12", + "http-body 0.4.6", + "hyper 0.14.32", "hyper-timeout", "percent-encoding", "pin-project", "prost", - "rustls", + "rustls 0.21.12", "rustls-pemfile", "tokio", - "tokio-rustls", + "tokio-rustls 0.24.1", "tokio-stream", - "tower", + "tower 0.4.13", "tower-layer", "tower-service", "tracing", @@ -1221,7 +1795,7 @@ dependencies = [ "indexmap 1.9.3", "pin-project", "pin-project-lite", - "rand", + "rand 0.8.5", "slab", "tokio", "tokio-util", @@ -1230,6 +1804,39 @@ dependencies = [ "tracing", ] +[[package]] +name = "tower" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" +dependencies = [ + "futures-core", + "futures-util", + "pin-project-lite", + "sync_wrapper 1.0.2", + "tokio", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-http" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adc82fd73de2a9722ac5da747f12383d2bfdb93591ee6c58486e0097890f05f2" +dependencies = [ + "bitflags 2.9.1", + "bytes", + "futures-util", + "http 1.3.1", + "http-body 1.0.1", + "iri-string", + "pin-project-lite", + "tower 0.5.2", + "tower-layer", + "tower-service", +] + [[package]] name = "tower-layer" version = "0.3.3" @@ -1318,6 +1925,23 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" +[[package]] +name = "url" +version = "2.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", +] + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + [[package]] name = "valuable" version = "0.1.1" @@ -1358,6 +1982,93 @@ dependencies = [ "wit-bindgen-rt", ] +[[package]] +name = "wasm-bindgen" +version = "0.2.105" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da95793dfc411fbbd93f5be7715b0578ec61fe87cb1a42b12eb625caa5c5ea60" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.55" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "551f88106c6d5e7ccc7cd9a16f312dd3b5d36ea8b4954304657d5dfba115d4a0" +dependencies = [ + "cfg-if", + "js-sys", + "once_cell", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.105" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04264334509e04a7bf8690f2384ef5265f05143a4bff3889ab7a3269adab59c2" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.105" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "420bc339d9f322e562942d52e115d57e950d12d88983a14c79b86859ee6c7ebc" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.105" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76f218a38c84bcb33c25ec7059b07847d465ce0e0a76b995e134a45adcb6af76" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "web-sys" +version = "0.3.82" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a1f95c0d03a47f4ae1f7a64643a6bb97465d9b740f0fa8f90ea33915c99a9a1" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "webpki-roots" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2878ef029c47c6e8cf779119f20fcf52bde7ad42a731b2a304bc221df17571e" +dependencies = [ + "rustls-pki-types", +] + [[package]] name = "winapi-util" version = "0.1.9" @@ -1531,6 +2242,35 @@ dependencies = [ "bitflags 2.9.1", ] +[[package]] +name = "writeable" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" + +[[package]] +name = "yoke" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" +dependencies = [ + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + [[package]] name = "zerocopy" version = "0.8.26" @@ -1550,3 +2290,63 @@ dependencies = [ "quote", "syn", ] + +[[package]] +name = "zerofrom" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zeroize" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" + +[[package]] +name = "zerotrie" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", +] + +[[package]] +name = "zerovec" +version = "0.11.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/client/crates/client/Cargo.toml b/client/crates/client/Cargo.toml index d325eb73fb..de149a0ee0 100644 --- a/client/crates/client/Cargo.toml +++ b/client/crates/client/Cargo.toml @@ -20,6 +20,7 @@ thiserror = "1.0" tracing = "0.1" futures = "0.3" async-trait = "0.1" +reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] } [dev-dependencies] tokio-test = "0.4" diff --git a/client/crates/client/src/error.rs b/client/crates/client/src/error.rs index f7a5f8cb05..190126d1c8 100644 --- a/client/crates/client/src/error.rs +++ b/client/crates/client/src/error.rs @@ -8,6 +8,9 @@ pub enum ClientError { #[error("RPC error: {0}")] Rpc(#[from] tonic::Status), + #[error("HTTP error: {0}")] + Http(#[from] reqwest::Error), + #[error("Connection error: {0}")] Connection(String), diff --git a/client/crates/client/src/health.rs b/client/crates/client/src/health.rs index 6042a9d8b4..ac499a44be 100644 --- a/client/crates/client/src/health.rs +++ b/client/crates/client/src/health.rs @@ -1,32 +1,55 @@ use crate::{client::Client, error::Result}; -use ev_types::v1::{health_service_client::HealthServiceClient, GetHealthResponse, HealthStatus}; -use tonic::Request; + +/// Health status of the node +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum HealthStatus { + /// Node is operating normally + Pass, + /// Node is degraded but still serving + Warn, + /// Node has failed health checks + Fail, + /// Unknown health status + Unknown, +} pub struct HealthClient { - inner: HealthServiceClient, + base_url: String, + http_client: reqwest::Client, } impl HealthClient { /// Create a new HealthClient from a Client - pub fn new(client: &Client) -> Self { - let inner = HealthServiceClient::new(client.channel().clone()); - Self { inner } + /// + /// Note: The base_url should be the HTTP endpoint (e.g., "http://localhost:9090") + pub fn new(_client: &Client) -> Self { + // For now, we'll need to construct the base URL from the client + // This is a workaround since we're mixing gRPC and HTTP endpoints + // TODO: Consider adding a method to Client to get the base URL + Self::with_base_url("http://localhost:9090".to_string()) + } + + /// Create a new HealthClient with an explicit base URL + pub fn with_base_url(base_url: String) -> Self { + Self { + base_url: base_url.trim_end_matches('/').to_string(), + http_client: reqwest::Client::new(), + } } /// Check if the node is alive and get its health status pub async fn livez(&self) -> Result { - let request = Request::new(()); - let response = self.inner.clone().livez(request).await?; - - Ok(response.into_inner().status()) - } + let url = format!("{}/health/live", self.base_url); + let response = self.http_client.get(&url).send().await?; - /// Get the full health response - pub async fn get_health(&self) -> Result { - let request = Request::new(()); - let response = self.inner.clone().livez(request).await?; + let status_text = response.text().await?.trim().to_string(); - Ok(response.into_inner()) + match status_text.as_str() { + "OK" => Ok(HealthStatus::Pass), + "WARN" => Ok(HealthStatus::Warn), + "FAIL" => Ok(HealthStatus::Fail), + _ => Ok(HealthStatus::Unknown), + } } /// Check if the node is healthy (status is PASS) diff --git a/client/crates/types/src/proto/evnode.v1.messages.rs b/client/crates/types/src/proto/evnode.v1.messages.rs index b5b35c6fd8..1aad84efe0 100644 --- a/client/crates/types/src/proto/evnode.v1.messages.rs +++ b/client/crates/types/src/proto/evnode.v1.messages.rs @@ -169,6 +169,8 @@ pub struct State { pub da_height: u64, #[prost(bytes = "vec", tag = "8")] pub app_hash: ::prost::alloc::vec::Vec, + #[prost(bytes = "vec", tag = "9")] + pub last_header_hash: ::prost::alloc::vec::Vec, } /// GetPeerInfoResponse defines the response for retrieving peer information #[allow(clippy::derive_partial_eq_without_eq)] @@ -218,51 +220,6 @@ pub struct Batch { #[prost(bytes = "vec", repeated, tag = "1")] pub txs: ::prost::alloc::vec::Vec<::prost::alloc::vec::Vec>, } -/// GetHealthResponse defines the response for retrieving health status -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct GetHealthResponse { - /// Health status - #[prost(enumeration = "HealthStatus", tag = "1")] - pub status: i32, -} -/// HealthStatus defines the health status of the node -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] -#[repr(i32)] -pub enum HealthStatus { - /// Unknown health status - Unknown = 0, - /// Healthy status (Healthy) - Pass = 1, - /// Degraded but still serving - Warn = 2, - /// Hard fail - Fail = 3, -} -impl HealthStatus { - /// String value of the enum field names used in the ProtoBuf definition. - /// - /// The values are not transformed in any way and thus are considered stable - /// (if the ProtoBuf definition does not change) and safe for programmatic use. - pub fn as_str_name(&self) -> &'static str { - match self { - HealthStatus::Unknown => "UNKNOWN", - HealthStatus::Pass => "PASS", - HealthStatus::Warn => "WARN", - HealthStatus::Fail => "FAIL", - } - } - /// Creates an enum from field names used in the ProtoBuf definition. - pub fn from_str_name(value: &str) -> ::core::option::Option { - match value { - "UNKNOWN" => Some(Self::Unknown), - "PASS" => Some(Self::Pass), - "WARN" => Some(Self::Warn), - "FAIL" => Some(Self::Fail), - _ => None, - } - } -} /// InitChainRequest contains the genesis parameters for chain initialization #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] diff --git a/client/crates/types/src/proto/evnode.v1.services.rs b/client/crates/types/src/proto/evnode.v1.services.rs index 7923876305..e0103af6c1 100644 --- a/client/crates/types/src/proto/evnode.v1.services.rs +++ b/client/crates/types/src/proto/evnode.v1.services.rs @@ -539,6 +539,8 @@ pub struct State { pub da_height: u64, #[prost(bytes = "vec", tag = "8")] pub app_hash: ::prost::alloc::vec::Vec, + #[prost(bytes = "vec", tag = "9")] + pub last_header_hash: ::prost::alloc::vec::Vec, } /// GetPeerInfoResponse defines the response for retrieving peer information #[allow(clippy::derive_partial_eq_without_eq)] @@ -955,343 +957,6 @@ pub struct Batch { #[prost(bytes = "vec", repeated, tag = "1")] pub txs: ::prost::alloc::vec::Vec<::prost::alloc::vec::Vec>, } -/// GetHealthResponse defines the response for retrieving health status -#[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct GetHealthResponse { - /// Health status - #[prost(enumeration = "HealthStatus", tag = "1")] - pub status: i32, -} -/// HealthStatus defines the health status of the node -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] -#[repr(i32)] -pub enum HealthStatus { - /// Unknown health status - Unknown = 0, - /// Healthy status (Healthy) - Pass = 1, - /// Degraded but still serving - Warn = 2, - /// Hard fail - Fail = 3, -} -impl HealthStatus { - /// String value of the enum field names used in the ProtoBuf definition. - /// - /// The values are not transformed in any way and thus are considered stable - /// (if the ProtoBuf definition does not change) and safe for programmatic use. - pub fn as_str_name(&self) -> &'static str { - match self { - HealthStatus::Unknown => "UNKNOWN", - HealthStatus::Pass => "PASS", - HealthStatus::Warn => "WARN", - HealthStatus::Fail => "FAIL", - } - } - /// Creates an enum from field names used in the ProtoBuf definition. - pub fn from_str_name(value: &str) -> ::core::option::Option { - match value { - "UNKNOWN" => Some(Self::Unknown), - "PASS" => Some(Self::Pass), - "WARN" => Some(Self::Warn), - "FAIL" => Some(Self::Fail), - _ => None, - } - } -} -/// Generated client implementations. -pub mod health_service_client { - #![allow(unused_variables, dead_code, missing_docs, clippy::let_unit_value)] - use tonic::codegen::*; - use tonic::codegen::http::Uri; - /// HealthService defines the RPC service for the health package - #[derive(Debug, Clone)] - pub struct HealthServiceClient { - inner: tonic::client::Grpc, - } - impl HealthServiceClient { - /// Attempt to create a new client by connecting to a given endpoint. - pub async fn connect(dst: D) -> Result - where - D: TryInto, - D::Error: Into, - { - let conn = tonic::transport::Endpoint::new(dst)?.connect().await?; - Ok(Self::new(conn)) - } - } - impl HealthServiceClient - where - T: tonic::client::GrpcService, - T::Error: Into, - T::ResponseBody: Body + Send + 'static, - ::Error: Into + Send, - { - pub fn new(inner: T) -> Self { - let inner = tonic::client::Grpc::new(inner); - Self { inner } - } - pub fn with_origin(inner: T, origin: Uri) -> Self { - let inner = tonic::client::Grpc::with_origin(inner, origin); - Self { inner } - } - pub fn with_interceptor( - inner: T, - interceptor: F, - ) -> HealthServiceClient> - where - F: tonic::service::Interceptor, - T::ResponseBody: Default, - T: tonic::codegen::Service< - http::Request, - Response = http::Response< - >::ResponseBody, - >, - >, - , - >>::Error: Into + Send + Sync, - { - HealthServiceClient::new(InterceptedService::new(inner, interceptor)) - } - /// Compress requests with the given encoding. - /// - /// This requires the server to support it otherwise it might respond with an - /// error. - #[must_use] - pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self { - self.inner = self.inner.send_compressed(encoding); - self - } - /// Enable decompressing responses. - #[must_use] - pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self { - self.inner = self.inner.accept_compressed(encoding); - self - } - /// Limits the maximum size of a decoded message. - /// - /// Default: `4MB` - #[must_use] - pub fn max_decoding_message_size(mut self, limit: usize) -> Self { - self.inner = self.inner.max_decoding_message_size(limit); - self - } - /// Limits the maximum size of an encoded message. - /// - /// Default: `usize::MAX` - #[must_use] - pub fn max_encoding_message_size(mut self, limit: usize) -> Self { - self.inner = self.inner.max_encoding_message_size(limit); - self - } - /// Livez returns the health status of the node - pub async fn livez( - &mut self, - request: impl tonic::IntoRequest<()>, - ) -> std::result::Result< - tonic::Response, - tonic::Status, - > { - self.inner - .ready() - .await - .map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; - let codec = tonic::codec::ProstCodec::default(); - let path = http::uri::PathAndQuery::from_static( - "/evnode.v1.HealthService/Livez", - ); - let mut req = request.into_request(); - req.extensions_mut() - .insert(GrpcMethod::new("evnode.v1.HealthService", "Livez")); - self.inner.unary(req, path, codec).await - } - } -} -/// Generated server implementations. -pub mod health_service_server { - #![allow(unused_variables, dead_code, missing_docs, clippy::let_unit_value)] - use tonic::codegen::*; - /// Generated trait containing gRPC methods that should be implemented for use with HealthServiceServer. - #[async_trait] - pub trait HealthService: Send + Sync + 'static { - /// Livez returns the health status of the node - async fn livez( - &self, - request: tonic::Request<()>, - ) -> std::result::Result< - tonic::Response, - tonic::Status, - >; - } - /// HealthService defines the RPC service for the health package - #[derive(Debug)] - pub struct HealthServiceServer { - inner: _Inner, - accept_compression_encodings: EnabledCompressionEncodings, - send_compression_encodings: EnabledCompressionEncodings, - max_decoding_message_size: Option, - max_encoding_message_size: Option, - } - struct _Inner(Arc); - impl HealthServiceServer { - pub fn new(inner: T) -> Self { - Self::from_arc(Arc::new(inner)) - } - pub fn from_arc(inner: Arc) -> Self { - let inner = _Inner(inner); - Self { - inner, - accept_compression_encodings: Default::default(), - send_compression_encodings: Default::default(), - max_decoding_message_size: None, - max_encoding_message_size: None, - } - } - pub fn with_interceptor( - inner: T, - interceptor: F, - ) -> InterceptedService - where - F: tonic::service::Interceptor, - { - InterceptedService::new(Self::new(inner), interceptor) - } - /// Enable decompressing requests with the given encoding. - #[must_use] - pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self { - self.accept_compression_encodings.enable(encoding); - self - } - /// Compress responses with the given encoding, if the client supports it. - #[must_use] - pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self { - self.send_compression_encodings.enable(encoding); - self - } - /// Limits the maximum size of a decoded message. - /// - /// Default: `4MB` - #[must_use] - pub fn max_decoding_message_size(mut self, limit: usize) -> Self { - self.max_decoding_message_size = Some(limit); - self - } - /// Limits the maximum size of an encoded message. - /// - /// Default: `usize::MAX` - #[must_use] - pub fn max_encoding_message_size(mut self, limit: usize) -> Self { - self.max_encoding_message_size = Some(limit); - self - } - } - impl tonic::codegen::Service> for HealthServiceServer - where - T: HealthService, - B: Body + Send + 'static, - B::Error: Into + Send + 'static, - { - type Response = http::Response; - type Error = std::convert::Infallible; - type Future = BoxFuture; - fn poll_ready( - &mut self, - _cx: &mut Context<'_>, - ) -> Poll> { - Poll::Ready(Ok(())) - } - fn call(&mut self, req: http::Request) -> Self::Future { - let inner = self.inner.clone(); - match req.uri().path() { - "/evnode.v1.HealthService/Livez" => { - #[allow(non_camel_case_types)] - struct LivezSvc(pub Arc); - impl tonic::server::UnaryService<()> - for LivezSvc { - type Response = super::GetHealthResponse; - type Future = BoxFuture< - tonic::Response, - tonic::Status, - >; - fn call(&mut self, request: tonic::Request<()>) -> Self::Future { - let inner = Arc::clone(&self.0); - let fut = async move { - ::livez(&inner, request).await - }; - Box::pin(fut) - } - } - let accept_compression_encodings = self.accept_compression_encodings; - let send_compression_encodings = self.send_compression_encodings; - let max_decoding_message_size = self.max_decoding_message_size; - let max_encoding_message_size = self.max_encoding_message_size; - let inner = self.inner.clone(); - let fut = async move { - let inner = inner.0; - let method = LivezSvc(inner); - let codec = tonic::codec::ProstCodec::default(); - let mut grpc = tonic::server::Grpc::new(codec) - .apply_compression_config( - accept_compression_encodings, - send_compression_encodings, - ) - .apply_max_message_size_config( - max_decoding_message_size, - max_encoding_message_size, - ); - let res = grpc.unary(method, req).await; - Ok(res) - }; - Box::pin(fut) - } - _ => { - Box::pin(async move { - Ok( - http::Response::builder() - .status(200) - .header("grpc-status", "12") - .header("content-type", "application/grpc") - .body(empty_body()) - .unwrap(), - ) - }) - } - } - } - } - impl Clone for HealthServiceServer { - fn clone(&self) -> Self { - let inner = self.inner.clone(); - Self { - inner, - accept_compression_encodings: self.accept_compression_encodings, - send_compression_encodings: self.send_compression_encodings, - max_decoding_message_size: self.max_decoding_message_size, - max_encoding_message_size: self.max_encoding_message_size, - } - } - } - impl Clone for _Inner { - fn clone(&self) -> Self { - Self(Arc::clone(&self.0)) - } - } - impl std::fmt::Debug for _Inner { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{:?}", self.0) - } - } - impl tonic::server::NamedService for HealthServiceServer { - const NAME: &'static str = "evnode.v1.HealthService"; - } -} /// InitChainRequest contains the genesis parameters for chain initialization #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] diff --git a/pkg/rpc/client/client.go b/pkg/rpc/client/client.go index 316b028f66..ee98dd6979 100644 --- a/pkg/rpc/client/client.go +++ b/pkg/rpc/client/client.go @@ -2,7 +2,10 @@ package client import ( "context" + "fmt" + "io" "net/http" + "strings" "connectrpc.com/connect" "google.golang.org/protobuf/types/known/emptypb" @@ -11,12 +14,40 @@ import ( rpc "github.com/evstack/ev-node/types/pb/evnode/v1/v1connect" ) -// Client is the client for StoreService, P2PService, HealthService, and ConfigService +// HealthStatus represents the health status of a node +type HealthStatus int32 + +const ( + // HealthStatus_UNKNOWN represents an unknown health status + HealthStatus_UNKNOWN HealthStatus = 0 + // HealthStatus_PASS represents a healthy node + HealthStatus_PASS HealthStatus = 1 + // HealthStatus_WARN represents a degraded but still serving node + HealthStatus_WARN HealthStatus = 2 + // HealthStatus_FAIL represents a failed node + HealthStatus_FAIL HealthStatus = 3 +) + +func (h HealthStatus) String() string { + switch h { + case HealthStatus_PASS: + return "PASS" + case HealthStatus_WARN: + return "WARN" + case HealthStatus_FAIL: + return "FAIL" + default: + return "UNKNOWN" + } +} + +// Client is the client for StoreService, P2PService, and ConfigService type Client struct { storeClient rpc.StoreServiceClient p2pClient rpc.P2PServiceClient - healthClient rpc.HealthServiceClient configClient rpc.ConfigServiceClient + baseURL string + httpClient *http.Client } // NewClient creates a new RPC client @@ -24,14 +55,14 @@ func NewClient(baseURL string) *Client { httpClient := http.DefaultClient storeClient := rpc.NewStoreServiceClient(httpClient, baseURL, connect.WithGRPC()) p2pClient := rpc.NewP2PServiceClient(httpClient, baseURL, connect.WithGRPC()) - healthClient := rpc.NewHealthServiceClient(httpClient, baseURL, connect.WithGRPC()) configClient := rpc.NewConfigServiceClient(httpClient, baseURL, connect.WithGRPC()) return &Client{ storeClient: storeClient, p2pClient: p2pClient, - healthClient: healthClient, configClient: configClient, + baseURL: baseURL, + httpClient: httpClient, } } @@ -114,14 +145,37 @@ func (c *Client) GetNetInfo(ctx context.Context) (*pb.NetInfo, error) { return resp.Msg.NetInfo, nil } -// GetHealth calls the HealthService.Livez endpoint and returns the HealthStatus -func (c *Client) GetHealth(ctx context.Context) (pb.HealthStatus, error) { - req := connect.NewRequest(&emptypb.Empty{}) - resp, err := c.healthClient.Livez(ctx, req) +// GetHealth calls the /health/live HTTP endpoint and returns the HealthStatus +func (c *Client) GetHealth(ctx context.Context) (HealthStatus, error) { + healthURL := fmt.Sprintf("%s/health/live", c.baseURL) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, healthURL, nil) + if err != nil { + return HealthStatus_UNKNOWN, fmt.Errorf("failed to create health request: %w", err) + } + + resp, err := c.httpClient.Do(req) if err != nil { - return pb.HealthStatus_UNKNOWN, err + return HealthStatus_UNKNOWN, fmt.Errorf("failed to get health: %w", err) + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return HealthStatus_UNKNOWN, fmt.Errorf("failed to read health response: %w", err) + } + + // Parse the text response + status := strings.TrimSpace(string(body)) + switch status { + case "OK": + return HealthStatus_PASS, nil + case "WARN": + return HealthStatus_WARN, nil + case "FAIL": + return HealthStatus_FAIL, nil + default: + return HealthStatus_UNKNOWN, fmt.Errorf("unknown health status: %s", status) } - return resp.Msg.Status, nil } // GetNamespace returns the namespace configuration for this network diff --git a/pkg/rpc/client/client_test.go b/pkg/rpc/client/client_test.go index 3d7974476c..340fe1ea42 100644 --- a/pkg/rpc/client/client_test.go +++ b/pkg/rpc/client/client_test.go @@ -43,7 +43,6 @@ func setupTestServer(t *testing.T, mockStore *mocks.MockStore, mockP2P *mocks.Mo storeServer := server.NewStoreServer(mockStore, logger) p2pServer := server.NewP2PServer(mockP2P) - healthServer := server.NewHealthServer(mockStore, testConfig, logger) configServer := server.NewConfigServer(testConfig, nil, logger) // Register the store service @@ -54,14 +53,13 @@ func setupTestServer(t *testing.T, mockStore *mocks.MockStore, mockP2P *mocks.Mo p2pPath, p2pHandler := rpc.NewP2PServiceHandler(p2pServer) mux.Handle(p2pPath, p2pHandler) - // Register the health service - healthPath, healthHandler := rpc.NewHealthServiceHandler(healthServer) - mux.Handle(healthPath, healthHandler) - // Register the config service configPath, configHandler := rpc.NewConfigServiceHandler(configServer) mux.Handle(configPath, configHandler) + // Register custom HTTP endpoints (including health) + server.RegisterCustomHTTPEndpoints(mux, mockStore, mockP2P, testConfig, nil, logger) + // Create an HTTP server with h2c for HTTP/2 support testServer := httptest.NewServer(h2c.NewHandler(mux, &http2.Server{})) diff --git a/pkg/rpc/server/da_visualization_test.go b/pkg/rpc/server/da_visualization_test.go index 099bd3c5b1..2b7d3d7f86 100644 --- a/pkg/rpc/server/da_visualization_test.go +++ b/pkg/rpc/server/da_visualization_test.go @@ -255,7 +255,8 @@ func TestRegisterCustomHTTPEndpointsDAVisualization(t *testing.T) { // Create mux and register endpoints mux := http.NewServeMux() - RegisterCustomHTTPEndpoints(mux, nil, nil, config.DefaultConfig(), nil) + nopLogger := zerolog.Nop() + RegisterCustomHTTPEndpoints(mux, nil, nil, config.DefaultConfig(), nil, nopLogger) // Test /da endpoint req, err := http.NewRequest("GET", "/da", nil) @@ -292,7 +293,8 @@ func TestRegisterCustomHTTPEndpointsWithoutServer(t *testing.T) { SetDAVisualizationServer(nil) mux := http.NewServeMux() - RegisterCustomHTTPEndpoints(mux, nil, nil, config.DefaultConfig(), nil) + logger := zerolog.Nop() + RegisterCustomHTTPEndpoints(mux, nil, nil, config.DefaultConfig(), nil, logger) // Test that endpoints return service unavailable when server is not set endpoints := []string{"/da", "/da/submissions", "/da/blob"} diff --git a/pkg/rpc/server/http.go b/pkg/rpc/server/http.go index a8656c1dd5..7f9e4c731f 100644 --- a/pkg/rpc/server/http.go +++ b/pkg/rpc/server/http.go @@ -3,10 +3,22 @@ package server import ( "fmt" "net/http" + "time" "github.com/evstack/ev-node/pkg/config" "github.com/evstack/ev-node/pkg/p2p" "github.com/evstack/ev-node/pkg/store" + "github.com/rs/zerolog" +) + +const ( + // healthCheckWarnMultiplier is the multiplier for block time to determine WARN threshold + // If no block has been produced in (blockTime * healthCheckWarnMultiplier), return WARN + healthCheckWarnMultiplier = 3 + + // healthCheckFailMultiplier is the multiplier for block time to determine FAIL threshold + // If no block has been produced in (blockTime * healthCheckFailMultiplier), return FAIL + healthCheckFailMultiplier = 5 ) // BestKnownHeightProvider should return the best-known network height observed by the node @@ -15,9 +27,59 @@ type BestKnownHeightProvider func() uint64 // RegisterCustomHTTPEndpoints is the designated place to add new, non-gRPC, plain HTTP handlers. // Additional custom HTTP endpoints can be registered on the mux here. -func RegisterCustomHTTPEndpoints(mux *http.ServeMux, s store.Store, pm p2p.P2PRPC, cfg config.Config, bestKnownHeightProvider BestKnownHeightProvider) { +func RegisterCustomHTTPEndpoints(mux *http.ServeMux, s store.Store, pm p2p.P2PRPC, cfg config.Config, bestKnownHeightProvider BestKnownHeightProvider, logger zerolog.Logger) { + // Liveness endpoint - checks if block production is healthy for aggregator nodes mux.HandleFunc("/health/live", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/plain") + + // For aggregator nodes, check if block production is healthy + if cfg.Node.Aggregator { + state, err := s.GetState(r.Context()) + if err != nil { + logger.Error().Err(err).Msg("Failed to get state for health check") + http.Error(w, "FAIL", http.StatusServiceUnavailable) + return + } + + // If we have blocks, check if the last block time is recent + if state.LastBlockHeight > 0 { + timeSinceLastBlock := time.Since(state.LastBlockTime) + + // Calculate the threshold based on block time + blockTime := cfg.Node.BlockTime.Duration + + // For lazy mode, use the lazy block interval instead + if cfg.Node.LazyMode { + blockTime = cfg.Node.LazyBlockInterval.Duration + } + + warnThreshold := blockTime * healthCheckWarnMultiplier + failThreshold := blockTime * healthCheckFailMultiplier + + if timeSinceLastBlock > failThreshold { + logger.Error(). + Dur("time_since_last_block", timeSinceLastBlock). + Dur("fail_threshold", failThreshold). + Uint64("last_block_height", state.LastBlockHeight). + Time("last_block_time", state.LastBlockTime). + Msg("Health check: node has stopped producing blocks (FAIL)") + http.Error(w, "FAIL", http.StatusServiceUnavailable) + return + } else if timeSinceLastBlock > warnThreshold { + logger.Warn(). + Dur("time_since_last_block", timeSinceLastBlock). + Dur("warn_threshold", warnThreshold). + Uint64("last_block_height", state.LastBlockHeight). + Time("last_block_time", state.LastBlockTime). + Msg("Health check: block production is slow (WARN)") + w.WriteHeader(http.StatusOK) + fmt.Fprintln(w, "WARN") + return + } + } + } + + // For non-aggregator nodes or if checks pass, return healthy w.WriteHeader(http.StatusOK) fmt.Fprintln(w, "OK") }) diff --git a/pkg/rpc/server/http_test.go b/pkg/rpc/server/http_test.go index 5fe0a3d5f3..335113d0af 100644 --- a/pkg/rpc/server/http_test.go +++ b/pkg/rpc/server/http_test.go @@ -7,15 +7,17 @@ import ( "testing" "github.com/evstack/ev-node/pkg/config" + "github.com/rs/zerolog" "github.com/stretchr/testify/assert" ) func TestRegisterCustomHTTPEndpoints(t *testing.T) { // Create a new ServeMux mux := http.NewServeMux() + logger := zerolog.Nop() // Register custom HTTP endpoints - RegisterCustomHTTPEndpoints(mux, nil, nil, config.DefaultConfig(), nil) + RegisterCustomHTTPEndpoints(mux, nil, nil, config.DefaultConfig(), nil, logger) // Create a new HTTP test server with the mux testServer := httptest.NewServer(mux) diff --git a/pkg/rpc/server/server.go b/pkg/rpc/server/server.go index 06ede9ca36..f649fda37d 100644 --- a/pkg/rpc/server/server.go +++ b/pkg/rpc/server/server.go @@ -28,16 +28,6 @@ import ( rpc "github.com/evstack/ev-node/types/pb/evnode/v1/v1connect" ) -const ( - // healthCheckWarnMultiplier is the multiplier for block time to determine WARN threshold - // If no block has been produced in (blockTime * healthCheckWarnMultiplier), return WARN - healthCheckWarnMultiplier = 3 - - // healthCheckFailMultiplier is the multiplier for block time to determine FAIL threshold - // If no block has been produced in (blockTime * healthCheckFailMultiplier), return FAIL - healthCheckFailMultiplier = 5 -) - var _ rpc.StoreServiceHandler = (*StoreServer)(nil) // StoreServer implements the StoreService defined in the proto file @@ -297,87 +287,10 @@ func (p *P2PServer) GetNetInfo( }), nil } -// HealthServer implements the HealthService defined in the proto file -type HealthServer struct { - store store.Store - config config.Config - logger zerolog.Logger -} - -// NewHealthServer creates a new HealthServer instance -func NewHealthServer(store store.Store, config config.Config, logger zerolog.Logger) *HealthServer { - return &HealthServer{ - store: store, - config: config, - logger: logger, - } -} - -// Livez implements the HealthService.Livez RPC -func (h *HealthServer) Livez( - ctx context.Context, - req *connect.Request[emptypb.Empty], -) (*connect.Response[pb.GetHealthResponse], error) { - // For aggregator nodes, check if block production is healthy - if h.config.Node.Aggregator { - state, err := h.store.GetState(ctx) - if err != nil { - h.logger.Error().Err(err).Msg("Failed to get state for health check") - return connect.NewResponse(&pb.GetHealthResponse{ - Status: pb.HealthStatus_FAIL, - }), nil - } - - // If we have blocks, check if the last block time is recent - if state.LastBlockHeight > 0 { - timeSinceLastBlock := time.Since(state.LastBlockTime) - - // Calculate the threshold based on block time - blockTime := h.config.Node.BlockTime.Duration - - // For lazy mode, use the lazy block interval instead - if h.config.Node.LazyMode { - blockTime = h.config.Node.LazyBlockInterval.Duration - } - - warnThreshold := blockTime * healthCheckWarnMultiplier - failThreshold := blockTime * healthCheckFailMultiplier - - if timeSinceLastBlock > failThreshold { - h.logger.Error(). - Dur("time_since_last_block", timeSinceLastBlock). - Dur("fail_threshold", failThreshold). - Uint64("last_block_height", state.LastBlockHeight). - Time("last_block_time", state.LastBlockTime). - Msg("Health check: node has stopped producing blocks (FAIL)") - return connect.NewResponse(&pb.GetHealthResponse{ - Status: pb.HealthStatus_FAIL, - }), nil - } else if timeSinceLastBlock > warnThreshold { - h.logger.Warn(). - Dur("time_since_last_block", timeSinceLastBlock). - Dur("warn_threshold", warnThreshold). - Uint64("last_block_height", state.LastBlockHeight). - Time("last_block_time", state.LastBlockTime). - Msg("Health check: block production is slow (WARN)") - return connect.NewResponse(&pb.GetHealthResponse{ - Status: pb.HealthStatus_WARN, - }), nil - } - } - } - - // For non-aggregator nodes or if checks pass, return healthy - return connect.NewResponse(&pb.GetHealthResponse{ - Status: pb.HealthStatus_PASS, - }), nil -} - -// NewServiceHandler creates a new HTTP handler for Store, P2P and Health services +// NewServiceHandler creates a new HTTP handler for Store, P2P and Config services func NewServiceHandler(store store.Store, peerManager p2p.P2PRPC, proposerAddress []byte, logger zerolog.Logger, config config.Config, bestKnown BestKnownHeightProvider) (http.Handler, error) { storeServer := NewStoreServer(store, logger) p2pServer := NewP2PServer(peerManager) - healthServer := NewHealthServer(store, config, logger) configServer := NewConfigServer(config, proposerAddress, logger) mux := http.NewServeMux() @@ -386,7 +299,6 @@ func NewServiceHandler(store store.Store, peerManager p2p.P2PRPC, proposerAddres reflector := grpcreflect.NewStaticReflector( rpc.StoreServiceName, rpc.P2PServiceName, - rpc.HealthServiceName, rpc.ConfigServiceName, ) mux.Handle(grpcreflect.NewHandlerV1(reflector, compress1KB)) @@ -400,15 +312,11 @@ func NewServiceHandler(store store.Store, peerManager p2p.P2PRPC, proposerAddres p2pPath, p2pHandler := rpc.NewP2PServiceHandler(p2pServer) mux.Handle(p2pPath, p2pHandler) - // Register HealthService - healthPath, healthHandler := rpc.NewHealthServiceHandler(healthServer) - mux.Handle(healthPath, healthHandler) - configPath, configHandler := rpc.NewConfigServiceHandler(configServer) mux.Handle(configPath, configHandler) // Register custom HTTP endpoints - RegisterCustomHTTPEndpoints(mux, store, peerManager, config, bestKnown) + RegisterCustomHTTPEndpoints(mux, store, peerManager, config, bestKnown, logger) // Use h2c to support HTTP/2 without TLS return h2c.NewHandler(mux, &http2.Server{ diff --git a/pkg/rpc/server/server_test.go b/pkg/rpc/server/server_test.go index 1aa71c11e2..9f60aaf58f 100644 --- a/pkg/rpc/server/server_test.go +++ b/pkg/rpc/server/server_test.go @@ -360,22 +360,34 @@ func TestP2PServer_GetNetInfo(t *testing.T) { require.Nil(t, resp2) } -func TestHealthServer_Livez(t *testing.T) { +func TestHealthLiveEndpoint(t *testing.T) { logger := zerolog.Nop() - t.Run("non-aggregator always returns PASS", func(t *testing.T) { + t.Run("non-aggregator always returns OK", func(t *testing.T) { mockStore := mocks.NewMockStore(t) + mockP2PManager := &mocks.MockP2PRPC{} testConfig := config.DefaultConfig() testConfig.Node.Aggregator = false - h := NewHealthServer(mockStore, testConfig, logger) - resp, err := h.Livez(context.Background(), connect.NewRequest(&emptypb.Empty{})) + handler, err := NewServiceHandler(mockStore, mockP2PManager, nil, logger, testConfig, nil) + require.NoError(t, err) + + server := httptest.NewServer(handler) + defer server.Close() + + resp, err := http.Get(server.URL + "/health/live") + require.NoError(t, err) + defer resp.Body.Close() + + require.Equal(t, http.StatusOK, resp.StatusCode) + body, err := io.ReadAll(resp.Body) require.NoError(t, err) - require.Equal(t, pb.HealthStatus_PASS, resp.Msg.Status) + require.Equal(t, "OK\n", string(body)) }) - t.Run("aggregator with no blocks returns PASS", func(t *testing.T) { + t.Run("aggregator with no blocks returns OK", func(t *testing.T) { mockStore := mocks.NewMockStore(t) + mockP2PManager := &mocks.MockP2PRPC{} testConfig := config.DefaultConfig() testConfig.Node.Aggregator = true @@ -385,15 +397,26 @@ func TestHealthServer_Livez(t *testing.T) { } mockStore.On("GetState", mock.Anything).Return(state, nil) - h := NewHealthServer(mockStore, testConfig, logger) - resp, err := h.Livez(context.Background(), connect.NewRequest(&emptypb.Empty{})) + handler, err := NewServiceHandler(mockStore, mockP2PManager, nil, logger, testConfig, nil) + require.NoError(t, err) + + server := httptest.NewServer(handler) + defer server.Close() + + resp, err := http.Get(server.URL + "/health/live") + require.NoError(t, err) + defer resp.Body.Close() + + require.Equal(t, http.StatusOK, resp.StatusCode) + body, err := io.ReadAll(resp.Body) require.NoError(t, err) - require.Equal(t, pb.HealthStatus_PASS, resp.Msg.Status) + require.Equal(t, "OK\n", string(body)) mockStore.AssertExpectations(t) }) - t.Run("aggregator with recent blocks returns PASS", func(t *testing.T) { + t.Run("aggregator with recent blocks returns OK", func(t *testing.T) { mockStore := mocks.NewMockStore(t) + mockP2PManager := &mocks.MockP2PRPC{} testConfig := config.DefaultConfig() testConfig.Node.Aggregator = true testConfig.Node.BlockTime.Duration = 1 * time.Second @@ -405,15 +428,26 @@ func TestHealthServer_Livez(t *testing.T) { } mockStore.On("GetState", mock.Anything).Return(state, nil) - h := NewHealthServer(mockStore, testConfig, logger) - resp, err := h.Livez(context.Background(), connect.NewRequest(&emptypb.Empty{})) + handler, err := NewServiceHandler(mockStore, mockP2PManager, nil, logger, testConfig, nil) require.NoError(t, err) - require.Equal(t, pb.HealthStatus_PASS, resp.Msg.Status) + + server := httptest.NewServer(handler) + defer server.Close() + + resp, err := http.Get(server.URL + "/health/live") + require.NoError(t, err) + defer resp.Body.Close() + + require.Equal(t, http.StatusOK, resp.StatusCode) + body, err := io.ReadAll(resp.Body) + require.NoError(t, err) + require.Equal(t, "OK\n", string(body)) mockStore.AssertExpectations(t) }) t.Run("aggregator with slow block production returns WARN", func(t *testing.T) { mockStore := mocks.NewMockStore(t) + mockP2PManager := &mocks.MockP2PRPC{} testConfig := config.DefaultConfig() testConfig.Node.Aggregator = true testConfig.Node.BlockTime.Duration = 1 * time.Second @@ -425,15 +459,26 @@ func TestHealthServer_Livez(t *testing.T) { } mockStore.On("GetState", mock.Anything).Return(state, nil) - h := NewHealthServer(mockStore, testConfig, logger) - resp, err := h.Livez(context.Background(), connect.NewRequest(&emptypb.Empty{})) + handler, err := NewServiceHandler(mockStore, mockP2PManager, nil, logger, testConfig, nil) + require.NoError(t, err) + + server := httptest.NewServer(handler) + defer server.Close() + + resp, err := http.Get(server.URL + "/health/live") + require.NoError(t, err) + defer resp.Body.Close() + + require.Equal(t, http.StatusOK, resp.StatusCode) + body, err := io.ReadAll(resp.Body) require.NoError(t, err) - require.Equal(t, pb.HealthStatus_WARN, resp.Msg.Status) + require.Equal(t, "WARN\n", string(body)) mockStore.AssertExpectations(t) }) t.Run("aggregator with stopped block production returns FAIL", func(t *testing.T) { mockStore := mocks.NewMockStore(t) + mockP2PManager := &mocks.MockP2PRPC{} testConfig := config.DefaultConfig() testConfig.Node.Aggregator = true testConfig.Node.BlockTime.Duration = 1 * time.Second @@ -445,15 +490,26 @@ func TestHealthServer_Livez(t *testing.T) { } mockStore.On("GetState", mock.Anything).Return(state, nil) - h := NewHealthServer(mockStore, testConfig, logger) - resp, err := h.Livez(context.Background(), connect.NewRequest(&emptypb.Empty{})) + handler, err := NewServiceHandler(mockStore, mockP2PManager, nil, logger, testConfig, nil) require.NoError(t, err) - require.Equal(t, pb.HealthStatus_FAIL, resp.Msg.Status) + + server := httptest.NewServer(handler) + defer server.Close() + + resp, err := http.Get(server.URL + "/health/live") + require.NoError(t, err) + defer resp.Body.Close() + + require.Equal(t, http.StatusServiceUnavailable, resp.StatusCode) + body, err := io.ReadAll(resp.Body) + require.NoError(t, err) + require.Contains(t, string(body), "FAIL") mockStore.AssertExpectations(t) }) t.Run("lazy aggregator uses lazy block interval for threshold", func(t *testing.T) { mockStore := mocks.NewMockStore(t) + mockP2PManager := &mocks.MockP2PRPC{} testConfig := config.DefaultConfig() testConfig.Node.Aggregator = true testConfig.Node.LazyMode = true @@ -467,58 +523,47 @@ func TestHealthServer_Livez(t *testing.T) { } mockStore.On("GetState", mock.Anything).Return(state, nil) - h := NewHealthServer(mockStore, testConfig, logger) - resp, err := h.Livez(context.Background(), connect.NewRequest(&emptypb.Empty{})) + handler, err := NewServiceHandler(mockStore, mockP2PManager, nil, logger, testConfig, nil) require.NoError(t, err) - require.Equal(t, pb.HealthStatus_WARN, resp.Msg.Status) + + server := httptest.NewServer(handler) + defer server.Close() + + resp, err := http.Get(server.URL + "/health/live") + require.NoError(t, err) + defer resp.Body.Close() + + require.Equal(t, http.StatusOK, resp.StatusCode) + body, err := io.ReadAll(resp.Body) + require.NoError(t, err) + require.Equal(t, "WARN\n", string(body)) mockStore.AssertExpectations(t) }) t.Run("aggregator with state error returns FAIL", func(t *testing.T) { mockStore := mocks.NewMockStore(t) + mockP2PManager := &mocks.MockP2PRPC{} testConfig := config.DefaultConfig() testConfig.Node.Aggregator = true mockStore.On("GetState", mock.Anything).Return(types.State{}, fmt.Errorf("state error")) - h := NewHealthServer(mockStore, testConfig, logger) - resp, err := h.Livez(context.Background(), connect.NewRequest(&emptypb.Empty{})) + handler, err := NewServiceHandler(mockStore, mockP2PManager, nil, logger, testConfig, nil) require.NoError(t, err) - require.Equal(t, pb.HealthStatus_FAIL, resp.Msg.Status) - mockStore.AssertExpectations(t) - }) -} -func TestHealthLiveEndpoint(t *testing.T) { - assert := require.New(t) + server := httptest.NewServer(handler) + defer server.Close() - // Create mock dependencies - mockStore := mocks.NewMockStore(t) - mockP2PManager := &mocks.MockP2PRPC{} // Assuming this mock is sufficient or can be adapted + resp, err := http.Get(server.URL + "/health/live") + require.NoError(t, err) + defer resp.Body.Close() - // Create the service handler - logger := zerolog.Nop() - testConfig := config.DefaultConfig() - handler, err := NewServiceHandler(mockStore, mockP2PManager, nil, logger, testConfig, nil) - assert.NoError(err) - assert.NotNil(handler) - - // Create a new HTTP test server - server := httptest.NewServer(handler) - defer server.Close() - - // Make a GET request to the /health/live endpoint - resp, err := http.Get(server.URL + "/health/live") - assert.NoError(err) - defer resp.Body.Close() - - // Check the status code - assert.Equal(http.StatusOK, resp.StatusCode) - - // Check the response body - body, err := io.ReadAll(resp.Body) - assert.NoError(err) - assert.Equal("OK\n", string(body)) // fmt.Fprintln adds a newline + require.Equal(t, http.StatusServiceUnavailable, resp.StatusCode) + body, err := io.ReadAll(resp.Body) + require.NoError(t, err) + require.Contains(t, string(body), "FAIL") + mockStore.AssertExpectations(t) + }) } func TestHealthReadyEndpoint(t *testing.T) { diff --git a/proto/evnode/v1/health.proto b/proto/evnode/v1/health.proto deleted file mode 100644 index a9a1b716c1..0000000000 --- a/proto/evnode/v1/health.proto +++ /dev/null @@ -1,32 +0,0 @@ -syntax = "proto3"; -package evnode.v1; - -import "google/protobuf/empty.proto"; -import "evnode/v1/evnode.proto"; -import "evnode/v1/state.proto"; - -option go_package = "github.com/evstack/ev-node/types/pb/evnode/v1"; - -// HealthService defines the RPC service for the health package -service HealthService { - // Livez returns the health status of the node - rpc Livez(google.protobuf.Empty) returns (GetHealthResponse) {} -} - -// HealthStatus defines the health status of the node -enum HealthStatus { - // Unknown health status - UNKNOWN = 0; - // Healthy status (Healthy) - PASS = 1; - // Degraded but still serving - WARN = 2; - // Hard fail - FAIL = 3; -} - -// GetHealthResponse defines the response for retrieving health status -message GetHealthResponse { - // Health status - HealthStatus status = 1; -} diff --git a/test/docker-e2e/upgrade_test.go b/test/docker-e2e/upgrade_test.go index a5b426d834..84b86e695a 100644 --- a/test/docker-e2e/upgrade_test.go +++ b/test/docker-e2e/upgrade_test.go @@ -210,10 +210,9 @@ func (s *EVMSingleUpgradeTestSuite) waitForEVMSingleHealthy(ctx context.Context, networkInfo, err := node.GetNetworkInfo(ctx) s.Require().NoError(err) - healthURL := fmt.Sprintf("http://0.0.0.0:%s/evnode.v1.HealthService/Livez", networkInfo.External.Ports.RPC) + healthURL := fmt.Sprintf("http://0.0.0.0:%s/health/live", networkInfo.External.Ports.RPC) s.Require().Eventually(func() bool { - req, _ := http.NewRequestWithContext(ctx, http.MethodPost, healthURL, bytes.NewBufferString("{}")) - req.Header.Set("Content-Type", "application/json") + req, _ := http.NewRequestWithContext(ctx, http.MethodGet, healthURL, nil) resp, err := http.DefaultClient.Do(req) if err != nil { return false diff --git a/types/pb/evnode/v1/health.pb.go b/types/pb/evnode/v1/health.pb.go deleted file mode 100644 index 1a76c46c7e..0000000000 --- a/types/pb/evnode/v1/health.pb.go +++ /dev/null @@ -1,198 +0,0 @@ -// Code generated by protoc-gen-go. DO NOT EDIT. -// versions: -// protoc-gen-go v1.36.10 -// protoc (unknown) -// source: evnode/v1/health.proto - -package v1 - -import ( - protoreflect "google.golang.org/protobuf/reflect/protoreflect" - protoimpl "google.golang.org/protobuf/runtime/protoimpl" - emptypb "google.golang.org/protobuf/types/known/emptypb" - reflect "reflect" - sync "sync" - unsafe "unsafe" -) - -const ( - // Verify that this generated code is sufficiently up-to-date. - _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) - // Verify that runtime/protoimpl is sufficiently up-to-date. - _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) -) - -// HealthStatus defines the health status of the node -type HealthStatus int32 - -const ( - // Unknown health status - HealthStatus_UNKNOWN HealthStatus = 0 - // Healthy status (Healthy) - HealthStatus_PASS HealthStatus = 1 - // Degraded but still serving - HealthStatus_WARN HealthStatus = 2 - // Hard fail - HealthStatus_FAIL HealthStatus = 3 -) - -// Enum value maps for HealthStatus. -var ( - HealthStatus_name = map[int32]string{ - 0: "UNKNOWN", - 1: "PASS", - 2: "WARN", - 3: "FAIL", - } - HealthStatus_value = map[string]int32{ - "UNKNOWN": 0, - "PASS": 1, - "WARN": 2, - "FAIL": 3, - } -) - -func (x HealthStatus) Enum() *HealthStatus { - p := new(HealthStatus) - *p = x - return p -} - -func (x HealthStatus) String() string { - return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) -} - -func (HealthStatus) Descriptor() protoreflect.EnumDescriptor { - return file_evnode_v1_health_proto_enumTypes[0].Descriptor() -} - -func (HealthStatus) Type() protoreflect.EnumType { - return &file_evnode_v1_health_proto_enumTypes[0] -} - -func (x HealthStatus) Number() protoreflect.EnumNumber { - return protoreflect.EnumNumber(x) -} - -// Deprecated: Use HealthStatus.Descriptor instead. -func (HealthStatus) EnumDescriptor() ([]byte, []int) { - return file_evnode_v1_health_proto_rawDescGZIP(), []int{0} -} - -// GetHealthResponse defines the response for retrieving health status -type GetHealthResponse struct { - state protoimpl.MessageState `protogen:"open.v1"` - // Health status - Status HealthStatus `protobuf:"varint,1,opt,name=status,proto3,enum=evnode.v1.HealthStatus" json:"status,omitempty"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache -} - -func (x *GetHealthResponse) Reset() { - *x = GetHealthResponse{} - mi := &file_evnode_v1_health_proto_msgTypes[0] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) -} - -func (x *GetHealthResponse) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*GetHealthResponse) ProtoMessage() {} - -func (x *GetHealthResponse) ProtoReflect() protoreflect.Message { - mi := &file_evnode_v1_health_proto_msgTypes[0] - if x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use GetHealthResponse.ProtoReflect.Descriptor instead. -func (*GetHealthResponse) Descriptor() ([]byte, []int) { - return file_evnode_v1_health_proto_rawDescGZIP(), []int{0} -} - -func (x *GetHealthResponse) GetStatus() HealthStatus { - if x != nil { - return x.Status - } - return HealthStatus_UNKNOWN -} - -var File_evnode_v1_health_proto protoreflect.FileDescriptor - -const file_evnode_v1_health_proto_rawDesc = "" + - "\n" + - "\x16evnode/v1/health.proto\x12\tevnode.v1\x1a\x1bgoogle/protobuf/empty.proto\x1a\x16evnode/v1/evnode.proto\x1a\x15evnode/v1/state.proto\"D\n" + - "\x11GetHealthResponse\x12/\n" + - "\x06status\x18\x01 \x01(\x0e2\x17.evnode.v1.HealthStatusR\x06status*9\n" + - "\fHealthStatus\x12\v\n" + - "\aUNKNOWN\x10\x00\x12\b\n" + - "\x04PASS\x10\x01\x12\b\n" + - "\x04WARN\x10\x02\x12\b\n" + - "\x04FAIL\x10\x032P\n" + - "\rHealthService\x12?\n" + - "\x05Livez\x12\x16.google.protobuf.Empty\x1a\x1c.evnode.v1.GetHealthResponse\"\x00B/Z-github.com/evstack/ev-node/types/pb/evnode/v1b\x06proto3" - -var ( - file_evnode_v1_health_proto_rawDescOnce sync.Once - file_evnode_v1_health_proto_rawDescData []byte -) - -func file_evnode_v1_health_proto_rawDescGZIP() []byte { - file_evnode_v1_health_proto_rawDescOnce.Do(func() { - file_evnode_v1_health_proto_rawDescData = protoimpl.X.CompressGZIP(unsafe.Slice(unsafe.StringData(file_evnode_v1_health_proto_rawDesc), len(file_evnode_v1_health_proto_rawDesc))) - }) - return file_evnode_v1_health_proto_rawDescData -} - -var file_evnode_v1_health_proto_enumTypes = make([]protoimpl.EnumInfo, 1) -var file_evnode_v1_health_proto_msgTypes = make([]protoimpl.MessageInfo, 1) -var file_evnode_v1_health_proto_goTypes = []any{ - (HealthStatus)(0), // 0: evnode.v1.HealthStatus - (*GetHealthResponse)(nil), // 1: evnode.v1.GetHealthResponse - (*emptypb.Empty)(nil), // 2: google.protobuf.Empty -} -var file_evnode_v1_health_proto_depIdxs = []int32{ - 0, // 0: evnode.v1.GetHealthResponse.status:type_name -> evnode.v1.HealthStatus - 2, // 1: evnode.v1.HealthService.Livez:input_type -> google.protobuf.Empty - 1, // 2: evnode.v1.HealthService.Livez:output_type -> evnode.v1.GetHealthResponse - 2, // [2:3] is the sub-list for method output_type - 1, // [1:2] is the sub-list for method input_type - 1, // [1:1] is the sub-list for extension type_name - 1, // [1:1] is the sub-list for extension extendee - 0, // [0:1] is the sub-list for field type_name -} - -func init() { file_evnode_v1_health_proto_init() } -func file_evnode_v1_health_proto_init() { - if File_evnode_v1_health_proto != nil { - return - } - file_evnode_v1_evnode_proto_init() - file_evnode_v1_state_proto_init() - type x struct{} - out := protoimpl.TypeBuilder{ - File: protoimpl.DescBuilder{ - GoPackagePath: reflect.TypeOf(x{}).PkgPath(), - RawDescriptor: unsafe.Slice(unsafe.StringData(file_evnode_v1_health_proto_rawDesc), len(file_evnode_v1_health_proto_rawDesc)), - NumEnums: 1, - NumMessages: 1, - NumExtensions: 0, - NumServices: 1, - }, - GoTypes: file_evnode_v1_health_proto_goTypes, - DependencyIndexes: file_evnode_v1_health_proto_depIdxs, - EnumInfos: file_evnode_v1_health_proto_enumTypes, - MessageInfos: file_evnode_v1_health_proto_msgTypes, - }.Build() - File_evnode_v1_health_proto = out.File - file_evnode_v1_health_proto_goTypes = nil - file_evnode_v1_health_proto_depIdxs = nil -} diff --git a/types/pb/evnode/v1/v1connect/health.connect.go b/types/pb/evnode/v1/v1connect/health.connect.go deleted file mode 100644 index 92a926e0cc..0000000000 --- a/types/pb/evnode/v1/v1connect/health.connect.go +++ /dev/null @@ -1,111 +0,0 @@ -// Code generated by protoc-gen-connect-go. DO NOT EDIT. -// -// Source: evnode/v1/health.proto - -package v1connect - -import ( - connect "connectrpc.com/connect" - context "context" - errors "errors" - v1 "github.com/evstack/ev-node/types/pb/evnode/v1" - emptypb "google.golang.org/protobuf/types/known/emptypb" - http "net/http" - strings "strings" -) - -// This is a compile-time assertion to ensure that this generated file and the connect package are -// compatible. If you get a compiler error that this constant is not defined, this code was -// generated with a version of connect newer than the one compiled into your binary. You can fix the -// problem by either regenerating this code with an older version of connect or updating the connect -// version compiled into your binary. -const _ = connect.IsAtLeastVersion1_13_0 - -const ( - // HealthServiceName is the fully-qualified name of the HealthService service. - HealthServiceName = "evnode.v1.HealthService" -) - -// These constants are the fully-qualified names of the RPCs defined in this package. They're -// exposed at runtime as Spec.Procedure and as the final two segments of the HTTP route. -// -// Note that these are different from the fully-qualified method names used by -// google.golang.org/protobuf/reflect/protoreflect. To convert from these constants to -// reflection-formatted method names, remove the leading slash and convert the remaining slash to a -// period. -const ( - // HealthServiceLivezProcedure is the fully-qualified name of the HealthService's Livez RPC. - HealthServiceLivezProcedure = "/evnode.v1.HealthService/Livez" -) - -// HealthServiceClient is a client for the evnode.v1.HealthService service. -type HealthServiceClient interface { - // Livez returns the health status of the node - Livez(context.Context, *connect.Request[emptypb.Empty]) (*connect.Response[v1.GetHealthResponse], error) -} - -// NewHealthServiceClient constructs a client for the evnode.v1.HealthService service. By default, -// it uses the Connect protocol with the binary Protobuf Codec, asks for gzipped responses, and -// sends uncompressed requests. To use the gRPC or gRPC-Web protocols, supply the connect.WithGRPC() -// or connect.WithGRPCWeb() options. -// -// The URL supplied here should be the base URL for the Connect or gRPC server (for example, -// http://api.acme.com or https://acme.com/grpc). -func NewHealthServiceClient(httpClient connect.HTTPClient, baseURL string, opts ...connect.ClientOption) HealthServiceClient { - baseURL = strings.TrimRight(baseURL, "/") - healthServiceMethods := v1.File_evnode_v1_health_proto.Services().ByName("HealthService").Methods() - return &healthServiceClient{ - livez: connect.NewClient[emptypb.Empty, v1.GetHealthResponse]( - httpClient, - baseURL+HealthServiceLivezProcedure, - connect.WithSchema(healthServiceMethods.ByName("Livez")), - connect.WithClientOptions(opts...), - ), - } -} - -// healthServiceClient implements HealthServiceClient. -type healthServiceClient struct { - livez *connect.Client[emptypb.Empty, v1.GetHealthResponse] -} - -// Livez calls evnode.v1.HealthService.Livez. -func (c *healthServiceClient) Livez(ctx context.Context, req *connect.Request[emptypb.Empty]) (*connect.Response[v1.GetHealthResponse], error) { - return c.livez.CallUnary(ctx, req) -} - -// HealthServiceHandler is an implementation of the evnode.v1.HealthService service. -type HealthServiceHandler interface { - // Livez returns the health status of the node - Livez(context.Context, *connect.Request[emptypb.Empty]) (*connect.Response[v1.GetHealthResponse], error) -} - -// NewHealthServiceHandler builds an HTTP handler from the service implementation. It returns the -// path on which to mount the handler and the handler itself. -// -// By default, handlers support the Connect, gRPC, and gRPC-Web protocols with the binary Protobuf -// and JSON codecs. They also support gzip compression. -func NewHealthServiceHandler(svc HealthServiceHandler, opts ...connect.HandlerOption) (string, http.Handler) { - healthServiceMethods := v1.File_evnode_v1_health_proto.Services().ByName("HealthService").Methods() - healthServiceLivezHandler := connect.NewUnaryHandler( - HealthServiceLivezProcedure, - svc.Livez, - connect.WithSchema(healthServiceMethods.ByName("Livez")), - connect.WithHandlerOptions(opts...), - ) - return "/evnode.v1.HealthService/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - switch r.URL.Path { - case HealthServiceLivezProcedure: - healthServiceLivezHandler.ServeHTTP(w, r) - default: - http.NotFound(w, r) - } - }) -} - -// UnimplementedHealthServiceHandler returns CodeUnimplemented from all methods. -type UnimplementedHealthServiceHandler struct{} - -func (UnimplementedHealthServiceHandler) Livez(context.Context, *connect.Request[emptypb.Empty]) (*connect.Response[v1.GetHealthResponse], error) { - return nil, connect.NewError(connect.CodeUnimplemented, errors.New("evnode.v1.HealthService.Livez is not implemented")) -} From ba282ec81cbfd0e8df609bfda6ed557f5d6967b1 Mon Sep 17 00:00:00 2001 From: Randy Grok Date: Wed, 5 Nov 2025 11:56:17 +0100 Subject: [PATCH 06/21] feat(health): introduce back HealthService with Livez RPC for health status checks; deprecate legacy gRPC endpoint in favor of HTTP --- pkg/rpc/server/server.go | 87 +++++++- proto/evnode/v1/health.proto | 32 +++ test/docker-e2e/upgrade_test.go | 1 - types/pb/evnode/v1/health.pb.go | 196 ++++++++++++++++++ .../pb/evnode/v1/v1connect/health.connect.go | 113 ++++++++++ 5 files changed, 426 insertions(+), 3 deletions(-) create mode 100644 proto/evnode/v1/health.proto create mode 100644 types/pb/evnode/v1/health.pb.go create mode 100644 types/pb/evnode/v1/v1connect/health.connect.go diff --git a/pkg/rpc/server/server.go b/pkg/rpc/server/server.go index f649fda37d..a01cb6358e 100644 --- a/pkg/rpc/server/server.go +++ b/pkg/rpc/server/server.go @@ -287,10 +287,88 @@ func (p *P2PServer) GetNetInfo( }), nil } -// NewServiceHandler creates a new HTTP handler for Store, P2P and Config services +// HealthServer implements the HealthService defined in the proto file +// DEPRECATED: This is a legacy compatibility shim for external frameworks. +// New code should use GET /health/live HTTP endpoint instead. +type HealthServer struct { + store store.Store + config config.Config + logger zerolog.Logger +} + +// NewHealthServer creates a new HealthServer instance +func NewHealthServer(store store.Store, config config.Config, logger zerolog.Logger) *HealthServer { + return &HealthServer{ + store: store, + config: config, + logger: logger, + } +} + +// Livez implements the HealthService.Livez RPC +// DEPRECATED: Use GET /health/live HTTP endpoint instead. This endpoint exists only +// for backward compatibility with external testing frameworks. +func (h *HealthServer) Livez( + ctx context.Context, + req *connect.Request[emptypb.Empty], +) (*connect.Response[pb.GetHealthResponse], error) { + status := pb.HealthStatus_PASS + + // For aggregator nodes, check if block production is healthy + if h.config.Node.Aggregator { + state, err := h.store.GetState(ctx) + if err != nil { + h.logger.Error().Err(err).Msg("Failed to get state for health check") + return connect.NewResponse(&pb.GetHealthResponse{ + Status: pb.HealthStatus_FAIL, + }), nil + } + + // If we have blocks, check if the last block time is recent + if state.LastBlockHeight > 0 { + timeSinceLastBlock := time.Since(state.LastBlockTime) + + // Calculate the threshold based on block time + blockTime := h.config.Node.BlockTime.Duration + + // For lazy mode, use the lazy block interval instead + if h.config.Node.LazyMode { + blockTime = h.config.Node.LazyBlockInterval.Duration + } + + warnThreshold := blockTime * 3 // healthCheckWarnMultiplier + failThreshold := blockTime * 5 // healthCheckFailMultiplier + + if timeSinceLastBlock > failThreshold { + h.logger.Error(). + Dur("time_since_last_block", timeSinceLastBlock). + Dur("fail_threshold", failThreshold). + Uint64("last_block_height", state.LastBlockHeight). + Time("last_block_time", state.LastBlockTime). + Msg("Health check: node has stopped producing blocks (FAIL)") + status = pb.HealthStatus_FAIL + } else if timeSinceLastBlock > warnThreshold { + h.logger.Warn(). + Dur("time_since_last_block", timeSinceLastBlock). + Dur("warn_threshold", warnThreshold). + Uint64("last_block_height", state.LastBlockHeight). + Time("last_block_time", state.LastBlockTime). + Msg("Health check: block production is slow (WARN)") + status = pb.HealthStatus_WARN + } + } + } + + return connect.NewResponse(&pb.GetHealthResponse{ + Status: status, + }), nil +} + +// NewServiceHandler creates a new HTTP handler for Store, P2P, Health and Config services func NewServiceHandler(store store.Store, peerManager p2p.P2PRPC, proposerAddress []byte, logger zerolog.Logger, config config.Config, bestKnown BestKnownHeightProvider) (http.Handler, error) { storeServer := NewStoreServer(store, logger) p2pServer := NewP2PServer(peerManager) + healthServer := NewHealthServer(store, config, logger) // Legacy gRPC endpoint configServer := NewConfigServer(config, proposerAddress, logger) mux := http.NewServeMux() @@ -299,6 +377,7 @@ func NewServiceHandler(store store.Store, peerManager p2p.P2PRPC, proposerAddres reflector := grpcreflect.NewStaticReflector( rpc.StoreServiceName, rpc.P2PServiceName, + rpc.HealthServiceName, // Legacy gRPC endpoint rpc.ConfigServiceName, ) mux.Handle(grpcreflect.NewHandlerV1(reflector, compress1KB)) @@ -312,10 +391,14 @@ func NewServiceHandler(store store.Store, peerManager p2p.P2PRPC, proposerAddres p2pPath, p2pHandler := rpc.NewP2PServiceHandler(p2pServer) mux.Handle(p2pPath, p2pHandler) + // Register HealthService (legacy gRPC endpoint for backward compatibility) + healthPath, healthHandler := rpc.NewHealthServiceHandler(healthServer) + mux.Handle(healthPath, healthHandler) + configPath, configHandler := rpc.NewConfigServiceHandler(configServer) mux.Handle(configPath, configHandler) - // Register custom HTTP endpoints + // Register custom HTTP endpoints (including the preferred /health/live endpoint) RegisterCustomHTTPEndpoints(mux, store, peerManager, config, bestKnown, logger) // Use h2c to support HTTP/2 without TLS diff --git a/proto/evnode/v1/health.proto b/proto/evnode/v1/health.proto new file mode 100644 index 0000000000..3be9d7d5ab --- /dev/null +++ b/proto/evnode/v1/health.proto @@ -0,0 +1,32 @@ +syntax = "proto3"; +package evnode.v1; + +import "google/protobuf/empty.proto"; + +option go_package = "github.com/evstack/ev-node/types/pb/evnode/v1"; + +// HealthService defines the RPC service for the health package +// DEPRECATED: Use HTTP endpoint GET /health/live instead +service HealthService { + // Livez returns the health status of the node + // DEPRECATED: Use HTTP endpoint GET /health/live instead + rpc Livez(google.protobuf.Empty) returns (GetHealthResponse) {} +} + +// HealthStatus defines the health status of the node +enum HealthStatus { + // Unknown health status + UNKNOWN = 0; + // Healthy status (Healthy) + PASS = 1; + // Degraded but still serving + WARN = 2; + // Hard fail + FAIL = 3; +} + +// GetHealthResponse defines the response for retrieving health status +message GetHealthResponse { + // Health status + HealthStatus status = 1; +} diff --git a/test/docker-e2e/upgrade_test.go b/test/docker-e2e/upgrade_test.go index 84b86e695a..1a48f5f977 100644 --- a/test/docker-e2e/upgrade_test.go +++ b/test/docker-e2e/upgrade_test.go @@ -3,7 +3,6 @@ package docker_e2e import ( - "bytes" "context" "fmt" "math/big" diff --git a/types/pb/evnode/v1/health.pb.go b/types/pb/evnode/v1/health.pb.go new file mode 100644 index 0000000000..084d911d7c --- /dev/null +++ b/types/pb/evnode/v1/health.pb.go @@ -0,0 +1,196 @@ +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.36.10 +// protoc (unknown) +// source: evnode/v1/health.proto + +package v1 + +import ( + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + emptypb "google.golang.org/protobuf/types/known/emptypb" + reflect "reflect" + sync "sync" + unsafe "unsafe" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +// HealthStatus defines the health status of the node +type HealthStatus int32 + +const ( + // Unknown health status + HealthStatus_UNKNOWN HealthStatus = 0 + // Healthy status (Healthy) + HealthStatus_PASS HealthStatus = 1 + // Degraded but still serving + HealthStatus_WARN HealthStatus = 2 + // Hard fail + HealthStatus_FAIL HealthStatus = 3 +) + +// Enum value maps for HealthStatus. +var ( + HealthStatus_name = map[int32]string{ + 0: "UNKNOWN", + 1: "PASS", + 2: "WARN", + 3: "FAIL", + } + HealthStatus_value = map[string]int32{ + "UNKNOWN": 0, + "PASS": 1, + "WARN": 2, + "FAIL": 3, + } +) + +func (x HealthStatus) Enum() *HealthStatus { + p := new(HealthStatus) + *p = x + return p +} + +func (x HealthStatus) String() string { + return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) +} + +func (HealthStatus) Descriptor() protoreflect.EnumDescriptor { + return file_evnode_v1_health_proto_enumTypes[0].Descriptor() +} + +func (HealthStatus) Type() protoreflect.EnumType { + return &file_evnode_v1_health_proto_enumTypes[0] +} + +func (x HealthStatus) Number() protoreflect.EnumNumber { + return protoreflect.EnumNumber(x) +} + +// Deprecated: Use HealthStatus.Descriptor instead. +func (HealthStatus) EnumDescriptor() ([]byte, []int) { + return file_evnode_v1_health_proto_rawDescGZIP(), []int{0} +} + +// GetHealthResponse defines the response for retrieving health status +type GetHealthResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + // Health status + Status HealthStatus `protobuf:"varint,1,opt,name=status,proto3,enum=evnode.v1.HealthStatus" json:"status,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *GetHealthResponse) Reset() { + *x = GetHealthResponse{} + mi := &file_evnode_v1_health_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *GetHealthResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetHealthResponse) ProtoMessage() {} + +func (x *GetHealthResponse) ProtoReflect() protoreflect.Message { + mi := &file_evnode_v1_health_proto_msgTypes[0] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetHealthResponse.ProtoReflect.Descriptor instead. +func (*GetHealthResponse) Descriptor() ([]byte, []int) { + return file_evnode_v1_health_proto_rawDescGZIP(), []int{0} +} + +func (x *GetHealthResponse) GetStatus() HealthStatus { + if x != nil { + return x.Status + } + return HealthStatus_UNKNOWN +} + +var File_evnode_v1_health_proto protoreflect.FileDescriptor + +const file_evnode_v1_health_proto_rawDesc = "" + + "\n" + + "\x16evnode/v1/health.proto\x12\tevnode.v1\x1a\x1bgoogle/protobuf/empty.proto\"D\n" + + "\x11GetHealthResponse\x12/\n" + + "\x06status\x18\x01 \x01(\x0e2\x17.evnode.v1.HealthStatusR\x06status*9\n" + + "\fHealthStatus\x12\v\n" + + "\aUNKNOWN\x10\x00\x12\b\n" + + "\x04PASS\x10\x01\x12\b\n" + + "\x04WARN\x10\x02\x12\b\n" + + "\x04FAIL\x10\x032P\n" + + "\rHealthService\x12?\n" + + "\x05Livez\x12\x16.google.protobuf.Empty\x1a\x1c.evnode.v1.GetHealthResponse\"\x00B/Z-github.com/evstack/ev-node/types/pb/evnode/v1b\x06proto3" + +var ( + file_evnode_v1_health_proto_rawDescOnce sync.Once + file_evnode_v1_health_proto_rawDescData []byte +) + +func file_evnode_v1_health_proto_rawDescGZIP() []byte { + file_evnode_v1_health_proto_rawDescOnce.Do(func() { + file_evnode_v1_health_proto_rawDescData = protoimpl.X.CompressGZIP(unsafe.Slice(unsafe.StringData(file_evnode_v1_health_proto_rawDesc), len(file_evnode_v1_health_proto_rawDesc))) + }) + return file_evnode_v1_health_proto_rawDescData +} + +var file_evnode_v1_health_proto_enumTypes = make([]protoimpl.EnumInfo, 1) +var file_evnode_v1_health_proto_msgTypes = make([]protoimpl.MessageInfo, 1) +var file_evnode_v1_health_proto_goTypes = []any{ + (HealthStatus)(0), // 0: evnode.v1.HealthStatus + (*GetHealthResponse)(nil), // 1: evnode.v1.GetHealthResponse + (*emptypb.Empty)(nil), // 2: google.protobuf.Empty +} +var file_evnode_v1_health_proto_depIdxs = []int32{ + 0, // 0: evnode.v1.GetHealthResponse.status:type_name -> evnode.v1.HealthStatus + 2, // 1: evnode.v1.HealthService.Livez:input_type -> google.protobuf.Empty + 1, // 2: evnode.v1.HealthService.Livez:output_type -> evnode.v1.GetHealthResponse + 2, // [2:3] is the sub-list for method output_type + 1, // [1:2] is the sub-list for method input_type + 1, // [1:1] is the sub-list for extension type_name + 1, // [1:1] is the sub-list for extension extendee + 0, // [0:1] is the sub-list for field type_name +} + +func init() { file_evnode_v1_health_proto_init() } +func file_evnode_v1_health_proto_init() { + if File_evnode_v1_health_proto != nil { + return + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: unsafe.Slice(unsafe.StringData(file_evnode_v1_health_proto_rawDesc), len(file_evnode_v1_health_proto_rawDesc)), + NumEnums: 1, + NumMessages: 1, + NumExtensions: 0, + NumServices: 1, + }, + GoTypes: file_evnode_v1_health_proto_goTypes, + DependencyIndexes: file_evnode_v1_health_proto_depIdxs, + EnumInfos: file_evnode_v1_health_proto_enumTypes, + MessageInfos: file_evnode_v1_health_proto_msgTypes, + }.Build() + File_evnode_v1_health_proto = out.File + file_evnode_v1_health_proto_goTypes = nil + file_evnode_v1_health_proto_depIdxs = nil +} diff --git a/types/pb/evnode/v1/v1connect/health.connect.go b/types/pb/evnode/v1/v1connect/health.connect.go new file mode 100644 index 0000000000..2cbb3cb6c4 --- /dev/null +++ b/types/pb/evnode/v1/v1connect/health.connect.go @@ -0,0 +1,113 @@ +// Code generated by protoc-gen-connect-go. DO NOT EDIT. +// +// Source: evnode/v1/health.proto + +package v1connect + +import ( + connect "connectrpc.com/connect" + context "context" + errors "errors" + v1 "github.com/evstack/ev-node/types/pb/evnode/v1" + emptypb "google.golang.org/protobuf/types/known/emptypb" + http "net/http" + strings "strings" +) + +// This is a compile-time assertion to ensure that this generated file and the connect package are +// compatible. If you get a compiler error that this constant is not defined, this code was +// generated with a version of connect newer than the one compiled into your binary. You can fix the +// problem by either regenerating this code with an older version of connect or updating the connect +// version compiled into your binary. +const _ = connect.IsAtLeastVersion1_13_0 + +const ( + // HealthServiceName is the fully-qualified name of the HealthService service. + HealthServiceName = "evnode.v1.HealthService" +) + +// These constants are the fully-qualified names of the RPCs defined in this package. They're +// exposed at runtime as Spec.Procedure and as the final two segments of the HTTP route. +// +// Note that these are different from the fully-qualified method names used by +// google.golang.org/protobuf/reflect/protoreflect. To convert from these constants to +// reflection-formatted method names, remove the leading slash and convert the remaining slash to a +// period. +const ( + // HealthServiceLivezProcedure is the fully-qualified name of the HealthService's Livez RPC. + HealthServiceLivezProcedure = "/evnode.v1.HealthService/Livez" +) + +// HealthServiceClient is a client for the evnode.v1.HealthService service. +type HealthServiceClient interface { + // Livez returns the health status of the node + // DEPRECATED: Use HTTP endpoint GET /health/live instead + Livez(context.Context, *connect.Request[emptypb.Empty]) (*connect.Response[v1.GetHealthResponse], error) +} + +// NewHealthServiceClient constructs a client for the evnode.v1.HealthService service. By default, +// it uses the Connect protocol with the binary Protobuf Codec, asks for gzipped responses, and +// sends uncompressed requests. To use the gRPC or gRPC-Web protocols, supply the connect.WithGRPC() +// or connect.WithGRPCWeb() options. +// +// The URL supplied here should be the base URL for the Connect or gRPC server (for example, +// http://api.acme.com or https://acme.com/grpc). +func NewHealthServiceClient(httpClient connect.HTTPClient, baseURL string, opts ...connect.ClientOption) HealthServiceClient { + baseURL = strings.TrimRight(baseURL, "/") + healthServiceMethods := v1.File_evnode_v1_health_proto.Services().ByName("HealthService").Methods() + return &healthServiceClient{ + livez: connect.NewClient[emptypb.Empty, v1.GetHealthResponse]( + httpClient, + baseURL+HealthServiceLivezProcedure, + connect.WithSchema(healthServiceMethods.ByName("Livez")), + connect.WithClientOptions(opts...), + ), + } +} + +// healthServiceClient implements HealthServiceClient. +type healthServiceClient struct { + livez *connect.Client[emptypb.Empty, v1.GetHealthResponse] +} + +// Livez calls evnode.v1.HealthService.Livez. +func (c *healthServiceClient) Livez(ctx context.Context, req *connect.Request[emptypb.Empty]) (*connect.Response[v1.GetHealthResponse], error) { + return c.livez.CallUnary(ctx, req) +} + +// HealthServiceHandler is an implementation of the evnode.v1.HealthService service. +type HealthServiceHandler interface { + // Livez returns the health status of the node + // DEPRECATED: Use HTTP endpoint GET /health/live instead + Livez(context.Context, *connect.Request[emptypb.Empty]) (*connect.Response[v1.GetHealthResponse], error) +} + +// NewHealthServiceHandler builds an HTTP handler from the service implementation. It returns the +// path on which to mount the handler and the handler itself. +// +// By default, handlers support the Connect, gRPC, and gRPC-Web protocols with the binary Protobuf +// and JSON codecs. They also support gzip compression. +func NewHealthServiceHandler(svc HealthServiceHandler, opts ...connect.HandlerOption) (string, http.Handler) { + healthServiceMethods := v1.File_evnode_v1_health_proto.Services().ByName("HealthService").Methods() + healthServiceLivezHandler := connect.NewUnaryHandler( + HealthServiceLivezProcedure, + svc.Livez, + connect.WithSchema(healthServiceMethods.ByName("Livez")), + connect.WithHandlerOptions(opts...), + ) + return "/evnode.v1.HealthService/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case HealthServiceLivezProcedure: + healthServiceLivezHandler.ServeHTTP(w, r) + default: + http.NotFound(w, r) + } + }) +} + +// UnimplementedHealthServiceHandler returns CodeUnimplemented from all methods. +type UnimplementedHealthServiceHandler struct{} + +func (UnimplementedHealthServiceHandler) Livez(context.Context, *connect.Request[emptypb.Empty]) (*connect.Response[v1.GetHealthResponse], error) { + return nil, connect.NewError(connect.CodeUnimplemented, errors.New("evnode.v1.HealthService.Livez is not implemented")) +} From cf475a1795f26ae1ccb79e986c1b0c14189558b9 Mon Sep 17 00:00:00 2001 From: Randy Grok Date: Wed, 5 Nov 2025 12:01:56 +0100 Subject: [PATCH 07/21] feat(health): add deprecation warnings and headers for Livez endpoint; recommend migration to HTTP --- pkg/rpc/server/server.go | 16 +++++++++++++-- proto/evnode/v1/health.proto | 13 +++++++++--- types/pb/evnode/v1/health.pb.go | 6 +++--- .../pb/evnode/v1/v1connect/health.connect.go | 20 +++++++++++++++++-- 4 files changed, 45 insertions(+), 10 deletions(-) diff --git a/pkg/rpc/server/server.go b/pkg/rpc/server/server.go index a01cb6358e..5513adfe25 100644 --- a/pkg/rpc/server/server.go +++ b/pkg/rpc/server/server.go @@ -312,6 +312,12 @@ func (h *HealthServer) Livez( ctx context.Context, req *connect.Request[emptypb.Empty], ) (*connect.Response[pb.GetHealthResponse], error) { + // Log deprecation warning + h.logger.Warn(). + Str("deprecated_endpoint", "/evnode.v1.HealthService/Livez"). + Str("recommended_endpoint", "GET /health/live"). + Msg("DEPRECATED: gRPC health endpoint called. Please migrate to HTTP endpoint GET /health/live") + status := pb.HealthStatus_PASS // For aggregator nodes, check if block production is healthy @@ -359,9 +365,15 @@ func (h *HealthServer) Livez( } } - return connect.NewResponse(&pb.GetHealthResponse{ + // Add deprecation warning to response headers + resp := connect.NewResponse(&pb.GetHealthResponse{ Status: status, - }), nil + }) + resp.Header().Set("X-Deprecated", "true") + resp.Header().Set("X-Deprecated-Message", "Use GET /health/live instead") + resp.Header().Set("Warning", "299 - \"Deprecated endpoint. Use GET /health/live\"") + + return resp, nil } // NewServiceHandler creates a new HTTP handler for Store, P2P, Health and Config services diff --git a/proto/evnode/v1/health.proto b/proto/evnode/v1/health.proto index 3be9d7d5ab..ee96ee7807 100644 --- a/proto/evnode/v1/health.proto +++ b/proto/evnode/v1/health.proto @@ -6,11 +6,18 @@ import "google/protobuf/empty.proto"; option go_package = "github.com/evstack/ev-node/types/pb/evnode/v1"; // HealthService defines the RPC service for the health package -// DEPRECATED: Use HTTP endpoint GET /health/live instead +// DEPRECATED: Use HTTP endpoint GET /health/live instead. +// This service is maintained only for backward compatibility with external frameworks. +// It will be removed in a future version. service HealthService { + option deprecated = true; + // Livez returns the health status of the node - // DEPRECATED: Use HTTP endpoint GET /health/live instead - rpc Livez(google.protobuf.Empty) returns (GetHealthResponse) {} + // DEPRECATED: Use HTTP endpoint GET /health/live instead. + // This endpoint logs deprecation warnings and adds deprecation headers to responses. + rpc Livez(google.protobuf.Empty) returns (GetHealthResponse) { + option deprecated = true; + } } // HealthStatus defines the health status of the node diff --git a/types/pb/evnode/v1/health.pb.go b/types/pb/evnode/v1/health.pb.go index 084d911d7c..5d85950a29 100644 --- a/types/pb/evnode/v1/health.pb.go +++ b/types/pb/evnode/v1/health.pb.go @@ -136,9 +136,9 @@ const file_evnode_v1_health_proto_rawDesc = "" + "\aUNKNOWN\x10\x00\x12\b\n" + "\x04PASS\x10\x01\x12\b\n" + "\x04WARN\x10\x02\x12\b\n" + - "\x04FAIL\x10\x032P\n" + - "\rHealthService\x12?\n" + - "\x05Livez\x12\x16.google.protobuf.Empty\x1a\x1c.evnode.v1.GetHealthResponse\"\x00B/Z-github.com/evstack/ev-node/types/pb/evnode/v1b\x06proto3" + "\x04FAIL\x10\x032X\n" + + "\rHealthService\x12B\n" + + "\x05Livez\x12\x16.google.protobuf.Empty\x1a\x1c.evnode.v1.GetHealthResponse\"\x03\x88\x02\x01\x1a\x03\x88\x02\x01B/Z-github.com/evstack/ev-node/types/pb/evnode/v1b\x06proto3" var ( file_evnode_v1_health_proto_rawDescOnce sync.Once diff --git a/types/pb/evnode/v1/v1connect/health.connect.go b/types/pb/evnode/v1/v1connect/health.connect.go index 2cbb3cb6c4..31fb7c17f4 100644 --- a/types/pb/evnode/v1/v1connect/health.connect.go +++ b/types/pb/evnode/v1/v1connect/health.connect.go @@ -39,9 +39,14 @@ const ( ) // HealthServiceClient is a client for the evnode.v1.HealthService service. +// +// Deprecated: do not use. type HealthServiceClient interface { // Livez returns the health status of the node - // DEPRECATED: Use HTTP endpoint GET /health/live instead + // DEPRECATED: Use HTTP endpoint GET /health/live instead. + // This endpoint logs deprecation warnings and adds deprecation headers to responses. + // + // Deprecated: do not use. Livez(context.Context, *connect.Request[emptypb.Empty]) (*connect.Response[v1.GetHealthResponse], error) } @@ -52,6 +57,8 @@ type HealthServiceClient interface { // // The URL supplied here should be the base URL for the Connect or gRPC server (for example, // http://api.acme.com or https://acme.com/grpc). +// +// Deprecated: do not use. func NewHealthServiceClient(httpClient connect.HTTPClient, baseURL string, opts ...connect.ClientOption) HealthServiceClient { baseURL = strings.TrimRight(baseURL, "/") healthServiceMethods := v1.File_evnode_v1_health_proto.Services().ByName("HealthService").Methods() @@ -71,14 +78,21 @@ type healthServiceClient struct { } // Livez calls evnode.v1.HealthService.Livez. +// +// Deprecated: do not use. func (c *healthServiceClient) Livez(ctx context.Context, req *connect.Request[emptypb.Empty]) (*connect.Response[v1.GetHealthResponse], error) { return c.livez.CallUnary(ctx, req) } // HealthServiceHandler is an implementation of the evnode.v1.HealthService service. +// +// Deprecated: do not use. type HealthServiceHandler interface { // Livez returns the health status of the node - // DEPRECATED: Use HTTP endpoint GET /health/live instead + // DEPRECATED: Use HTTP endpoint GET /health/live instead. + // This endpoint logs deprecation warnings and adds deprecation headers to responses. + // + // Deprecated: do not use. Livez(context.Context, *connect.Request[emptypb.Empty]) (*connect.Response[v1.GetHealthResponse], error) } @@ -87,6 +101,8 @@ type HealthServiceHandler interface { // // By default, handlers support the Connect, gRPC, and gRPC-Web protocols with the binary Protobuf // and JSON codecs. They also support gzip compression. +// +// Deprecated: do not use. func NewHealthServiceHandler(svc HealthServiceHandler, opts ...connect.HandlerOption) (string, http.Handler) { healthServiceMethods := v1.File_evnode_v1_health_proto.Services().ByName("HealthService").Methods() healthServiceLivezHandler := connect.NewUnaryHandler( From c0749b17b6101bb9cf378e7793b5a6169a315469 Mon Sep 17 00:00:00 2001 From: Randy Grok Date: Wed, 5 Nov 2025 12:06:35 +0100 Subject: [PATCH 08/21] refactor(health): remove deprecated HealthService and related gRPC endpoints; migrate to HTTP health checks --- pkg/rpc/server/server.go | 99 +-------- proto/evnode/v1/health.proto | 39 ---- types/pb/evnode/v1/health.pb.go | 196 ------------------ .../pb/evnode/v1/v1connect/health.connect.go | 129 ------------ 4 files changed, 2 insertions(+), 461 deletions(-) delete mode 100644 proto/evnode/v1/health.proto delete mode 100644 types/pb/evnode/v1/health.pb.go delete mode 100644 types/pb/evnode/v1/v1connect/health.connect.go diff --git a/pkg/rpc/server/server.go b/pkg/rpc/server/server.go index 5513adfe25..f649fda37d 100644 --- a/pkg/rpc/server/server.go +++ b/pkg/rpc/server/server.go @@ -287,100 +287,10 @@ func (p *P2PServer) GetNetInfo( }), nil } -// HealthServer implements the HealthService defined in the proto file -// DEPRECATED: This is a legacy compatibility shim for external frameworks. -// New code should use GET /health/live HTTP endpoint instead. -type HealthServer struct { - store store.Store - config config.Config - logger zerolog.Logger -} - -// NewHealthServer creates a new HealthServer instance -func NewHealthServer(store store.Store, config config.Config, logger zerolog.Logger) *HealthServer { - return &HealthServer{ - store: store, - config: config, - logger: logger, - } -} - -// Livez implements the HealthService.Livez RPC -// DEPRECATED: Use GET /health/live HTTP endpoint instead. This endpoint exists only -// for backward compatibility with external testing frameworks. -func (h *HealthServer) Livez( - ctx context.Context, - req *connect.Request[emptypb.Empty], -) (*connect.Response[pb.GetHealthResponse], error) { - // Log deprecation warning - h.logger.Warn(). - Str("deprecated_endpoint", "/evnode.v1.HealthService/Livez"). - Str("recommended_endpoint", "GET /health/live"). - Msg("DEPRECATED: gRPC health endpoint called. Please migrate to HTTP endpoint GET /health/live") - - status := pb.HealthStatus_PASS - - // For aggregator nodes, check if block production is healthy - if h.config.Node.Aggregator { - state, err := h.store.GetState(ctx) - if err != nil { - h.logger.Error().Err(err).Msg("Failed to get state for health check") - return connect.NewResponse(&pb.GetHealthResponse{ - Status: pb.HealthStatus_FAIL, - }), nil - } - - // If we have blocks, check if the last block time is recent - if state.LastBlockHeight > 0 { - timeSinceLastBlock := time.Since(state.LastBlockTime) - - // Calculate the threshold based on block time - blockTime := h.config.Node.BlockTime.Duration - - // For lazy mode, use the lazy block interval instead - if h.config.Node.LazyMode { - blockTime = h.config.Node.LazyBlockInterval.Duration - } - - warnThreshold := blockTime * 3 // healthCheckWarnMultiplier - failThreshold := blockTime * 5 // healthCheckFailMultiplier - - if timeSinceLastBlock > failThreshold { - h.logger.Error(). - Dur("time_since_last_block", timeSinceLastBlock). - Dur("fail_threshold", failThreshold). - Uint64("last_block_height", state.LastBlockHeight). - Time("last_block_time", state.LastBlockTime). - Msg("Health check: node has stopped producing blocks (FAIL)") - status = pb.HealthStatus_FAIL - } else if timeSinceLastBlock > warnThreshold { - h.logger.Warn(). - Dur("time_since_last_block", timeSinceLastBlock). - Dur("warn_threshold", warnThreshold). - Uint64("last_block_height", state.LastBlockHeight). - Time("last_block_time", state.LastBlockTime). - Msg("Health check: block production is slow (WARN)") - status = pb.HealthStatus_WARN - } - } - } - - // Add deprecation warning to response headers - resp := connect.NewResponse(&pb.GetHealthResponse{ - Status: status, - }) - resp.Header().Set("X-Deprecated", "true") - resp.Header().Set("X-Deprecated-Message", "Use GET /health/live instead") - resp.Header().Set("Warning", "299 - \"Deprecated endpoint. Use GET /health/live\"") - - return resp, nil -} - -// NewServiceHandler creates a new HTTP handler for Store, P2P, Health and Config services +// NewServiceHandler creates a new HTTP handler for Store, P2P and Config services func NewServiceHandler(store store.Store, peerManager p2p.P2PRPC, proposerAddress []byte, logger zerolog.Logger, config config.Config, bestKnown BestKnownHeightProvider) (http.Handler, error) { storeServer := NewStoreServer(store, logger) p2pServer := NewP2PServer(peerManager) - healthServer := NewHealthServer(store, config, logger) // Legacy gRPC endpoint configServer := NewConfigServer(config, proposerAddress, logger) mux := http.NewServeMux() @@ -389,7 +299,6 @@ func NewServiceHandler(store store.Store, peerManager p2p.P2PRPC, proposerAddres reflector := grpcreflect.NewStaticReflector( rpc.StoreServiceName, rpc.P2PServiceName, - rpc.HealthServiceName, // Legacy gRPC endpoint rpc.ConfigServiceName, ) mux.Handle(grpcreflect.NewHandlerV1(reflector, compress1KB)) @@ -403,14 +312,10 @@ func NewServiceHandler(store store.Store, peerManager p2p.P2PRPC, proposerAddres p2pPath, p2pHandler := rpc.NewP2PServiceHandler(p2pServer) mux.Handle(p2pPath, p2pHandler) - // Register HealthService (legacy gRPC endpoint for backward compatibility) - healthPath, healthHandler := rpc.NewHealthServiceHandler(healthServer) - mux.Handle(healthPath, healthHandler) - configPath, configHandler := rpc.NewConfigServiceHandler(configServer) mux.Handle(configPath, configHandler) - // Register custom HTTP endpoints (including the preferred /health/live endpoint) + // Register custom HTTP endpoints RegisterCustomHTTPEndpoints(mux, store, peerManager, config, bestKnown, logger) // Use h2c to support HTTP/2 without TLS diff --git a/proto/evnode/v1/health.proto b/proto/evnode/v1/health.proto deleted file mode 100644 index ee96ee7807..0000000000 --- a/proto/evnode/v1/health.proto +++ /dev/null @@ -1,39 +0,0 @@ -syntax = "proto3"; -package evnode.v1; - -import "google/protobuf/empty.proto"; - -option go_package = "github.com/evstack/ev-node/types/pb/evnode/v1"; - -// HealthService defines the RPC service for the health package -// DEPRECATED: Use HTTP endpoint GET /health/live instead. -// This service is maintained only for backward compatibility with external frameworks. -// It will be removed in a future version. -service HealthService { - option deprecated = true; - - // Livez returns the health status of the node - // DEPRECATED: Use HTTP endpoint GET /health/live instead. - // This endpoint logs deprecation warnings and adds deprecation headers to responses. - rpc Livez(google.protobuf.Empty) returns (GetHealthResponse) { - option deprecated = true; - } -} - -// HealthStatus defines the health status of the node -enum HealthStatus { - // Unknown health status - UNKNOWN = 0; - // Healthy status (Healthy) - PASS = 1; - // Degraded but still serving - WARN = 2; - // Hard fail - FAIL = 3; -} - -// GetHealthResponse defines the response for retrieving health status -message GetHealthResponse { - // Health status - HealthStatus status = 1; -} diff --git a/types/pb/evnode/v1/health.pb.go b/types/pb/evnode/v1/health.pb.go deleted file mode 100644 index 5d85950a29..0000000000 --- a/types/pb/evnode/v1/health.pb.go +++ /dev/null @@ -1,196 +0,0 @@ -// Code generated by protoc-gen-go. DO NOT EDIT. -// versions: -// protoc-gen-go v1.36.10 -// protoc (unknown) -// source: evnode/v1/health.proto - -package v1 - -import ( - protoreflect "google.golang.org/protobuf/reflect/protoreflect" - protoimpl "google.golang.org/protobuf/runtime/protoimpl" - emptypb "google.golang.org/protobuf/types/known/emptypb" - reflect "reflect" - sync "sync" - unsafe "unsafe" -) - -const ( - // Verify that this generated code is sufficiently up-to-date. - _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) - // Verify that runtime/protoimpl is sufficiently up-to-date. - _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) -) - -// HealthStatus defines the health status of the node -type HealthStatus int32 - -const ( - // Unknown health status - HealthStatus_UNKNOWN HealthStatus = 0 - // Healthy status (Healthy) - HealthStatus_PASS HealthStatus = 1 - // Degraded but still serving - HealthStatus_WARN HealthStatus = 2 - // Hard fail - HealthStatus_FAIL HealthStatus = 3 -) - -// Enum value maps for HealthStatus. -var ( - HealthStatus_name = map[int32]string{ - 0: "UNKNOWN", - 1: "PASS", - 2: "WARN", - 3: "FAIL", - } - HealthStatus_value = map[string]int32{ - "UNKNOWN": 0, - "PASS": 1, - "WARN": 2, - "FAIL": 3, - } -) - -func (x HealthStatus) Enum() *HealthStatus { - p := new(HealthStatus) - *p = x - return p -} - -func (x HealthStatus) String() string { - return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) -} - -func (HealthStatus) Descriptor() protoreflect.EnumDescriptor { - return file_evnode_v1_health_proto_enumTypes[0].Descriptor() -} - -func (HealthStatus) Type() protoreflect.EnumType { - return &file_evnode_v1_health_proto_enumTypes[0] -} - -func (x HealthStatus) Number() protoreflect.EnumNumber { - return protoreflect.EnumNumber(x) -} - -// Deprecated: Use HealthStatus.Descriptor instead. -func (HealthStatus) EnumDescriptor() ([]byte, []int) { - return file_evnode_v1_health_proto_rawDescGZIP(), []int{0} -} - -// GetHealthResponse defines the response for retrieving health status -type GetHealthResponse struct { - state protoimpl.MessageState `protogen:"open.v1"` - // Health status - Status HealthStatus `protobuf:"varint,1,opt,name=status,proto3,enum=evnode.v1.HealthStatus" json:"status,omitempty"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache -} - -func (x *GetHealthResponse) Reset() { - *x = GetHealthResponse{} - mi := &file_evnode_v1_health_proto_msgTypes[0] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) -} - -func (x *GetHealthResponse) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*GetHealthResponse) ProtoMessage() {} - -func (x *GetHealthResponse) ProtoReflect() protoreflect.Message { - mi := &file_evnode_v1_health_proto_msgTypes[0] - if x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use GetHealthResponse.ProtoReflect.Descriptor instead. -func (*GetHealthResponse) Descriptor() ([]byte, []int) { - return file_evnode_v1_health_proto_rawDescGZIP(), []int{0} -} - -func (x *GetHealthResponse) GetStatus() HealthStatus { - if x != nil { - return x.Status - } - return HealthStatus_UNKNOWN -} - -var File_evnode_v1_health_proto protoreflect.FileDescriptor - -const file_evnode_v1_health_proto_rawDesc = "" + - "\n" + - "\x16evnode/v1/health.proto\x12\tevnode.v1\x1a\x1bgoogle/protobuf/empty.proto\"D\n" + - "\x11GetHealthResponse\x12/\n" + - "\x06status\x18\x01 \x01(\x0e2\x17.evnode.v1.HealthStatusR\x06status*9\n" + - "\fHealthStatus\x12\v\n" + - "\aUNKNOWN\x10\x00\x12\b\n" + - "\x04PASS\x10\x01\x12\b\n" + - "\x04WARN\x10\x02\x12\b\n" + - "\x04FAIL\x10\x032X\n" + - "\rHealthService\x12B\n" + - "\x05Livez\x12\x16.google.protobuf.Empty\x1a\x1c.evnode.v1.GetHealthResponse\"\x03\x88\x02\x01\x1a\x03\x88\x02\x01B/Z-github.com/evstack/ev-node/types/pb/evnode/v1b\x06proto3" - -var ( - file_evnode_v1_health_proto_rawDescOnce sync.Once - file_evnode_v1_health_proto_rawDescData []byte -) - -func file_evnode_v1_health_proto_rawDescGZIP() []byte { - file_evnode_v1_health_proto_rawDescOnce.Do(func() { - file_evnode_v1_health_proto_rawDescData = protoimpl.X.CompressGZIP(unsafe.Slice(unsafe.StringData(file_evnode_v1_health_proto_rawDesc), len(file_evnode_v1_health_proto_rawDesc))) - }) - return file_evnode_v1_health_proto_rawDescData -} - -var file_evnode_v1_health_proto_enumTypes = make([]protoimpl.EnumInfo, 1) -var file_evnode_v1_health_proto_msgTypes = make([]protoimpl.MessageInfo, 1) -var file_evnode_v1_health_proto_goTypes = []any{ - (HealthStatus)(0), // 0: evnode.v1.HealthStatus - (*GetHealthResponse)(nil), // 1: evnode.v1.GetHealthResponse - (*emptypb.Empty)(nil), // 2: google.protobuf.Empty -} -var file_evnode_v1_health_proto_depIdxs = []int32{ - 0, // 0: evnode.v1.GetHealthResponse.status:type_name -> evnode.v1.HealthStatus - 2, // 1: evnode.v1.HealthService.Livez:input_type -> google.protobuf.Empty - 1, // 2: evnode.v1.HealthService.Livez:output_type -> evnode.v1.GetHealthResponse - 2, // [2:3] is the sub-list for method output_type - 1, // [1:2] is the sub-list for method input_type - 1, // [1:1] is the sub-list for extension type_name - 1, // [1:1] is the sub-list for extension extendee - 0, // [0:1] is the sub-list for field type_name -} - -func init() { file_evnode_v1_health_proto_init() } -func file_evnode_v1_health_proto_init() { - if File_evnode_v1_health_proto != nil { - return - } - type x struct{} - out := protoimpl.TypeBuilder{ - File: protoimpl.DescBuilder{ - GoPackagePath: reflect.TypeOf(x{}).PkgPath(), - RawDescriptor: unsafe.Slice(unsafe.StringData(file_evnode_v1_health_proto_rawDesc), len(file_evnode_v1_health_proto_rawDesc)), - NumEnums: 1, - NumMessages: 1, - NumExtensions: 0, - NumServices: 1, - }, - GoTypes: file_evnode_v1_health_proto_goTypes, - DependencyIndexes: file_evnode_v1_health_proto_depIdxs, - EnumInfos: file_evnode_v1_health_proto_enumTypes, - MessageInfos: file_evnode_v1_health_proto_msgTypes, - }.Build() - File_evnode_v1_health_proto = out.File - file_evnode_v1_health_proto_goTypes = nil - file_evnode_v1_health_proto_depIdxs = nil -} diff --git a/types/pb/evnode/v1/v1connect/health.connect.go b/types/pb/evnode/v1/v1connect/health.connect.go deleted file mode 100644 index 31fb7c17f4..0000000000 --- a/types/pb/evnode/v1/v1connect/health.connect.go +++ /dev/null @@ -1,129 +0,0 @@ -// Code generated by protoc-gen-connect-go. DO NOT EDIT. -// -// Source: evnode/v1/health.proto - -package v1connect - -import ( - connect "connectrpc.com/connect" - context "context" - errors "errors" - v1 "github.com/evstack/ev-node/types/pb/evnode/v1" - emptypb "google.golang.org/protobuf/types/known/emptypb" - http "net/http" - strings "strings" -) - -// This is a compile-time assertion to ensure that this generated file and the connect package are -// compatible. If you get a compiler error that this constant is not defined, this code was -// generated with a version of connect newer than the one compiled into your binary. You can fix the -// problem by either regenerating this code with an older version of connect or updating the connect -// version compiled into your binary. -const _ = connect.IsAtLeastVersion1_13_0 - -const ( - // HealthServiceName is the fully-qualified name of the HealthService service. - HealthServiceName = "evnode.v1.HealthService" -) - -// These constants are the fully-qualified names of the RPCs defined in this package. They're -// exposed at runtime as Spec.Procedure and as the final two segments of the HTTP route. -// -// Note that these are different from the fully-qualified method names used by -// google.golang.org/protobuf/reflect/protoreflect. To convert from these constants to -// reflection-formatted method names, remove the leading slash and convert the remaining slash to a -// period. -const ( - // HealthServiceLivezProcedure is the fully-qualified name of the HealthService's Livez RPC. - HealthServiceLivezProcedure = "/evnode.v1.HealthService/Livez" -) - -// HealthServiceClient is a client for the evnode.v1.HealthService service. -// -// Deprecated: do not use. -type HealthServiceClient interface { - // Livez returns the health status of the node - // DEPRECATED: Use HTTP endpoint GET /health/live instead. - // This endpoint logs deprecation warnings and adds deprecation headers to responses. - // - // Deprecated: do not use. - Livez(context.Context, *connect.Request[emptypb.Empty]) (*connect.Response[v1.GetHealthResponse], error) -} - -// NewHealthServiceClient constructs a client for the evnode.v1.HealthService service. By default, -// it uses the Connect protocol with the binary Protobuf Codec, asks for gzipped responses, and -// sends uncompressed requests. To use the gRPC or gRPC-Web protocols, supply the connect.WithGRPC() -// or connect.WithGRPCWeb() options. -// -// The URL supplied here should be the base URL for the Connect or gRPC server (for example, -// http://api.acme.com or https://acme.com/grpc). -// -// Deprecated: do not use. -func NewHealthServiceClient(httpClient connect.HTTPClient, baseURL string, opts ...connect.ClientOption) HealthServiceClient { - baseURL = strings.TrimRight(baseURL, "/") - healthServiceMethods := v1.File_evnode_v1_health_proto.Services().ByName("HealthService").Methods() - return &healthServiceClient{ - livez: connect.NewClient[emptypb.Empty, v1.GetHealthResponse]( - httpClient, - baseURL+HealthServiceLivezProcedure, - connect.WithSchema(healthServiceMethods.ByName("Livez")), - connect.WithClientOptions(opts...), - ), - } -} - -// healthServiceClient implements HealthServiceClient. -type healthServiceClient struct { - livez *connect.Client[emptypb.Empty, v1.GetHealthResponse] -} - -// Livez calls evnode.v1.HealthService.Livez. -// -// Deprecated: do not use. -func (c *healthServiceClient) Livez(ctx context.Context, req *connect.Request[emptypb.Empty]) (*connect.Response[v1.GetHealthResponse], error) { - return c.livez.CallUnary(ctx, req) -} - -// HealthServiceHandler is an implementation of the evnode.v1.HealthService service. -// -// Deprecated: do not use. -type HealthServiceHandler interface { - // Livez returns the health status of the node - // DEPRECATED: Use HTTP endpoint GET /health/live instead. - // This endpoint logs deprecation warnings and adds deprecation headers to responses. - // - // Deprecated: do not use. - Livez(context.Context, *connect.Request[emptypb.Empty]) (*connect.Response[v1.GetHealthResponse], error) -} - -// NewHealthServiceHandler builds an HTTP handler from the service implementation. It returns the -// path on which to mount the handler and the handler itself. -// -// By default, handlers support the Connect, gRPC, and gRPC-Web protocols with the binary Protobuf -// and JSON codecs. They also support gzip compression. -// -// Deprecated: do not use. -func NewHealthServiceHandler(svc HealthServiceHandler, opts ...connect.HandlerOption) (string, http.Handler) { - healthServiceMethods := v1.File_evnode_v1_health_proto.Services().ByName("HealthService").Methods() - healthServiceLivezHandler := connect.NewUnaryHandler( - HealthServiceLivezProcedure, - svc.Livez, - connect.WithSchema(healthServiceMethods.ByName("Livez")), - connect.WithHandlerOptions(opts...), - ) - return "/evnode.v1.HealthService/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - switch r.URL.Path { - case HealthServiceLivezProcedure: - healthServiceLivezHandler.ServeHTTP(w, r) - default: - http.NotFound(w, r) - } - }) -} - -// UnimplementedHealthServiceHandler returns CodeUnimplemented from all methods. -type UnimplementedHealthServiceHandler struct{} - -func (UnimplementedHealthServiceHandler) Livez(context.Context, *connect.Request[emptypb.Empty]) (*connect.Response[v1.GetHealthResponse], error) { - return nil, connect.NewError(connect.CodeUnimplemented, errors.New("evnode.v1.HealthService.Livez is not implemented")) -} From 29dc01da4766d195c5b39907ebd293c82a0ee219 Mon Sep 17 00:00:00 2001 From: Randy Grok Date: Wed, 5 Nov 2025 15:21:52 +0100 Subject: [PATCH 09/21] refactor(health): update health check tests and endpoints for improved clarity and accuracy --- pkg/rpc/client/client_test.go | 67 ++------ pkg/rpc/server/http.go | 109 ++++++------- pkg/rpc/server/http_test.go | 11 +- pkg/rpc/server/server_test.go | 280 +++++++++++++++------------------- 4 files changed, 192 insertions(+), 275 deletions(-) diff --git a/pkg/rpc/client/client_test.go b/pkg/rpc/client/client_test.go index 340fe1ea42..f36def6700 100644 --- a/pkg/rpc/client/client_test.go +++ b/pkg/rpc/client/client_test.go @@ -10,6 +10,7 @@ import ( "github.com/libp2p/go-libp2p/core/peer" "github.com/multiformats/go-multiaddr" "github.com/rs/zerolog" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" "golang.org/x/net/http2" @@ -247,92 +248,54 @@ func TestClientGetNetInfo(t *testing.T) { } func TestClientGetHealth(t *testing.T) { - t.Run("non-aggregator returns PASS", func(t *testing.T) { + t.Run("returns PASS when store is accessible", func(t *testing.T) { mockStore := mocks.NewMockStore(t) mockP2P := mocks.NewMockP2PRPC(t) + // Mock Height to return successfully + mockStore.On("Height", mock.Anything).Return(uint64(100), nil) + testServer, client := setupTestServer(t, mockStore, mockP2P) defer testServer.Close() healthStatus, err := client.GetHealth(context.Background()) - require.NoError(t, err) - require.Equal(t, "PASS", healthStatus.String()) - }) - - t.Run("aggregator with recent blocks returns PASS", func(t *testing.T) { - mockStore := mocks.NewMockStore(t) - mockP2P := mocks.NewMockP2PRPC(t) - - // Setup aggregator config - testConfig := config.DefaultConfig() - testConfig.Node.Aggregator = true - testConfig.Node.BlockTime.Duration = 1 * time.Second - - // Create state with recent block - state := types.State{ - LastBlockHeight: 100, - LastBlockTime: time.Now().Add(-500 * time.Millisecond), - } - mockStore.On("GetState", mock.Anything).Return(state, nil) - - // Create custom test server with aggregator config - testServer, client := setupTestServer(t, mockStore, mockP2P, testConfig) - defer testServer.Close() - healthStatus, err := client.GetHealth(context.Background()) - require.NoError(t, err) require.Equal(t, "PASS", healthStatus.String()) mockStore.AssertExpectations(t) }) - t.Run("aggregator with slow block production returns WARN", func(t *testing.T) { + t.Run("returns FAIL when store is not accessible", func(t *testing.T) { mockStore := mocks.NewMockStore(t) mockP2P := mocks.NewMockP2PRPC(t) - testConfig := config.DefaultConfig() - testConfig.Node.Aggregator = true - testConfig.Node.BlockTime.Duration = 1 * time.Second + // Mock Height to return an error + mockStore.On("Height", mock.Anything).Return(uint64(0), assert.AnError) - // State with block older than 3x block time - state := types.State{ - LastBlockHeight: 100, - LastBlockTime: time.Now().Add(-4 * time.Second), - } - mockStore.On("GetState", mock.Anything).Return(state, nil) - - testServer, client := setupTestServer(t, mockStore, mockP2P, testConfig) + testServer, client := setupTestServer(t, mockStore, mockP2P) defer testServer.Close() healthStatus, err := client.GetHealth(context.Background()) require.NoError(t, err) - require.Equal(t, "WARN", healthStatus.String()) + require.Equal(t, "FAIL", healthStatus.String()) mockStore.AssertExpectations(t) }) - t.Run("aggregator with stopped block production returns FAIL", func(t *testing.T) { + t.Run("returns PASS even at height 0", func(t *testing.T) { mockStore := mocks.NewMockStore(t) mockP2P := mocks.NewMockP2PRPC(t) - testConfig := config.DefaultConfig() - testConfig.Node.Aggregator = true - testConfig.Node.BlockTime.Duration = 1 * time.Second - - // State with block older than 5x block time - state := types.State{ - LastBlockHeight: 100, - LastBlockTime: time.Now().Add(-10 * time.Second), - } - mockStore.On("GetState", mock.Anything).Return(state, nil) + // Mock Height to return 0 successfully (genesis state) + mockStore.On("Height", mock.Anything).Return(uint64(0), nil) - testServer, client := setupTestServer(t, mockStore, mockP2P, testConfig) + testServer, client := setupTestServer(t, mockStore, mockP2P) defer testServer.Close() healthStatus, err := client.GetHealth(context.Background()) require.NoError(t, err) - require.Equal(t, "FAIL", healthStatus.String()) + require.Equal(t, "PASS", healthStatus.String()) mockStore.AssertExpectations(t) }) } diff --git a/pkg/rpc/server/http.go b/pkg/rpc/server/http.go index 7f9e4c731f..c9493ffa6a 100644 --- a/pkg/rpc/server/http.go +++ b/pkg/rpc/server/http.go @@ -11,16 +11,6 @@ import ( "github.com/rs/zerolog" ) -const ( - // healthCheckWarnMultiplier is the multiplier for block time to determine WARN threshold - // If no block has been produced in (blockTime * healthCheckWarnMultiplier), return WARN - healthCheckWarnMultiplier = 3 - - // healthCheckFailMultiplier is the multiplier for block time to determine FAIL threshold - // If no block has been produced in (blockTime * healthCheckFailMultiplier), return FAIL - healthCheckFailMultiplier = 5 -) - // BestKnownHeightProvider should return the best-known network height observed by the node // (e.g. min(headerSyncHeight, dataSyncHeight) for full nodes, or header height for light nodes). type BestKnownHeightProvider func() uint64 @@ -28,58 +18,22 @@ type BestKnownHeightProvider func() uint64 // RegisterCustomHTTPEndpoints is the designated place to add new, non-gRPC, plain HTTP handlers. // Additional custom HTTP endpoints can be registered on the mux here. func RegisterCustomHTTPEndpoints(mux *http.ServeMux, s store.Store, pm p2p.P2PRPC, cfg config.Config, bestKnownHeightProvider BestKnownHeightProvider, logger zerolog.Logger) { - // Liveness endpoint - checks if block production is healthy for aggregator nodes + // Liveness endpoint - checks if the service process is alive and responsive + // A failing liveness check should result in killing/restarting the process + // This endpoint should NOT check business logic (like block production or sync status) mux.HandleFunc("/health/live", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/plain") - // For aggregator nodes, check if block production is healthy - if cfg.Node.Aggregator { - state, err := s.GetState(r.Context()) - if err != nil { - logger.Error().Err(err).Msg("Failed to get state for health check") - http.Error(w, "FAIL", http.StatusServiceUnavailable) - return - } - - // If we have blocks, check if the last block time is recent - if state.LastBlockHeight > 0 { - timeSinceLastBlock := time.Since(state.LastBlockTime) - - // Calculate the threshold based on block time - blockTime := cfg.Node.BlockTime.Duration - - // For lazy mode, use the lazy block interval instead - if cfg.Node.LazyMode { - blockTime = cfg.Node.LazyBlockInterval.Duration - } - - warnThreshold := blockTime * healthCheckWarnMultiplier - failThreshold := blockTime * healthCheckFailMultiplier - - if timeSinceLastBlock > failThreshold { - logger.Error(). - Dur("time_since_last_block", timeSinceLastBlock). - Dur("fail_threshold", failThreshold). - Uint64("last_block_height", state.LastBlockHeight). - Time("last_block_time", state.LastBlockTime). - Msg("Health check: node has stopped producing blocks (FAIL)") - http.Error(w, "FAIL", http.StatusServiceUnavailable) - return - } else if timeSinceLastBlock > warnThreshold { - logger.Warn(). - Dur("time_since_last_block", timeSinceLastBlock). - Dur("warn_threshold", warnThreshold). - Uint64("last_block_height", state.LastBlockHeight). - Time("last_block_time", state.LastBlockTime). - Msg("Health check: block production is slow (WARN)") - w.WriteHeader(http.StatusOK) - fmt.Fprintln(w, "WARN") - return - } - } + // Basic liveness check: Can we access the store? + // This verifies the process is alive and core dependencies are accessible + _, err := s.Height(r.Context()) + if err != nil { + logger.Error().Err(err).Msg("Liveness check failed: cannot access store") + http.Error(w, "FAIL", http.StatusServiceUnavailable) + return } - // For non-aggregator nodes or if checks pass, return healthy + // Process is alive and responsive w.WriteHeader(http.StatusOK) fmt.Fprintln(w, "OK") }) @@ -88,31 +42,58 @@ func RegisterCustomHTTPEndpoints(mux *http.ServeMux, s store.Store, pm p2p.P2PRP mux.HandleFunc("/health/ready", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/plain") - // Peer readiness: non-aggregator nodes should have at least 1 peer - if pm != nil && !cfg.Node.Aggregator { - peers, err := pm.GetPeers() + // P2P readiness: if P2P is enabled, verify it's ready to accept connections + if pm != nil { + netInfo, err := pm.GetNetworkInfo() if err != nil { - http.Error(w, "UNREADY: failed to query peers", http.StatusServiceUnavailable) + http.Error(w, "UNREADY: failed to query P2P network info", http.StatusServiceUnavailable) return } - if len(peers) == 0 { - http.Error(w, "UNREADY: no peers connected", http.StatusServiceUnavailable) + if len(netInfo.ListenAddress) == 0 { + http.Error(w, "UNREADY: P2P not listening for connections", http.StatusServiceUnavailable) return } + + // Peer readiness: non-aggregator nodes should have at least 1 peer + if !cfg.Node.Aggregator { + peers, err := pm.GetPeers() + if err != nil { + http.Error(w, "UNREADY: failed to query peers", http.StatusServiceUnavailable) + return + } + if len(peers) == 0 { + http.Error(w, "UNREADY: no peers connected", http.StatusServiceUnavailable) + return + } + } } - localHeight, err := s.Height(r.Context()) + // Get current state + state, err := s.GetState(r.Context()) if err != nil { http.Error(w, "UNREADY: state unavailable", http.StatusServiceUnavailable) return } + localHeight := state.LastBlockHeight + // If no blocks yet, consider unready if localHeight == 0 { http.Error(w, "UNREADY: no blocks yet", http.StatusServiceUnavailable) return } + // Aggregator block production check: verify blocks are being produced at expected rate + if cfg.Node.Aggregator { + timeSinceLastBlock := time.Since(state.LastBlockTime) + maxAllowedDelay := 5 * cfg.Node.BlockTime.Duration + + if timeSinceLastBlock > maxAllowedDelay { + http.Error(w, "UNREADY: aggregator not producing blocks at expected rate", http.StatusServiceUnavailable) + return + } + } + // Require best-known height to make the readiness decision if bestKnownHeightProvider == nil { http.Error(w, "UNREADY: best-known height unavailable", http.StatusServiceUnavailable) diff --git a/pkg/rpc/server/http_test.go b/pkg/rpc/server/http_test.go index 335113d0af..c259012692 100644 --- a/pkg/rpc/server/http_test.go +++ b/pkg/rpc/server/http_test.go @@ -7,8 +7,10 @@ import ( "testing" "github.com/evstack/ev-node/pkg/config" + "github.com/evstack/ev-node/test/mocks" "github.com/rs/zerolog" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" ) func TestRegisterCustomHTTPEndpoints(t *testing.T) { @@ -16,8 +18,12 @@ func TestRegisterCustomHTTPEndpoints(t *testing.T) { mux := http.NewServeMux() logger := zerolog.Nop() + // Create mock store + mockStore := mocks.NewMockStore(t) + mockStore.On("Height", mock.Anything).Return(uint64(100), nil) + // Register custom HTTP endpoints - RegisterCustomHTTPEndpoints(mux, nil, nil, config.DefaultConfig(), nil, logger) + RegisterCustomHTTPEndpoints(mux, mockStore, nil, config.DefaultConfig(), nil, logger) // Create a new HTTP test server with the mux testServer := httptest.NewServer(mux) @@ -37,4 +43,7 @@ func TestRegisterCustomHTTPEndpoints(t *testing.T) { // Check the response body content assert.Equal(t, "OK\n", string(body)) // fmt.Fprintln adds a newline + + // Verify mock expectations + mockStore.AssertExpectations(t) } diff --git a/pkg/rpc/server/server_test.go b/pkg/rpc/server/server_test.go index 9f60aaf58f..1b0818e862 100644 --- a/pkg/rpc/server/server_test.go +++ b/pkg/rpc/server/server_test.go @@ -363,39 +363,13 @@ func TestP2PServer_GetNetInfo(t *testing.T) { func TestHealthLiveEndpoint(t *testing.T) { logger := zerolog.Nop() - t.Run("non-aggregator always returns OK", func(t *testing.T) { + t.Run("returns OK when store is accessible", func(t *testing.T) { mockStore := mocks.NewMockStore(t) mockP2PManager := &mocks.MockP2PRPC{} testConfig := config.DefaultConfig() - testConfig.Node.Aggregator = false - - handler, err := NewServiceHandler(mockStore, mockP2PManager, nil, logger, testConfig, nil) - require.NoError(t, err) - server := httptest.NewServer(handler) - defer server.Close() - - resp, err := http.Get(server.URL + "/health/live") - require.NoError(t, err) - defer resp.Body.Close() - - require.Equal(t, http.StatusOK, resp.StatusCode) - body, err := io.ReadAll(resp.Body) - require.NoError(t, err) - require.Equal(t, "OK\n", string(body)) - }) - - t.Run("aggregator with no blocks returns OK", func(t *testing.T) { - mockStore := mocks.NewMockStore(t) - mockP2PManager := &mocks.MockP2PRPC{} - testConfig := config.DefaultConfig() - testConfig.Node.Aggregator = true - - // State with no blocks yet - state := types.State{ - LastBlockHeight: 0, - } - mockStore.On("GetState", mock.Anything).Return(state, nil) + // Mock successful store access + mockStore.On("Height", mock.Anything).Return(uint64(100), nil) handler, err := NewServiceHandler(mockStore, mockP2PManager, nil, logger, testConfig, nil) require.NoError(t, err) @@ -414,19 +388,13 @@ func TestHealthLiveEndpoint(t *testing.T) { mockStore.AssertExpectations(t) }) - t.Run("aggregator with recent blocks returns OK", func(t *testing.T) { + t.Run("returns FAIL when store is not accessible", func(t *testing.T) { mockStore := mocks.NewMockStore(t) mockP2PManager := &mocks.MockP2PRPC{} testConfig := config.DefaultConfig() - testConfig.Node.Aggregator = true - testConfig.Node.BlockTime.Duration = 1 * time.Second - // State with recent block - state := types.State{ - LastBlockHeight: 100, - LastBlockTime: time.Now().Add(-500 * time.Millisecond), // Recent block - } - mockStore.On("GetState", mock.Anything).Return(state, nil) + // Mock store access failure + mockStore.On("Height", mock.Anything).Return(uint64(0), fmt.Errorf("store unavailable")) handler, err := NewServiceHandler(mockStore, mockP2PManager, nil, logger, testConfig, nil) require.NoError(t, err) @@ -438,26 +406,20 @@ func TestHealthLiveEndpoint(t *testing.T) { require.NoError(t, err) defer resp.Body.Close() - require.Equal(t, http.StatusOK, resp.StatusCode) + require.Equal(t, http.StatusServiceUnavailable, resp.StatusCode) body, err := io.ReadAll(resp.Body) require.NoError(t, err) - require.Equal(t, "OK\n", string(body)) + require.Contains(t, string(body), "FAIL") mockStore.AssertExpectations(t) }) - t.Run("aggregator with slow block production returns WARN", func(t *testing.T) { + t.Run("returns OK even at height 0", func(t *testing.T) { mockStore := mocks.NewMockStore(t) mockP2PManager := &mocks.MockP2PRPC{} testConfig := config.DefaultConfig() - testConfig.Node.Aggregator = true - testConfig.Node.BlockTime.Duration = 1 * time.Second - // State with block older than 3x block time (warn threshold) - state := types.State{ - LastBlockHeight: 100, - LastBlockTime: time.Now().Add(-4 * time.Second), // 4 seconds ago > 3x block time - } - mockStore.On("GetState", mock.Anything).Return(state, nil) + // Mock successful store access at genesis + mockStore.On("Height", mock.Anything).Return(uint64(0), nil) handler, err := NewServiceHandler(mockStore, mockP2PManager, nil, logger, testConfig, nil) require.NoError(t, err) @@ -472,139 +434,141 @@ func TestHealthLiveEndpoint(t *testing.T) { require.Equal(t, http.StatusOK, resp.StatusCode) body, err := io.ReadAll(resp.Body) require.NoError(t, err) - require.Equal(t, "WARN\n", string(body)) + require.Equal(t, "OK\n", string(body)) mockStore.AssertExpectations(t) }) +} - t.Run("aggregator with stopped block production returns FAIL", func(t *testing.T) { - mockStore := mocks.NewMockStore(t) - mockP2PManager := &mocks.MockP2PRPC{} - testConfig := config.DefaultConfig() - testConfig.Node.Aggregator = true - testConfig.Node.BlockTime.Duration = 1 * time.Second - - // State with block older than 5x block time (fail threshold) - state := types.State{ - LastBlockHeight: 100, - LastBlockTime: time.Now().Add(-10 * time.Second), // 10 seconds ago > 5x block time +func TestHealthReadyEndpoint(t *testing.T) { + t.Run("non-aggregator tests", func(t *testing.T) { + cases := []struct { + name string + local uint64 + bestKnown uint64 + peers int + p2pListening bool + lastBlockTime time.Time + expectedCode int + }{ + {name: "at_head", local: 100, bestKnown: 100, peers: 1, p2pListening: true, lastBlockTime: time.Now(), expectedCode: http.StatusOK}, + {name: "within_1_block", local: 99, bestKnown: 100, peers: 1, p2pListening: true, lastBlockTime: time.Now(), expectedCode: http.StatusOK}, + {name: "within_15_blocks", local: 85, bestKnown: 100, peers: 1, p2pListening: true, lastBlockTime: time.Now(), expectedCode: http.StatusOK}, + {name: "just_over_15_blocks", local: 84, bestKnown: 100, peers: 1, p2pListening: true, lastBlockTime: time.Now(), expectedCode: http.StatusServiceUnavailable}, + {name: "local_ahead", local: 101, bestKnown: 100, peers: 1, p2pListening: true, lastBlockTime: time.Now(), expectedCode: http.StatusOK}, + {name: "no_blocks_yet", local: 0, bestKnown: 100, peers: 1, p2pListening: true, lastBlockTime: time.Now(), expectedCode: http.StatusServiceUnavailable}, + {name: "unknown_best_known", local: 100, bestKnown: 0, peers: 1, p2pListening: true, lastBlockTime: time.Now(), expectedCode: http.StatusServiceUnavailable}, + {name: "no_peers", local: 100, bestKnown: 100, peers: 0, p2pListening: true, lastBlockTime: time.Now(), expectedCode: http.StatusServiceUnavailable}, + {name: "p2p_not_listening", local: 100, bestKnown: 100, peers: 1, p2pListening: false, lastBlockTime: time.Now(), expectedCode: http.StatusServiceUnavailable}, } - mockStore.On("GetState", mock.Anything).Return(state, nil) - - handler, err := NewServiceHandler(mockStore, mockP2PManager, nil, logger, testConfig, nil) - require.NoError(t, err) - - server := httptest.NewServer(handler) - defer server.Close() - - resp, err := http.Get(server.URL + "/health/live") - require.NoError(t, err) - defer resp.Body.Close() - - require.Equal(t, http.StatusServiceUnavailable, resp.StatusCode) - body, err := io.ReadAll(resp.Body) - require.NoError(t, err) - require.Contains(t, string(body), "FAIL") - mockStore.AssertExpectations(t) - }) - t.Run("lazy aggregator uses lazy block interval for threshold", func(t *testing.T) { - mockStore := mocks.NewMockStore(t) - mockP2PManager := &mocks.MockP2PRPC{} + logger := zerolog.Nop() testConfig := config.DefaultConfig() - testConfig.Node.Aggregator = true - testConfig.Node.LazyMode = true - testConfig.Node.BlockTime.Duration = 1 * time.Second - testConfig.Node.LazyBlockInterval.Duration = 10 * time.Second + testConfig.Node.Aggregator = false - // State with block older than 3x lazy block interval (warn threshold) - state := types.State{ - LastBlockHeight: 100, - LastBlockTime: time.Now().Add(-35 * time.Second), // 35 seconds ago > 3x lazy interval (30s) + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + mockStore := mocks.NewMockStore(t) + mockP2P := mocks.NewMockP2PRPC(t) + + // Setup P2P network info + netInfo := p2p.NetworkInfo{ + ID: "test-node", + } + if tc.p2pListening { + netInfo.ListenAddress = []string{"/ip4/0.0.0.0/tcp/26656"} + } + mockP2P.On("GetNetworkInfo").Return(netInfo, nil) + + // Only expect GetPeers() when P2P is listening (handler returns early if not listening) + if tc.p2pListening { + var peers []peer.AddrInfo + for i := 0; i < tc.peers; i++ { + peers = append(peers, peer.AddrInfo{}) + } + mockP2P.On("GetPeers").Return(peers, nil) + } + + // Only expect GetState() when peers are present (handler returns early on no peers) + if tc.peers > 0 && tc.p2pListening { + state := types.State{ + LastBlockHeight: tc.local, + LastBlockTime: tc.lastBlockTime, + } + mockStore.On("GetState", mock.Anything).Return(state, nil) + } + + bestKnown := func() uint64 { return tc.bestKnown } + handler, err := NewServiceHandler(mockStore, mockP2P, nil, logger, testConfig, bestKnown) + require.NoError(t, err) + server := httptest.NewServer(handler) + defer server.Close() + + resp, err := http.Get(server.URL + "/health/ready") + require.NoError(t, err) + defer resp.Body.Close() + require.Equal(t, tc.expectedCode, resp.StatusCode) + }) } - mockStore.On("GetState", mock.Anything).Return(state, nil) - - handler, err := NewServiceHandler(mockStore, mockP2PManager, nil, logger, testConfig, nil) - require.NoError(t, err) - - server := httptest.NewServer(handler) - defer server.Close() - - resp, err := http.Get(server.URL + "/health/live") - require.NoError(t, err) - defer resp.Body.Close() - - require.Equal(t, http.StatusOK, resp.StatusCode) - body, err := io.ReadAll(resp.Body) - require.NoError(t, err) - require.Equal(t, "WARN\n", string(body)) - mockStore.AssertExpectations(t) }) - t.Run("aggregator with state error returns FAIL", func(t *testing.T) { - mockStore := mocks.NewMockStore(t) - mockP2PManager := &mocks.MockP2PRPC{} + t.Run("aggregator tests", func(t *testing.T) { + logger := zerolog.Nop() testConfig := config.DefaultConfig() testConfig.Node.Aggregator = true + testConfig.Node.BlockTime.Duration = 1 * time.Second - mockStore.On("GetState", mock.Anything).Return(types.State{}, fmt.Errorf("state error")) - - handler, err := NewServiceHandler(mockStore, mockP2PManager, nil, logger, testConfig, nil) - require.NoError(t, err) + t.Run("producing blocks at expected rate", func(t *testing.T) { + mockStore := mocks.NewMockStore(t) + mockP2P := mocks.NewMockP2PRPC(t) - server := httptest.NewServer(handler) - defer server.Close() + // Setup P2P + netInfo := p2p.NetworkInfo{ + ID: "test-node", + ListenAddress: []string{"/ip4/0.0.0.0/tcp/26656"}, + } + mockP2P.On("GetNetworkInfo").Return(netInfo, nil) - resp, err := http.Get(server.URL + "/health/live") - require.NoError(t, err) - defer resp.Body.Close() + // Aggregators don't need peers check + // No GetPeers() call expected - require.Equal(t, http.StatusServiceUnavailable, resp.StatusCode) - body, err := io.ReadAll(resp.Body) - require.NoError(t, err) - require.Contains(t, string(body), "FAIL") - mockStore.AssertExpectations(t) - }) -} + // Recent block (within 5x block time) + state := types.State{ + LastBlockHeight: 100, + LastBlockTime: time.Now().Add(-2 * time.Second), // 2 seconds ago, within 5x1s = 5s + } + mockStore.On("GetState", mock.Anything).Return(state, nil) -func TestHealthReadyEndpoint(t *testing.T) { - cases := []struct { - name string - local uint64 - bestKnown uint64 - peers int - expectedCode int - }{ - {name: "at_head", local: 100, bestKnown: 100, peers: 1, expectedCode: http.StatusOK}, - {name: "within_1_block", local: 99, bestKnown: 100, peers: 1, expectedCode: http.StatusOK}, - {name: "within_15_blocks", local: 85, bestKnown: 100, peers: 1, expectedCode: http.StatusOK}, - {name: "just_over_15_blocks", local: 84, bestKnown: 100, peers: 1, expectedCode: http.StatusServiceUnavailable}, - {name: "local_ahead", local: 101, bestKnown: 100, peers: 1, expectedCode: http.StatusOK}, - {name: "no_blocks_yet", local: 0, bestKnown: 100, peers: 1, expectedCode: http.StatusServiceUnavailable}, - {name: "unknown_best_known", local: 100, bestKnown: 0, peers: 1, expectedCode: http.StatusServiceUnavailable}, - {name: "no_peers", local: 100, bestKnown: 100, peers: 0, expectedCode: http.StatusServiceUnavailable}, - } + bestKnown := func() uint64 { return 100 } + handler, err := NewServiceHandler(mockStore, mockP2P, nil, logger, testConfig, bestKnown) + require.NoError(t, err) + server := httptest.NewServer(handler) + defer server.Close() - logger := zerolog.Nop() - testConfig := config.DefaultConfig() - testConfig.Node.Aggregator = false + resp, err := http.Get(server.URL + "/health/ready") + require.NoError(t, err) + defer resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode) + }) - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { + t.Run("not producing blocks at expected rate", func(t *testing.T) { mockStore := mocks.NewMockStore(t) mockP2P := mocks.NewMockP2PRPC(t) - // only expect Height() when peers are present handler returns early on no peers - if tc.peers > 0 { - mockStore.On("Height", mock.Anything).Return(tc.local, nil) + // Setup P2P + netInfo := p2p.NetworkInfo{ + ID: "test-node", + ListenAddress: []string{"/ip4/0.0.0.0/tcp/26656"}, } + mockP2P.On("GetNetworkInfo").Return(netInfo, nil) - var peers []peer.AddrInfo - for i := 0; i < tc.peers; i++ { - peers = append(peers, peer.AddrInfo{}) + // Old block (beyond 5x block time) + state := types.State{ + LastBlockHeight: 100, + LastBlockTime: time.Now().Add(-10 * time.Second), // 10 seconds ago, beyond 5x1s = 5s } - mockP2P.On("GetPeers").Return(peers, nil) + mockStore.On("GetState", mock.Anything).Return(state, nil) - bestKnown := func() uint64 { return tc.bestKnown } + bestKnown := func() uint64 { return 100 } handler, err := NewServiceHandler(mockStore, mockP2P, nil, logger, testConfig, bestKnown) require.NoError(t, err) server := httptest.NewServer(handler) @@ -613,7 +577,7 @@ func TestHealthReadyEndpoint(t *testing.T) { resp, err := http.Get(server.URL + "/health/ready") require.NoError(t, err) defer resp.Body.Close() - require.Equal(t, tc.expectedCode, resp.StatusCode) + require.Equal(t, http.StatusServiceUnavailable, resp.StatusCode) }) - } + }) } From 4eec708b860c070978c7cbb5a2b17d9c88054229 Mon Sep 17 00:00:00 2001 From: Randy Grok Date: Wed, 5 Nov 2025 15:27:04 +0100 Subject: [PATCH 10/21] docs(health): add comprehensive documentation for health check endpoints in config and server files --- docs/learn/config.md | 129 +++++++++++++++++++++++++++++++++++++++++ pkg/rpc/server/http.go | 16 ++++- 2 files changed, 144 insertions(+), 1 deletion(-) diff --git a/docs/learn/config.md b/docs/learn/config.md index 6fa6befe0c..b308a4b1c9 100644 --- a/docs/learn/config.md +++ b/docs/learn/config.md @@ -37,6 +37,7 @@ This document provides a comprehensive reference for all configuration options a - [RPC Configuration (`rpc`)](#rpc-configuration-rpc) - [RPC Server Address](#rpc-server-address) - [Enable DA Visualization](#enable-da-visualization) + - [Health Endpoints](#health-endpoints) - [Instrumentation Configuration (`instrumentation`)](#instrumentation-configuration-instrumentation) - [Enable Prometheus Metrics](#enable-prometheus-metrics) - [Prometheus Listen Address](#prometheus-listen-address) @@ -606,6 +607,134 @@ _Constant:_ `FlagRPCEnableDAVisualization` See the [DA Visualizer Guide](../guides/da/visualizer.md) for detailed information on using this feature. +### Health Endpoints + +Evolve exposes two HTTP health check endpoints that serve different purposes in production environments. These endpoints are automatically available on the RPC server and follow Kubernetes health check best practices. + +#### `/health/live` - Liveness Probe + +**Purpose:** Determines if the node process is alive and responsive. + +**What it checks:** +- Store accessibility (can the process access its database?) + +**Typical usage:** +- Kubernetes liveness probes +- Process monitoring systems +- Container orchestration health checks + +**Response:** +- `200 OK` with body `OK` - Process is alive and responsive +- `503 Service Unavailable` with body `FAIL` - Process is dead or unresponsive + +**Failure action:** If this endpoint fails, the process should be **killed and restarted**. A failing liveness check indicates the process is in an unrecoverable state (e.g., database connection lost, deadlock, etc.). + +**Example:** +```bash +curl http://localhost:7331/health/live +# Response: OK (HTTP 200) +``` + +**Kubernetes liveness probe configuration:** +```yaml +livenessProbe: + httpGet: + path: /health/live + port: 7331 + initialDelaySeconds: 10 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 +``` + +--- + +#### `/health/ready` - Readiness Probe + +**Purpose:** Determines if the node can serve correct data to clients. + +**What it checks:** + +**For all nodes:** +1. **P2P listening** (if P2P enabled): P2P network is accepting connections +2. **Store accessible**: Can read state from the database +3. **Has blocks**: Node has synced at least one block (height > 0) +4. **Best-known height available**: Can determine network height +5. **Sync status**: Node is not too far behind the network (within `readiness_max_blocks_behind`) + +**Additional checks for non-aggregators:** +6. **Has peers**: Connected to at least one peer (for receiving blocks) + +**Additional checks for aggregators:** +7. **Block production rate**: Producing blocks at expected rate (within 5x `block_time`) + +**Typical usage:** +- Kubernetes readiness probes +- Load balancer health checks +- Service mesh routing decisions + +**Response:** +- `200 OK` with body `READY` - Node can serve correct data +- `503 Service Unavailable` with body `UNREADY: ` - Node should not receive traffic + +**Failure action:** If this endpoint fails, the node should be **removed from the load balancer** but **NOT killed**. The process is alive but temporarily unable to serve correct data (e.g., syncing, no peers, behind network head). + +**Example:** +```bash +curl http://localhost:7331/health/ready +# Response: READY (HTTP 200) +# or +# Response: UNREADY: behind best-known head (HTTP 503) +``` + +**Kubernetes readiness probe configuration:** +```yaml +readinessProbe: + httpGet: + path: /health/ready + port: 7331 + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 +``` + +--- + +#### Key Differences: Liveness vs Readiness + +| Aspect | Liveness (`/health/live`) | Readiness (`/health/ready`) | +|--------|---------------------------|----------------------------| +| **Purpose** | Is the process alive? | Can it serve correct data? | +| **Checks** | Infrastructure (store accessible) | Business logic (synced, peers, block production) | +| **Failure means** | Process is broken/deadlocked | Temporarily unable to serve | +| **Action on failure** | **Kill and restart** process | **Remove from load balancer** | +| **Check frequency** | Less frequent (every 10-30s) | More frequent (every 5-10s) | +| **Example failure** | Database corruption, deadlock | Syncing from genesis, no peers | + +**Important:** A node can be **live but not ready**. For example, a newly started full node is alive (process running, database accessible) but not ready (still syncing blocks from peers). In this state: +- `/health/live` returns `200 OK` (don't kill the process) +- `/health/ready` returns `503 UNREADY: behind best-known head` (don't route traffic yet) + +#### Configuration + +**Readiness max blocks behind:** + +The readiness endpoint uses the `readiness_max_blocks_behind` configuration to determine if a node is too far behind the network. + +**YAML:** +```yaml +node: + readiness_max_blocks_behind: 15 +``` + +**Command-line Flag:** +`--rollkit.node.readiness_max_blocks_behind ` +_Example:_ `--rollkit.node.readiness_max_blocks_behind 20` +_Default:_ `15` + +This value determines how many blocks behind the best-known network height a node can be before being considered unready. Lower values ensure tighter consistency but may cause nodes to be marked unready more frequently during network hiccups. + ## Instrumentation Configuration (`instrumentation`) Settings for enabling and configuring metrics and profiling endpoints, useful for monitoring node performance and debugging. diff --git a/pkg/rpc/server/http.go b/pkg/rpc/server/http.go index c9493ffa6a..5a2e8c0626 100644 --- a/pkg/rpc/server/http.go +++ b/pkg/rpc/server/http.go @@ -1,3 +1,9 @@ +// Package server provides HTTP endpoint handlers for the RPC server. +// +// Health Endpoints: +// This file implements health check endpoints following Kubernetes best practices. +// For comprehensive documentation on health endpoints, their differences, and usage examples, +// see: docs/learn/config.md#health-endpoints package server import ( @@ -17,10 +23,14 @@ type BestKnownHeightProvider func() uint64 // RegisterCustomHTTPEndpoints is the designated place to add new, non-gRPC, plain HTTP handlers. // Additional custom HTTP endpoints can be registered on the mux here. +// +// For detailed documentation on health endpoints, see: docs/learn/config.md#health-endpoints func RegisterCustomHTTPEndpoints(mux *http.ServeMux, s store.Store, pm p2p.P2PRPC, cfg config.Config, bestKnownHeightProvider BestKnownHeightProvider, logger zerolog.Logger) { // Liveness endpoint - checks if the service process is alive and responsive // A failing liveness check should result in killing/restarting the process // This endpoint should NOT check business logic (like block production or sync status) + // + // See docs/learn/config.md#healthlive---liveness-probe for details mux.HandleFunc("/health/live", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/plain") @@ -38,7 +48,11 @@ func RegisterCustomHTTPEndpoints(mux *http.ServeMux, s store.Store, pm p2p.P2PRP fmt.Fprintln(w, "OK") }) - // Readiness endpoint + // Readiness endpoint - checks if the node can serve correct data to clients + // A failing readiness check should result in removing the node from load balancer + // but NOT killing the process (e.g., node is syncing, no peers, etc.) + // + // See docs/learn/config.md#healthready---readiness-probe for details mux.HandleFunc("/health/ready", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/plain") From 94a9bc487d9d909ae1abcad69aa29ed4b863e8c7 Mon Sep 17 00:00:00 2001 From: Randy Grok Date: Wed, 5 Nov 2025 17:24:33 +0100 Subject: [PATCH 11/21] docs: update CHANGELOG and buf.yaml to reflect migration from gRPC to HTTP health endpoints --- CHANGELOG.md | 16 ++++++++++++++++ buf.yaml | 4 ++++ 2 files changed, 20 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9391bb6345..40e7627a38 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,9 +9,25 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Added comprehensive health endpoint documentation in `docs/learn/config.md#health-endpoints` explaining liveness vs readiness checks, Kubernetes probe configuration, and usage examples ([#2800](https://github.com/evstack/ev-node/pull/2800)) +- Added P2P listening check to `/health/ready` endpoint to verify P2P network is ready to accept connections ([#2800](https://github.com/evstack/ev-node/pull/2800)) +- Added aggregator block production rate check to `/health/ready` endpoint to ensure aggregators are producing blocks within expected timeframe (5x block time) ([#2800](https://github.com/evstack/ev-node/pull/2800)) + ### Changed - Use cache instead of in memory store for reaper. Persist cache on reload. Autoclean after 24 hours. ([#2811](https://github.com/evstack/ev-node/pull/2811)) +- Simplified `/health/live` endpoint to only check store accessibility (liveness) instead of business logic, following Kubernetes best practices ([#2800](https://github.com/evstack/ev-node/pull/2800)) +- Updated `/health/ready` endpoint to use `GetState()` instead of `Height()` to access block production timing information ([#2800](https://github.com/evstack/ev-node/pull/2800)) + +### Removed + +- **BREAKING:** Removed `evnode.v1.HealthService` gRPC endpoint in favor of HTTP health endpoints ([#2800](https://github.com/evstack/ev-node/pull/2800)) + - Migration: Use `GET /health/live` instead of `HealthService.Livez()` gRPC call + - See migration guide: `docs/learn/config.md#health-endpoints` + - Affected clients: Go client (`pkg/rpc/client`), Rust client (`client/crates/client`), and any external services using the gRPC health endpoint +- Removed `proto/evnode/v1/health.proto` and generated protobuf files ([#2800](https://github.com/evstack/ev-node/pull/2800)) ## v1.0.0-beta.9 diff --git a/buf.yaml b/buf.yaml index bf7debf7cc..940e2630f5 100644 --- a/buf.yaml +++ b/buf.yaml @@ -14,3 +14,7 @@ lint: breaking: use: - FILE + ignore: + # health.proto was intentionally removed - health checks migrated to HTTP endpoints + # See: docs/learn/config.md#health-endpoints + - evnode/v1/health.proto From 807e8b776dd7b12eaaccfcbc682092b68b9b28bd Mon Sep 17 00:00:00 2001 From: Randy Grok Date: Wed, 5 Nov 2025 17:31:36 +0100 Subject: [PATCH 12/21] docs: update CHANGELOG with new readiness checks and client methods for health endpoints --- CHANGELOG.md | 3 + node/single_sequencer_integration_test.go | 38 +++----- pkg/rpc/client/client.go | 53 +++++++++++ pkg/rpc/client/client_test.go | 108 +++++++++++++++++++++- 4 files changed, 175 insertions(+), 27 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 40e7627a38..5edeb6317a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,12 +14,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added comprehensive health endpoint documentation in `docs/learn/config.md#health-endpoints` explaining liveness vs readiness checks, Kubernetes probe configuration, and usage examples ([#2800](https://github.com/evstack/ev-node/pull/2800)) - Added P2P listening check to `/health/ready` endpoint to verify P2P network is ready to accept connections ([#2800](https://github.com/evstack/ev-node/pull/2800)) - Added aggregator block production rate check to `/health/ready` endpoint to ensure aggregators are producing blocks within expected timeframe (5x block time) ([#2800](https://github.com/evstack/ev-node/pull/2800)) +- Added `GetReadiness()` method to Go RPC client for checking `/health/ready` endpoint ([#2800](https://github.com/evstack/ev-node/pull/2800)) +- Added `ReadinessStatus` type to Go RPC client with READY/UNREADY/UNKNOWN states ([#2800](https://github.com/evstack/ev-node/pull/2800)) ### Changed - Use cache instead of in memory store for reaper. Persist cache on reload. Autoclean after 24 hours. ([#2811](https://github.com/evstack/ev-node/pull/2811)) - Simplified `/health/live` endpoint to only check store accessibility (liveness) instead of business logic, following Kubernetes best practices ([#2800](https://github.com/evstack/ev-node/pull/2800)) - Updated `/health/ready` endpoint to use `GetState()` instead of `Height()` to access block production timing information ([#2800](https://github.com/evstack/ev-node/pull/2800)) +- Renamed integration test from `TestHealthEndpointWhenBlockProductionStops` to `TestReadinessEndpointWhenBlockProductionStops` to correctly test readiness endpoint ([#2800](https://github.com/evstack/ev-node/pull/2800)) ### Removed diff --git a/node/single_sequencer_integration_test.go b/node/single_sequencer_integration_test.go index 219b95eb68..070c95b771 100644 --- a/node/single_sequencer_integration_test.go +++ b/node/single_sequencer_integration_test.go @@ -418,12 +418,12 @@ func waitForBlockN(t *testing.T, n uint64, node *FullNode, blockInterval time.Du return got >= n }, timeout[0], blockInterval/2) } -// TestHealthEndpointWhenBlockProductionStops verifies that the health endpoint -// correctly reports WARN and FAIL states when an aggregator stops producing blocks. -func TestHealthEndpointWhenBlockProductionStops(t *testing.T) { +// TestReadinessEndpointWhenBlockProductionStops verifies that the readiness endpoint +// correctly reports UNREADY state when an aggregator stops producing blocks. +func TestReadinessEndpointWhenBlockProductionStops(t *testing.T) { require := require.New(t) - // Set up configuration with specific block time for predictable health checks + // Set up configuration with specific block time for predictable readiness checks config := getTestConfig(t, 1) config.Node.Aggregator = true config.Node.BlockTime = evconfig.DurationWrapper{Duration: 500 * time.Millisecond} @@ -448,10 +448,10 @@ func TestHealthEndpointWhenBlockProductionStops(t *testing.T) { // Create RPC client rpcClient := NewRPCClient(config.RPC.Address) - // Verify health is PASS while blocks are being produced - health, err := rpcClient.GetHealth(ctx) + // Verify readiness is READY while blocks are being produced + readiness, err := rpcClient.GetReadiness(ctx) require.NoError(err) - require.Equal("PASS", health.String(), "Health should be PASS while producing blocks") + require.Equal("READY", readiness.String(), "Readiness should be READY while producing blocks") // Wait for block production to stop (when MaxPendingHeadersAndData is reached) time.Sleep(time.Duration(config.Node.MaxPendingHeadersAndData+2) * config.Node.BlockTime.Duration) @@ -461,30 +461,20 @@ func TestHealthEndpointWhenBlockProductionStops(t *testing.T) { require.NoError(err) require.LessOrEqual(height, config.Node.MaxPendingHeadersAndData) - // Health check threshold calculations: + // Readiness check threshold for aggregators: // blockTime = 500ms - // warnThreshold = blockTime * 3 = 1500ms = 1.5s - // failThreshold = blockTime * 5 = 2500ms = 2.5s + // maxAllowedDelay = blockTime * 5 = 2500ms = 2.5s + // After 2.5s without producing a block, aggregator should be UNREADY - // Poll for health to transition away from PASS (to WARN or FAIL) + // Poll for readiness to transition to UNREADY // This is more robust than fixed time.Sleep as it handles timing variations require.Eventually(func() bool { - health, err := rpcClient.GetHealth(ctx) + readiness, err := rpcClient.GetReadiness(ctx) if err != nil { return false } - return health.String() != "PASS" - }, 5*time.Second, 100*time.Millisecond, "Health should transition away from PASS after block production stops") - - // Poll for health to reach FAIL state - // Timeout is set to 10 seconds to be safe, but should happen around 2.5s - require.Eventually(func() bool { - health, err := rpcClient.GetHealth(ctx) - if err != nil { - return false - } - return health.String() == "FAIL" - }, 10*time.Second, 100*time.Millisecond, "Health should be FAIL after 5x block time without new blocks") + return readiness.String() == "UNREADY" + }, 10*time.Second, 100*time.Millisecond, "Readiness should be UNREADY after aggregator stops producing blocks (5x block time)") // Stop the node and wait for shutdown shutdownAndWait(t, []context.CancelFunc{cancel}, &runningWg, 10*time.Second) diff --git a/pkg/rpc/client/client.go b/pkg/rpc/client/client.go index ee98dd6979..68dcfedf3b 100644 --- a/pkg/rpc/client/client.go +++ b/pkg/rpc/client/client.go @@ -146,6 +146,8 @@ func (c *Client) GetNetInfo(ctx context.Context) (*pb.NetInfo, error) { } // GetHealth calls the /health/live HTTP endpoint and returns the HealthStatus +// This endpoint checks liveness (is the process alive and responsive?). +// For readiness checks (can the node serve correct data?), use GetReadiness(). func (c *Client) GetHealth(ctx context.Context) (HealthStatus, error) { healthURL := fmt.Sprintf("%s/health/live", c.baseURL) req, err := http.NewRequestWithContext(ctx, http.MethodGet, healthURL, nil) @@ -178,6 +180,57 @@ func (c *Client) GetHealth(ctx context.Context) (HealthStatus, error) { } } +// ReadinessStatus represents the readiness state of a node +type ReadinessStatus int32 + +const ( + ReadinessStatus_UNKNOWN ReadinessStatus = 0 + ReadinessStatus_READY ReadinessStatus = 1 + ReadinessStatus_UNREADY ReadinessStatus = 2 +) + +func (s ReadinessStatus) String() string { + switch s { + case ReadinessStatus_READY: + return "READY" + case ReadinessStatus_UNREADY: + return "UNREADY" + default: + return "UNKNOWN" + } +} + +// GetReadiness calls the /health/ready HTTP endpoint and returns the ReadinessStatus +// This endpoint checks if the node can serve correct data to clients. +// For liveness checks (is the process alive?), use GetHealth(). +func (c *Client) GetReadiness(ctx context.Context) (ReadinessStatus, error) { + readinessURL := fmt.Sprintf("%s/health/ready", c.baseURL) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, readinessURL, nil) + if err != nil { + return ReadinessStatus_UNKNOWN, fmt.Errorf("failed to create readiness request: %w", err) + } + + resp, err := c.httpClient.Do(req) + if err != nil { + return ReadinessStatus_UNKNOWN, fmt.Errorf("failed to get readiness: %w", err) + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return ReadinessStatus_UNKNOWN, fmt.Errorf("failed to read readiness response: %w", err) + } + + // Parse the text response + status := strings.TrimSpace(string(body)) + if strings.HasPrefix(status, "READY") { + return ReadinessStatus_READY, nil + } else if strings.HasPrefix(status, "UNREADY") { + return ReadinessStatus_UNREADY, nil + } + return ReadinessStatus_UNKNOWN, fmt.Errorf("unknown readiness status: %s", status) +} + // GetNamespace returns the namespace configuration for this network func (c *Client) GetNamespace(ctx context.Context) (*pb.GetNamespaceResponse, error) { req := connect.NewRequest(&emptypb.Empty{}) diff --git a/pkg/rpc/client/client_test.go b/pkg/rpc/client/client_test.go index f36def6700..e0189fa47a 100644 --- a/pkg/rpc/client/client_test.go +++ b/pkg/rpc/client/client_test.go @@ -24,9 +24,28 @@ import ( rpc "github.com/evstack/ev-node/types/pb/evnode/v1/v1connect" ) +// setupTestServerOptions holds optional parameters for setupTestServer +type setupTestServerOptions struct { + config *config.Config + bestKnownHeightProvider server.BestKnownHeightProvider +} + // setupTestServer creates a test server with mock store and mock p2p manager. // An optional custom config can be provided; if not provided, uses DefaultConfig with test-headers namespace. func setupTestServer(t *testing.T, mockStore *mocks.MockStore, mockP2P *mocks.MockP2PRPC, customConfig ...config.Config) (*httptest.Server, *Client) { + return setupTestServerWithOptions(t, mockStore, mockP2P, setupTestServerOptions{ + config: func() *config.Config { + if len(customConfig) > 0 { + return &customConfig[0] + } + return nil + }(), + bestKnownHeightProvider: func() uint64 { return 100 }, // Default provider + }) +} + +// setupTestServerWithOptions creates a test server with full control over options +func setupTestServerWithOptions(t *testing.T, mockStore *mocks.MockStore, mockP2P *mocks.MockP2PRPC, opts setupTestServerOptions) (*httptest.Server, *Client) { // Create a new HTTP test server mux := http.NewServeMux() @@ -35,8 +54,8 @@ func setupTestServer(t *testing.T, mockStore *mocks.MockStore, mockP2P *mocks.Mo // Use custom config if provided, otherwise use default var testConfig config.Config - if len(customConfig) > 0 { - testConfig = customConfig[0] + if opts.config != nil { + testConfig = *opts.config } else { testConfig = config.DefaultConfig() testConfig.DA.Namespace = "test-headers" @@ -59,7 +78,11 @@ func setupTestServer(t *testing.T, mockStore *mocks.MockStore, mockP2P *mocks.Mo mux.Handle(configPath, configHandler) // Register custom HTTP endpoints (including health) - server.RegisterCustomHTTPEndpoints(mux, mockStore, mockP2P, testConfig, nil, logger) + bestKnownHeight := opts.bestKnownHeightProvider + if bestKnownHeight == nil { + bestKnownHeight = func() uint64 { return 100 } + } + server.RegisterCustomHTTPEndpoints(mux, mockStore, mockP2P, testConfig, bestKnownHeight, logger) // Create an HTTP server with h2c for HTTP/2 support testServer := httptest.NewServer(h2c.NewHandler(mux, &http2.Server{})) @@ -300,6 +323,85 @@ func TestClientGetHealth(t *testing.T) { }) } +func TestClientGetReadiness(t *testing.T) { + t.Run("returns READY when all checks pass", func(t *testing.T) { + mockStore := mocks.NewMockStore(t) + mockP2P := mocks.NewMockP2PRPC(t) + + // Setup P2P network info + netInfo := p2p.NetworkInfo{ + ID: "test-node", + ListenAddress: []string{"/ip4/0.0.0.0/tcp/26656"}, + } + mockP2P.On("GetNetworkInfo").Return(netInfo, nil) + + // Setup peers (non-aggregator needs peers) + peers := []peer.AddrInfo{{}} + mockP2P.On("GetPeers").Return(peers, nil) + + // Setup state + state := types.State{ + LastBlockHeight: 100, + LastBlockTime: time.Now(), + } + mockStore.On("GetState", mock.Anything).Return(state, nil) + + testServer, client := setupTestServer(t, mockStore, mockP2P) + defer testServer.Close() + + readiness, err := client.GetReadiness(context.Background()) + + require.NoError(t, err) + require.Equal(t, "READY", readiness.String()) + mockStore.AssertExpectations(t) + mockP2P.AssertExpectations(t) + }) + + t.Run("returns UNREADY when P2P not listening", func(t *testing.T) { + mockStore := mocks.NewMockStore(t) + mockP2P := mocks.NewMockP2PRPC(t) + + // Setup P2P network info with no listen addresses + netInfo := p2p.NetworkInfo{ + ID: "test-node", + } + mockP2P.On("GetNetworkInfo").Return(netInfo, nil) + + testServer, client := setupTestServer(t, mockStore, mockP2P) + defer testServer.Close() + + readiness, err := client.GetReadiness(context.Background()) + + require.NoError(t, err) + require.Equal(t, "UNREADY", readiness.String()) + mockP2P.AssertExpectations(t) + }) + + t.Run("returns UNREADY when no peers (non-aggregator)", func(t *testing.T) { + mockStore := mocks.NewMockStore(t) + mockP2P := mocks.NewMockP2PRPC(t) + + // Setup P2P network info + netInfo := p2p.NetworkInfo{ + ID: "test-node", + ListenAddress: []string{"/ip4/0.0.0.0/tcp/26656"}, + } + mockP2P.On("GetNetworkInfo").Return(netInfo, nil) + + // No peers + mockP2P.On("GetPeers").Return([]peer.AddrInfo{}, nil) + + testServer, client := setupTestServer(t, mockStore, mockP2P) + defer testServer.Close() + + readiness, err := client.GetReadiness(context.Background()) + + require.NoError(t, err) + require.Equal(t, "UNREADY", readiness.String()) + mockP2P.AssertExpectations(t) + }) +} + func TestClientGetNamespace(t *testing.T) { // Create mocks mockStore := mocks.NewMockStore(t) From b09df8488761b43489fd36fd82386c363aaba5d8 Mon Sep 17 00:00:00 2001 From: Randy Grok Date: Wed, 5 Nov 2025 17:42:46 +0100 Subject: [PATCH 13/21] docs: update CHANGELOG with new readiness checks and methods in Rust client feat(health): add `readyz()` and `is_ready()` methods to Rust `HealthClient` feat(health): introduce `ReadinessStatus` enum for Rust client refactor: update Rust client example to demonstrate liveness and readiness checks --- CHANGELOG.md | 3 ++ client/crates/client/examples/basic.rs | 27 +++++++++---- client/crates/client/src/health.rs | 46 +++++++++++++++++++++-- client/crates/client/src/lib.rs | 2 +- client/crates/types/tests/feature_test.rs | 2 - pkg/rpc/client/client_test.go | 2 +- 6 files changed, 67 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5edeb6317a..e27f5c2532 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added aggregator block production rate check to `/health/ready` endpoint to ensure aggregators are producing blocks within expected timeframe (5x block time) ([#2800](https://github.com/evstack/ev-node/pull/2800)) - Added `GetReadiness()` method to Go RPC client for checking `/health/ready` endpoint ([#2800](https://github.com/evstack/ev-node/pull/2800)) - Added `ReadinessStatus` type to Go RPC client with READY/UNREADY/UNKNOWN states ([#2800](https://github.com/evstack/ev-node/pull/2800)) +- Added `readyz()` and `is_ready()` methods to Rust `HealthClient` for checking `/health/ready` endpoint ([#2800](https://github.com/evstack/ev-node/pull/2800)) +- Added `ReadinessStatus` enum to Rust client with Ready/Unready/Unknown states ([#2800](https://github.com/evstack/ev-node/pull/2800)) ### Changed @@ -23,6 +25,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Simplified `/health/live` endpoint to only check store accessibility (liveness) instead of business logic, following Kubernetes best practices ([#2800](https://github.com/evstack/ev-node/pull/2800)) - Updated `/health/ready` endpoint to use `GetState()` instead of `Height()` to access block production timing information ([#2800](https://github.com/evstack/ev-node/pull/2800)) - Renamed integration test from `TestHealthEndpointWhenBlockProductionStops` to `TestReadinessEndpointWhenBlockProductionStops` to correctly test readiness endpoint ([#2800](https://github.com/evstack/ev-node/pull/2800)) +- Updated Rust client example (`client/crates/client/examples/basic.rs`) to demonstrate both liveness and readiness checks ([#2800](https://github.com/evstack/ev-node/pull/2800)) ### Removed diff --git a/client/crates/client/examples/basic.rs b/client/crates/client/examples/basic.rs index 8a9cbd6ab7..7617577819 100644 --- a/client/crates/client/examples/basic.rs +++ b/client/crates/client/examples/basic.rs @@ -1,4 +1,4 @@ -use ev_client::{Client, HealthClient, P2PClient, StoreClient}; +use ev_client::{health::HealthClient, Client, P2PClient, StoreClient}; use std::error::Error; #[tokio::main] @@ -14,15 +14,26 @@ async fn main() -> Result<(), Box> { let client = Client::connect(&endpoint).await?; println!("Successfully connected to evolve node"); - // Check health status + // Check health status (HTTP endpoints) println!("\n=== Health Check ==="); - let health = HealthClient::new(&client); - match health.get_health().await { - Ok(health_response) => { - println!("Health status: {:?}", health_response.status()); - println!("Node is healthy: {}", health.is_healthy().await?); + let health = HealthClient::with_base_url(endpoint.clone()); + + // Liveness check - is the process alive? + match health.livez().await { + Ok(status) => { + println!("Liveness status: {:?}", status); + println!("Node is alive: {}", health.is_healthy().await?); + } + Err(e) => println!("Failed to get liveness status: {e}"), + } + + // Readiness check - can it serve correct data? + match health.readyz().await { + Ok(status) => { + println!("Readiness status: {:?}", status); + println!("Node is ready: {}", health.is_ready().await?); } - Err(e) => println!("Failed to get health status: {e}"), + Err(e) => println!("Failed to get readiness status: {e}"), } // Get P2P information diff --git a/client/crates/client/src/health.rs b/client/crates/client/src/health.rs index ac499a44be..e60e1462af 100644 --- a/client/crates/client/src/health.rs +++ b/client/crates/client/src/health.rs @@ -1,6 +1,6 @@ use crate::{client::Client, error::Result}; -/// Health status of the node +/// Health status of the node (liveness check) #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum HealthStatus { /// Node is operating normally @@ -13,6 +13,17 @@ pub enum HealthStatus { Unknown, } +/// Readiness status of the node (can serve correct data) +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ReadinessStatus { + /// Node is ready to serve traffic + Ready, + /// Node is not ready to serve traffic + Unready, + /// Unknown readiness status + Unknown, +} + pub struct HealthClient { base_url: String, http_client: reqwest::Client, @@ -37,7 +48,10 @@ impl HealthClient { } } - /// Check if the node is alive and get its health status + /// Check if the node is alive and get its health status (liveness check) + /// + /// This endpoint checks if the process is alive and responsive. + /// A failing liveness check should result in killing/restarting the process. pub async fn livez(&self) -> Result { let url = format!("{}/health/live", self.base_url); let response = self.http_client.get(&url).send().await?; @@ -52,9 +66,35 @@ impl HealthClient { } } - /// Check if the node is healthy (status is PASS) + /// Check if the node is ready to serve correct data (readiness check) + /// + /// This endpoint checks if the node can serve correct data to clients. + /// A failing readiness check should remove the node from load balancer + /// but NOT kill the process. + pub async fn readyz(&self) -> Result { + let url = format!("{}/health/ready", self.base_url); + let response = self.http_client.get(&url).send().await?; + + let status_text = response.text().await?.trim().to_string(); + + if status_text.starts_with("READY") { + Ok(ReadinessStatus::Ready) + } else if status_text.starts_with("UNREADY") { + Ok(ReadinessStatus::Unready) + } else { + Ok(ReadinessStatus::Unknown) + } + } + + /// Check if the node is healthy (liveness status is PASS) pub async fn is_healthy(&self) -> Result { let status = self.livez().await?; Ok(status == HealthStatus::Pass) } + + /// Check if the node is ready (readiness status is READY) + pub async fn is_ready(&self) -> Result { + let status = self.readyz().await?; + Ok(status == ReadinessStatus::Ready) + } } diff --git a/client/crates/client/src/lib.rs b/client/crates/client/src/lib.rs index a75101c7c9..7d6771ae9d 100644 --- a/client/crates/client/src/lib.rs +++ b/client/crates/client/src/lib.rs @@ -88,7 +88,7 @@ pub mod store; pub use client::{Client, ClientBuilder}; pub use config::ConfigClient; pub use error::{ClientError, Result}; -pub use health::HealthClient; +pub use health::{HealthClient, HealthStatus, ReadinessStatus}; pub use p2p::P2PClient; pub use signer::SignerClient; pub use store::StoreClient; diff --git a/client/crates/types/tests/feature_test.rs b/client/crates/types/tests/feature_test.rs index 76c4b6f039..e7139eccd9 100644 --- a/client/crates/types/tests/feature_test.rs +++ b/client/crates/types/tests/feature_test.rs @@ -10,13 +10,11 @@ fn test_message_types_available() { #[test] fn test_grpc_types_available() { // These should only be available with the grpc feature - use ev_types::v1::health_service_client::HealthServiceClient; use ev_types::v1::p2p_service_client::P2pServiceClient; use ev_types::v1::signer_service_client::SignerServiceClient; use ev_types::v1::store_service_client::StoreServiceClient; // Just verify the types exist - let _ = std::any::type_name::>(); let _ = std::any::type_name::>(); let _ = std::any::type_name::>(); let _ = std::any::type_name::>(); diff --git a/pkg/rpc/client/client_test.go b/pkg/rpc/client/client_test.go index e0189fa47a..680c6ef82e 100644 --- a/pkg/rpc/client/client_test.go +++ b/pkg/rpc/client/client_test.go @@ -26,7 +26,7 @@ import ( // setupTestServerOptions holds optional parameters for setupTestServer type setupTestServerOptions struct { - config *config.Config + config *config.Config bestKnownHeightProvider server.BestKnownHeightProvider } From 0d9c1e585e986ae414ee50e2258db4d409979469 Mon Sep 17 00:00:00 2001 From: Randy Grok Date: Thu, 6 Nov 2025 08:36:44 +0100 Subject: [PATCH 14/21] docs: update CHANGELOG with enhancements to health and readiness checks, including E2E test updates --- CHANGELOG.md | 1 + node/single_sequencer_integration_test.go | 5 +++-- pkg/rpc/client/client_test.go | 12 ++++++------ test/e2e/sut_helper.go | 12 ++++++++++-- 4 files changed, 20 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e27f5c2532..24e2df3c12 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Use cache instead of in memory store for reaper. Persist cache on reload. Autoclean after 24 hours. ([#2811](https://github.com/evstack/ev-node/pull/2811)) - Simplified `/health/live` endpoint to only check store accessibility (liveness) instead of business logic, following Kubernetes best practices ([#2800](https://github.com/evstack/ev-node/pull/2800)) - Updated `/health/ready` endpoint to use `GetState()` instead of `Height()` to access block production timing information ([#2800](https://github.com/evstack/ev-node/pull/2800)) +- Updated E2E test helper `AwaitNodeUp()` to check both liveness and readiness endpoints, ensuring nodes are not just alive but ready to serve traffic ([#2800](https://github.com/evstack/ev-node/pull/2800)) - Renamed integration test from `TestHealthEndpointWhenBlockProductionStops` to `TestReadinessEndpointWhenBlockProductionStops` to correctly test readiness endpoint ([#2800](https://github.com/evstack/ev-node/pull/2800)) - Updated Rust client example (`client/crates/client/examples/basic.rs`) to demonstrate both liveness and readiness checks ([#2800](https://github.com/evstack/ev-node/pull/2800)) diff --git a/node/single_sequencer_integration_test.go b/node/single_sequencer_integration_test.go index 070c95b771..4c8214665d 100644 --- a/node/single_sequencer_integration_test.go +++ b/node/single_sequencer_integration_test.go @@ -16,6 +16,7 @@ import ( coreda "github.com/evstack/ev-node/core/da" coreexecutor "github.com/evstack/ev-node/core/execution" evconfig "github.com/evstack/ev-node/pkg/config" + "github.com/evstack/ev-node/pkg/rpc/client" ) // FullNodeTestSuite is a test suite for full node integration tests @@ -451,7 +452,7 @@ func TestReadinessEndpointWhenBlockProductionStops(t *testing.T) { // Verify readiness is READY while blocks are being produced readiness, err := rpcClient.GetReadiness(ctx) require.NoError(err) - require.Equal("READY", readiness.String(), "Readiness should be READY while producing blocks") + require.Equal(client.ReadinessStatus_READY, readiness, "Readiness should be READY while producing blocks") // Wait for block production to stop (when MaxPendingHeadersAndData is reached) time.Sleep(time.Duration(config.Node.MaxPendingHeadersAndData+2) * config.Node.BlockTime.Duration) @@ -473,7 +474,7 @@ func TestReadinessEndpointWhenBlockProductionStops(t *testing.T) { if err != nil { return false } - return readiness.String() == "UNREADY" + return readiness == client.ReadinessStatus_UNREADY }, 10*time.Second, 100*time.Millisecond, "Readiness should be UNREADY after aggregator stops producing blocks (5x block time)") // Stop the node and wait for shutdown diff --git a/pkg/rpc/client/client_test.go b/pkg/rpc/client/client_test.go index 680c6ef82e..eaf1d45abb 100644 --- a/pkg/rpc/client/client_test.go +++ b/pkg/rpc/client/client_test.go @@ -284,7 +284,7 @@ func TestClientGetHealth(t *testing.T) { healthStatus, err := client.GetHealth(context.Background()) require.NoError(t, err) - require.Equal(t, "PASS", healthStatus.String()) + require.Equal(t, HealthStatus_PASS, healthStatus) mockStore.AssertExpectations(t) }) @@ -301,7 +301,7 @@ func TestClientGetHealth(t *testing.T) { healthStatus, err := client.GetHealth(context.Background()) require.NoError(t, err) - require.Equal(t, "FAIL", healthStatus.String()) + require.Equal(t, HealthStatus_FAIL, healthStatus) mockStore.AssertExpectations(t) }) @@ -318,7 +318,7 @@ func TestClientGetHealth(t *testing.T) { healthStatus, err := client.GetHealth(context.Background()) require.NoError(t, err) - require.Equal(t, "PASS", healthStatus.String()) + require.Equal(t, HealthStatus_PASS, healthStatus) mockStore.AssertExpectations(t) }) } @@ -352,7 +352,7 @@ func TestClientGetReadiness(t *testing.T) { readiness, err := client.GetReadiness(context.Background()) require.NoError(t, err) - require.Equal(t, "READY", readiness.String()) + require.Equal(t, ReadinessStatus_READY, readiness) mockStore.AssertExpectations(t) mockP2P.AssertExpectations(t) }) @@ -373,7 +373,7 @@ func TestClientGetReadiness(t *testing.T) { readiness, err := client.GetReadiness(context.Background()) require.NoError(t, err) - require.Equal(t, "UNREADY", readiness.String()) + require.Equal(t, ReadinessStatus_UNREADY, readiness) mockP2P.AssertExpectations(t) }) @@ -397,7 +397,7 @@ func TestClientGetReadiness(t *testing.T) { readiness, err := client.GetReadiness(context.Background()) require.NoError(t, err) - require.Equal(t, "UNREADY", readiness.String()) + require.Equal(t, ReadinessStatus_UNREADY, readiness) mockP2P.AssertExpectations(t) }) } diff --git a/test/e2e/sut_helper.go b/test/e2e/sut_helper.go index 226e02a0cb..8d6745e0bb 100644 --- a/test/e2e/sut_helper.go +++ b/test/e2e/sut_helper.go @@ -93,7 +93,8 @@ func (s *SystemUnderTest) ExecCmd(cmd string, args ...string) { s.awaitProcessCleanup(c) } -// AwaitNodeUp waits until a node is operational by validating it produces blocks. +// AwaitNodeUp waits until a node is operational by checking both liveness and readiness. +// This verifies the process is alive (liveness) and ready to serve traffic (readiness). func (s *SystemUnderTest) AwaitNodeUp(t *testing.T, rpcAddr string, timeout time.Duration) { t.Helper() t.Logf("Await node is up: %s", rpcAddr) @@ -102,8 +103,15 @@ func (s *SystemUnderTest) AwaitNodeUp(t *testing.T, rpcAddr string, timeout time require.EventuallyWithT(t, func(t *assert.CollectT) { c := client.NewClient(rpcAddr) require.NotNil(t, c) + + // Check liveness: is the process alive? _, err := c.GetHealth(ctx) - require.NoError(t, err) + require.NoError(t, err, "liveness check failed") + + // Check readiness: is the node ready to serve traffic? + readiness, err := c.GetReadiness(ctx) + require.NoError(t, err, "readiness check failed") + require.Equal(t, client.ReadinessStatus_READY, readiness, "node is not ready") }, timeout, min(timeout/10, 200*time.Millisecond), "node is not up") } From 6bc650fb1b20c757812cfd0e2f002a025562555b Mon Sep 17 00:00:00 2001 From: Randy Grok Date: Thu, 6 Nov 2025 09:25:00 +0100 Subject: [PATCH 15/21] deps: update tastora to v0.7.6 with additional changes --- test/docker-e2e/go.mod | 2 +- test/docker-e2e/go.sum | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/test/docker-e2e/go.mod b/test/docker-e2e/go.mod index cb3d48aa92..020f6e2575 100644 --- a/test/docker-e2e/go.mod +++ b/test/docker-e2e/go.mod @@ -4,7 +4,7 @@ go 1.24.6 require ( cosmossdk.io/math v1.5.3 - github.com/celestiaorg/tastora v0.7.5 + github.com/celestiaorg/tastora v0.7.6-0.20251106081541-3ec2da2f1f7f github.com/ethereum/go-ethereum v1.16.6 github.com/evstack/ev-node/execution/evm v1.0.0-beta.3 github.com/stretchr/testify v1.11.1 diff --git a/test/docker-e2e/go.sum b/test/docker-e2e/go.sum index b54154c75b..95956f854a 100644 --- a/test/docker-e2e/go.sum +++ b/test/docker-e2e/go.sum @@ -135,6 +135,8 @@ github.com/celestiaorg/nmt v0.24.2 h1:LlpJSPOd6/Lw1Ig6HUhZuqiINHLka/ZSRTBzlNJpch github.com/celestiaorg/nmt v0.24.2/go.mod h1:vgLBpWBi8F5KLxTdXSwb7AU4NhiIQ1AQRGa+PzdcLEA= github.com/celestiaorg/tastora v0.7.5 h1:LT1MPpRB7Jd2LcBBoVwtimBh1NIxueG7c5DQwfTpZ0g= github.com/celestiaorg/tastora v0.7.5/go.mod h1:Xw44XeRN2T/kSdopVCJjNhwFwRSO58wTW8GrVP7OWFI= +github.com/celestiaorg/tastora v0.7.6-0.20251106081541-3ec2da2f1f7f h1:vTcOubI4Bab0RCE1i9APbbmhJviFA52wv/XV43NwE9M= +github.com/celestiaorg/tastora v0.7.6-0.20251106081541-3ec2da2f1f7f/go.mod h1:Xw44XeRN2T/kSdopVCJjNhwFwRSO58wTW8GrVP7OWFI= github.com/cenkalti/backoff v2.2.1+incompatible h1:tNowT99t7UNflLxfYYSlKYsBpXdEet03Pg2g16Swow4= github.com/cenkalti/backoff v2.2.1+incompatible/go.mod h1:90ReRw6GdpyfrHakVjL/QHaoyV4aDUVVkXQJJJ3NXXM= github.com/cenkalti/backoff/v4 v4.1.1/go.mod h1:scbssz8iZGpm3xbr14ovlUdkxfGXNInqkPWOWmG2CLw= From 015cb76bd2b334dca6fb04fc8a2a0b5831811987 Mon Sep 17 00:00:00 2001 From: Randy Grok Date: Thu, 6 Nov 2025 09:31:20 +0100 Subject: [PATCH 16/21] deps: update tastora to v0.7.6 --- test/docker-e2e/go.sum | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/docker-e2e/go.sum b/test/docker-e2e/go.sum index 95956f854a..80d6dd6a6c 100644 --- a/test/docker-e2e/go.sum +++ b/test/docker-e2e/go.sum @@ -133,8 +133,6 @@ github.com/celestiaorg/go-square/v3 v3.0.2 h1:eSQOgNII8inK9IhiBZ+6GADQeWbRq4HYY7 github.com/celestiaorg/go-square/v3 v3.0.2/go.mod h1:oFReMLsSDMRs82ICFEeFQFCqNvwdsbIM1BzCcb0f7dM= github.com/celestiaorg/nmt v0.24.2 h1:LlpJSPOd6/Lw1Ig6HUhZuqiINHLka/ZSRTBzlNJpchg= github.com/celestiaorg/nmt v0.24.2/go.mod h1:vgLBpWBi8F5KLxTdXSwb7AU4NhiIQ1AQRGa+PzdcLEA= -github.com/celestiaorg/tastora v0.7.5 h1:LT1MPpRB7Jd2LcBBoVwtimBh1NIxueG7c5DQwfTpZ0g= -github.com/celestiaorg/tastora v0.7.5/go.mod h1:Xw44XeRN2T/kSdopVCJjNhwFwRSO58wTW8GrVP7OWFI= github.com/celestiaorg/tastora v0.7.6-0.20251106081541-3ec2da2f1f7f h1:vTcOubI4Bab0RCE1i9APbbmhJviFA52wv/XV43NwE9M= github.com/celestiaorg/tastora v0.7.6-0.20251106081541-3ec2da2f1f7f/go.mod h1:Xw44XeRN2T/kSdopVCJjNhwFwRSO58wTW8GrVP7OWFI= github.com/cenkalti/backoff v2.2.1+incompatible h1:tNowT99t7UNflLxfYYSlKYsBpXdEet03Pg2g16Swow4= From ea5219b0527835008f86ef354c4fa7f08fdaa124 Mon Sep 17 00:00:00 2001 From: Randy Grok Date: Thu, 6 Nov 2025 10:13:20 +0100 Subject: [PATCH 17/21] feat: add liveness check for nodes in e2e tests and update node startup parameters --- test/e2e/base_test.go | 12 ++++++++---- test/e2e/sut_helper.go | 18 ++++++++++++++++++ 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/test/e2e/base_test.go b/test/e2e/base_test.go index be521a95d5..c5437ba136 100644 --- a/test/e2e/base_test.go +++ b/test/e2e/base_test.go @@ -65,6 +65,7 @@ func TestBasic(t *testing.T) { require.NoError(t, err, "failed to init aggregator", output) // start aggregator + node1P2P := "/ip4/0.0.0.0/tcp/26656" sut.ExecCmd(binaryPath, "start", "--home="+node1Home, @@ -72,13 +73,14 @@ func TestBasic(t *testing.T) { "--evnode.signer.passphrase_file="+passphraseFile, "--evnode.node.block_time=5ms", "--evnode.da.block_time=15ms", + "--evnode.p2p.listen_address="+node1P2P, "--kv-endpoint=127.0.0.1:9090", ) sut.AwaitNodeUp(t, "http://127.0.0.1:7331", 2*time.Second) // Give aggregator more time before starting the next node - time.Sleep(1 * time.Second) // Increased wait time + time.Sleep(2 * time.Second) // Init the second node (full node) output, err = sut.RunCmd(binaryPath, @@ -91,7 +93,7 @@ func TestBasic(t *testing.T) { // Copy genesis file from aggregator to full node MustCopyFile(t, filepath.Join(node1Home, "config", "genesis.json"), filepath.Join(node2Home, "config", "genesis.json")) - // Start the full node + // Start the full node - will discover aggregator via DHT node2RPC := "127.0.0.1:7332" node2P2P := "/ip4/0.0.0.0/tcp/7676" sut.ExecCmd( @@ -100,11 +102,13 @@ func TestBasic(t *testing.T) { "--home="+node2Home, "--evnode.log.level=debug", "--evnode.p2p.listen_address="+node2P2P, + "--evnode.node.readiness_max_blocks_behind=100", // Allow more blocks behind during bootstrap fmt.Sprintf("--evnode.rpc.address=%s", node2RPC), ) - sut.AwaitNodeUp(t, "http://"+node2RPC, 2*time.Second) - t.Logf("Full node (node 2) is up.") + // For local e2e tests, only check liveness (not full readiness with peers) + sut.AwaitNodeLive(t, "http://"+node2RPC, 10*time.Second) + t.Logf("Full node (node 2) is live.") // when a client TX for state update is executed const myKey = "foo" diff --git a/test/e2e/sut_helper.go b/test/e2e/sut_helper.go index 8d6745e0bb..302cfe4b77 100644 --- a/test/e2e/sut_helper.go +++ b/test/e2e/sut_helper.go @@ -115,6 +115,24 @@ func (s *SystemUnderTest) AwaitNodeUp(t *testing.T, rpcAddr string, timeout time }, timeout, min(timeout/10, 200*time.Millisecond), "node is not up") } +// AwaitNodeLive waits until a node is alive (liveness check only). +// This only verifies the process is alive and responsive, not that it's ready to serve traffic. +// Use this for local tests where nodes may not have peers configured. +func (s *SystemUnderTest) AwaitNodeLive(t *testing.T, rpcAddr string, timeout time.Duration) { + t.Helper() + t.Logf("Await node is live: %s", rpcAddr) + ctx, done := context.WithTimeout(context.Background(), timeout) + defer done() + require.EventuallyWithT(t, func(t *assert.CollectT) { + c := client.NewClient(rpcAddr) + require.NotNil(t, c) + + // Check liveness only: is the process alive? + _, err := c.GetHealth(ctx) + require.NoError(t, err, "liveness check failed") + }, timeout, min(timeout/10, 200*time.Millisecond), "node is not live") +} + // AwaitNBlocks waits until the node has produced at least `n` blocks. func (s *SystemUnderTest) AwaitNBlocks(t *testing.T, n uint64, rpcAddr string, timeout time.Duration) { t.Helper() From d2d43932144e15892f95ed4c7f3fec3bf2378f81 Mon Sep 17 00:00:00 2001 From: Randy Grok Date: Thu, 6 Nov 2025 11:36:27 +0100 Subject: [PATCH 18/21] docs: update CHANGELOG with new health and readiness checks, including endpoint modifications and test updates --- CHANGELOG.md | 4 +- node/single_sequencer_integration_test.go | 19 -- pkg/rpc/client/client.go | 120 ------------ pkg/rpc/client/client_test.go | 224 +--------------------- pkg/rpc/server/http.go | 40 +--- pkg/rpc/server/http_test.go | 11 +- test/e2e/sut_helper.go | 21 +- 7 files changed, 23 insertions(+), 416 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 24e2df3c12..667c2acb2f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,8 +14,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added comprehensive health endpoint documentation in `docs/learn/config.md#health-endpoints` explaining liveness vs readiness checks, Kubernetes probe configuration, and usage examples ([#2800](https://github.com/evstack/ev-node/pull/2800)) - Added P2P listening check to `/health/ready` endpoint to verify P2P network is ready to accept connections ([#2800](https://github.com/evstack/ev-node/pull/2800)) - Added aggregator block production rate check to `/health/ready` endpoint to ensure aggregators are producing blocks within expected timeframe (5x block time) ([#2800](https://github.com/evstack/ev-node/pull/2800)) -- Added `GetReadiness()` method to Go RPC client for checking `/health/ready` endpoint ([#2800](https://github.com/evstack/ev-node/pull/2800)) -- Added `ReadinessStatus` type to Go RPC client with READY/UNREADY/UNKNOWN states ([#2800](https://github.com/evstack/ev-node/pull/2800)) - Added `readyz()` and `is_ready()` methods to Rust `HealthClient` for checking `/health/ready` endpoint ([#2800](https://github.com/evstack/ev-node/pull/2800)) - Added `ReadinessStatus` enum to Rust client with Ready/Unready/Unknown states ([#2800](https://github.com/evstack/ev-node/pull/2800)) @@ -33,7 +31,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **BREAKING:** Removed `evnode.v1.HealthService` gRPC endpoint in favor of HTTP health endpoints ([#2800](https://github.com/evstack/ev-node/pull/2800)) - Migration: Use `GET /health/live` instead of `HealthService.Livez()` gRPC call - See migration guide: `docs/learn/config.md#health-endpoints` - - Affected clients: Go client (`pkg/rpc/client`), Rust client (`client/crates/client`), and any external services using the gRPC health endpoint + - Affected clients: Rust client (`client/crates/client`) and any external services using the gRPC health endpoint - Removed `proto/evnode/v1/health.proto` and generated protobuf files ([#2800](https://github.com/evstack/ev-node/pull/2800)) ## v1.0.0-beta.9 diff --git a/node/single_sequencer_integration_test.go b/node/single_sequencer_integration_test.go index 4c8214665d..414a706af1 100644 --- a/node/single_sequencer_integration_test.go +++ b/node/single_sequencer_integration_test.go @@ -419,19 +419,13 @@ func waitForBlockN(t *testing.T, n uint64, node *FullNode, blockInterval time.Du return got >= n }, timeout[0], blockInterval/2) } -// TestReadinessEndpointWhenBlockProductionStops verifies that the readiness endpoint -// correctly reports UNREADY state when an aggregator stops producing blocks. func TestReadinessEndpointWhenBlockProductionStops(t *testing.T) { require := require.New(t) - // Set up configuration with specific block time for predictable readiness checks config := getTestConfig(t, 1) config.Node.Aggregator = true config.Node.BlockTime = evconfig.DurationWrapper{Duration: 500 * time.Millisecond} config.Node.MaxPendingHeadersAndData = 2 - - // Set DA block time large enough to avoid header submission to DA layer - // This will cause block production to stop once MaxPendingHeadersAndData is reached config.DA.BlockTime = evconfig.DurationWrapper{Duration: 100 * time.Second} node, cleanup := createNodeWithCleanup(t, config) @@ -443,32 +437,20 @@ func TestReadinessEndpointWhenBlockProductionStops(t *testing.T) { var runningWg sync.WaitGroup startNodeInBackground(t, []*FullNode{node}, []context.Context{ctx}, &runningWg, 0, nil) - // Wait for first block to be produced waitForBlockN(t, 1, node, config.Node.BlockTime.Duration) - // Create RPC client rpcClient := NewRPCClient(config.RPC.Address) - // Verify readiness is READY while blocks are being produced readiness, err := rpcClient.GetReadiness(ctx) require.NoError(err) require.Equal(client.ReadinessStatus_READY, readiness, "Readiness should be READY while producing blocks") - // Wait for block production to stop (when MaxPendingHeadersAndData is reached) time.Sleep(time.Duration(config.Node.MaxPendingHeadersAndData+2) * config.Node.BlockTime.Duration) - // Get the height to confirm blocks stopped height, err := getNodeHeight(node, Store) require.NoError(err) require.LessOrEqual(height, config.Node.MaxPendingHeadersAndData) - // Readiness check threshold for aggregators: - // blockTime = 500ms - // maxAllowedDelay = blockTime * 5 = 2500ms = 2.5s - // After 2.5s without producing a block, aggregator should be UNREADY - - // Poll for readiness to transition to UNREADY - // This is more robust than fixed time.Sleep as it handles timing variations require.Eventually(func() bool { readiness, err := rpcClient.GetReadiness(ctx) if err != nil { @@ -477,6 +459,5 @@ func TestReadinessEndpointWhenBlockProductionStops(t *testing.T) { return readiness == client.ReadinessStatus_UNREADY }, 10*time.Second, 100*time.Millisecond, "Readiness should be UNREADY after aggregator stops producing blocks (5x block time)") - // Stop the node and wait for shutdown shutdownAndWait(t, []context.CancelFunc{cancel}, &runningWg, 10*time.Second) } diff --git a/pkg/rpc/client/client.go b/pkg/rpc/client/client.go index 68dcfedf3b..f7a03d536f 100644 --- a/pkg/rpc/client/client.go +++ b/pkg/rpc/client/client.go @@ -2,10 +2,7 @@ package client import ( "context" - "fmt" - "io" "net/http" - "strings" "connectrpc.com/connect" "google.golang.org/protobuf/types/known/emptypb" @@ -14,40 +11,11 @@ import ( rpc "github.com/evstack/ev-node/types/pb/evnode/v1/v1connect" ) -// HealthStatus represents the health status of a node -type HealthStatus int32 - -const ( - // HealthStatus_UNKNOWN represents an unknown health status - HealthStatus_UNKNOWN HealthStatus = 0 - // HealthStatus_PASS represents a healthy node - HealthStatus_PASS HealthStatus = 1 - // HealthStatus_WARN represents a degraded but still serving node - HealthStatus_WARN HealthStatus = 2 - // HealthStatus_FAIL represents a failed node - HealthStatus_FAIL HealthStatus = 3 -) - -func (h HealthStatus) String() string { - switch h { - case HealthStatus_PASS: - return "PASS" - case HealthStatus_WARN: - return "WARN" - case HealthStatus_FAIL: - return "FAIL" - default: - return "UNKNOWN" - } -} - // Client is the client for StoreService, P2PService, and ConfigService type Client struct { storeClient rpc.StoreServiceClient p2pClient rpc.P2PServiceClient configClient rpc.ConfigServiceClient - baseURL string - httpClient *http.Client } // NewClient creates a new RPC client @@ -61,8 +29,6 @@ func NewClient(baseURL string) *Client { storeClient: storeClient, p2pClient: p2pClient, configClient: configClient, - baseURL: baseURL, - httpClient: httpClient, } } @@ -145,92 +111,6 @@ func (c *Client) GetNetInfo(ctx context.Context) (*pb.NetInfo, error) { return resp.Msg.NetInfo, nil } -// GetHealth calls the /health/live HTTP endpoint and returns the HealthStatus -// This endpoint checks liveness (is the process alive and responsive?). -// For readiness checks (can the node serve correct data?), use GetReadiness(). -func (c *Client) GetHealth(ctx context.Context) (HealthStatus, error) { - healthURL := fmt.Sprintf("%s/health/live", c.baseURL) - req, err := http.NewRequestWithContext(ctx, http.MethodGet, healthURL, nil) - if err != nil { - return HealthStatus_UNKNOWN, fmt.Errorf("failed to create health request: %w", err) - } - - resp, err := c.httpClient.Do(req) - if err != nil { - return HealthStatus_UNKNOWN, fmt.Errorf("failed to get health: %w", err) - } - defer resp.Body.Close() - - body, err := io.ReadAll(resp.Body) - if err != nil { - return HealthStatus_UNKNOWN, fmt.Errorf("failed to read health response: %w", err) - } - - // Parse the text response - status := strings.TrimSpace(string(body)) - switch status { - case "OK": - return HealthStatus_PASS, nil - case "WARN": - return HealthStatus_WARN, nil - case "FAIL": - return HealthStatus_FAIL, nil - default: - return HealthStatus_UNKNOWN, fmt.Errorf("unknown health status: %s", status) - } -} - -// ReadinessStatus represents the readiness state of a node -type ReadinessStatus int32 - -const ( - ReadinessStatus_UNKNOWN ReadinessStatus = 0 - ReadinessStatus_READY ReadinessStatus = 1 - ReadinessStatus_UNREADY ReadinessStatus = 2 -) - -func (s ReadinessStatus) String() string { - switch s { - case ReadinessStatus_READY: - return "READY" - case ReadinessStatus_UNREADY: - return "UNREADY" - default: - return "UNKNOWN" - } -} - -// GetReadiness calls the /health/ready HTTP endpoint and returns the ReadinessStatus -// This endpoint checks if the node can serve correct data to clients. -// For liveness checks (is the process alive?), use GetHealth(). -func (c *Client) GetReadiness(ctx context.Context) (ReadinessStatus, error) { - readinessURL := fmt.Sprintf("%s/health/ready", c.baseURL) - req, err := http.NewRequestWithContext(ctx, http.MethodGet, readinessURL, nil) - if err != nil { - return ReadinessStatus_UNKNOWN, fmt.Errorf("failed to create readiness request: %w", err) - } - - resp, err := c.httpClient.Do(req) - if err != nil { - return ReadinessStatus_UNKNOWN, fmt.Errorf("failed to get readiness: %w", err) - } - defer resp.Body.Close() - - body, err := io.ReadAll(resp.Body) - if err != nil { - return ReadinessStatus_UNKNOWN, fmt.Errorf("failed to read readiness response: %w", err) - } - - // Parse the text response - status := strings.TrimSpace(string(body)) - if strings.HasPrefix(status, "READY") { - return ReadinessStatus_READY, nil - } else if strings.HasPrefix(status, "UNREADY") { - return ReadinessStatus_UNREADY, nil - } - return ReadinessStatus_UNKNOWN, fmt.Errorf("unknown readiness status: %s", status) -} - // GetNamespace returns the namespace configuration for this network func (c *Client) GetNamespace(ctx context.Context) (*pb.GetNamespaceResponse, error) { req := connect.NewRequest(&emptypb.Empty{}) diff --git a/pkg/rpc/client/client_test.go b/pkg/rpc/client/client_test.go index eaf1d45abb..db375dee5b 100644 --- a/pkg/rpc/client/client_test.go +++ b/pkg/rpc/client/client_test.go @@ -10,7 +10,6 @@ import ( "github.com/libp2p/go-libp2p/core/peer" "github.com/multiformats/go-multiaddr" "github.com/rs/zerolog" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" "golang.org/x/net/http2" @@ -24,81 +23,36 @@ import ( rpc "github.com/evstack/ev-node/types/pb/evnode/v1/v1connect" ) -// setupTestServerOptions holds optional parameters for setupTestServer -type setupTestServerOptions struct { - config *config.Config - bestKnownHeightProvider server.BestKnownHeightProvider -} - -// setupTestServer creates a test server with mock store and mock p2p manager. -// An optional custom config can be provided; if not provided, uses DefaultConfig with test-headers namespace. -func setupTestServer(t *testing.T, mockStore *mocks.MockStore, mockP2P *mocks.MockP2PRPC, customConfig ...config.Config) (*httptest.Server, *Client) { - return setupTestServerWithOptions(t, mockStore, mockP2P, setupTestServerOptions{ - config: func() *config.Config { - if len(customConfig) > 0 { - return &customConfig[0] - } - return nil - }(), - bestKnownHeightProvider: func() uint64 { return 100 }, // Default provider - }) -} - -// setupTestServerWithOptions creates a test server with full control over options -func setupTestServerWithOptions(t *testing.T, mockStore *mocks.MockStore, mockP2P *mocks.MockP2PRPC, opts setupTestServerOptions) (*httptest.Server, *Client) { - // Create a new HTTP test server +func setupTestServer(t *testing.T, mockStore *mocks.MockStore, mockP2P *mocks.MockP2PRPC) (*httptest.Server, *Client) { mux := http.NewServeMux() - // Create the servers logger := zerolog.Nop() - - // Use custom config if provided, otherwise use default - var testConfig config.Config - if opts.config != nil { - testConfig = *opts.config - } else { - testConfig = config.DefaultConfig() - testConfig.DA.Namespace = "test-headers" - } - storeServer := server.NewStoreServer(mockStore, logger) p2pServer := server.NewP2PServer(mockP2P) + + testConfig := config.DefaultConfig() + testConfig.DA.Namespace = "test-headers" configServer := server.NewConfigServer(testConfig, nil, logger) - // Register the store service storePath, storeHandler := rpc.NewStoreServiceHandler(storeServer) mux.Handle(storePath, storeHandler) - // Register the p2p service p2pPath, p2pHandler := rpc.NewP2PServiceHandler(p2pServer) mux.Handle(p2pPath, p2pHandler) - // Register the config service configPath, configHandler := rpc.NewConfigServiceHandler(configServer) mux.Handle(configPath, configHandler) - // Register custom HTTP endpoints (including health) - bestKnownHeight := opts.bestKnownHeightProvider - if bestKnownHeight == nil { - bestKnownHeight = func() uint64 { return 100 } - } - server.RegisterCustomHTTPEndpoints(mux, mockStore, mockP2P, testConfig, bestKnownHeight, logger) - - // Create an HTTP server with h2c for HTTP/2 support testServer := httptest.NewServer(h2c.NewHandler(mux, &http2.Server{})) - - // Create a client that connects to the test server client := NewClient(testServer.URL) return testServer, client } func TestClientGetState(t *testing.T) { - // Create mocks mockStore := mocks.NewMockStore(t) mockP2P := mocks.NewMockP2PRPC(t) - // Create test data state := types.State{ AppHash: []byte("app_hash"), InitialHeight: 10, @@ -106,17 +60,13 @@ func TestClientGetState(t *testing.T) { LastBlockTime: time.Now(), } - // Setup mock expectations mockStore.On("GetState", mock.Anything).Return(state, nil) - // Setup test server and client testServer, client := setupTestServer(t, mockStore, mockP2P) defer testServer.Close() - // Call GetState resultState, err := client.GetState(context.Background()) - // Assert expectations require.NoError(t, err) require.Equal(t, state.AppHash, resultState.AppHash) require.Equal(t, state.InitialHeight, resultState.InitialHeight) @@ -126,91 +76,71 @@ func TestClientGetState(t *testing.T) { } func TestClientGetMetadata(t *testing.T) { - // Create mocks mockStore := mocks.NewMockStore(t) mockP2P := mocks.NewMockP2PRPC(t) - // Create test data key := "test_key" value := []byte("test_value") - // Setup mock expectations mockStore.On("GetMetadata", mock.Anything, key).Return(value, nil) - // Setup test server and client testServer, client := setupTestServer(t, mockStore, mockP2P) defer testServer.Close() - // Call GetMetadata resultValue, err := client.GetMetadata(context.Background(), key) - // Assert expectations require.NoError(t, err) require.Equal(t, value, resultValue) mockStore.AssertExpectations(t) } func TestClientGetBlockByHeight(t *testing.T) { - // Create mocks mockStore := mocks.NewMockStore(t) mockP2P := mocks.NewMockP2PRPC(t) - // Create test data height := uint64(10) header := &types.SignedHeader{} data := &types.Data{} - // Setup mock expectations mockStore.On("GetBlockData", mock.Anything, height).Return(header, data, nil) - // Setup test server and client testServer, client := setupTestServer(t, mockStore, mockP2P) defer testServer.Close() - // Call GetBlockByHeight block, err := client.GetBlockByHeight(context.Background(), height) - // Assert expectations require.NoError(t, err) require.NotNil(t, block) mockStore.AssertExpectations(t) } func TestClientGetBlockByHash(t *testing.T) { - // Create mocks mockStore := mocks.NewMockStore(t) mockP2P := mocks.NewMockP2PRPC(t) - // Create test data hash := []byte("block_hash") header := &types.SignedHeader{} data := &types.Data{} - // Setup mock expectations mockStore.On("GetBlockByHash", mock.Anything, hash).Return(header, data, nil) - // Setup test server and client testServer, client := setupTestServer(t, mockStore, mockP2P) defer testServer.Close() - // Call GetBlockByHash block, err := client.GetBlockByHash(context.Background(), hash) - // Assert expectations require.NoError(t, err) require.NotNil(t, block) mockStore.AssertExpectations(t) } func TestClientGetPeerInfo(t *testing.T) { - // Create mocks mockStore := mocks.NewMockStore(t) mockP2P := mocks.NewMockP2PRPC(t) addr, err := multiaddr.NewMultiaddr("/ip4/0.0.0.0/tcp/8000") require.NoError(t, err) - // Create test data peers := []peer.AddrInfo{ { ID: "3bM8hezDN5", @@ -222,17 +152,13 @@ func TestClientGetPeerInfo(t *testing.T) { }, } - // Setup mock expectations mockP2P.On("GetPeers").Return(peers, nil) - // Setup test server and client testServer, client := setupTestServer(t, mockStore, mockP2P) defer testServer.Close() - // Call GetPeerInfo resultPeers, err := client.GetPeerInfo(context.Background()) - // Assert expectations require.NoError(t, err) require.Len(t, resultPeers, 2) require.Equal(t, "3tSMH9AUGpeoe4", resultPeers[0].Id) @@ -243,178 +169,36 @@ func TestClientGetPeerInfo(t *testing.T) { } func TestClientGetNetInfo(t *testing.T) { - // Create mocks mockStore := mocks.NewMockStore(t) mockP2P := mocks.NewMockP2PRPC(t) - // Create test data netInfo := p2p.NetworkInfo{ ID: "node1", ListenAddress: []string{"0.0.0.0:26656"}, } - // Setup mock expectations mockP2P.On("GetNetworkInfo").Return(netInfo, nil) - // Setup test server and client testServer, client := setupTestServer(t, mockStore, mockP2P) defer testServer.Close() - // Call GetNetInfo resultNetInfo, err := client.GetNetInfo(context.Background()) - // Assert expectations require.NoError(t, err) require.Equal(t, "node1", resultNetInfo.Id) require.Equal(t, "0.0.0.0:26656", resultNetInfo.ListenAddresses[0]) mockP2P.AssertExpectations(t) } -func TestClientGetHealth(t *testing.T) { - t.Run("returns PASS when store is accessible", func(t *testing.T) { - mockStore := mocks.NewMockStore(t) - mockP2P := mocks.NewMockP2PRPC(t) - - // Mock Height to return successfully - mockStore.On("Height", mock.Anything).Return(uint64(100), nil) - - testServer, client := setupTestServer(t, mockStore, mockP2P) - defer testServer.Close() - - healthStatus, err := client.GetHealth(context.Background()) - - require.NoError(t, err) - require.Equal(t, HealthStatus_PASS, healthStatus) - mockStore.AssertExpectations(t) - }) - - t.Run("returns FAIL when store is not accessible", func(t *testing.T) { - mockStore := mocks.NewMockStore(t) - mockP2P := mocks.NewMockP2PRPC(t) - - // Mock Height to return an error - mockStore.On("Height", mock.Anything).Return(uint64(0), assert.AnError) - - testServer, client := setupTestServer(t, mockStore, mockP2P) - defer testServer.Close() - - healthStatus, err := client.GetHealth(context.Background()) - - require.NoError(t, err) - require.Equal(t, HealthStatus_FAIL, healthStatus) - mockStore.AssertExpectations(t) - }) - - t.Run("returns PASS even at height 0", func(t *testing.T) { - mockStore := mocks.NewMockStore(t) - mockP2P := mocks.NewMockP2PRPC(t) - - // Mock Height to return 0 successfully (genesis state) - mockStore.On("Height", mock.Anything).Return(uint64(0), nil) - - testServer, client := setupTestServer(t, mockStore, mockP2P) - defer testServer.Close() - - healthStatus, err := client.GetHealth(context.Background()) - - require.NoError(t, err) - require.Equal(t, HealthStatus_PASS, healthStatus) - mockStore.AssertExpectations(t) - }) -} - -func TestClientGetReadiness(t *testing.T) { - t.Run("returns READY when all checks pass", func(t *testing.T) { - mockStore := mocks.NewMockStore(t) - mockP2P := mocks.NewMockP2PRPC(t) - - // Setup P2P network info - netInfo := p2p.NetworkInfo{ - ID: "test-node", - ListenAddress: []string{"/ip4/0.0.0.0/tcp/26656"}, - } - mockP2P.On("GetNetworkInfo").Return(netInfo, nil) - - // Setup peers (non-aggregator needs peers) - peers := []peer.AddrInfo{{}} - mockP2P.On("GetPeers").Return(peers, nil) - - // Setup state - state := types.State{ - LastBlockHeight: 100, - LastBlockTime: time.Now(), - } - mockStore.On("GetState", mock.Anything).Return(state, nil) - - testServer, client := setupTestServer(t, mockStore, mockP2P) - defer testServer.Close() - - readiness, err := client.GetReadiness(context.Background()) - - require.NoError(t, err) - require.Equal(t, ReadinessStatus_READY, readiness) - mockStore.AssertExpectations(t) - mockP2P.AssertExpectations(t) - }) - - t.Run("returns UNREADY when P2P not listening", func(t *testing.T) { - mockStore := mocks.NewMockStore(t) - mockP2P := mocks.NewMockP2PRPC(t) - - // Setup P2P network info with no listen addresses - netInfo := p2p.NetworkInfo{ - ID: "test-node", - } - mockP2P.On("GetNetworkInfo").Return(netInfo, nil) - - testServer, client := setupTestServer(t, mockStore, mockP2P) - defer testServer.Close() - - readiness, err := client.GetReadiness(context.Background()) - - require.NoError(t, err) - require.Equal(t, ReadinessStatus_UNREADY, readiness) - mockP2P.AssertExpectations(t) - }) - - t.Run("returns UNREADY when no peers (non-aggregator)", func(t *testing.T) { - mockStore := mocks.NewMockStore(t) - mockP2P := mocks.NewMockP2PRPC(t) - - // Setup P2P network info - netInfo := p2p.NetworkInfo{ - ID: "test-node", - ListenAddress: []string{"/ip4/0.0.0.0/tcp/26656"}, - } - mockP2P.On("GetNetworkInfo").Return(netInfo, nil) - - // No peers - mockP2P.On("GetPeers").Return([]peer.AddrInfo{}, nil) - - testServer, client := setupTestServer(t, mockStore, mockP2P) - defer testServer.Close() - - readiness, err := client.GetReadiness(context.Background()) - - require.NoError(t, err) - require.Equal(t, ReadinessStatus_UNREADY, readiness) - mockP2P.AssertExpectations(t) - }) -} - func TestClientGetNamespace(t *testing.T) { - // Create mocks mockStore := mocks.NewMockStore(t) mockP2P := mocks.NewMockP2PRPC(t) - // Setup test server and client testServer, client := setupTestServer(t, mockStore, mockP2P) defer testServer.Close() - // Call GetNamespace namespaceResp, err := client.GetNamespace(context.Background()) - // Assert expectations require.NoError(t, err) require.NotNil(t, namespaceResp) // The namespace should be derived from the config we set in setupTestServer diff --git a/pkg/rpc/server/http.go b/pkg/rpc/server/http.go index 5a2e8c0626..784a789c57 100644 --- a/pkg/rpc/server/http.go +++ b/pkg/rpc/server/http.go @@ -1,9 +1,3 @@ -// Package server provides HTTP endpoint handlers for the RPC server. -// -// Health Endpoints: -// This file implements health check endpoints following Kubernetes best practices. -// For comprehensive documentation on health endpoints, their differences, and usage examples, -// see: docs/learn/config.md#health-endpoints package server import ( @@ -17,25 +11,17 @@ import ( "github.com/rs/zerolog" ) -// BestKnownHeightProvider should return the best-known network height observed by the node -// (e.g. min(headerSyncHeight, dataSyncHeight) for full nodes, or header height for light nodes). +// BestKnownHeightProvider returns the best-known network height observed by the node type BestKnownHeightProvider func() uint64 -// RegisterCustomHTTPEndpoints is the designated place to add new, non-gRPC, plain HTTP handlers. -// Additional custom HTTP endpoints can be registered on the mux here. -// -// For detailed documentation on health endpoints, see: docs/learn/config.md#health-endpoints +// RegisterCustomHTTPEndpoints registers custom HTTP handlers on the mux. +// See docs/learn/config.md#health-endpoints for health endpoint documentation. func RegisterCustomHTTPEndpoints(mux *http.ServeMux, s store.Store, pm p2p.P2PRPC, cfg config.Config, bestKnownHeightProvider BestKnownHeightProvider, logger zerolog.Logger) { - // Liveness endpoint - checks if the service process is alive and responsive - // A failing liveness check should result in killing/restarting the process - // This endpoint should NOT check business logic (like block production or sync status) - // - // See docs/learn/config.md#healthlive---liveness-probe for details + // /health/live - Liveness probe: checks if process is alive and responsive + // Should NOT check business logic (block production, sync status, etc.) mux.HandleFunc("/health/live", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/plain") - // Basic liveness check: Can we access the store? - // This verifies the process is alive and core dependencies are accessible _, err := s.Height(r.Context()) if err != nil { logger.Error().Err(err).Msg("Liveness check failed: cannot access store") @@ -43,20 +29,15 @@ func RegisterCustomHTTPEndpoints(mux *http.ServeMux, s store.Store, pm p2p.P2PRP return } - // Process is alive and responsive w.WriteHeader(http.StatusOK) fmt.Fprintln(w, "OK") }) - // Readiness endpoint - checks if the node can serve correct data to clients - // A failing readiness check should result in removing the node from load balancer - // but NOT killing the process (e.g., node is syncing, no peers, etc.) - // - // See docs/learn/config.md#healthready---readiness-probe for details + // /health/ready - Readiness probe: checks if node can serve correct data + // Failing readiness removes node from load balancer but doesn't kill process mux.HandleFunc("/health/ready", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/plain") - // P2P readiness: if P2P is enabled, verify it's ready to accept connections if pm != nil { netInfo, err := pm.GetNetworkInfo() if err != nil { @@ -68,7 +49,6 @@ func RegisterCustomHTTPEndpoints(mux *http.ServeMux, s store.Store, pm p2p.P2PRP return } - // Peer readiness: non-aggregator nodes should have at least 1 peer if !cfg.Node.Aggregator { peers, err := pm.GetPeers() if err != nil { @@ -82,7 +62,6 @@ func RegisterCustomHTTPEndpoints(mux *http.ServeMux, s store.Store, pm p2p.P2PRP } } - // Get current state state, err := s.GetState(r.Context()) if err != nil { http.Error(w, "UNREADY: state unavailable", http.StatusServiceUnavailable) @@ -90,14 +69,11 @@ func RegisterCustomHTTPEndpoints(mux *http.ServeMux, s store.Store, pm p2p.P2PRP } localHeight := state.LastBlockHeight - - // If no blocks yet, consider unready if localHeight == 0 { http.Error(w, "UNREADY: no blocks yet", http.StatusServiceUnavailable) return } - // Aggregator block production check: verify blocks are being produced at expected rate if cfg.Node.Aggregator { timeSinceLastBlock := time.Since(state.LastBlockTime) maxAllowedDelay := 5 * cfg.Node.BlockTime.Duration @@ -108,7 +84,6 @@ func RegisterCustomHTTPEndpoints(mux *http.ServeMux, s store.Store, pm p2p.P2PRP } } - // Require best-known height to make the readiness decision if bestKnownHeightProvider == nil { http.Error(w, "UNREADY: best-known height unavailable", http.StatusServiceUnavailable) return @@ -122,7 +97,6 @@ func RegisterCustomHTTPEndpoints(mux *http.ServeMux, s store.Store, pm p2p.P2PRP allowedBlocksBehind := cfg.Node.ReadinessMaxBlocksBehind if bestKnownHeight <= localHeight { - // local is ahead of our observed best-known consider ready w.WriteHeader(http.StatusOK) fmt.Fprintln(w, "READY") return diff --git a/pkg/rpc/server/http_test.go b/pkg/rpc/server/http_test.go index c259012692..9f1a5c7419 100644 --- a/pkg/rpc/server/http_test.go +++ b/pkg/rpc/server/http_test.go @@ -14,36 +14,27 @@ import ( ) func TestRegisterCustomHTTPEndpoints(t *testing.T) { - // Create a new ServeMux mux := http.NewServeMux() logger := zerolog.Nop() - // Create mock store mockStore := mocks.NewMockStore(t) mockStore.On("Height", mock.Anything).Return(uint64(100), nil) - // Register custom HTTP endpoints RegisterCustomHTTPEndpoints(mux, mockStore, nil, config.DefaultConfig(), nil, logger) - // Create a new HTTP test server with the mux testServer := httptest.NewServer(mux) defer testServer.Close() - // Make an HTTP GET request to the /health/live endpoint resp, err := http.Get(testServer.URL + "/health/live") assert.NoError(t, err) defer resp.Body.Close() - // Check the status code assert.Equal(t, http.StatusOK, resp.StatusCode) - // Read the response body body, err := io.ReadAll(resp.Body) assert.NoError(t, err) - // Check the response body content - assert.Equal(t, "OK\n", string(body)) // fmt.Fprintln adds a newline + assert.Equal(t, "OK\n", string(body)) - // Verify mock expectations mockStore.AssertExpectations(t) } diff --git a/test/e2e/sut_helper.go b/test/e2e/sut_helper.go index 302cfe4b77..9071bf5d93 100644 --- a/test/e2e/sut_helper.go +++ b/test/e2e/sut_helper.go @@ -8,6 +8,7 @@ import ( "io" "iter" "maps" + "net/http" "os" "os/exec" "path/filepath" @@ -98,20 +99,19 @@ func (s *SystemUnderTest) ExecCmd(cmd string, args ...string) { func (s *SystemUnderTest) AwaitNodeUp(t *testing.T, rpcAddr string, timeout time.Duration) { t.Helper() t.Logf("Await node is up: %s", rpcAddr) - ctx, done := context.WithTimeout(context.Background(), timeout) - defer done() require.EventuallyWithT(t, func(t *assert.CollectT) { c := client.NewClient(rpcAddr) require.NotNil(t, c) - // Check liveness: is the process alive? - _, err := c.GetHealth(ctx) + resp, err := http.Get(rpcAddr + "/health/live") require.NoError(t, err, "liveness check failed") + defer resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode, "liveness check failed") - // Check readiness: is the node ready to serve traffic? - readiness, err := c.GetReadiness(ctx) + resp, err = http.Get(rpcAddr + "/health/ready") require.NoError(t, err, "readiness check failed") - require.Equal(t, client.ReadinessStatus_READY, readiness, "node is not ready") + defer resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode, "node is not ready") }, timeout, min(timeout/10, 200*time.Millisecond), "node is not up") } @@ -121,15 +121,14 @@ func (s *SystemUnderTest) AwaitNodeUp(t *testing.T, rpcAddr string, timeout time func (s *SystemUnderTest) AwaitNodeLive(t *testing.T, rpcAddr string, timeout time.Duration) { t.Helper() t.Logf("Await node is live: %s", rpcAddr) - ctx, done := context.WithTimeout(context.Background(), timeout) - defer done() require.EventuallyWithT(t, func(t *assert.CollectT) { c := client.NewClient(rpcAddr) require.NotNil(t, c) - // Check liveness only: is the process alive? - _, err := c.GetHealth(ctx) + resp, err := http.Get(rpcAddr + "/health/live") require.NoError(t, err, "liveness check failed") + defer resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode, "liveness check failed") }, timeout, min(timeout/10, 200*time.Millisecond), "node is not live") } From 8458fa098fb75139627511e22983131855b68b7d Mon Sep 17 00:00:00 2001 From: Randy Grok Date: Thu, 6 Nov 2025 12:40:21 +0100 Subject: [PATCH 19/21] feat: enhance health check system with separate liveness and readiness endpoints, update documentation and remove deprecated health client --- CHANGELOG.md | 17 +--- buf.yaml | 2 - client/crates/client/examples/basic.rs | 27 +----- client/crates/client/src/health.rs | 100 --------------------- client/crates/client/src/lib.rs | 17 ++-- docs/learn/config.md | 119 +++---------------------- pkg/rpc/server/http.go | 7 +- test/e2e/sut_helper.go | 9 -- 8 files changed, 22 insertions(+), 276 deletions(-) delete mode 100644 client/crates/client/src/health.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 667c2acb2f..c5eeb0e2f1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,28 +11,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- Added comprehensive health endpoint documentation in `docs/learn/config.md#health-endpoints` explaining liveness vs readiness checks, Kubernetes probe configuration, and usage examples ([#2800](https://github.com/evstack/ev-node/pull/2800)) -- Added P2P listening check to `/health/ready` endpoint to verify P2P network is ready to accept connections ([#2800](https://github.com/evstack/ev-node/pull/2800)) -- Added aggregator block production rate check to `/health/ready` endpoint to ensure aggregators are producing blocks within expected timeframe (5x block time) ([#2800](https://github.com/evstack/ev-node/pull/2800)) -- Added `readyz()` and `is_ready()` methods to Rust `HealthClient` for checking `/health/ready` endpoint ([#2800](https://github.com/evstack/ev-node/pull/2800)) -- Added `ReadinessStatus` enum to Rust client with Ready/Unready/Unknown states ([#2800](https://github.com/evstack/ev-node/pull/2800)) +- Enhanced health check system with separate liveness (`/health/live`) and readiness (`/health/ready`) HTTP endpoints. Readiness endpoint includes P2P listening check and aggregator block production rate validation (5x block time threshold). ([#2800](https://github.com/evstack/ev-node/pull/2800)) ### Changed - Use cache instead of in memory store for reaper. Persist cache on reload. Autoclean after 24 hours. ([#2811](https://github.com/evstack/ev-node/pull/2811)) -- Simplified `/health/live` endpoint to only check store accessibility (liveness) instead of business logic, following Kubernetes best practices ([#2800](https://github.com/evstack/ev-node/pull/2800)) -- Updated `/health/ready` endpoint to use `GetState()` instead of `Height()` to access block production timing information ([#2800](https://github.com/evstack/ev-node/pull/2800)) -- Updated E2E test helper `AwaitNodeUp()` to check both liveness and readiness endpoints, ensuring nodes are not just alive but ready to serve traffic ([#2800](https://github.com/evstack/ev-node/pull/2800)) -- Renamed integration test from `TestHealthEndpointWhenBlockProductionStops` to `TestReadinessEndpointWhenBlockProductionStops` to correctly test readiness endpoint ([#2800](https://github.com/evstack/ev-node/pull/2800)) -- Updated Rust client example (`client/crates/client/examples/basic.rs`) to demonstrate both liveness and readiness checks ([#2800](https://github.com/evstack/ev-node/pull/2800)) ### Removed -- **BREAKING:** Removed `evnode.v1.HealthService` gRPC endpoint in favor of HTTP health endpoints ([#2800](https://github.com/evstack/ev-node/pull/2800)) - - Migration: Use `GET /health/live` instead of `HealthService.Livez()` gRPC call - - See migration guide: `docs/learn/config.md#health-endpoints` - - Affected clients: Rust client (`client/crates/client`) and any external services using the gRPC health endpoint -- Removed `proto/evnode/v1/health.proto` and generated protobuf files ([#2800](https://github.com/evstack/ev-node/pull/2800)) +- **BREAKING:** Removed `evnode.v1.HealthService` gRPC endpoint. Use HTTP endpoints: `GET /health/live` and `GET /health/ready`. ([#2800](https://github.com/evstack/ev-node/pull/2800)) ## v1.0.0-beta.9 diff --git a/buf.yaml b/buf.yaml index 940e2630f5..4488fbf4d7 100644 --- a/buf.yaml +++ b/buf.yaml @@ -15,6 +15,4 @@ breaking: use: - FILE ignore: - # health.proto was intentionally removed - health checks migrated to HTTP endpoints - # See: docs/learn/config.md#health-endpoints - evnode/v1/health.proto diff --git a/client/crates/client/examples/basic.rs b/client/crates/client/examples/basic.rs index 7617577819..fc3114b96c 100644 --- a/client/crates/client/examples/basic.rs +++ b/client/crates/client/examples/basic.rs @@ -1,12 +1,10 @@ -use ev_client::{health::HealthClient, Client, P2PClient, StoreClient}; +use ev_client::{Client, P2PClient, StoreClient}; use std::error::Error; #[tokio::main] async fn main() -> Result<(), Box> { - // Initialize tracing for better debugging tracing_subscriber::fmt::init(); - // Connect to a Evolve node let endpoint = std::env::var("EVOLVE_ENDPOINT").unwrap_or_else(|_| "http://localhost:50051".to_string()); println!("Connecting to evolve node at: {endpoint}"); @@ -14,29 +12,6 @@ async fn main() -> Result<(), Box> { let client = Client::connect(&endpoint).await?; println!("Successfully connected to evolve node"); - // Check health status (HTTP endpoints) - println!("\n=== Health Check ==="); - let health = HealthClient::with_base_url(endpoint.clone()); - - // Liveness check - is the process alive? - match health.livez().await { - Ok(status) => { - println!("Liveness status: {:?}", status); - println!("Node is alive: {}", health.is_healthy().await?); - } - Err(e) => println!("Failed to get liveness status: {e}"), - } - - // Readiness check - can it serve correct data? - match health.readyz().await { - Ok(status) => { - println!("Readiness status: {:?}", status); - println!("Node is ready: {}", health.is_ready().await?); - } - Err(e) => println!("Failed to get readiness status: {e}"), - } - - // Get P2P information println!("\n=== P2P Information ==="); let p2p = P2PClient::new(&client); match p2p.get_net_info().await { diff --git a/client/crates/client/src/health.rs b/client/crates/client/src/health.rs deleted file mode 100644 index e60e1462af..0000000000 --- a/client/crates/client/src/health.rs +++ /dev/null @@ -1,100 +0,0 @@ -use crate::{client::Client, error::Result}; - -/// Health status of the node (liveness check) -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum HealthStatus { - /// Node is operating normally - Pass, - /// Node is degraded but still serving - Warn, - /// Node has failed health checks - Fail, - /// Unknown health status - Unknown, -} - -/// Readiness status of the node (can serve correct data) -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum ReadinessStatus { - /// Node is ready to serve traffic - Ready, - /// Node is not ready to serve traffic - Unready, - /// Unknown readiness status - Unknown, -} - -pub struct HealthClient { - base_url: String, - http_client: reqwest::Client, -} - -impl HealthClient { - /// Create a new HealthClient from a Client - /// - /// Note: The base_url should be the HTTP endpoint (e.g., "http://localhost:9090") - pub fn new(_client: &Client) -> Self { - // For now, we'll need to construct the base URL from the client - // This is a workaround since we're mixing gRPC and HTTP endpoints - // TODO: Consider adding a method to Client to get the base URL - Self::with_base_url("http://localhost:9090".to_string()) - } - - /// Create a new HealthClient with an explicit base URL - pub fn with_base_url(base_url: String) -> Self { - Self { - base_url: base_url.trim_end_matches('/').to_string(), - http_client: reqwest::Client::new(), - } - } - - /// Check if the node is alive and get its health status (liveness check) - /// - /// This endpoint checks if the process is alive and responsive. - /// A failing liveness check should result in killing/restarting the process. - pub async fn livez(&self) -> Result { - let url = format!("{}/health/live", self.base_url); - let response = self.http_client.get(&url).send().await?; - - let status_text = response.text().await?.trim().to_string(); - - match status_text.as_str() { - "OK" => Ok(HealthStatus::Pass), - "WARN" => Ok(HealthStatus::Warn), - "FAIL" => Ok(HealthStatus::Fail), - _ => Ok(HealthStatus::Unknown), - } - } - - /// Check if the node is ready to serve correct data (readiness check) - /// - /// This endpoint checks if the node can serve correct data to clients. - /// A failing readiness check should remove the node from load balancer - /// but NOT kill the process. - pub async fn readyz(&self) -> Result { - let url = format!("{}/health/ready", self.base_url); - let response = self.http_client.get(&url).send().await?; - - let status_text = response.text().await?.trim().to_string(); - - if status_text.starts_with("READY") { - Ok(ReadinessStatus::Ready) - } else if status_text.starts_with("UNREADY") { - Ok(ReadinessStatus::Unready) - } else { - Ok(ReadinessStatus::Unknown) - } - } - - /// Check if the node is healthy (liveness status is PASS) - pub async fn is_healthy(&self) -> Result { - let status = self.livez().await?; - Ok(status == HealthStatus::Pass) - } - - /// Check if the node is ready (readiness status is READY) - pub async fn is_ready(&self) -> Result { - let status = self.readyz().await?; - Ok(status == ReadinessStatus::Ready) - } -} diff --git a/client/crates/client/src/lib.rs b/client/crates/client/src/lib.rs index 7d6771ae9d..1c2467e342 100644 --- a/client/crates/client/src/lib.rs +++ b/client/crates/client/src/lib.rs @@ -5,24 +5,23 @@ //! # Example //! //! ```no_run -//! use ev_client::{Client, HealthClient, ConfigClient}; +//! use ev_client::{Client, ConfigClient}; //! //! #[tokio::main] //! async fn main() -> Result<(), Box> { //! // Connect to a Evolve node //! let client = Client::connect("http://localhost:50051").await?; -//! -//! // Check health -//! let health = HealthClient::new(&client); -//! let is_healthy = health.is_healthy().await?; -//! println!("Node healthy: {}", is_healthy); -//! +//! //! // Get namespace configuration //! let config = ConfigClient::new(&client); //! let namespace = config.get_namespace().await?; //! println!("Header namespace: {}", namespace.header_namespace); //! println!("Data namespace: {}", namespace.data_namespace); -//! +//! +//! // For health checks, use HTTP endpoints directly: +//! // curl http://localhost:9090/health/live +//! // curl http://localhost:9090/health/ready +//! //! Ok(()) //! } //! ``` @@ -79,7 +78,6 @@ pub mod client; pub mod config; pub mod error; -pub mod health; pub mod p2p; pub mod signer; pub mod store; @@ -88,7 +86,6 @@ pub mod store; pub use client::{Client, ClientBuilder}; pub use config::ConfigClient; pub use error::{ClientError, Result}; -pub use health::{HealthClient, HealthStatus, ReadinessStatus}; pub use p2p::P2PClient; pub use signer::SignerClient; pub use store::StoreClient; diff --git a/docs/learn/config.md b/docs/learn/config.md index b308a4b1c9..0321b12e77 100644 --- a/docs/learn/config.md +++ b/docs/learn/config.md @@ -609,132 +609,33 @@ See the [DA Visualizer Guide](../guides/da/visualizer.md) for detailed informati ### Health Endpoints -Evolve exposes two HTTP health check endpoints that serve different purposes in production environments. These endpoints are automatically available on the RPC server and follow Kubernetes health check best practices. +#### `/health/live` -#### `/health/live` - Liveness Probe +Returns `200 OK` if the process is alive and can access the store. -**Purpose:** Determines if the node process is alive and responsive. - -**What it checks:** -- Store accessibility (can the process access its database?) - -**Typical usage:** -- Kubernetes liveness probes -- Process monitoring systems -- Container orchestration health checks - -**Response:** -- `200 OK` with body `OK` - Process is alive and responsive -- `503 Service Unavailable` with body `FAIL` - Process is dead or unresponsive - -**Failure action:** If this endpoint fails, the process should be **killed and restarted**. A failing liveness check indicates the process is in an unrecoverable state (e.g., database connection lost, deadlock, etc.). - -**Example:** ```bash curl http://localhost:7331/health/live -# Response: OK (HTTP 200) -``` - -**Kubernetes liveness probe configuration:** -```yaml -livenessProbe: - httpGet: - path: /health/live - port: 7331 - initialDelaySeconds: 10 - periodSeconds: 10 - timeoutSeconds: 5 - failureThreshold: 3 ``` ---- - -#### `/health/ready` - Readiness Probe - -**Purpose:** Determines if the node can serve correct data to clients. - -**What it checks:** - -**For all nodes:** -1. **P2P listening** (if P2P enabled): P2P network is accepting connections -2. **Store accessible**: Can read state from the database -3. **Has blocks**: Node has synced at least one block (height > 0) -4. **Best-known height available**: Can determine network height -5. **Sync status**: Node is not too far behind the network (within `readiness_max_blocks_behind`) - -**Additional checks for non-aggregators:** -6. **Has peers**: Connected to at least one peer (for receiving blocks) - -**Additional checks for aggregators:** -7. **Block production rate**: Producing blocks at expected rate (within 5x `block_time`) - -**Typical usage:** -- Kubernetes readiness probes -- Load balancer health checks -- Service mesh routing decisions +#### `/health/ready` -**Response:** -- `200 OK` with body `READY` - Node can serve correct data -- `503 Service Unavailable` with body `UNREADY: ` - Node should not receive traffic +Returns `200 OK` if the node can serve correct data. Checks: +- P2P is listening (if enabled) +- Has synced blocks +- Not too far behind network +- Non-aggregators: has peers +- Aggregators: producing blocks at expected rate -**Failure action:** If this endpoint fails, the node should be **removed from the load balancer** but **NOT killed**. The process is alive but temporarily unable to serve correct data (e.g., syncing, no peers, behind network head). - -**Example:** ```bash curl http://localhost:7331/health/ready -# Response: READY (HTTP 200) -# or -# Response: UNREADY: behind best-known head (HTTP 503) -``` - -**Kubernetes readiness probe configuration:** -```yaml -readinessProbe: - httpGet: - path: /health/ready - port: 7331 - initialDelaySeconds: 30 - periodSeconds: 10 - timeoutSeconds: 5 - failureThreshold: 3 ``` ---- - -#### Key Differences: Liveness vs Readiness - -| Aspect | Liveness (`/health/live`) | Readiness (`/health/ready`) | -|--------|---------------------------|----------------------------| -| **Purpose** | Is the process alive? | Can it serve correct data? | -| **Checks** | Infrastructure (store accessible) | Business logic (synced, peers, block production) | -| **Failure means** | Process is broken/deadlocked | Temporarily unable to serve | -| **Action on failure** | **Kill and restart** process | **Remove from load balancer** | -| **Check frequency** | Less frequent (every 10-30s) | More frequent (every 5-10s) | -| **Example failure** | Database corruption, deadlock | Syncing from genesis, no peers | - -**Important:** A node can be **live but not ready**. For example, a newly started full node is alive (process running, database accessible) but not ready (still syncing blocks from peers). In this state: -- `/health/live` returns `200 OK` (don't kill the process) -- `/health/ready` returns `503 UNREADY: behind best-known head` (don't route traffic yet) - -#### Configuration - -**Readiness max blocks behind:** - -The readiness endpoint uses the `readiness_max_blocks_behind` configuration to determine if a node is too far behind the network. - -**YAML:** +Configure max blocks behind: ```yaml node: readiness_max_blocks_behind: 15 ``` -**Command-line Flag:** -`--rollkit.node.readiness_max_blocks_behind ` -_Example:_ `--rollkit.node.readiness_max_blocks_behind 20` -_Default:_ `15` - -This value determines how many blocks behind the best-known network height a node can be before being considered unready. Lower values ensure tighter consistency but may cause nodes to be marked unready more frequently during network hiccups. - ## Instrumentation Configuration (`instrumentation`) Settings for enabling and configuring metrics and profiling endpoints, useful for monitoring node performance and debugging. diff --git a/pkg/rpc/server/http.go b/pkg/rpc/server/http.go index 784a789c57..4feea0fa89 100644 --- a/pkg/rpc/server/http.go +++ b/pkg/rpc/server/http.go @@ -15,10 +15,8 @@ import ( type BestKnownHeightProvider func() uint64 // RegisterCustomHTTPEndpoints registers custom HTTP handlers on the mux. -// See docs/learn/config.md#health-endpoints for health endpoint documentation. func RegisterCustomHTTPEndpoints(mux *http.ServeMux, s store.Store, pm p2p.P2PRPC, cfg config.Config, bestKnownHeightProvider BestKnownHeightProvider, logger zerolog.Logger) { - // /health/live - Liveness probe: checks if process is alive and responsive - // Should NOT check business logic (block production, sync status, etc.) + // /health/live checks if the process is alive and responsive mux.HandleFunc("/health/live", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/plain") @@ -33,8 +31,7 @@ func RegisterCustomHTTPEndpoints(mux *http.ServeMux, s store.Store, pm p2p.P2PRP fmt.Fprintln(w, "OK") }) - // /health/ready - Readiness probe: checks if node can serve correct data - // Failing readiness removes node from load balancer but doesn't kill process + // /health/ready checks if the node can serve correct data mux.HandleFunc("/health/ready", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/plain") diff --git a/test/e2e/sut_helper.go b/test/e2e/sut_helper.go index 9071bf5d93..fca77a122c 100644 --- a/test/e2e/sut_helper.go +++ b/test/e2e/sut_helper.go @@ -95,14 +95,10 @@ func (s *SystemUnderTest) ExecCmd(cmd string, args ...string) { } // AwaitNodeUp waits until a node is operational by checking both liveness and readiness. -// This verifies the process is alive (liveness) and ready to serve traffic (readiness). func (s *SystemUnderTest) AwaitNodeUp(t *testing.T, rpcAddr string, timeout time.Duration) { t.Helper() t.Logf("Await node is up: %s", rpcAddr) require.EventuallyWithT(t, func(t *assert.CollectT) { - c := client.NewClient(rpcAddr) - require.NotNil(t, c) - resp, err := http.Get(rpcAddr + "/health/live") require.NoError(t, err, "liveness check failed") defer resp.Body.Close() @@ -116,15 +112,10 @@ func (s *SystemUnderTest) AwaitNodeUp(t *testing.T, rpcAddr string, timeout time } // AwaitNodeLive waits until a node is alive (liveness check only). -// This only verifies the process is alive and responsive, not that it's ready to serve traffic. -// Use this for local tests where nodes may not have peers configured. func (s *SystemUnderTest) AwaitNodeLive(t *testing.T, rpcAddr string, timeout time.Duration) { t.Helper() t.Logf("Await node is live: %s", rpcAddr) require.EventuallyWithT(t, func(t *assert.CollectT) { - c := client.NewClient(rpcAddr) - require.NotNil(t, c) - resp, err := http.Get(rpcAddr + "/health/live") require.NoError(t, err, "liveness check failed") defer resp.Body.Close() From 9aab503a759121e9e362104915b8c8d23a92a437 Mon Sep 17 00:00:00 2001 From: Randy Grok Date: Thu, 6 Nov 2025 15:04:45 +0100 Subject: [PATCH 20/21] feat: update integration tests to use HTTP health checks and AwaitNodeLive for lazy mode scenarios --- node/single_sequencer_integration_test.go | 14 +++++++------- test/e2e/evm_test_common.go | 10 +++++++--- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/node/single_sequencer_integration_test.go b/node/single_sequencer_integration_test.go index 414a706af1..22b2fd4506 100644 --- a/node/single_sequencer_integration_test.go +++ b/node/single_sequencer_integration_test.go @@ -6,6 +6,7 @@ import ( "context" "errors" "fmt" + "net/http" "sync" "testing" "time" @@ -16,7 +17,6 @@ import ( coreda "github.com/evstack/ev-node/core/da" coreexecutor "github.com/evstack/ev-node/core/execution" evconfig "github.com/evstack/ev-node/pkg/config" - "github.com/evstack/ev-node/pkg/rpc/client" ) // FullNodeTestSuite is a test suite for full node integration tests @@ -439,11 +439,10 @@ func TestReadinessEndpointWhenBlockProductionStops(t *testing.T) { waitForBlockN(t, 1, node, config.Node.BlockTime.Duration) - rpcClient := NewRPCClient(config.RPC.Address) - - readiness, err := rpcClient.GetReadiness(ctx) + resp, err := http.Get("http://" + config.RPC.Address + "/health/ready") require.NoError(err) - require.Equal(client.ReadinessStatus_READY, readiness, "Readiness should be READY while producing blocks") + require.Equal(http.StatusOK, resp.StatusCode, "Readiness should be READY while producing blocks") + resp.Body.Close() time.Sleep(time.Duration(config.Node.MaxPendingHeadersAndData+2) * config.Node.BlockTime.Duration) @@ -452,11 +451,12 @@ func TestReadinessEndpointWhenBlockProductionStops(t *testing.T) { require.LessOrEqual(height, config.Node.MaxPendingHeadersAndData) require.Eventually(func() bool { - readiness, err := rpcClient.GetReadiness(ctx) + resp, err := http.Get("http://" + config.RPC.Address + "/health/ready") if err != nil { return false } - return readiness == client.ReadinessStatus_UNREADY + defer resp.Body.Close() + return resp.StatusCode == http.StatusServiceUnavailable }, 10*time.Second, 100*time.Millisecond, "Readiness should be UNREADY after aggregator stops producing blocks (5x block time)") shutdownAndWait(t, []context.CancelFunc{cancel}, &runningWg, 10*time.Second) diff --git a/test/e2e/evm_test_common.go b/test/e2e/evm_test_common.go index 756d836c3d..116ece6757 100644 --- a/test/e2e/evm_test_common.go +++ b/test/e2e/evm_test_common.go @@ -367,7 +367,8 @@ func setupSequencerNodeLazy(t *testing.T, sut *SystemUnderTest, sequencerHome, j "--evm.eth-url", endpoints.GetSequencerEthURL(), } sut.ExecCmd(evmSingleBinaryPath, args...) - sut.AwaitNodeUp(t, endpoints.GetRollkitRPCAddress(), NodeStartupTimeout) + // Use AwaitNodeLive for lazy mode since the node won't be ready (producing blocks) immediately + sut.AwaitNodeLive(t, endpoints.GetRollkitRPCAddress(), NodeStartupTimeout) } // setupFullNode initializes and starts the full node with P2P connection to sequencer. @@ -421,7 +422,9 @@ func setupFullNode(t *testing.T, sut *SystemUnderTest, fullNodeHome, sequencerHo "--rollkit.p2p.listen_address", endpoints.GetFullNodeP2PAddress(), } sut.ExecCmd(evmSingleBinaryPath, args...) - sut.AwaitNodeUp(t, endpoints.GetFullNodeRPCAddress(), NodeStartupTimeout) + // Use AwaitNodeLive instead of AwaitNodeUp because in lazy mode scenarios, + // the full node may not become ready until the sequencer produces blocks + sut.AwaitNodeLive(t, endpoints.GetFullNodeRPCAddress(), NodeStartupTimeout) } // Global nonce counter to ensure unique nonces across multiple transaction submissions @@ -676,7 +679,8 @@ func restartDAAndSequencerLazy(t *testing.T, sut *SystemUnderTest, sequencerHome time.Sleep(SlowPollingInterval) - sut.AwaitNodeUp(t, endpoints.GetRollkitRPCAddress(), NodeStartupTimeout) + // Use AwaitNodeLive for lazy mode since the node won't be ready (producing blocks) immediately + sut.AwaitNodeLive(t, endpoints.GetRollkitRPCAddress(), NodeStartupTimeout) } // restartSequencerNode starts an existing sequencer node without initialization. From 77cc822a1eab675bd747658747599b84c8dd0a29 Mon Sep 17 00:00:00 2001 From: Randy Grok Date: Thu, 6 Nov 2025 18:30:58 +0100 Subject: [PATCH 21/21] feat: refine health check endpoints and update related documentation; remove unused RPC client code and adjust e2e test parameters --- buf.yaml | 2 -- node/helpers_test.go | 6 ------ pkg/rpc/server/http.go | 22 ++++++++++++++++++++-- test/docker-e2e/go.mod | 1 + test/e2e/base_test.go | 5 +---- 5 files changed, 22 insertions(+), 14 deletions(-) diff --git a/buf.yaml b/buf.yaml index 4488fbf4d7..bf7debf7cc 100644 --- a/buf.yaml +++ b/buf.yaml @@ -14,5 +14,3 @@ lint: breaking: use: - FILE - ignore: - - evnode/v1/health.proto diff --git a/node/helpers_test.go b/node/helpers_test.go index e9b2719fb8..06d789060f 100644 --- a/node/helpers_test.go +++ b/node/helpers_test.go @@ -24,7 +24,6 @@ import ( evconfig "github.com/evstack/ev-node/pkg/config" "github.com/evstack/ev-node/pkg/p2p" "github.com/evstack/ev-node/pkg/p2p/key" - rpcclient "github.com/evstack/ev-node/pkg/rpc/client" remote_signer "github.com/evstack/ev-node/pkg/signer/noop" "github.com/evstack/ev-node/types" ) @@ -318,8 +317,3 @@ func verifyNodesSynced(node1, syncingNode Node, source Source) error { return fmt.Errorf("nodes not synced: sequencer at height %v, syncing node at height %v", sequencerHeight, syncingHeight) }) } - -// NewRPCClient creates a new RPC client for testing -func NewRPCClient(address string) *rpcclient.Client { - return rpcclient.NewClient("http://" + address) -} diff --git a/pkg/rpc/server/http.go b/pkg/rpc/server/http.go index 4feea0fa89..47021af0f2 100644 --- a/pkg/rpc/server/http.go +++ b/pkg/rpc/server/http.go @@ -16,7 +16,9 @@ type BestKnownHeightProvider func() uint64 // RegisterCustomHTTPEndpoints registers custom HTTP handlers on the mux. func RegisterCustomHTTPEndpoints(mux *http.ServeMux, s store.Store, pm p2p.P2PRPC, cfg config.Config, bestKnownHeightProvider BestKnownHeightProvider, logger zerolog.Logger) { - // /health/live checks if the process is alive and responsive + // /health/live performs a basic liveness check to determine if the process is alive and responsive. + // Returns 200 if the process can access its store, 503 otherwise. + // This is a lightweight check suitable for Kubernetes liveness probes. mux.HandleFunc("/health/live", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/plain") @@ -31,7 +33,23 @@ func RegisterCustomHTTPEndpoints(mux *http.ServeMux, s store.Store, pm p2p.P2PRP fmt.Fprintln(w, "OK") }) - // /health/ready checks if the node can serve correct data + // /health/ready performs a comprehensive readiness check to determine if the node can serve correct data. + // Returns 200 if all checks pass, 503 otherwise. + // Suitable for Kubernetes readiness probes and load balancer health checks. + // + // The following checks are performed: + // 1. P2P network connectivity (if P2P is enabled): + // - Verifies P2P network info is accessible + // - Confirms node is listening for P2P connections + // - For non-aggregator nodes: ensures at least one peer is connected + // 2. Block production/sync status: + // - Confirms node state is accessible + // - Verifies at least one block has been produced/synced + // 3. Aggregator-specific checks (for aggregator nodes only): + // - Validates blocks are being produced at expected rate (within 5x block_time) + // 4. Sync status (for all nodes): + // - Compares local height with best known network height + // - Ensures node is not falling behind by more than readiness_max_blocks_behind mux.HandleFunc("/health/ready", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/plain") diff --git a/test/docker-e2e/go.mod b/test/docker-e2e/go.mod index 020f6e2575..cc2625e48b 100644 --- a/test/docker-e2e/go.mod +++ b/test/docker-e2e/go.mod @@ -4,6 +4,7 @@ go 1.24.6 require ( cosmossdk.io/math v1.5.3 + // TODO(reviewer): Update to tagged release once tastora is merged and tagged github.com/celestiaorg/tastora v0.7.6-0.20251106081541-3ec2da2f1f7f github.com/ethereum/go-ethereum v1.16.6 github.com/evstack/ev-node/execution/evm v1.0.0-beta.3 diff --git a/test/e2e/base_test.go b/test/e2e/base_test.go index c5437ba136..7ab30a6e31 100644 --- a/test/e2e/base_test.go +++ b/test/e2e/base_test.go @@ -65,7 +65,6 @@ func TestBasic(t *testing.T) { require.NoError(t, err, "failed to init aggregator", output) // start aggregator - node1P2P := "/ip4/0.0.0.0/tcp/26656" sut.ExecCmd(binaryPath, "start", "--home="+node1Home, @@ -73,7 +72,6 @@ func TestBasic(t *testing.T) { "--evnode.signer.passphrase_file="+passphraseFile, "--evnode.node.block_time=5ms", "--evnode.da.block_time=15ms", - "--evnode.p2p.listen_address="+node1P2P, "--kv-endpoint=127.0.0.1:9090", ) @@ -102,11 +100,10 @@ func TestBasic(t *testing.T) { "--home="+node2Home, "--evnode.log.level=debug", "--evnode.p2p.listen_address="+node2P2P, - "--evnode.node.readiness_max_blocks_behind=100", // Allow more blocks behind during bootstrap fmt.Sprintf("--evnode.rpc.address=%s", node2RPC), ) - // For local e2e tests, only check liveness (not full readiness with peers) + // For local e2e tests, only check liveness since P2P discovery may take time sut.AwaitNodeLive(t, "http://"+node2RPC, 10*time.Second) t.Logf("Full node (node 2) is live.")