Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions specifications/SPEC-UTILS-SERVICE.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,12 +151,12 @@ health:
properties:
status:
type: string
enum: [UP, DOWN]
enum: [UP, DEGRADED, DOWN]
description: Service health status
reason:
type: string
nullable: true
description: Optional reason for status
description: Required reason for DOWN or DEGRADED status; must be null for UP
components:
type: object
description: Hierarchical component health
Expand Down
21 changes: 0 additions & 21 deletions specifications/SPEC_SYSTEM_SERVICE.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,27 +107,6 @@ _Note: For detailed implementation, refer to the source code in the `src/aignost

### 3.3 Data Schemas

**Health Status Schema:**

```yaml
Health:
type: object
properties:
status:
type: string
enum: [UP, DOWN]
description: "Overall system health status"
components:
type: object
description: "Health status of individual components"
additionalProperties:
$ref: "#/definitions/Health"
reason:
type: string
description: "Reason for DOWN status, null for UP"
required: [status]
```

**System Info Schema:**

```yaml
Expand Down
53 changes: 40 additions & 13 deletions src/aignostics/platform/_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,10 +192,47 @@ def info(self, mask_secrets: bool = True) -> dict[str, Any]:
else None,
}

@staticmethod
def _health_from_response(response: urllib3.BaseHTTPResponse) -> Health:
"""Map a PAPI health response to a Health status.

Handles non-200 status codes, unparseable bodies, and the three recognised
``status`` values (``"UP"``, ``"DEGRADED"``, ``"DOWN"``).

Args:
response: urllib3 response from the ``/health`` endpoint.

Returns:
Health: ``UP``, ``DEGRADED``, or ``DOWN`` derived from the response.
"""
if response.status != HTTPStatus.OK:
logger.error("Aignostics Platform API returned '{}'", response.status)
return Health(
status=Health.Code.DOWN, reason=f"Aignostics Platform API returned status '{response.status}'"
)

try:
body = json.loads(response.data)
except Exception:
return Health(status=Health.Code.DOWN, reason="Aignostics Platform API returned unparseable response")

api_status = body.get("status")
if api_status == "UP":
return Health(status=Health.Code.UP)
if api_status == "DEGRADED":
reason = body.get("reason") or "Aignostics Platform API is DEGRADED"
logger.warning("Aignostics Platform API is DEGRADED: {}", reason)
return Health(status=Health.Code.DEGRADED, reason=reason)
return Health(
status=Health.Code.DOWN,
reason=f"Aignostics Platform API returned unknown status '{api_status}'",
)

def _determine_api_public_health(self) -> Health:
"""Determine healthiness and reachability of Aignostics Platform API.

- Checks if health endpoint is reachable and returns 200 OK
- Parses the response body to detect DEGRADED status
- Uses urllib3 for a direct connection check without authentication

Returns:
Expand All @@ -209,23 +246,17 @@ def _determine_api_public_health(self) -> Health:
headers={"User-Agent": user_agent()},
timeout=urllib3.Timeout(total=self._settings.health_timeout),
)

if response.status != HTTPStatus.OK:
logger.error("Aignostics Platform API (public) returned '{}'", response.status)
return Health(
status=Health.Code.DOWN, reason=f"Aignostics Platform API returned status '{response.status}'"
)
return self._health_from_response(response)
except Exception as e:
logger.exception("Issue with Aignostics Platform API")
return Health(status=Health.Code.DOWN, reason=f"Issue with Aignostics Platform API: '{e}'")

return Health(status=Health.Code.UP)

def _determine_api_authenticated_health(self) -> Health:
"""Determine healthiness and reachability of Aignostics Platform API via authenticated request.

Uses a dedicated HTTP pool (separate from the API client's connection pool) to prevent
connection-level cross-contamination between health checks and API calls.
Parses the response body to detect DEGRADED status.

Returns:
Health: The healthiness of the Aignostics Platform API when trying to reach via authenticated request.
Expand All @@ -242,14 +273,10 @@ def _determine_api_authenticated_health(self) -> Health:
},
timeout=urllib3.Timeout(total=self._settings.health_timeout),
)

if response.status != HTTPStatus.OK:
logger.error("Aignostics Platform API (authenticated) returned '{}'", response.status)
return Health(status=Health.Code.DOWN, reason=f"Aignostics Platform API returned '{response.status}'")
return self._health_from_response(response)
except Exception as e:
logger.exception("Issue with Aignostics Platform API")
return Health(status=Health.Code.DOWN, reason=f"Issue with Aignostics Platform API: '{e}'")
return Health(status=Health.Code.UP)

def health(self) -> Health:
"""Determine health of this service.
Expand Down
12 changes: 5 additions & 7 deletions src/aignostics/system/CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,11 +123,9 @@ def health(self) -> Health:
)

# Determine overall status based on ALL modules
overall = Health.Code.UP if all(
c.status == Health.Code.UP for c in components.values()
) else Health.Code.DOWN

return Health(status=overall, components=components)
# Priority: DOWN > DEGRADED > UP
# compute_health_from_components() handles this automatically
return Health(status=Health.Code.UP, components=components)
```

### Exception Hierarchy (`_exceptions.py`)
Expand Down Expand Up @@ -275,7 +273,7 @@ print(f"System status: {health.status}")

# Check specific component
platform_health = health.components.get("platform")
if platform_health.status != Health.Code.UP:
if not platform_health: # False only when DOWN (DEGRADED and UP are both truthy)
print(f"Platform issue: {platform_health.reason}")
```

Expand Down Expand Up @@ -453,7 +451,7 @@ def test_health_aggregation():
service = Service()
health = service.health()

assert health.status in [Health.Code.UP, Health.Code.DOWN]
assert health.status in [Health.Code.UP, Health.Code.DEGRADED, Health.Code.DOWN]
assert "platform" in health.components
assert isinstance(health.components, dict)

Expand Down
3 changes: 2 additions & 1 deletion src/aignostics/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
)
from ._di import discover_plugin_packages, load_modules, locate_implementations, locate_subclasses
from ._fs import get_user_data_directory, open_user_data_directory, sanitize_path, sanitize_path_component
from ._health import Health
from ._health import Health, HealthStatus
from ._log import LogSettings
from ._mcp import MCP_SERVER_NAME, MCP_TRANSPORT, mcp_create_server, mcp_discover_servers, mcp_list_tools, mcp_run
from ._nav import BaseNavBuilder, NavGroup, NavItem, gui_get_nav_groups
Expand All @@ -42,6 +42,7 @@
"BaseNavBuilder",
"BaseService",
"Health",
"HealthStatus",
"LogSettings",
"NavGroup",
"NavItem",
Expand Down
83 changes: 53 additions & 30 deletions src/aignostics/utils/_health.py
Original file line number Diff line number Diff line change
@@ -1,69 +1,92 @@
"""Health models and status definitions for service health checks."""

from enum import StrEnum
from typing import ClassVar, Self
from typing import Any, ClassVar, Self

from pydantic import BaseModel, Field, model_validator


class _HealthStatus(StrEnum):
class HealthStatus(StrEnum):
"""Health status enumeration for service health checks.

Values:
UP: Service is operating normally
DEGRADED: Service is operational but with reduced functionality
DOWN: Service is not operational
"""

UP = "UP"
DEGRADED = "DEGRADED"
DOWN = "DOWN"


class Health(BaseModel):
"""Represents the health status of a service with optional components and failure reasons.

- A health object can have child components, i.e. health forms a tree.
- Any node in the tree can set itself to DOWN. In this case the node is required
to set the reason attribute. If reason is not set when DOWN,
automatic model validation of the tree will fail.
- DOWN'ness is propagated to parent health objects. I.e. the health of a parent
node is automatically set to DOWN if any of its child components are DOWN. The
child components leading to this will be listed in the reason.
- The root of the health tree is computed in the system module. The health of other
modules is automatically picked up by the system module.
- Any node in the tree can set itself to DOWN or DEGRADED. If DOWN or DEGRADED, the node
is required to set the reason attribute. If reason is not set when DOWN or DEGRADED,
automatic model validation fails.
- DOWN trumps DEGRADED, DEGRADED trumps UP. If any child is DOWN, parent is DOWN.
If none are DOWN but any are DEGRADED, parent is DEGRADED.
- The root of the health tree is computed in the system module.
The health of other modules is automatically picked up by the system module.
"""

Code: ClassVar[type[_HealthStatus]] = _HealthStatus
status: _HealthStatus
Code: ClassVar[type[HealthStatus]] = HealthStatus
status: HealthStatus
reason: str | None = None
components: dict[str, "Health"] = Field(default_factory=dict)
uptime_statistics: dict[str, dict[str, Any]] | None = None # Optional uptime stats

def compute_health_from_components(self) -> Self:
"""Recursively compute health status from components.

- If health is already DOWN, it remains DOWN with its original reason.
- If health is UP but any component is DOWN, health becomes DOWN with
a reason listing all failed components.
- If no components are DOWN but any are DEGRADED, health becomes DEGRADED with a reason.

Returns:
Self: The updated health instance with computed status.
"""
# Skip recomputation if already known to be DOWN
if self.status == _HealthStatus.DOWN:
if self.status == HealthStatus.DOWN:
return self

# No components means we keep the existing status
if not self.components:
return self

# Find all DOWN components
# Find all DOWN and DEGRADED components
down_components = []
degraded_components = []
for component_name, component in self.components.items():
# Recursively compute health for each component
component.compute_health_from_components()
if component.status == _HealthStatus.DOWN:
down_components.append(component_name)
if component.status == HealthStatus.DOWN:
down_components.append((component_name, component.reason))
elif component.status == HealthStatus.DEGRADED:
degraded_components.append((component_name, component.reason))

# If any components are DOWN, mark the parent as DOWN
if down_components:
self.status = _HealthStatus.DOWN
self.status = HealthStatus.DOWN
if len(down_components) == 1:
self.reason = f"Component '{down_components[0]}' is DOWN"
component_name, component_reason = down_components[0]
self.reason = f"Component '{component_name}' is DOWN ({component_reason})"
else:
component_list = ", ".join(f"'{name}' ({reason})" for name, reason in down_components)
self.reason = f"Components {component_list} are DOWN"
# If no components are DOWN but any are DEGRADED, mark parent as DEGRADED
elif degraded_components:
self.status = HealthStatus.DEGRADED
if len(degraded_components) == 1:
component_name, component_reason = degraded_components[0]
self.reason = f"Component '{component_name}' is DEGRADED ({component_reason})"
else:
component_list = "', '".join(down_components)
self.reason = f"Components '{component_list}' are DOWN"
component_list = ", ".join(f"'{name}' ({reason})" for name, reason in degraded_components)
self.reason = f"Components {component_list} are DEGRADED"

return self

Expand All @@ -73,7 +96,7 @@ def validate_health_state(self) -> Self:

- Compute overall health based on component health
- Ensure UP status has no associated reason
- Ensure DOWN status always has a reason
- Ensure DOWN and DEGRADED status always have a reason

Returns:
Self: The validated model instance with correct health status.
Expand All @@ -85,31 +108,31 @@ def validate_health_state(self) -> Self:
self.compute_health_from_components()

# Validate that UP status has no reason
if (self.status == _HealthStatus.UP) and self.reason:
if (self.status == HealthStatus.UP) and self.reason:
msg = f"Health {self.status} must not have reason"
raise ValueError(msg)

# Validate that DOWN status always has a reason
if (self.status == _HealthStatus.DOWN) and not self.reason:
msg = "Health DOWN must have a reason"
# Validate that DOWN and DEGRADED status always have a reason
if (self.status in {HealthStatus.DOWN, HealthStatus.DEGRADED}) and not self.reason:
msg = f"Health {self.status} must have a reason"
raise ValueError(msg)

return self

def __str__(self) -> str:
"""Return string representation of health status with optional reason for DOWN state.
"""Return string representation of health status with optional reason for DOWN/DEGRADED state.

Returns:
str: The health status value, with reason appended if status is DOWN.
str: The health status value, with reason appended if status is DOWN or DEGRADED.
"""
if self.status == _HealthStatus.DOWN and self.reason:
if self.status in {HealthStatus.DOWN, HealthStatus.DEGRADED} and self.reason:
return f"{self.status.value}: {self.reason}"
return self.status.value

def __bool__(self) -> bool:
"""Convert health status to a boolean value.

Returns:
bool: True if the status is UP, False otherwise.
bool: True if the status is UP or DEGRADED, False otherwise.
"""
return self.status == _HealthStatus.UP
return self.status in {HealthStatus.UP, HealthStatus.DEGRADED}
Loading
Loading