Skip to content

Commit e55accd

Browse files
olivermeyerclaude
andcommitted
feat(utils, platform): add DEGRADED state to Health model
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent eb1454e commit e55accd

File tree

8 files changed

+301
-85
lines changed

8 files changed

+301
-85
lines changed

specifications/SPEC-UTILS-SERVICE.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,12 +151,12 @@ health:
151151
properties:
152152
status:
153153
type: string
154-
enum: [UP, DOWN]
154+
enum: [UP, DEGRADED, DOWN]
155155
description: Service health status
156156
reason:
157157
type: string
158158
nullable: true
159-
description: Optional reason for status
159+
description: Required reason for DOWN or DEGRADED status; must be null for UP
160160
components:
161161
type: object
162162
description: Hierarchical component health

specifications/SPEC_SYSTEM_SERVICE.md

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -107,27 +107,6 @@ _Note: For detailed implementation, refer to the source code in the `src/aignost
107107

108108
### 3.3 Data Schemas
109109

110-
**Health Status Schema:**
111-
112-
```yaml
113-
Health:
114-
type: object
115-
properties:
116-
status:
117-
type: string
118-
enum: [UP, DOWN]
119-
description: "Overall system health status"
120-
components:
121-
type: object
122-
description: "Health status of individual components"
123-
additionalProperties:
124-
$ref: "#/definitions/Health"
125-
reason:
126-
type: string
127-
description: "Reason for DOWN status, null for UP"
128-
required: [status]
129-
```
130-
131110
**System Info Schema:**
132111

133112
```yaml

src/aignostics/platform/_service.py

Lines changed: 40 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -192,10 +192,47 @@ def info(self, mask_secrets: bool = True) -> dict[str, Any]:
192192
else None,
193193
}
194194

195+
@staticmethod
196+
def _health_from_response(response: urllib3.BaseHTTPResponse) -> Health:
197+
"""Map a PAPI health response to a Health status.
198+
199+
Handles non-200 status codes, unparseable bodies, and the three recognised
200+
``status`` values (``"UP"``, ``"DEGRADED"``, ``"DOWN"``).
201+
202+
Args:
203+
response: urllib3 response from the ``/health`` endpoint.
204+
205+
Returns:
206+
Health: ``UP``, ``DEGRADED``, or ``DOWN`` derived from the response.
207+
"""
208+
if response.status != HTTPStatus.OK:
209+
logger.error("Aignostics Platform API returned '{}'", response.status)
210+
return Health(
211+
status=Health.Code.DOWN, reason=f"Aignostics Platform API returned status '{response.status}'"
212+
)
213+
214+
try:
215+
body = json.loads(response.data)
216+
except Exception:
217+
return Health(status=Health.Code.DOWN, reason="Aignostics Platform API returned unparseable response")
218+
219+
api_status = body.get("status")
220+
if api_status == "UP":
221+
return Health(status=Health.Code.UP)
222+
if api_status == "DEGRADED":
223+
reason = body.get("reason") or "Aignostics Platform API is DEGRADED"
224+
logger.warning("Aignostics Platform API is DEGRADED: {}", reason)
225+
return Health(status=Health.Code.DEGRADED, reason=reason)
226+
return Health(
227+
status=Health.Code.DOWN,
228+
reason=f"Aignostics Platform API returned unknown status '{api_status}'",
229+
)
230+
195231
def _determine_api_public_health(self) -> Health:
196232
"""Determine healthiness and reachability of Aignostics Platform API.
197233
198234
- Checks if health endpoint is reachable and returns 200 OK
235+
- Parses the response body to detect DEGRADED status
199236
- Uses urllib3 for a direct connection check without authentication
200237
201238
Returns:
@@ -209,23 +246,17 @@ def _determine_api_public_health(self) -> Health:
209246
headers={"User-Agent": user_agent()},
210247
timeout=urllib3.Timeout(total=self._settings.health_timeout),
211248
)
212-
213-
if response.status != HTTPStatus.OK:
214-
logger.error("Aignostics Platform API (public) returned '{}'", response.status)
215-
return Health(
216-
status=Health.Code.DOWN, reason=f"Aignostics Platform API returned status '{response.status}'"
217-
)
249+
return self._health_from_response(response)
218250
except Exception as e:
219251
logger.exception("Issue with Aignostics Platform API")
220252
return Health(status=Health.Code.DOWN, reason=f"Issue with Aignostics Platform API: '{e}'")
221253

222-
return Health(status=Health.Code.UP)
223-
224254
def _determine_api_authenticated_health(self) -> Health:
225255
"""Determine healthiness and reachability of Aignostics Platform API via authenticated request.
226256
227257
Uses a dedicated HTTP pool (separate from the API client's connection pool) to prevent
228258
connection-level cross-contamination between health checks and API calls.
259+
Parses the response body to detect DEGRADED status.
229260
230261
Returns:
231262
Health: The healthiness of the Aignostics Platform API when trying to reach via authenticated request.
@@ -242,14 +273,10 @@ def _determine_api_authenticated_health(self) -> Health:
242273
},
243274
timeout=urllib3.Timeout(total=self._settings.health_timeout),
244275
)
245-
246-
if response.status != HTTPStatus.OK:
247-
logger.error("Aignostics Platform API (authenticated) returned '{}'", response.status)
248-
return Health(status=Health.Code.DOWN, reason=f"Aignostics Platform API returned '{response.status}'")
276+
return self._health_from_response(response)
249277
except Exception as e:
250278
logger.exception("Issue with Aignostics Platform API")
251279
return Health(status=Health.Code.DOWN, reason=f"Issue with Aignostics Platform API: '{e}'")
252-
return Health(status=Health.Code.UP)
253280

254281
def health(self) -> Health:
255282
"""Determine health of this service.

src/aignostics/system/CLAUDE.md

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -123,11 +123,9 @@ def health(self) -> Health:
123123
)
124124

125125
# Determine overall status based on ALL modules
126-
overall = Health.Code.UP if all(
127-
c.status == Health.Code.UP for c in components.values()
128-
) else Health.Code.DOWN
129-
130-
return Health(status=overall, components=components)
126+
# Priority: DOWN > DEGRADED > UP
127+
# compute_health_from_components() handles this automatically
128+
return Health(status=Health.Code.UP, components=components)
131129
```
132130

133131
### Exception Hierarchy (`_exceptions.py`)
@@ -275,7 +273,7 @@ print(f"System status: {health.status}")
275273

276274
# Check specific component
277275
platform_health = health.components.get("platform")
278-
if platform_health.status != Health.Code.UP:
276+
if not platform_health: # False only when DOWN (DEGRADED and UP are both truthy)
279277
print(f"Platform issue: {platform_health.reason}")
280278
```
281279

@@ -453,7 +451,7 @@ def test_health_aggregation():
453451
service = Service()
454452
health = service.health()
455453

456-
assert health.status in [Health.Code.UP, Health.Code.DOWN]
454+
assert health.status in [Health.Code.UP, Health.Code.DEGRADED, Health.Code.DOWN]
457455
assert "platform" in health.components
458456
assert isinstance(health.components, dict)
459457

src/aignostics/utils/_health.py

Lines changed: 52 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,69 +1,91 @@
11
"""Health models and status definitions for service health checks."""
22

33
from enum import StrEnum
4-
from typing import ClassVar, Self
4+
from typing import Any, ClassVar, Self
55

66
from pydantic import BaseModel, Field, model_validator
77

88

9-
class _HealthStatus(StrEnum):
9+
class HealthStatus(StrEnum):
10+
"""Health status enumeration for service health checks.
11+
12+
Values:
13+
UP: Service is operating normally
14+
DEGRADED: Service is operational but with reduced functionality
15+
DOWN: Service is not operational
16+
"""
17+
1018
UP = "UP"
19+
DEGRADED = "DEGRADED"
1120
DOWN = "DOWN"
1221

1322

1423
class Health(BaseModel):
1524
"""Represents the health status of a service with optional components and failure reasons.
1625
1726
- A health object can have child components, i.e. health forms a tree.
18-
- Any node in the tree can set itself to DOWN. In this case the node is required
19-
to set the reason attribute. If reason is not set when DOWN,
20-
automatic model validation of the tree will fail.
21-
- DOWN'ness is propagated to parent health objects. I.e. the health of a parent
22-
node is automatically set to DOWN if any of its child components are DOWN. The
23-
child components leading to this will be listed in the reason.
24-
- The root of the health tree is computed in the system module. The health of other
25-
modules is automatically picked up by the system module.
27+
- Any node in the tree can set itself to DOWN or DEGRADED. If DOWN, the node is required
28+
to set the reason attribute. If reason is not set when DOWN, automatic model validation fails.
29+
- DOWN trumps DEGRADED, DEGRADED trumps UP. If any child is DOWN, parent is DOWN.
30+
If none are DOWN but any are DEGRADED, parent is DEGRADED.
31+
- The root of the health tree is computed in the system module.
32+
The health of other modules is automatically picked up by the system module.
2633
"""
2734

28-
Code: ClassVar[type[_HealthStatus]] = _HealthStatus
29-
status: _HealthStatus
35+
Code: ClassVar[type[HealthStatus]] = HealthStatus
36+
status: HealthStatus
3037
reason: str | None = None
3138
components: dict[str, "Health"] = Field(default_factory=dict)
39+
uptime_statistics: dict[str, dict[str, Any]] | None = None # Optional uptime stats
3240

3341
def compute_health_from_components(self) -> Self:
3442
"""Recursively compute health status from components.
3543
3644
- If health is already DOWN, it remains DOWN with its original reason.
3745
- If health is UP but any component is DOWN, health becomes DOWN with
3846
a reason listing all failed components.
47+
- If no components are DOWN but any are DEGRADED, health becomes DEGRADED with a reason.
3948
4049
Returns:
4150
Self: The updated health instance with computed status.
4251
"""
4352
# Skip recomputation if already known to be DOWN
44-
if self.status == _HealthStatus.DOWN:
53+
if self.status == HealthStatus.DOWN:
4554
return self
4655

4756
# No components means we keep the existing status
4857
if not self.components:
4958
return self
5059

51-
# Find all DOWN components
60+
# Find all DOWN and DEGRADED components
5261
down_components = []
62+
degraded_components = []
5363
for component_name, component in self.components.items():
5464
# Recursively compute health for each component
5565
component.compute_health_from_components()
56-
if component.status == _HealthStatus.DOWN:
57-
down_components.append(component_name)
66+
if component.status == HealthStatus.DOWN:
67+
down_components.append((component_name, component.reason))
68+
elif component.status == HealthStatus.DEGRADED:
69+
degraded_components.append((component_name, component.reason))
5870

5971
# If any components are DOWN, mark the parent as DOWN
6072
if down_components:
61-
self.status = _HealthStatus.DOWN
73+
self.status = HealthStatus.DOWN
6274
if len(down_components) == 1:
63-
self.reason = f"Component '{down_components[0]}' is DOWN"
75+
component_name, component_reason = down_components[0]
76+
self.reason = f"Component '{component_name}' is DOWN ({component_reason})"
77+
else:
78+
component_list = ", ".join(f"'{name}' ({reason})" for name, reason in down_components)
79+
self.reason = f"Components {component_list} are DOWN"
80+
# If no components are DOWN but any are DEGRADED, mark parent as DEGRADED
81+
elif degraded_components:
82+
self.status = HealthStatus.DEGRADED
83+
if len(degraded_components) == 1:
84+
component_name, component_reason = degraded_components[0]
85+
self.reason = f"Component '{component_name}' is DEGRADED ({component_reason})"
6486
else:
65-
component_list = "', '".join(down_components)
66-
self.reason = f"Components '{component_list}' are DOWN"
87+
component_list = ", ".join(f"'{name}' ({reason})" for name, reason in degraded_components)
88+
self.reason = f"Components {component_list} are DEGRADED"
6789

6890
return self
6991

@@ -73,7 +95,7 @@ def validate_health_state(self) -> Self:
7395
7496
- Compute overall health based on component health
7597
- Ensure UP status has no associated reason
76-
- Ensure DOWN status always has a reason
98+
- Ensure DOWN and DEGRADED status always have a reason
7799
78100
Returns:
79101
Self: The validated model instance with correct health status.
@@ -85,31 +107,31 @@ def validate_health_state(self) -> Self:
85107
self.compute_health_from_components()
86108

87109
# Validate that UP status has no reason
88-
if (self.status == _HealthStatus.UP) and self.reason:
110+
if (self.status == HealthStatus.UP) and self.reason:
89111
msg = f"Health {self.status} must not have reason"
90112
raise ValueError(msg)
91113

92-
# Validate that DOWN status always has a reason
93-
if (self.status == _HealthStatus.DOWN) and not self.reason:
94-
msg = "Health DOWN must have a reason"
114+
# Validate that DOWN and DEGRADED status always have a reason
115+
if (self.status in {HealthStatus.DOWN, HealthStatus.DEGRADED}) and not self.reason:
116+
msg = f"Health {self.status} must have a reason"
95117
raise ValueError(msg)
96118

97119
return self
98120

99121
def __str__(self) -> str:
100-
"""Return string representation of health status with optional reason for DOWN state.
122+
"""Return string representation of health status with optional reason for DOWN/DEGRADED state.
101123
102124
Returns:
103-
str: The health status value, with reason appended if status is DOWN.
125+
str: The health status value, with reason appended if status is DOWN or DEGRADED.
104126
"""
105-
if self.status == _HealthStatus.DOWN and self.reason:
127+
if self.status in {HealthStatus.DOWN, HealthStatus.DEGRADED} and self.reason:
106128
return f"{self.status.value}: {self.reason}"
107129
return self.status.value
108130

109131
def __bool__(self) -> bool:
110132
"""Convert health status to a boolean value.
111133
112134
Returns:
113-
bool: True if the status is UP, False otherwise.
135+
bool: True if the status is UP or DEGRADED, False otherwise.
114136
"""
115-
return self.status == _HealthStatus.UP
137+
return self.status in {HealthStatus.UP, HealthStatus.DEGRADED}

tests/aignostics/application/cli_test.py

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ def test_cli_application_run_upload_fails_on_missing_source(runner: CliRunner, t
180180
assert "Warning: Source file 'missing.file' (row 0) does not exist" in normalize_output(result.stdout)
181181

182182

183-
@pytest.mark.unit
183+
@pytest.mark.e2e
184184
@pytest.mark.timeout(timeout=10)
185185
@patch("aignostics.application._cli.SystemService.health_static")
186186
def test_cli_run_submit_fails_when_system_unhealthy_and_no_force(
@@ -211,7 +211,47 @@ def test_cli_run_submit_fails_when_system_unhealthy_and_no_force(
211211
assert result.exit_code == 1
212212

213213

214-
@pytest.mark.unit
214+
@pytest.mark.e2e
215+
@pytest.mark.timeout(timeout=10)
216+
@patch("aignostics.application._cli.SystemService.health_static")
217+
def test_cli_run_submit_succeeds_when_system_degraded_and_no_force(
218+
mock_health: MagicMock, runner: CliRunner, tmp_path: Path
219+
) -> None:
220+
"""Check run submit command succeeds when system is degraded and --force is not used."""
221+
mock_health.return_value = Health(
222+
status=Health.Code.DEGRADED,
223+
reason="Simulated degraded system for testing",
224+
)
225+
csv_content = "external_id;checksum_base64_crc32c;resolution_mpp;width_px;height_px;staining_method;tissue;disease;"
226+
csv_content += "platform_bucket_url\n"
227+
csv_content += ";5onqtA==;0.26268186053789266;7447;7196;H&E;LUNG;LUNG_CANCER;gs://bucket/test"
228+
csv_path = tmp_path / "dummy.csv"
229+
csv_path.write_text(csv_content)
230+
231+
result = runner.invoke(
232+
cli,
233+
[
234+
"application",
235+
"run",
236+
"submit",
237+
HETA_APPLICATION_ID,
238+
str(csv_path),
239+
"--deadline",
240+
(datetime.now(tz=UTC) + timedelta(minutes=5)).isoformat(),
241+
],
242+
)
243+
244+
assert result.exit_code == 0
245+
246+
# cancel the run
247+
run_id_match = re.search(r"Submitted run with id '([0-9a-f-]+)' for '", normalize_output(result.stdout))
248+
assert run_id_match, f"Failed to extract run ID from output '{normalize_output(result.stdout)}'"
249+
run_id = run_id_match.group(1)
250+
cancel_result = runner.invoke(cli, ["application", "run", "cancel", run_id])
251+
assert cancel_result.exit_code == 0
252+
253+
254+
@pytest.mark.e2e
215255
@pytest.mark.timeout(timeout=10)
216256
@patch("aignostics.application._cli.SystemService.health_static")
217257
def test_cli_run_upload_fails_when_system_unhealthy_and_no_force(

0 commit comments

Comments
 (0)