Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions judoscale/core/reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,15 @@ def _run_loop(self):
# Instead, we now wait for the first interval to pass before
# reporting metrics.
time.sleep(self.config["REPORT_INTERVAL_SECONDS"])
self._report_metrics()

# Catch absolutely anything so the reporter thread survives
# transient/unexpected errors and keeps trying on the next
# interval. Without this, a single bad cycle silently
# disables all metric reporting until the process restarts.
try:
self._report_metrics()
except Exception as e:
logger.exception(f"Reporter cycle failed, will retry: {e}")

if self._stopevent.is_set():
break
Expand All @@ -100,10 +108,19 @@ def _run_loop(self):
def all_metrics(self) -> List[Metric]:
"""
Return a list of all metrics collected by all collectors.

A failing collector is logged and skipped so that one collector's
error doesn't suppress metrics from the others.
"""
metrics = []
for collector in self.collectors:
metrics.extend(collector.collect())
try:
metrics.extend(collector.collect())
except Exception as e:
logger.exception(
f"Collector {type(collector).__name__} failed to "
f"collect metrics, skipping this cycle: {e}"
)
return metrics

TRANSIENT_ERRORS = (
Expand Down
57 changes: 57 additions & 0 deletions tests/test_reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,3 +132,60 @@ def test_does_not_retry_non_transient_errors(
reporter._report_metrics()

assert mock_post.call_count == 1

def test_all_metrics_continues_when_one_collector_raises(
self, reporter, caplog
):
caplog.set_level(logging.ERROR, logger="judoscale")

failing_collector = WebMetricsCollector(reporter.config)
failing_collector.collect = MagicMock(side_effect=RuntimeError("boom"))
reporter.add_adapter(
Adapter(
identifier="failing",
adapter_info=AdapterInfo(runtime_version="0.0.0"),
metrics_collector=failing_collector,
)
)

healthy_collector = WebMetricsCollector(reporter.config)
healthy_collector.add(Metric(measurement="qt", timestamp=0, value=0))
reporter.add_adapter(
Adapter(
identifier="healthy",
adapter_info=AdapterInfo(runtime_version="0.0.0"),
metrics_collector=healthy_collector,
)
)

metrics = reporter.all_metrics

assert len(metrics) == 1
assert any(
"failed to collect metrics" in record.message
for record in caplog.records
)

@patch.object(time, "sleep")
def test_run_loop_survives_report_metrics_exception(
self, mock_sleep, reporter, caplog
):
caplog.set_level(logging.ERROR, logger="judoscale")

call_count = {"n": 0}

def fail_first_then_stop():
call_count["n"] += 1
if call_count["n"] == 1:
raise RuntimeError("kaboom")
reporter._stopevent.set()

reporter._report_metrics = MagicMock(side_effect=fail_first_then_stop)
reporter.start()
reporter._thread.join(timeout=2)

assert call_count["n"] >= 2
assert any(
"Reporter cycle failed" in record.message
for record in caplog.records
)
Loading