Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 142 additions & 0 deletions test/functional/bug_addnode_no_retry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
#!/usr/bin/env python3
"""Bug reproducer — `addnode add` has no retry on transient failure.

Original issue body of #6 listed FOUR root causes:
1. silent async failures (LOG-less) — FIXED by PR #10
2. premature RPC success — FIXED by PR #10
3. NO RETRY MECHANISM — STILL OPEN (this file)
4. post-IBD stall detection disabled — STILL OPEN (see bug_post_ibd_stall_disabled.py)

`manual_addresses_` in connection_manager just remembers the address; if the
initial TCP connect fails, the node never re-attempts the manual peer. So if
you `addnode <peer> add` while <peer> is briefly down, then bring <peer> up
later, the node never connects.

This reproducer demonstrates the gap. It exits 0 if the gap is still present
(bug still open) and exits 1 if the node has learned to retry (= someone has
implemented the fix; this reproducer is now obsolete and should be inverted).

Issue: https://github.com/unicitynetwork/unicity-node/issues/6 (item 3)
Tracking: aggregator-subscription/INVESTIGATIONS.md F6a
"""
import sys
import socket
import time
import tempfile
import shutil
import threading
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent / "test_framework"))
from test_node import TestNode
from util import pick_free_port

GREEN, RED, YELLOW, BLUE, RESET = '\033[92m', '\033[91m', '\033[93m', '\033[94m', '\033[0m'


def log(msg, color=None):
print(f"{color}{msg}{RESET}" if color else msg)


def main():
test_dir = Path(tempfile.mkdtemp(prefix='cbc_bug_addnode_no_retry_'))
binary = Path(__file__).resolve().parent.parent.parent / "build" / "bin" / "unicityd"

node_port = pick_free_port()
peer_port = pick_free_port() # the "peer" we'll bring up belatedly

node = TestNode(0, test_dir / "node0", binary_path=binary,
extra_args=[f"--port={node_port}"], chain="regtest")

listener_stop = threading.Event()
listener_thread = None
accepted = []

def listener():
"""Belated TCP listener. After we addnode to peer_port (closed),
we wait a few seconds, then start accepting on peer_port. If the
node had a retry mechanism, it would connect to us."""
srv = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
srv.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
try:
srv.bind(("127.0.0.1", peer_port))
srv.listen(8)
srv.settimeout(0.5)
except OSError as e:
log(f" listener bind failed: {e}", RED)
return
deadline = time.time() + 20
while not listener_stop.is_set() and time.time() < deadline:
try:
c, addr = srv.accept()
accepted.append(addr)
# We're not a real unicity peer; just close.
c.close()
except socket.timeout:
pass
srv.close()

try:
log("\n" + "=" * 70, BLUE)
log("BUG REPRODUCER — un-node #6, out-of-scope item: 'no retry on addnode'", BLUE)
log("=" * 70, BLUE)

log("\n[setup] start regtest node")
node.start()

log(f"\n[step 1] addnode 127.0.0.1:{peer_port} add — peer is CLOSED right now (no listener)")
rv = node.rpc("addnode", f"127.0.0.1:{peer_port}", "add")
log(f" result: {rv}")
# Expected post-PR-10: clean error, not success.

log(f"\n[step 2] start a TCP listener on 127.0.0.1:{peer_port} (5s after addnode)")
time.sleep(5)
listener_thread = threading.Thread(target=listener, daemon=True)
listener_thread.start()
log(" listener up; if the node has retry logic, it would now connect within ~10s")

log("\n[step 3] wait 15s; observe whether the node attempts the manual peer again")
time.sleep(15)
log(f" listener accepted {len(accepted)} connection attempt(s) from the node")

# Capture additional evidence from debug.log: did unicityd log any retry attempt?
debug = node.datadir / "debug.log"
retry_log_hits = 0
if debug.exists():
with debug.open() as f:
for line in f:
if str(peer_port) in line and (
"outbound connect attempt" in line.lower()
or "outbound connect failed" in line.lower()
):
retry_log_hits += 1
log(f" debug.log mentions of port {peer_port}: {retry_log_hits}")

# Verdict
log("\n" + "=" * 70, BLUE)
if len(accepted) == 0 and retry_log_hits <= 1:
log("BUG REPRODUCED — the node did not retry the manual peer.", YELLOW)
log(f" • 0 inbound connection attempts from the node within 15s of the listener opening", YELLOW)
log(f" • only the original (failed) outbound attempt is in debug.log", YELLOW)
log(" → un-node #6 item 3 (no retry mechanism) is still open. ✅ reproducer working.", YELLOW)
return 0
else:
log("BUG FIXED? Unexpected connection attempts observed:", GREEN)
log(f" • accepted {len(accepted)} connection(s) from the node", GREEN)
log(f" • {retry_log_hits} debug.log entries for port {peer_port}", GREEN)
log(" → looks like retry logic is in. This reproducer is OBSOLETE.", GREEN)
log(" Invert the assertion or delete this file, and file a CHANGELOG entry.", GREEN)
return 1
finally:
listener_stop.set()
if listener_thread:
listener_thread.join(timeout=2)
try:
node.stop()
except Exception:
pass
shutil.rmtree(test_dir, ignore_errors=True)


if __name__ == "__main__":
sys.exit(main())
69 changes: 69 additions & 0 deletions test/functional/bug_getaddednodeinfo_missing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/usr/bin/env python3
"""Bug reproducer — `getaddednodeinfo` RPC is not implemented.

Surfaced during the manual verification of un-node #6 / PR #10 (state-query
sub-check 2i). Bitcoin-Core-style nodes typically expose `getaddednodeinfo`
so operators can inspect what `addnode add` actually queued. Without it,
the only way to verify the manually-added-nodes list is by grepping
`debug.log`.

Not a regression of #10 (which didn't claim to add this RPC). Logged here
as an adjacent gap. This file exits 0 while the gap is still there (calling
the RPC returns `Unknown command`), and exits 1 when someone implements it.

Tracking: aggregator-subscription/INVESTIGATIONS.md F6c
"""
import sys
import tempfile
import shutil
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent / "test_framework"))
from test_node import TestNode
from util import pick_free_port

GREEN, RED, YELLOW, BLUE, RESET = '\033[92m', '\033[91m', '\033[93m', '\033[94m', '\033[0m'


def log(msg, color=None):
print(f"{color}{msg}{RESET}" if color else msg)


def main():
test_dir = Path(tempfile.mkdtemp(prefix='cbc_bug_getaddednodeinfo_'))
binary = Path(__file__).resolve().parent.parent.parent / "build" / "bin" / "unicityd"
node = TestNode(0, test_dir / "node0", binary_path=binary,
extra_args=[f"--port={pick_free_port()}"], chain="regtest")
try:
log("\n" + "=" * 70, BLUE)
log("BUG REPRODUCER — getaddednodeinfo RPC not implemented", BLUE)
log("=" * 70, BLUE)

node.start()
log("\n[step] call getaddednodeinfo")
rv = node.rpc("getaddednodeinfo")
log(f" result: {rv}")

unknown = isinstance(rv, dict) and "unknown command" in str(rv.get("error", "")).lower()

log("\n" + "=" * 70, BLUE)
if unknown:
log("BUG REPRODUCED — getaddednodeinfo returns 'Unknown command'.", YELLOW)
log(" → RPC is still unimplemented. ✅ reproducer working.", YELLOW)
return 0
else:
log("BUG FIXED? getaddednodeinfo returned a non-Unknown-command response:", GREEN)
log(f" {rv}", GREEN)
log(" → looks like the RPC was implemented. This reproducer is OBSOLETE;", GREEN)
log(" invert/delete it and add a positive feature test.", GREEN)
return 1
finally:
try:
node.stop()
except Exception:
pass
shutil.rmtree(test_dir, ignore_errors=True)


if __name__ == "__main__":
sys.exit(main())
132 changes: 132 additions & 0 deletions test/functional/bug_post_ibd_stall_disabled.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
#!/usr/bin/env python3
"""Bug reproducer — no FAST post-IBD chain-sync stall detection.

The original #6 body claimed "post-IBD stall detection disabled". After
verification, the picture is more nuanced:

POST-IBD LOGIC THAT IS PRESENT (commit 83a0fae9 added it):
- ConsiderEviction(peer): post-IBD stale-chain eviction by chain-work
comparison. src/network/header_sync_manager.cpp:132-137 + :755+
- PING/PONG timeout (~1200 s) at the protocol layer.
- Inactivity timeout (~1200 s) at the protocol layer.
- Multi-peer post-IBD sync (CheckInitialSync() pulls from all new outbound
peers, not just one).

THE SPECIFIC GAP THAT REMAINS:
The IBD-only deadline-based stall check inside `ProcessTimers()` —

if (in_ibd) {
// sync_deadline check on sync peer; disconnect on miss
} else {
// ONLY ConsiderEviction, NO deadline check
}

— has no post-IBD analog. A peer that:
(a) responds to PINGs (passes the 1200 s timeouts), and
(b) has chain-work not visibly behind ours (passes ConsiderEviction), and
(c) silently drags its feet on actually delivering headers
is only caught by the 1200 s inactivity timeout — a 20-minute window
during which the node is effectively wedged on a slow peer. The IBD
path catches the same case in seconds.

This reproducer is a SOURCE-LEVEL check on that specific shape. It asserts:
- ProcessTimers() has the gate `in_ibd`
- The IBD branch has a deadline-based check (look for `sync_deadline`)
- The else (post-IBD) branch has NO deadline check (only ConsiderEviction)

When the gap is fixed (post-IBD branch gets a deadline analog), the third
assertion flips → exit code becomes 1 → time to invert/delete this file.

A runtime test of the actual stall would need a 2-node controlled-silence
scenario (peer that keeps PINGing but ignores GETHEADERS); doable but
fragile. Source-level is canonical here.

Issue: https://github.com/unicitynetwork/unicity-node/issues/6 (item 4, scoped down)
Tracking: aggregator-subscription/INVESTIGATIONS.md F6b
"""
import re
import sys
from pathlib import Path

GREEN, RED, YELLOW, BLUE, RESET = '\033[92m', '\033[91m', '\033[93m', '\033[94m', '\033[0m'


def log(msg, color=None):
print(f"{color}{msg}{RESET}" if color else msg)


def main():
src_hsm = Path(__file__).resolve().parent.parent.parent / "src" / "network" / "header_sync_manager.cpp"
if not src_hsm.exists():
log(f"source not found: {src_hsm}", RED)
return 2

body = src_hsm.read_text()
m = re.search(r"void\s+HeaderSyncManager::ProcessTimers\s*\(\s*\)\s*\{(.+?)\n\}\n",
body, re.DOTALL)
if not m:
log("ProcessTimers() not found — has the function been renamed?", RED)
return 2
pt_body = m.group(1)

log("\n" + "=" * 70, BLUE)
log("BUG REPRODUCER — no FAST post-IBD chain-sync stall detection", BLUE)
log("=" * 70, BLUE)

# Split into `if (in_ibd) { ... }` IBD branch and `else { ... }` post-IBD branch.
ibd_match = re.search(
r"if\s*\(\s*in_ibd\s*\)\s*\{(.*?)\}\s*else\s*\{(.*?)\}\s*$",
pt_body.strip(), re.DOTALL,
)
if not ibd_match:
log("Couldn't split ProcessTimers into if(in_ibd)/else branches.", YELLOW)
log("Either the gate was removed (good!) or restructured (re-check the assertion).", YELLOW)
log(f"ProcessTimers body:\n{pt_body}", YELLOW)
return 1
ibd_branch = ibd_match.group(1)
post_ibd_branch = ibd_match.group(2)

# [1] IBD branch HAS a deadline-based check (the gold standard)
log("\n[1] IBD branch contains a deadline-based stall check")
ibd_has_deadline = bool(re.search(
r"sync_deadline|deadline\s*[!<>=]|timeout.*exceeded", ibd_branch, re.IGNORECASE))
log(f" IBD branch references sync_deadline / deadline / timeout: {ibd_has_deadline}",
GREEN if ibd_has_deadline else RED)

# [2] Post-IBD branch has ConsiderEviction (the existing partial mitigation)
log("\n[2] post-IBD branch calls ConsiderEviction (existing partial mitigation)")
post_has_eviction = "ConsiderEviction" in post_ibd_branch
log(f" post-IBD calls ConsiderEviction: {post_has_eviction}",
GREEN if post_has_eviction else RED)

# [3] Post-IBD branch has NO deadline check — this is the SPECIFIC gap
log("\n[3] post-IBD branch does NOT have a deadline-based stall check (the gap)")
post_has_deadline = bool(re.search(
r"sync_deadline|deadline\s*[!<>=]|\btimeout\b.*exceeded", post_ibd_branch, re.IGNORECASE))
log(f" post-IBD references sync_deadline / deadline / timeout: {post_has_deadline}",
YELLOW if not post_has_deadline else GREEN)

log("\n" + "=" * 70, BLUE)
if ibd_has_deadline and post_has_eviction and not post_has_deadline:
log("BUG REPRODUCED — the specific gap is still present:", YELLOW)
log(" • IBD branch has the deadline-based stall check", YELLOW)
log(" • post-IBD branch has ConsiderEviction (chain-work eviction only)", YELLOW)
log(" • post-IBD branch has NO deadline-based stall check", YELLOW)
log(" → fast post-IBD stall detection is still missing. ✅ reproducer working.", YELLOW)
return 0
elif post_has_deadline:
log("BUG FIXED? post-IBD branch now references a deadline / timeout check:", GREEN)
log(" → looks like the fast post-IBD stall check was implemented.", GREEN)
log(" This reproducer is OBSOLETE; invert / delete it.", GREEN)
return 1
else:
log("UNEXPECTED — IBD-branch mechanism changed, can't characterise:", RED)
log(f" ibd_has_deadline={ibd_has_deadline}", RED)
log(f" post_has_eviction={post_has_eviction}", RED)
log(f" post_has_deadline={post_has_deadline}", RED)
log(" → re-inspect the source manually.", RED)
return 2


if __name__ == "__main__":
sys.exit(main())
Loading
Loading