diff --git a/.gitignore b/.gitignore index 88c4928..813135a 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,5 @@ web/management/node_modules/ web/management/dist/ web/management/test-results/ web/management/playwright-report/ +docs/.plans/ SECURITY_AUDIT.md diff --git a/Makefile b/Makefile index 94e8e28..3e54f17 100644 --- a/Makefile +++ b/Makefile @@ -110,17 +110,21 @@ kill: @-repo=$$(pwd); \ pkill -TERM -f '[g]o tool air -c .air.toml|/go-build/.*/[a]ir -c .air.toml|[b]un run --bun vite --host 127.0.0.1 --port 5173|[n]ode .*vite --host 127.0.0.1 --port 5173' 2>/dev/null || true; \ pkill -TERM -f "$$repo/bin/p2pstream|$$repo/tmp/p2pstream-dev|$$repo/tmp/p2pstream-agent-dev|[g]o run main.go agent|/go-build/.*/[m]ain agent" 2>/dev/null || true; \ - for port in 8081 8088 8089 5173; do \ - pids=$$(ss -H -ltnp "sport = :$$port" 2>/dev/null | sed -n 's/.*pid=\([0-9][0-9]*\).*/\1/p' | sort -u); \ - [ -z "$$pids" ] || kill -TERM $$pids 2>/dev/null || true; \ - done; \ + if command -v ss >/dev/null 2>&1; then \ + for port in 8081 8088 8089 5173; do \ + pids=$$(ss -H -ltnp "sport = :$$port" 2>/dev/null | sed -n 's/.*pid=\([0-9][0-9]*\).*/\1/p' | sort -u); \ + [ -z "$$pids" ] || kill -TERM $$pids 2>/dev/null || true; \ + done; \ + fi; \ sleep 0.5; \ pkill -KILL -f '[g]o tool air -c .air.toml|/go-build/.*/[a]ir -c .air.toml|[b]un run --bun vite --host 127.0.0.1 --port 5173|[n]ode .*vite --host 127.0.0.1 --port 5173' 2>/dev/null || true; \ pkill -KILL -f "$$repo/bin/p2pstream|$$repo/tmp/p2pstream-dev|$$repo/tmp/p2pstream-agent-dev|[g]o run main.go agent|/go-build/.*/[m]ain agent" 2>/dev/null || true; \ - for port in 8081 8088 8089 5173; do \ - pids=$$(ss -H -ltnp "sport = :$$port" 2>/dev/null | sed -n 's/.*pid=\([0-9][0-9]*\).*/\1/p' | sort -u); \ - [ -z "$$pids" ] || kill -KILL $$pids 2>/dev/null || true; \ - done + if command -v ss >/dev/null 2>&1; then \ + for port in 8081 8088 8089 5173; do \ + pids=$$(ss -H -ltnp "sport = :$$port" 2>/dev/null | sed -n 's/.*pid=\([0-9][0-9]*\).*/\1/p' | sort -u); \ + [ -z "$$pids" ] || kill -KILL $$pids 2>/dev/null || true; \ + done; \ + fi clean: @echo "Cleaning up..." diff --git a/agent_tunnel_test.go b/agent_tunnel_test.go index 73007a2..79a909c 100644 --- a/agent_tunnel_test.go +++ b/agent_tunnel_test.go @@ -53,6 +53,7 @@ func dialAgentTunnel(ctx context.Context, managementURL string, publicID string, return nil, resp, err } if resp.StatusCode != http.StatusSwitchingProtocols { + _ = resp.Body.Close() return nil, resp, fmt.Errorf("expected tunnel upgrade status 101, got %d", resp.StatusCode) } body, ok := resp.Body.(io.ReadWriteCloser) diff --git a/docs/.plans/agent-yamux-overhaul.md b/docs/.plans/agent-yamux-overhaul.md deleted file mode 100644 index 12a2e1b..0000000 --- a/docs/.plans/agent-yamux-overhaul.md +++ /dev/null @@ -1,277 +0,0 @@ -# Agent Yamux Transport Overhaul - -## Current State - -- Branch: `dev` (merged from `rewrite/agent-yamux-transport`) -- Base commit: `ddb6c09f5eb5aa2e7e7335dd11ff8fccaa0eb0d2` -- Last updated: `2026-06-06T19:24:56+02:00` -- Current phase: PR merge path open because `dev` is protected; CodeQL false-positive dismissal pending -- Current blocker: PR #45 CodeQL alert must be dismissed as intentional discovery-only certificate collection. - -## Decisions - -- Use Yamux over the existing management TLS connection. -- Use hard cutover; no WebSocket compatibility layer. -- Use raw TCP streams, not HTTP-object messages. -- Server owns HTTP semantics through `httputil.ReverseProxy` and `http.Transport`. -- Agent only dials TCP origins and relays bytes. -- Agent and server versions must match during this rewrite. -- Management reverse proxies must allow HTTP/1.1 upgrade streaming for `/agent/tunnel` with `Upgrade: p2pstream-yamux`. -- Origin TLS verification for agent-pool backends is performed by the server-side transport. - -## Phase Checklist - -- [x] Phase 1: Tunnel package -- [x] Phase 2: Server tunnel bootstrap -- [x] Phase 3: Agent tunnel client -- [x] Phase 4: Unified proxy transport -- [x] Phase 5: Health checks, certificate discovery, environment proxy -- [x] Phase 6: Remove legacy WebSocket/msg/httpmsg runtime -- [x] Phase 7: Docs and setup snippets -- [x] Phase 8: Final verification -- [x] Phase 9: Documentation updates -- [x] Phase 10: Smoke and regression testing - -## Phase 2 Hardening Checklist - -- [x] Baseline validation -- [x] Docker smoke upstream replacement -- [x] Agent-pool smoke scenario expansion -- [x] Tunnel lifecycle unit/integration tests -- [x] Proxy parity tests -- [x] TLS and certificate discovery hardening -- [x] Observability and logging improvements -- [x] Dev ergonomics cleanup -- [x] Final validation - -## Phase 3 Pooling And Browser E2E Checklist - -- [x] Baseline validation -- [x] Agent transport pool implementation -- [x] Public proxy pooling integration -- [x] Health check pooling integration -- [x] Environment proxy pooling integration -- [x] Pool invalidation hooks -- [x] Pool lifecycle/concurrency tests -- [x] Playwright setup -- [x] Environment switch browser tests -- [x] Smoke and regression validation -- [x] Final handoff - -## Phase 4 Pre-Merge Checklist - -- [x] Branch hygiene and remote sync -- [x] High-risk implementation review -- [x] Generated file check -- [x] Full local validation -- [x] Docker smoke validation -- [x] Browser E2E validation -- [x] Legacy runtime reference scan -- [x] Merge decision recorded -- [x] Merge to dev -- [x] Post-merge validation - -## Files Changed - -| File | Purpose | -| --- | --- | -| `docs/.plans/agent-yamux-overhaul.md` | Persistent handoff ledger for this rewrite. | -| `go.mod`, `go.sum` | Added `github.com/hashicorp/yamux v0.1.2`; removed `github.com/coder/websocket`. | -| `internal/tunnel/*` | New tunnel protocol package: constants, OpenRequest/OpenResponse validation, length-prefixed JSON frames, Yamux config, tests. | -| `internal/server/server.go` | Replaced `/ws` with `/agent/tunnel`; added authenticated HTTP/1.1 upgrade, hijack, Yamux server session lifecycle, DB connected/disconnected state, duplicate rejection. | -| `internal/server/agent_hub.go` | Simplified agent hub to connected sessions only; removed pending request maps and late response tracker. | -| `internal/server/public_routing.go` | Agent-pool proxying now uses `httputil.ReverseProxy` and `http.Transport.DialContext` through Yamux streams. | -| `internal/server/public_backend_health.go` | Agent health checks now use the same server-side HTTP transport over `dialViaAgent`. | -| `internal/server/environment_proxy.go` | Environment proxy and certificate discovery now use server-side TLS over agent tunnel streams. | -| `internal/agent/agent.go` | Replaced WebSocket/message loop with tunnel bootstrap, Yamux client, stream accept loop, TCP dial, and bidirectional byte relay. | -| `internal/agent/management_tls_test.go` | Added regression coverage that tunnel bootstrap forces HTTP/1.1 ALPN even when the management TLS server supports HTTP/2. | -| `msg/*`, `httpmsg/*` | Deleted legacy custom HTTP-message protocol packages. | -| `internal/server/*test.go`, `internal/agent/*test.go`, root `*_test.go` | Migrated tests from WebSocket/message helpers to Yamux tunnel helpers and production-like stream relay. | -| `agent_tunnel_test.go` | Shared root test helper for HTTP/1.1 tunnel upgrade and Yamux agent relay. | -| `cmd/root.go`, `cmd/server.go` | Updated CLI description and management startup log from WebSocket to agent tunnel. | -| `Makefile` | Hardened `dev`, `run`, and `kill` cleanup so Air wrappers, repo-local server/agent child processes, Vite, and stale dev-port listeners are terminated on interrupt/restart. | -| `.air.toml` | Configured Air to send interrupts, wait briefly before killing the dev server child, and ignore noisy dependency/runtime directories. | -| `Dockerfile`, `docker-compose.test.yml` | Added a repo-owned Docker smoke upstream image and replaced the Python static upstream. | -| `internal/smoketest/upstream/main.go` | New smoke upstream with GET, POST echo, headers, streaming, slow-header, close-early, health, and minimal WebSocket endpoints. | -| `internal/smoketest/docker_smoke_test.go` | Expanded Docker smoke coverage for direct and agent-pool GET, POST body, streaming, forwarded headers, timeout, close-early, and WebSocket upgrade scenarios. | -| `internal/agent/agent.go` | Added unsupported-version stream error mapping, tunnel stream debug logs, relay byte/duration logging, and accept-loop exit logs. | -| `internal/agent/agent_test.go` | Added malformed/unsupported OpenRequest recovery and session-close tests. | -| `internal/server/agent_registry_test.go` | Strengthened token rotation tunnel test with DB disconnect, old-token rejection, and new-token reconnect assertions. | -| `internal/server/public_routing.go` | Added agent dial/open failure logs and closed late-opened Yamux streams when request context cancels. | -| `internal/server/server.go` | Added tunnel version, duration, and active request fields to connect/disconnect logs. | -| `internal/server/public_routing_timeout_test.go` | Added upload cancellation, mid-response agent disconnect, and agent-backed HTTPS verification tests. | -| `internal/server/environments_test.go` | Added agent environment certificate discovery, trust, proxy, changed-certificate rejection, and disconnected-agent discovery tests. | -| `internal/server/public_cache_test.go` | Added agent-pool cache miss/store/hit parity coverage with selected agent event recording. | -| `web/management/vite.config.ts` | Proxies environment-scoped Connect calls under `/environments/` during Vite dev sessions. | -| `internal/server/agent_transport_pool.go` | Added keyed per-agent `http.Transport` pooling with request-ID context propagation, close-by-agent/backend/environment invalidation, and shutdown cleanup. | -| `internal/server/agent_transport_pool_test.go` | Added pool reuse, separation, invalidation, reconnect, concurrency, and request-ID fallback coverage. | -| `internal/server/server.go` | Added `App.AgentTransports`, disconnect-hook pool cleanup, and app-level transport cleanup helper. | -| `internal/server/agent_hub.go` | Added disconnect hook execution outside the hub mutex for pool invalidation. | -| `internal/server/public_routing.go` | Switched agent-backed public proxying from per-request transports to pooled per-agent/backend transports. | -| `internal/server/public_backend_health.go` | Switched agent-backed health checks to the public backend transport pool and request-ID context propagation. | -| `internal/server/environment_proxy.go` | Switched agent-backed environment proxy calls to pooled per-agent/environment transports while keeping auth wrapping outside the pool. | -| `internal/server/agent_registry.go`, `internal/server/public_config.go`, `internal/server/environments.go` | Added defensive pool invalidation after agent, backend, environment, token, and trust changes. | -| `internal/server/agent_registry_test.go`, `internal/server/environments_test.go`, `internal/server/public_routing_timeout_test.go` | Extended existing lifecycle/environment fixtures to assert pooling and invalidation behavior. | -| `web/management/package.json`, `web/management/bun.lock` | Added Playwright as an explicit frontend dev dependency and added E2E scripts. | -| `web/management/playwright.config.ts` | Added deterministic browser E2E server orchestration with isolated SQLite/cache directories and management/Vite projects. | -| `web/management/e2e/environment-switch.spec.ts`, `web/management/e2e/helpers/connect.ts` | Added browser coverage for switching to a trusted loopback environment through management proxy and direct Vite. | -| `Makefile` | Added `frontend-e2e` target for explicit Playwright execution. | -| `.air.toml`, `.gitignore` | Excluded Playwright artifact directories from Air and Git. | -| `README.md`, `docs/**` | Updated transport, reverse proxy, TLS ownership, and compatibility documentation. | -| `docs/public/architecture.svg` | Updated architecture diagram text from WSS to Yamux tunnel. | - -## Validation Log - -| Date | Command | Result | Notes | -| --- | --- | --- | --- | -| `2026-06-05T22:38:56Z` | `git pull --ff-only` | passed | `dev` was already up to date before branch creation. | -| `2026-06-05T22:40:00Z` | `go test ./internal/tunnel` | passed | Phase 1 package tests. | -| `2026-06-05T22:45:00Z` | `go test ./internal/server` | passed | Phase 2 server bootstrap checkpoint. | -| `2026-06-05T22:50:00Z` | `go test ./internal/agent` | passed | Phase 3 agent tunnel client checkpoint. | -| `2026-06-06T01:04:00+02:00` | `go test ./...` | passed | Full Go suite after deleting legacy protocol packages. | -| `2026-06-06T01:10:00+02:00` | `make test` | passed | Includes `go test ./...` and `vue-tsc --noEmit`. | -| `2026-06-06T01:11:00+02:00` | `make build` | passed | Frontend/backend build passed; Vite emitted existing large-chunk warning. | -| `2026-06-06T01:16:00+02:00` | `make docker-smoke` | failed | Docker build failed before app smoke checks: temporary DNS failure resolving `deb.debian.org` during `apt-get update`. | -| `2026-06-06T01:16:00+02:00` | `make docker-smoke-clean` | passed | Cleanup completed after failed Docker build. | -| `2026-06-06T01:17:00+02:00` | `rg -n "httpmsg\|p2pstream/msg\|github.com/coder/websocket\|PendingRequests\|LateAgentResponses\|pendingAgentRequest\|WriteCh\|/ws\\b" -S --glob '!docs/.plans/**'` | passed | No legacy runtime refs; matches only unrelated `PendingRequestsPerFlush` naming in web trace store. | -| `2026-06-06T13:55:00+02:00` | `go test ./internal/agent` | passed | Covers the HTTP/1.1 ALPN tunnel bootstrap regression. | -| `2026-06-06T13:56:00+02:00` | `go test ./...` | passed | Full Go suite after ALPN fix. | -| `2026-06-06T13:57:00+02:00` | `timeout 35s make dev; status=$?; make kill; if [ "$status" = 124 ]; then exit 0; else exit "$status"; fi` | passed | Bounded dev smoke reached `Connected tunnel successfully`; local ports `8081`, `8088`, and `8089` were already occupied by another dev instance before cleanup. | -| `2026-06-06T14:02:00+02:00` | `make kill && ss -ltnp '( sport = :8081 or sport = :8088 or sport = :8089 or sport = :5173 )'` | passed | Cleanup removed stale dev listeners; `ss` showed no bound dev ports. | -| `2026-06-06T14:03:00+02:00` | `timeout -s INT 30s make dev; status=$?; sleep 1; ss -ltnp '( sport = :8081 or sport = :8088 or sport = :8089 or sport = :5173 )'; ...` | passed | Restarted from clean state, connected the agent tunnel, handled interrupt, logged graceful shutdown, and released all dev ports. | -| `2026-06-06T14:04:00+02:00` | `git diff --check` | passed | No whitespace errors in the pending diff. | -| `2026-06-06T14:10:00+02:00` | `git status --short --branch` | passed | Clean phase 2 starting point on `rewrite/agent-yamux-transport`. | -| `2026-06-06T14:10:00+02:00` | `git diff --check` | passed | Baseline diff check before phase 2 edits. | -| `2026-06-06T14:11:00+02:00` | `go test ./...` | passed | Baseline full Go suite before phase 2 edits. | -| `2026-06-06T14:12:00+02:00` | `make test` | passed | Baseline `go test ./...` plus management UI typecheck. | -| `2026-06-06T14:13:00+02:00` | `make build` | passed | Baseline frontend/backend build; Vite emitted existing large-chunk warning. | -| `2026-06-06T14:13:00+02:00` | `make kill && ss -ltnp '( sport = :8081 or sport = :8088 or sport = :8089 or sport = :5173 )'` | passed | Baseline cleanup left no dev listeners. | -| `2026-06-06T14:17:00+02:00` | `make docker-smoke` | passed | Existing Docker smoke reached app-level checks and passed after Docker DNS was available. | -| `2026-06-06T14:31:00+02:00` | `go test ./internal/agent ./internal/server ./internal/smoketest ./internal/smoketest/upstream` | failed | First targeted run exposed a test fixture issue: the new agent cache test needed a matching DB agent row for the event FK. | -| `2026-06-06T14:32:00+02:00` | `go test ./internal/server` | passed | Passed after seeding the cache test agent row. | -| `2026-06-06T14:32:00+02:00` | `go test ./internal/agent ./internal/server ./internal/smoketest ./internal/smoketest/upstream` | passed | Targeted phase 2 package validation passed. | -| `2026-06-06T14:33:00+02:00` | `git diff --check` | passed | No whitespace errors after the phase 2 implementation. | -| `2026-06-06T14:34:00+02:00` | `go test ./...` | passed | Full Go suite after phase 2 implementation. | -| `2026-06-06T14:35:00+02:00` | `make test` | passed | Includes full Go suite and management UI typecheck. | -| `2026-06-06T14:36:00+02:00` | `make build` | passed | Frontend/backend build passed; Vite emitted the existing large-chunk warning. | -| `2026-06-06T14:37:00+02:00` | `timeout -s INT 30s make dev` | failed | Exposed stale listeners on `8081`, `8088`, and `8089` from a previous dev run; fixed by extending `make kill` to terminate listener PIDs found via `ss`. | -| `2026-06-06T14:38:00+02:00` | `make kill && ss -ltnp '( sport = :8081 or sport = :8088 or sport = :8089 or sport = :5173 )'` | passed | Cleanup left no dev listeners after the listener-PID fallback. | -| `2026-06-06T14:39:00+02:00` | `timeout -s INT 30s make dev; make kill; ss -ltnp '( sport = :8081 or sport = :8088 or sport = :8089 or sport = :5173 )'` | passed | Dev started, Air ignored noisy dirs, agent tunnel connected, interrupt shut services down, and no dev ports remained bound. | -| `2026-06-06T14:40:00+02:00` | `make docker-smoke-clean && make docker-smoke` | passed | Expanded Docker smoke passed direct GET/POST/stream and agent-pool GET/POST/stream/headers/timeout/close-early/WebSocket scenarios. | -| `2026-06-06T14:41:00+02:00` | `make docker-smoke-clean` | passed | Removed smoke containers, network, and data volume after the successful run. | -| `2026-06-06T14:41:00+02:00` | `go test -race ./internal/tunnel ./internal/agent ./internal/server` | passed | Race run passed for the tunnel protocol, agent relay, and server proxy/lifecycle packages. | -| `2026-06-06T14:48:00+02:00` | `go test ./internal/server -run 'TestEnvironmentRequiresHTTPSAndTrustedCertificateBeforeProxy\|TestAgentEnvironmentProxyDiscoversAndPinsCertificate'` | passed | Added `GetPublicProxyConfig` coverage through direct and agent environment proxies. | -| `2026-06-06T14:49:00+02:00` | `bun run --cwd web/management typecheck` | passed | Verified management UI types after adding the Vite `/environments/` dev proxy. | -| `2026-06-06T14:50:00+02:00` | `bun run --cwd web/management build` | passed | Vite build passed; emitted the existing large-chunk warning. | -| `2026-06-06T15:09:00+02:00` | `git status --short --branch` | passed | Clean phase 3 starting point on `rewrite/agent-yamux-transport`. | -| `2026-06-06T15:09:00+02:00` | `git diff --check` | passed | Baseline diff check before phase 3 edits. | -| `2026-06-06T15:11:00+02:00` | `go test ./...` | passed | Baseline full Go suite before pooling changes. | -| `2026-06-06T15:12:00+02:00` | `make test` | passed | Baseline full Go suite plus management UI typecheck. | -| `2026-06-06T15:13:00+02:00` | `make build` | passed | Baseline frontend/backend build; Vite emitted the existing large-chunk warning. | -| `2026-06-06T15:16:00+02:00` | `make docker-smoke-clean && make docker-smoke` | passed | Baseline Docker smoke passed before pooling changes. | -| `2026-06-06T15:16:00+02:00` | `make docker-smoke-clean` | passed | Removed smoke containers, network, and data volume after the baseline run. | -| `2026-06-06T15:24:00+02:00` | `go test ./internal/server -run 'TestAgentPoolHealthCheckRunsThroughAssignedAgent\|TestAgentEnvironmentProxyDiscoversAndPinsCertificate\|TestAgentProxy'` | passed | Initial pooling integration checkpoint. | -| `2026-06-06T15:26:00+02:00` | `go test ./internal/server -run 'TestAgentTransportPool\|TestRotateAgentTokenDisconnectsActiveAgent\|TestAgentEnvironmentProxyDiscoversAndPinsCertificate\|TestAgentProxy\|TestPublicBackendHealth'` | passed | Pool unit/integration tests, token rotation invalidation, and environment reuse checkpoint. | -| `2026-06-06T15:27:00+02:00` | `bun run --cwd web/management typecheck` | passed | Frontend typecheck after adding Playwright files. | -| `2026-06-06T15:28:00+02:00` | `bun run --cwd web/management e2e` | failed | Playwright browser binary was not installed in the local environment. | -| `2026-06-06T15:29:00+02:00` | `bun run --cwd web/management e2e:install` | passed | Installed Chromium for local Playwright execution. | -| `2026-06-06T15:31:00+02:00` | `bun run --cwd web/management e2e` | failed | Exposed Playwright setup isolation issue: implicit config DB migrated the repo-root legacy `p2pstream.db`. | -| `2026-06-06T15:42:00+02:00` | `bun run --cwd web/management e2e` | passed | Browser environment switch passed through both `management-proxy` and `vite-direct` projects after using an explicit isolated SQLite URL and deterministic E2E server startup. | -| `2026-06-06T15:43:00+02:00` | `git diff --check` | passed | No whitespace errors after Phase 3 implementation. | -| `2026-06-06T15:43:00+02:00` | `go test ./internal/server -run 'TestAgentTransportPool\|TestEnvironment\|TestAgentProxy\|TestPublicBackendHealth'` | passed | Focused server regression set passed. | -| `2026-06-06T15:43:00+02:00` | `bun run --cwd web/management typecheck` | passed | Frontend typecheck passed after Playwright config fixes. | -| `2026-06-06T15:44:00+02:00` | `go test ./internal/server` | passed | Full server package passed with pooling enabled. | -| `2026-06-06T15:44:00+02:00` | `bun run --cwd web/management build` | passed | Frontend production build passed; Vite emitted the existing large-chunk warning. | -| `2026-06-06T15:45:00+02:00` | `go test ./...` | passed | Full Go suite passed after pooling and E2E changes. | -| `2026-06-06T15:45:00+02:00` | `make test` | passed | Full Makefile test target passed, including management UI typecheck. | -| `2026-06-06T15:46:00+02:00` | `make build` | passed | Frontend/backend build passed; Vite emitted the existing large-chunk warning. | -| `2026-06-06T15:47:00+02:00` | `bun run --cwd web/management e2e` | passed | Final browser E2E rerun passed both management proxy and direct Vite projects. | -| `2026-06-06T15:48:00+02:00` | `make docker-smoke-clean && make docker-smoke && make docker-smoke-clean` | passed | Docker smoke passed direct and agent-pool GET/POST/stream/header/timeout/close-early/WebSocket scenarios and cleaned up containers/volume/network. | -| `2026-06-06T15:49:00+02:00` | `go test -race ./internal/server` | passed | Race coverage passed for server pooling, proxying, environment, and lifecycle tests. | -| `2026-06-06T15:49:00+02:00` | `make kill && ss -ltnp '( sport = :8081 or sport = :8088 or sport = :8089 or sport = :5173 or sport = :19081 )'` | passed | Final cleanup left no dev/E2E listeners bound. | -| `2026-06-06T18:58:00+02:00` | `git switch rewrite/agent-yamux-transport && git status --short --branch && git fetch origin && git rev-list --left-right --count origin/dev...rewrite/agent-yamux-transport && git log --oneline origin/dev..rewrite/agent-yamux-transport` | passed | Branch clean; `origin/dev` had not moved; branch was `0` behind and `4` commits ahead. | -| `2026-06-06T18:59:00+02:00` | Manual high-risk review | passed | Reviewed pool keys, invalidation hooks, request-ID context propagation, Playwright isolated `DATABASE_URL`, and legacy transport removal points. No code changes needed. | -| `2026-06-06T19:00:00+02:00` | `make generate && git diff --exit-code -- . ':(exclude)docs/.plans/agent-yamux-overhaul.md'` | passed | No generated-file changes; ledger was intentionally dirty for Phase 4 bookkeeping. | -| `2026-06-06T19:00:00+02:00` | `bash -n scripts/install-agent.sh` | passed | Installer shell syntax check passed. | -| `2026-06-06T19:01:00+02:00` | `go test ./...` | passed | Full Go suite passed. | -| `2026-06-06T19:01:00+02:00` | `go vet ./...` | passed | Go vet passed. | -| `2026-06-06T19:01:00+02:00` | `bun test --cwd web/management src/lib/*.test.ts` | failed | This invocation treats the glob as a Bun filter; reran using the CI workflow form below. | -| `2026-06-06T19:01:00+02:00` | `cd web/management && bun test src/lib/*.test.ts` | passed | Frontend unit tests passed: 92 tests across 9 files. | -| `2026-06-06T19:01:00+02:00` | `bun run --cwd web/management typecheck` | passed | Management UI typecheck passed. | -| `2026-06-06T19:01:00+02:00` | `bun run --cwd web/management build` | passed | Frontend build passed; Vite emitted the existing large-chunk warning. | -| `2026-06-06T19:02:00+02:00` | `docker build --target runtime -t p2pstream:premerge .` | passed | Runtime image build passed; frontend build emitted the existing large-chunk warning. | -| `2026-06-06T19:02:00+02:00` | `go test -race ./internal/tunnel ./internal/agent ./internal/server` | passed | Race checks passed for tunnel, agent, and server packages. | -| `2026-06-06T19:02:00+02:00` | `bun run --cwd web/management e2e` | failed | A stale previous dev instance was bound to `5173`, `8081`, `8088`, and `8089`; cleaned with `make kill`. | -| `2026-06-06T19:04:00+02:00` | `bun run --cwd web/management e2e` | failed | Vite process stayed alive without binding after stale-state cleanup; terminated and removed `web/management/test-results`, `web/management/playwright-report`, and Playwright tmp dirs. | -| `2026-06-06T19:05:00+02:00` | `bun run --cwd web/management e2e` | passed | Browser environment switch passed in both `management-proxy` and `vite-direct` projects after clean artifact state. | -| `2026-06-06T19:06:00+02:00` | `make docker-smoke-clean && make docker-smoke && make docker-smoke-clean` | passed | Docker smoke passed direct and agent-pool GET/POST/stream/header/timeout/close-early/WebSocket scenarios and cleaned up containers/volume/network. | -| `2026-06-06T19:07:00+02:00` | `timeout -s INT 30s make dev; make kill; ss -ltnp '( sport = :8081 or sport = :8088 or sport = :8089 or sport = :5173 or sport = :19081 )'` | passed | Dev server started, Vite bound, agent tunnel connected, interrupt shut services down, and no checked ports remained bound. | -| `2026-06-06T19:07:00+02:00` | `rg -n "github.com/coder/websocket\|p2pstream/msg\|httpmsg\|PendingRequests\|LateAgentResponses\|pendingAgentRequest\|WriteCh\|/ws\\b" -S --glob '!docs/.plans/**' --glob '!web/management/bun.lock'` | passed | Only allowed matches: smoke `/ws` endpoint/test and unrelated frontend `PendingRequestsPerFlush` naming. | -| `2026-06-06T19:08:00+02:00` | `git switch dev && git pull --ff-only origin dev && git merge --no-ff rewrite/agent-yamux-transport -m "Merge yamux agent transport rewrite"` | passed | Created merge commit `692f598`; merge was clean. | -| `2026-06-06T19:08:00+02:00` | `git status --short --branch && git log --oneline --decorate -6` | passed | `dev` was clean and ahead of `origin/dev` by the merge history; recent log showed `692f598` plus the five feature-branch commits. | -| `2026-06-06T19:09:00+02:00` | `make generate && git diff --exit-code` | passed | No generated files changed after the merge. | -| `2026-06-06T19:09:00+02:00` | `bash -n scripts/install-agent.sh` | passed | Installer shell syntax still valid on merged `dev`. | -| `2026-06-06T19:10:00+02:00` | `go test ./...` | passed | Full Go suite passed on merged `dev`. | -| `2026-06-06T19:10:00+02:00` | `go vet ./...` | passed | Go vet passed on merged `dev`. | -| `2026-06-06T19:10:00+02:00` | `cd web/management && bun test src/lib/*.test.ts` | passed | Frontend unit tests passed: 92 tests across 9 files. | -| `2026-06-06T19:10:00+02:00` | `bun run --cwd web/management typecheck` | passed | Management UI typecheck passed. | -| `2026-06-06T19:10:00+02:00` | `bun run --cwd web/management build` | passed | Frontend production build passed; Vite emitted the existing large-chunk warning. | -| `2026-06-06T19:11:00+02:00` | `bun run --cwd web/management e2e` | passed | Browser environment switch passed in both `management-proxy` and `vite-direct` projects after cleaning stale Playwright artifacts. | -| `2026-06-06T19:12:00+02:00` | `make docker-smoke-clean && make docker-smoke && make docker-smoke-clean` | passed | Docker smoke passed direct and agent-pool GET/POST/stream/header/timeout/close-early/WebSocket scenarios and cleaned up containers/volume/network. | -| `2026-06-06T19:12:00+02:00` | `docker build --target runtime -t p2pstream:dev-merged .` | passed | Optional merged runtime image build passed. | -| `2026-06-06T19:13:00+02:00` | `go test -race ./internal/tunnel ./internal/agent ./internal/server` | passed | Optional race checks passed for tunnel, agent, and server packages. | -| `2026-06-06T19:14:00+02:00` | `make kill && ss -ltnp '( sport = :8081 or sport = :8088 or sport = :8089 or sport = :5173 or sport = :19081 )'` | passed | Final cleanup left no dev/E2E listeners bound. | -| `2026-06-06T19:16:00+02:00` | `git push origin dev` | failed | GitHub rejected direct `dev` update because branch protection requires pull requests and the required `Verify` status check. | -| `2026-06-06T19:17:00+02:00` | `git push origin HEAD:refs/heads/merge/yamux-agent-transport-dev && gh pr create --base dev --head merge/yamux-agent-transport-dev` | passed | Opened PR #45 with the locally validated merge state. | -| `2026-06-06T19:23:00+02:00` | `gh pr checks 45 --watch` | failed | Required `Verify` passed; CodeQL raised one new high alert for intentional certificate-discovery `InsecureSkipVerify` in agent-backed environment discovery. | -| `2026-06-06T19:25:00+02:00` | CodeQL fix attempt | failed | A narrow `codeql[go/disabled-certificate-check]` suppression did not clear the code-scanning status. | -| `2026-06-06T19:31:00+02:00` | CodeQL structural fix attempt | failed | A normal TLS handshake does not expose the peer certificate after unknown-authority failure in this path, so agent-backed discovery could no longer trust a self-signed environment. | -| `2026-06-06T19:35:00+02:00` | CodeQL false-positive handling | pending | Restored the working discovery-only `InsecureSkipVerify` path with a source-code rationale; the alert needs repository code-scanning dismissal. | - -## Deferred: Per-Agent Transport Pool Refinements - -Current state: -- Agent-backed public proxy, health check, and environment proxy paths use pooled `http.Transport` instances. -- Pools are keyed by agent plus backend/environment/TLS/timeout/origin inputs. -- Idle connections are closed on agent disconnect, token rotation, backend update/delete, environment update/delete/trust, and app shutdown. - -Future design: -- Add per-agent/backend pool observability if operators need runtime visibility into idle connection reuse. -- Tune idle connection limits from production data. -- Consider closing environment pools on management access-token changes if token lifecycle grows beyond current auth wrapping. - -## Phase 4 Merge Decision - -Final branch commits: -- `131ba39 Rewrite agent transport with yamux tunnel` -- `4881478 Harden yamux transport smoke and lifecycle coverage` -- `0bda8c0 Fix environment-scoped management proxy in dev` -- `7cf9677 Add agent transport pooling and browser e2e coverage` -- `566ec5c Record yamux rewrite pre-merge validation` -- `692f598 Merge yamux agent transport rewrite` -- Follow-up commit: document the protected-branch PR path and the intentional discovery-only CodeQL certificate-check false positive. - -Merge decision: -- Merged `rewrite/agent-yamux-transport` into `dev` with `git merge --no-ff`. -- Direct push to `origin/dev` is blocked by branch protection; PR #45 carries the locally validated merge state. -- Preserve branch commits; do not squash. -- Add no new protocol, API, proto, or production metrics changes in Phase 4. -- Created only ledger validation commits during Phase 4. - -## Handoff Notes - -- Next task: monitor PR #45, merge it through GitHub after CodeQL rerun passes, then plan the separate `main` release when `dev` is stable. -- Known failing tests: none. -- Known risks: pooled transports intentionally reuse Yamux streams for sequential same-key HTTP requests; invalidation coverage is broad, but future config surfaces must close the relevant pool entries when they affect dialing, TLS, origin, or timeout behavior. -- Important implementation details: - - `/agent/tunnel` requires HTTP/1.1 hijack; tunnel clients disable HTTP/2 for bootstrap and force TLS ALPN to `http/1.1`. - - `make dev` now traps `INT`, `TERM`, and `EXIT` and cleans up both wrapper PIDs and repo-local child binaries so restarts do not leave `8081`, `8088`, or `8089` bound. - - Playwright E2E uses an explicit isolated `DATABASE_URL`; relying only on `CONFIG_DIR` would migrate the repo-root legacy `p2pstream.db`. - - Playwright starts the backend, waits for `GetSetupState`, starts the agent, then starts Vite so both `management-proxy` and `vite-direct` projects are ready deterministically. - - `web/management/test-results` and `web/management/playwright-report` are ignored by Git and Air. - - Each upstream TCP connection maps to one Yamux stream. - - `OpenRequest` and `OpenResponse` are length-prefixed JSON control frames capped at 16 KiB. - - Agent decode/open failures close only the stream, not the session. - - Server-side proxy, health check, environment proxy, and certificate discovery all use `dialViaAgent`. - - Pooled agent transports pull the request ID from context; if absent, `dialViaAgent` receives a generated UUID fallback. - - Old WebSocket agents are intentionally incompatible. diff --git a/internal/agent/agent.go b/internal/agent/agent.go index f568457..3dc7226 100644 --- a/internal/agent/agent.go +++ b/internal/agent/agent.go @@ -45,6 +45,8 @@ var ( agentReconnectBackoffMax = 30 * time.Second ) +const agentTunnelResponseHeaderTimeout = 15 * time.Second + type Options struct { ManagementURL string PublicID string @@ -248,6 +250,9 @@ func managementTunnelHTTPClient(base *http.Client) (*http.Client, error) { } transport.ForceAttemptHTTP2 = false transport.TLSNextProto = map[string]func(string, *tls.Conn) http.RoundTripper{} + if transport.ResponseHeaderTimeout == 0 { + transport.ResponseHeaderTimeout = agentTunnelResponseHeaderTimeout + } protocols := new(http.Protocols) protocols.SetHTTP1(true) transport.Protocols = protocols @@ -288,7 +293,7 @@ func connectAndServe(client *http.Client, tunnelURL string, agentPublicID string if resp.Body != nil { data, _ := io.ReadAll(io.LimitReader(resp.Body, 1024)) body = strings.TrimSpace(string(data)) - resp.Body.Close() + _ = resp.Body.Close() } if body != "" { return fmt.Errorf("agent tunnel upgrade failed: status %d: %s", resp.StatusCode, body) @@ -296,17 +301,17 @@ func connectAndServe(client *http.Client, tunnelURL string, agentPublicID string return fmt.Errorf("agent tunnel upgrade failed: status %d", resp.StatusCode) } if got := resp.Header.Get("Upgrade"); !strings.EqualFold(got, tunnel.UpgradeToken) { - resp.Body.Close() + _ = resp.Body.Close() return fmt.Errorf("agent tunnel upgrade response header = %q", got) } rwc, ok := resp.Body.(io.ReadWriteCloser) if !ok { - resp.Body.Close() + _ = resp.Body.Close() return fmt.Errorf("agent tunnel response body is %T, want io.ReadWriteCloser", resp.Body) } session, err := yamux.Client(rwc, tunnel.DefaultYamuxConfig(nil)) if err != nil { - rwc.Close() + _ = rwc.Close() return fmt.Errorf("failed to initialize tunnel session: %w", err) } defer session.Close() diff --git a/internal/server/agent_registry.go b/internal/server/agent_registry.go index 1a60b18..8bd5105 100644 --- a/internal/server/agent_registry.go +++ b/internal/server/agent_registry.go @@ -91,12 +91,12 @@ func (a *App) UpdateAgent( if err != nil { return nil, publicDBError(err) } - if err := a.refreshPublicProxySnapshot(ctx); err != nil { - return nil, err - } if a.AgentTransports != nil { a.AgentTransports.closeAgent(req.Msg.Id) } + if err := a.refreshPublicProxySnapshot(ctx); err != nil { + return nil, err + } return connect.NewResponse(&p2pstreamv1.UpdateAgentResponse{Agent: a.agentToProto(ctx, agent)}), nil } diff --git a/internal/server/public_backend_health_test.go b/internal/server/public_backend_health_test.go index 2705f51..fdada00 100644 --- a/internal/server/public_backend_health_test.go +++ b/internal/server/public_backend_health_test.go @@ -95,6 +95,7 @@ func TestAgentPoolHealthCheckRunsThroughAssignedAgent(t *testing.T) { t.Fatalf("connect agent: %v", err) } defer app.AgentHub.disconnect(agent) + defer fake.close() snap := &publicProxySnapshot{ Backends: map[int64]publicBackendConfig{backend.ID: backend}, @@ -231,12 +232,13 @@ func TestAgentHealthTraceRecordsSuccessAndDebugAttributes(t *testing.T) { backend := testHealthBackend(t, 102, publicBackendForwardModeAgentPool, upstream.URL) backend.AgentAssignments = []publicBackendAgentConfig{{BackendID: backend.ID, AgentID: 7, Position: 0, Weight: 100, Enabled: true}} - agent, _ := newFakeYamuxAgent(t, 7, "agent-7") + agent, fake := newFakeYamuxAgent(t, 7, "agent-7") agent.Name = "Agent Seven" if err := app.AgentHub.connect(agent); err != nil { t.Fatalf("connect agent: %v", err) } defer app.AgentHub.disconnect(agent) + defer fake.close() app.BackendHealth.reconcile(app, &publicProxySnapshot{ Backends: map[int64]publicBackendConfig{backend.ID: backend}, Agents: map[int64]publicAgentConfig{7: {ID: 7, PublicID: "agent-7", Name: "Agent Seven", Enabled: true}}, diff --git a/internal/server/public_routing.go b/internal/server/public_routing.go index 8989573..8727bb1 100644 --- a/internal/server/public_routing.go +++ b/internal/server/public_routing.go @@ -974,9 +974,27 @@ func (a *App) dialViaAgent(ctx context.Context, agent *AgentConn, network string openCh := make(chan struct { conn net.Conn err error - }) + }, 1) + openDone := make(chan struct{}) + stopOpenWatch := func() { + select { + case <-openDone: + default: + close(openDone) + } + } + session := agent.Session + go func() { + select { + case <-ctx.Done(): + _ = session.Close() + case <-agent.Done: + _ = session.Close() + case <-openDone: + } + }() go func() { - conn, err := agent.Session.Open() + conn, err := session.Open() result := struct { conn net.Conn err error @@ -997,6 +1015,7 @@ func (a *App) dialViaAgent(ctx context.Context, agent *AgentConn, network string var conn net.Conn select { case result := <-openCh: + stopOpenWatch() if result.err != nil { if agent != nil { log.Debug(). @@ -1010,6 +1029,8 @@ func (a *App) dialViaAgent(ctx context.Context, agent *AgentConn, network string } conn = result.conn case <-ctx.Done(): + _ = agent.Session.Close() + stopOpenWatch() if agent != nil { log.Debug(). Err(ctx.Err()). @@ -1020,6 +1041,8 @@ func (a *App) dialViaAgent(ctx context.Context, agent *AgentConn, network string } return nil, ctx.Err() case <-agent.Done: + _ = agent.Session.Close() + stopOpenWatch() log.Debug(). Str("request_id", requestID). Str("agent", agent.PublicID). @@ -1031,6 +1054,24 @@ func (a *App) dialViaAgent(ctx context.Context, agent *AgentConn, network string if deadline, ok := ctx.Deadline(); ok { _ = conn.SetDeadline(deadline) } + handshakeDone := make(chan struct{}) + stopHandshakeWatch := func() { + select { + case <-handshakeDone: + default: + close(handshakeDone) + } + } + go func() { + select { + case <-ctx.Done(): + _ = conn.Close() + case <-agent.Done: + _ = conn.Close() + case <-handshakeDone: + } + }() + defer stopHandshakeWatch() req := tunnel.NewOpenRequest(requestID, network, address) if err := tunnel.WriteOpenRequest(conn, req); err != nil { _ = conn.Close() diff --git a/internal/server/server.go b/internal/server/server.go index 0e13e68..708a166 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -4,6 +4,7 @@ import ( "context" "database/sql" "net/http" + "strconv" "strings" "sync" "sync/atomic" @@ -284,7 +285,8 @@ func (a *App) agentTunnelHandler(w http.ResponseWriter, r *http.Request) { http.Error(w, "agent tunnel upgrade required", http.StatusBadRequest) return } - if r.Header.Get(tunnel.TunnelVersionHeader) != "1" { + version := strconv.Itoa(tunnel.ProtocolVersion) + if r.Header.Get(tunnel.TunnelVersionHeader) != version { http.Error(w, "unsupported tunnel version", http.StatusUpgradeRequired) return } @@ -294,43 +296,22 @@ func (a *App) agentTunnelHandler(w http.ResponseWriter, r *http.Request) { return } - var connID int64 - if a.DB != nil { - id, err := a.DB.InsertConnection(r.Context(), sql.NullInt64{Int64: agentRow.ID, Valid: true}) - if err == nil { - connID = id - if err := a.DB.MarkAgentConnected(r.Context(), agentRow.ID); err != nil { - log.Warn().Err(err).Str("agent", agentRow.PublicID).Msg("Failed to update agent connected timestamp") - } - } else { - log.Warn().Err(err).Msg("Failed to insert connection into DB") - } + if existing := a.AgentHub.connectedByID(agentRow.ID); existing != nil { + log.Warn().Str("agent", agentRow.PublicID).Msg("Rejecting duplicate agent connection") + http.Error(w, "agent is already connected", http.StatusConflict) + return } agent := &AgentConn{ - AgentID: agentRow.ID, - PublicID: agentRow.PublicID, - Name: agentRow.Name, - Done: make(chan struct{}), - ConnectedAt: time.Now(), - ConnectionDBID: connID, - } - - if err := a.AgentHub.connect(agent); err != nil { - log.Warn().Err(err).Str("agent", agent.PublicID).Msg("Rejecting duplicate agent connection") - http.Error(w, err.Error(), http.StatusConflict) - return - } - cleanupAgent := func() { - a.AgentHub.disconnect(agent) - if a.BackendHealth != nil { - a.BackendHealth.recordAgentDisconnectedForAll(agent.AgentID) - } + AgentID: agentRow.ID, + PublicID: agentRow.PublicID, + Name: agentRow.Name, + Done: make(chan struct{}), + ConnectedAt: time.Now(), } rawConn, rw, err := hijacker.Hijack() if err != nil { - cleanupAgent() log.Error().Err(err).Str("agent", agent.PublicID).Msg("Failed to hijack agent tunnel") return } @@ -338,17 +319,15 @@ func (a *App) agentTunnelHandler(w http.ResponseWriter, r *http.Request) { _, _ = rw.WriteString("HTTP/1.1 400 Bad Request\r\nConnection: close\r\n\r\nunexpected buffered tunnel data\n") _ = rw.Flush() _ = rawConn.Close() - cleanupAgent() return } _, _ = rw.WriteString("HTTP/1.1 101 Switching Protocols\r\n") _, _ = rw.WriteString("Connection: Upgrade\r\n") _, _ = rw.WriteString("Upgrade: " + tunnel.UpgradeToken + "\r\n") - _, _ = rw.WriteString(tunnel.TunnelVersionHeader + ": 1\r\n") + _, _ = rw.WriteString(tunnel.TunnelVersionHeader + ": " + version + "\r\n") _, _ = rw.WriteString("\r\n") if err := rw.Flush(); err != nil { _ = rawConn.Close() - cleanupAgent() log.Error().Err(err).Str("agent", agent.PublicID).Msg("Failed to write agent tunnel upgrade response") return } @@ -356,11 +335,41 @@ func (a *App) agentTunnelHandler(w http.ResponseWriter, r *http.Request) { session, err := yamux.Server(rawConn, tunnel.DefaultYamuxConfig(nil)) if err != nil { _ = rawConn.Close() - cleanupAgent() log.Error().Err(err).Str("agent", agent.PublicID).Msg("Failed to initialize agent tunnel session") return } agent.Session = session + + if a.DB != nil { + id, err := a.DB.InsertConnection(r.Context(), sql.NullInt64{Int64: agentRow.ID, Valid: true}) + if err == nil { + agent.ConnectionDBID = id + if err := a.DB.MarkAgentConnected(r.Context(), agentRow.ID); err != nil { + log.Warn().Err(err).Str("agent", agentRow.PublicID).Msg("Failed to update agent connected timestamp") + } + } else { + log.Warn().Err(err).Msg("Failed to insert connection into DB") + } + } + if err := a.AgentHub.connect(agent); err != nil { + _ = session.Close() + if a.DB != nil && agent.ConnectionDBID > 0 { + if err := a.DB.UpdateConnectionDisconnected(context.Background(), agent.ConnectionDBID); err != nil { + log.Warn().Err(err).Msg("Failed to update rejected connection disconnection time") + } + if err := a.DB.MarkAgentDisconnected(context.Background(), agent.AgentID); err != nil { + log.Warn().Err(err).Str("agent", agent.PublicID).Msg("Failed to update rejected agent disconnected timestamp") + } + } + log.Warn().Err(err).Str("agent", agent.PublicID).Msg("Rejecting duplicate agent connection") + return + } + cleanupAgent := func() { + a.AgentHub.disconnect(agent) + if a.BackendHealth != nil { + a.BackendHealth.recordAgentDisconnectedForAll(agent.AgentID) + } + } if a.BackendHealth != nil { a.BackendHealth.recordAgentConnectedForAll(agent.AgentID, agent.PublicID) } @@ -383,8 +392,8 @@ func (a *App) agentTunnelHandler(w http.ResponseWriter, r *http.Request) { Int64("duration_ms", time.Since(agent.ConnectedAt).Milliseconds()). Int64("active_requests", agent.ActiveRequests.Load()). Msg("Agent tunnel disconnected") - if a.DB != nil && connID > 0 { - if err := a.DB.UpdateConnectionDisconnected(context.Background(), connID); err != nil { + if a.DB != nil && agent.ConnectionDBID > 0 { + if err := a.DB.UpdateConnectionDisconnected(context.Background(), agent.ConnectionDBID); err != nil { log.Warn().Err(err).Msg("Failed to update disconnection time") } if err := a.DB.MarkAgentDisconnected(context.Background(), agent.AgentID); err != nil { diff --git a/internal/smoketest/docker_smoke_test.go b/internal/smoketest/docker_smoke_test.go index 55efbd3..bb3957a 100644 --- a/internal/smoketest/docker_smoke_test.go +++ b/internal/smoketest/docker_smoke_test.go @@ -13,6 +13,7 @@ import ( "encoding/binary" "encoding/hex" "encoding/json" + "errors" "fmt" "io" "net" @@ -75,7 +76,7 @@ func TestDockerSmoke(t *testing.T) { waitHTTPBody(t, httpClient(), smokeURL(publicAgentURL, "/"), http.StatusOK, "smoke upstream ok", "agent GET") smokePostEcho(t, smokeURL(publicAgentURL, "/echo")) smokeStream(t, smokeURL(publicAgentURL, "/stream")) - smokeHeaders(t, smokeURL(publicAgentURL, "/headers")) + smokeHeaders(t, smokeURL(publicAgentURL, "/headers"), mustURLHost(t, upstreamURL), mustURLHost(t, publicAgentURL)) smokeWebSocketEcho(t, smokeURL(publicAgentURL, "/ws")) agentBackend = upsertAgentBackend(ctx, t, client, cookie, upstreamURL, dockerAgent.Id, 1000) @@ -577,7 +578,7 @@ func smokeStream(t *testing.T, requestURL string) { } } -func smokeHeaders(t *testing.T, requestURL string) { +func smokeHeaders(t *testing.T, requestURL string, expectedUpstreamHost string, expectedForwardedHost string) { t.Helper() req, err := http.NewRequest(http.MethodGet, requestURL, nil) @@ -599,8 +600,8 @@ func smokeHeaders(t *testing.T, requestURL string) { if err := json.NewDecoder(resp.Body).Decode(&headers); err != nil { t.Fatalf("decode headers response: %v", err) } - assertSmokeHeader(t, headers, "host", "upstream:9000") - assertSmokeHeader(t, headers, "x_forwarded_host", "server:8089") + assertSmokeHeader(t, headers, "host", expectedUpstreamHost) + assertSmokeHeader(t, headers, "x_forwarded_host", expectedForwardedHost) assertSmokeHeader(t, headers, "x_forwarded_proto", "http") assertSmokeHeader(t, headers, "x_request_method", http.MethodGet) assertSmokeHeader(t, headers, "x_smoke_request", "agent-header-check") @@ -616,23 +617,51 @@ func assertSmokeHeader(t *testing.T, headers map[string]string, name string, wan } } +func mustURLHost(t *testing.T, raw string) string { + t.Helper() + parsed, err := url.Parse(raw) + if err != nil || parsed.Host == "" { + t.Fatalf("invalid URL %q: %v", raw, err) + } + return parsed.Host +} + func smokeCloseEarly(t *testing.T, requestURL string) { t.Helper() resp, err := httpClient().Get(requestURL) if err != nil { - return + if isExpectedCloseEarlyError(err) { + return + } + t.Fatalf("close-early request failed unexpectedly: %v", err) } defer resp.Body.Close() body, readErr := io.ReadAll(resp.Body) if readErr != nil { - return + if isExpectedCloseEarlyError(readErr) { + return + } + t.Fatalf("close-early read failed unexpectedly: %v", readErr) } if resp.StatusCode != http.StatusBadGateway { t.Fatalf("close-early status = %d body=%q, want 502 or client read error", resp.StatusCode, string(body)) } } +func isExpectedCloseEarlyError(err error) bool { + if err == nil { + return false + } + if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) { + return true + } + message := strings.ToLower(err.Error()) + return strings.Contains(message, "unexpected eof") || + strings.Contains(message, "server closed idle connection") || + strings.Contains(message, "connection reset by peer") +} + func smokeWebSocketEcho(t *testing.T, requestURL string) { t.Helper() diff --git a/internal/smoketest/upstream/main.go b/internal/smoketest/upstream/main.go index b4a19f5..ea4dc29 100644 --- a/internal/smoketest/upstream/main.go +++ b/internal/smoketest/upstream/main.go @@ -142,7 +142,11 @@ func closeEarlyHandler(w http.ResponseWriter, r *http.Request) { if err != nil { return } - defer conn.Close() + defer func() { + if err := conn.Close(); err != nil { + log.Printf("close early connection close: %v", err) + } + }() _, _ = rw.WriteString("HTTP/1.1 200 OK\r\nContent-Type: text/plain\r\nContent-Length: 64\r\n\r\npartial") _ = rw.Flush() } @@ -182,7 +186,11 @@ func websocketHandler(w http.ResponseWriter, r *http.Request) { if err != nil { return } - defer conn.Close() + defer func() { + if err := conn.Close(); err != nil { + log.Printf("websocket connection close: %v", err) + } + }() accept := websocketAccept(key) _, _ = rw.WriteString("HTTP/1.1 101 Switching Protocols\r\n") @@ -194,7 +202,7 @@ func websocketHandler(w http.ResponseWriter, r *http.Request) { return } - reader := bufio.NewReader(conn) + reader := rw.Reader for { opcode, payload, err := readWebSocketFrame(reader) if err != nil {