diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index b449e69..2557290 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -50,7 +50,7 @@ The agent acts as a frontend on top of Azure Cost Management, Billing, ARM REST - **Backend**: .NET 10 minimal API (`src/Dashboard/`) - **Frontend**: Vue 3 + Vite SPA (`src/Dashboard/frontend/`) with ECharts for data visualization -- **AI**: GitHub Copilot SDK (`GitHub.Copilot.SDK` 1.0.0-beta.3) with BYOK (Bring Your Own Key) using Azure OpenAI via Entra ID bearer tokens. Sessions managed via `CopilotClient` / `CopilotSession`. Reasoning effort set to `xhigh`. The Copilot CLI provides built-in tools (file operations, bash, grep, glob, web fetch, memory) — custom tools handle Azure-specific APIs. Multi-session per user: each user can keep many conversations, listed in the right sidebar; the SDK auto-disconnects idle sessions after 30 min (`SessionIdleTimeoutSeconds = 1800`) while preserving on-disk state for resume. +- **AI**: GitHub Copilot SDK (`GitHub.Copilot.SDK` 1.0.0-beta.4) with BYOK (Bring Your Own Key) using Azure OpenAI via Entra ID bearer tokens. Sessions managed via `CopilotClient` / `CopilotSession`. Reasoning effort set to `xhigh`. The Copilot CLI provides built-in tools (file operations, bash, grep, glob, web fetch, memory) — custom tools handle Azure-specific APIs. Multi-session per user: each user can keep many conversations, listed in the right sidebar; the SDK auto-disconnects idle sessions after 30 min (`SessionIdleTimeoutSeconds = 1800`) while preserving on-disk state for resume. - **Auth**: Auto-assigned anonymous sessions (no login required for chat); Microsoft Entra ID OAuth (multi-tenant) for Azure ARM, Microsoft Graph, and Log Analytics APIs - **Data Sources**: Azure Retail Prices API (no auth), Azure Service Health (no auth), Azure Cost Management APIs, Microsoft Graph APIs, Azure Monitor / Log Analytics APIs, ECharts visualization - **Observability**: OpenTelemetry end-to-end. The .NET app uses `UseAzureMonitor()` (auto-instruments HttpClient, ASP.NET Core, custom `ActivitySource("AzureFinOps.AI")` + `Meter("AzureFinOps.AI")`). The Copilot CLI subprocess emits OTLP via the SDK's built-in `TelemetryConfig` (GenAI + MCP semantic conventions — every tool call, LLM round-trip, prompt, tool args, result, token usage). Both feeds reach Application Insights via an in-container **OpenTelemetry Collector** (`otel/opentelemetry-collector-contrib`) using the `azuremonitor` exporter — config at `src/Dashboard/otel-collector-config.yaml`, launched by `entrypoint.sh` before the .NET app. Trace context (W3C `traceparent`) is auto-propagated SDK→CLI so Application Map shows one continuous transaction. Custom metrics (`finops.chat.requests`, `finops.tool.calls`, `finops.sessions.active`, etc.) keep flowing through the .NET exporter. Frontend telemetry in `frontend/src/main.js` captures page views, failed browser dependencies, uncaught JS errors, unhandled promise rejections, Vue component errors, and CSP violations. Third-party correlation headers are excluded for `cdn.jsdelivr.net` and `js.monitor.azure.com`. diff --git a/.github/prompts/check-code-changes.prompt.md b/.github/prompts/check-code-changes.prompt.md new file mode 100644 index 0000000..e013c0f --- /dev/null +++ b/.github/prompts/check-code-changes.prompt.md @@ -0,0 +1 @@ +go trhough all th enot comitted code does it make sense? analyze evey single line \ No newline at end of file diff --git a/.github/prompts/time-test.prompt.md b/.github/prompts/time-test.prompt.md new file mode 100644 index 0000000..efa4b68 --- /dev/null +++ b/.github/prompts/time-test.prompt.md @@ -0,0 +1 @@ +Start the app with browser tools and ask the user to log in. Run a timer that checks every second until the user is logged in, then trigger Crawl. After it completes, look into the logs and produce a granular timetable outlining how long each step took and where the bottlenecks are. \ No newline at end of file diff --git a/src/Dashboard/AI/ChatEndpoints.cs b/src/Dashboard/AI/ChatEndpoints.cs index 1f2c49d..9925590 100644 --- a/src/Dashboard/AI/ChatEndpoints.cs +++ b/src/Dashboard/AI/ChatEndpoints.cs @@ -82,14 +82,65 @@ public static void MapChatEndpoints( logger.LogInformation("Chat request from {User} model={Model} promptLen={PromptLen}", userLogin, copilotFactory.Deployment, prompt.Length); + // === TIMING HOOKS === + // Per-phase stopwatch buffer flushed as SSE `timing` events once + // the response headers are open. We can't emit before + // ctx.Response.Headers are written (SSE preamble below), so we + // buffer here and replay after the headers go out. + var timingBuf = new List<(string Phase, double Ms, object? Extra)>(); + void RecordPhase(string phase, double ms, object? extra = null) + { + timingBuf.Add((phase, ms, extra)); + logger.LogInformation("timing phase={Phase} ms={Ms:F0} user={User}", phase, ms, userLogin); + } + var tokens = telemetry.UserTokens.GetOrAdd(userId, uid => new UserTokens { UserId = uid }); + // Fast path: anonymous user (no Entra OID) has nothing to refresh. + // Skipping the lock + 4 fetches saves ~5-30ms per Pricing/Estimates + // click and avoids touching the DNS-poisoned token endpoint at all. + if (entraOid is null) + { + RecordPhase("token.skipped_anonymous", 0); + } + else + { + var tokenLockSw = Stopwatch.StartNew(); await tokens.RefreshLock.WaitAsync(ctx.RequestAborted); + tokenLockSw.Stop(); + RecordPhase("token.lock_wait", tokenLockSw.Elapsed.TotalMilliseconds); try { - tokens.AzureToken = await tokenStore.GetAzureTokenAsync(ctx, httpFactory); - tokens.GraphToken = await tokenStore.GetGraphTokenAsync(ctx, httpFactory); - tokens.LogAnalyticsToken = await tokenStore.GetLogAnalyticsTokenAsync(ctx, httpFactory); - tokens.StorageToken = await tokenStore.GetStorageTokenAsync(ctx, httpFactory); + // Fan out all four token fetches in parallel — they hit different + // Entra scopes and are independent. Each task wraps a try/catch so + // a single scope failure (e.g. user never consented to Storage) + // doesn't drag down the others. Saves ~1.2 s on warm turns. + async Task<(string Name, string? Value, double Ms, string? Error)> Fetch(string name, Func> fn) + { + var sw = Stopwatch.StartNew(); + try { var v = await fn(); sw.Stop(); return (name, v, sw.Elapsed.TotalMilliseconds, null); } + catch (OperationCanceledException) { sw.Stop(); throw; } + catch (HttpRequestException ex) { sw.Stop(); return (name, null, sw.Elapsed.TotalMilliseconds, ex.Message); } + catch (UnauthorizedAccessException ex) { sw.Stop(); return (name, null, sw.Elapsed.TotalMilliseconds, ex.Message); } + catch (InvalidOperationException ex) { sw.Stop(); return (name, null, sw.Elapsed.TotalMilliseconds, ex.Message); } + } + var fetchSw = Stopwatch.StartNew(); + var results = await Task.WhenAll( + Fetch("azure", () => tokenStore.GetAzureTokenAsync(ctx, httpFactory)), + Fetch("graph", () => tokenStore.GetGraphTokenAsync(ctx, httpFactory)), + Fetch("loganalytics", () => tokenStore.GetLogAnalyticsTokenAsync(ctx, httpFactory)), + Fetch("storage", () => tokenStore.GetStorageTokenAsync(ctx, httpFactory))); + fetchSw.Stop(); + foreach (var r in results) + { + RecordPhase($"token.{r.Name}", r.Ms, new { hit = r.Value is not null, error = r.Error }); + if (r.Error is not null) + logger.LogWarning("Token fetch failed scope={Scope} ms={Ms:F0} err={Err}", r.Name, r.Ms, r.Error); + } + RecordPhase("token.parallel_total", fetchSw.Elapsed.TotalMilliseconds); + tokens.AzureToken = results[0].Value; + tokens.GraphToken = results[1].Value; + tokens.LogAnalyticsToken = results[2].Value; + tokens.StorageToken = results[3].Value; // Mirror expiry from session into the volatile bag so the // TenantTokenRefresher background service can refresh proactively @@ -103,6 +154,7 @@ public static void MapChatEndpoints( { tokens.RefreshLock.Release(); } + } // end if (entraOid is not null) logger.LogInformation("Chat tokens: azure={HasAzure} graph={HasGraph} la={HasLA} storage={HasStorage}", tokens.AzureToken is not null, tokens.GraphToken is not null, @@ -143,9 +195,15 @@ public static void MapChatEndpoints( ctx.Response.Headers.Connection = "keep-alive"; ctx.Response.Headers["X-Accel-Buffering"] = "no"; + // Declared outside the try so the finally block can deterministically + // remove this exact turn's reporter (sweeping by userId prefix would + // clobber a concurrent turn in another tab). + string? turnKey = null; try { CopilotSession session; + var sessionSw = Stopwatch.StartNew(); + string sessionAcquireMode; if (!string.IsNullOrEmpty(requestedSessionId)) { // IDOR guard: a requested sessionId must belong to this @@ -157,21 +215,27 @@ public static void MapChatEndpoints( { logger.LogInformation("Requested sessionId {Sid} not owned by user {Uid}; falling back to current session", requestedSessionId, userId); session = await copilotFactory.GetCurrentOrCreateAsync(userId, userLogin!, entraOid); + sessionAcquireMode = "fallback_current"; } else { session = await copilotFactory.GetOrResumeAsync(userId, requestedSessionId, userLogin!, entraOid); + sessionAcquireMode = "resume"; } } else { session = await copilotFactory.GetCurrentOrCreateAsync(userId, userLogin!, entraOid); + sessionAcquireMode = "current_or_new"; } + sessionSw.Stop(); + RecordPhase($"session.{sessionAcquireMode}", sessionSw.Elapsed.TotalMilliseconds); var activeSessionId = session.SessionId; var done = new TaskCompletionSource(); var cancelled = false; var toolTracker = new ConcurrentDictionary(); + var firstEventLogged = 0; // Browser disconnect releases this SSE handler but does NOT // abort the running turn. The Copilot CLI keeps generating @@ -190,9 +254,47 @@ public static void MapChatEndpoints( } }); + // SSE write lock + emit helper — declared up here so the + // session.On callback below can use SafeEmit for the + // sdk.first_event timing ping. + using var sseLock = new SemaphoreSlim(1, 1); + async Task SafeEmit(string sseData) + { + await sseLock.WaitAsync(); + try { await EmitAsync(ctx, sseData); } + finally { sseLock.Release(); } + } + // sdkSw measures time from subscription registration to the + // first SDK event arrival (time-to-first-byte from model). + // Started immediately before the subscription so a fast first + // event can't be observed before the stopwatch is running + // (which would log a misleading ms=0). + var sdkSw = Stopwatch.StartNew(); + using var subscription = session.On(async (SessionEvent evt) => { if (cancelled) return; + if (System.Threading.Interlocked.Exchange(ref firstEventLogged, 1) == 0) + { + try + { + var firstMs = sdkSw.Elapsed.TotalMilliseconds; + logger.LogInformation("timing phase=sdk.first_event ms={Ms:F0} user={User}", firstMs, userLogin); + await SafeEmit(JsonSerializer.Serialize(new { type = "timing", phase = "sdk.first_event", ms = Math.Round(firstMs, 1), extra = new { evt = evt.GetType().Name } })); + } + catch (OperationCanceledException) + { + logger.LogDebug("First-event timing emit canceled for user={User}", userLogin); + } + catch (ObjectDisposedException) + { + logger.LogDebug("First-event timing emit skipped because stream was disposed for user={User}", userLogin); + } + catch (InvalidOperationException) + { + logger.LogDebug("First-event timing emit skipped due to invalid stream state for user={User}", userLogin); + } + } try { await HandleSessionEventAsync(evt, ctx, toolTracker, telemetry, copilotFactory.Deployment, @@ -228,19 +330,39 @@ await HandleSessionEventAsync(evt, ctx, toolTracker, telemetry, copilotFactory.D await ctx.Response.WriteAsync($"data: {JsonSerializer.Serialize(new { type = "session", id = activeSessionId })}\n\n"); await ctx.Response.Body.FlushAsync(); - // Wire the retry hook so HttpHelper can push "Cooling down" pings - // to this SSE stream during 429 backoff. Serialize writes against - // the SDK event handler with a per-request lock so retry pings - // can't interleave bytes with delta/tool_done frames. - var sseLock = new SemaphoreSlim(1, 1); - async Task SafeEmit(string sseData) + // Flush buffered timing phases (token refresh + session acquire) + // now that the SSE stream is open. The frontend collects these + // alongside its own perf marks to build the timing table. + foreach (var t in timingBuf) { - await sseLock.WaitAsync(); - try { await EmitAsync(ctx, sseData); } - finally { sseLock.Release(); } + var p = t.Extra is null + ? JsonSerializer.Serialize(new { type = "timing", phase = t.Phase, ms = Math.Round(t.Ms, 1) }) + : JsonSerializer.Serialize(new { type = "timing", phase = t.Phase, ms = Math.Round(t.Ms, 1), extra = t.Extra }); + await ctx.Response.WriteAsync($"data: {p}\n\n"); } - Infrastructure.HttpHelper.RetryReporter.Value = (attempt, waitSec) => - SafeEmit(JsonSerializer.Serialize(new { type = "cooling_down", attempt, waitSeconds = waitSec })); + await ctx.Response.Body.FlushAsync(); + + // Wire the retry hook so HttpHelper can push "Cooling down" pings + // to this SSE stream during 429 backoff. The sseLock / SafeEmit + // were declared above so the subscription callback can share them. + // Register the SSE retry hook keyed by *turn id* (userId:sessionId) + // — NOT just userId — so concurrent turns from the same user + // (two tabs, sidebar score racing chat) don't clobber each + // other's reporter. Propagated to all child activities (incl. + // across the Copilot CLI JSON-RPC tool-callback boundary) via + // Activity Baggage. Earlier we tried AsyncLocal and Activity.RootId + // — both failed to flow through that boundary; baggage does. + turnKey = $"{userId}:{activeSessionId}"; + chatActivity?.SetBaggage("finops.turn.id", turnKey); + Infrastructure.HttpHelper.RetryReporters[turnKey] = (attempt, waitSec, url, tool, status) => + { + logger.LogInformation("EMIT cooling_down sse turn={Turn} attempt={Attempt} status={Status} tool={Tool} waitSec={Wait:F1}", + turnKey, attempt, status, tool, waitSec); + return SafeEmit(JsonSerializer.Serialize(new { type = "cooling_down", attempt, waitSeconds = waitSec, url, tool, status })); + }; + // Belt-and-braces cleanup on request abort. + var turnKeyForAbort = turnKey; + ctx.RequestAborted.Register(() => Infrastructure.HttpHelper.RetryReporters.TryRemove(turnKeyForAbort, out _)); try { @@ -290,26 +412,53 @@ async Task SafeEmit(string sseData) || existing.StartsWith("Untitled", StringComparison.OrdinalIgnoreCase); if (needsTitle) { - // Bound the wait so a slow title call never holds the SSE - // stream open for more than ~10s. - using var titleCts = CancellationTokenSource.CreateLinkedTokenSource(ctx.RequestAborted); - titleCts.CancelAfter(TimeSpan.FromSeconds(10)); - try + // Fire-and-forget: a fresh title is nice-to-have, not + // worth blocking the SSE close for. The next turn (or a + // sidebar refresh) will pick up the saved title via the + // session_title re-emit path above. Capture references + // so the background task is independent of the request. + var bgPrompt = prompt; + var bgReply = assistantReply; + var bgSessionId = activeSessionId; + // Race the title call against the SSE close so a fast title + // (~150ms p50) still gets pushed to the live stream. If it + // misses the window, the next turn's re-emit path picks it up. + var titleTask = Task.Run(async () => { - var generated = await copilotFactory.GenerateTitleAsync(prompt, assistantReply, titleCts.Token); + try + { + using var bgCts = new CancellationTokenSource(TimeSpan.FromSeconds(15)); + var generated = await copilotFactory.GenerateTitleAsync(bgPrompt, bgReply, bgCts.Token); + if (!string.IsNullOrWhiteSpace(generated)) + telemetry.SaveTitle(bgSessionId, generated); + return generated; + } + catch (OperationCanceledException bgEx) + { + logger.LogWarning(bgEx, "Background title generation timed out or was canceled for session {Sid}", bgSessionId); + return null; + } + catch (InvalidOperationException bgEx) + { + logger.LogWarning(bgEx, "Background title generation failed for session {Sid}", bgSessionId); + return null; + } + }); + var winner = await Task.WhenAny(titleTask, Task.Delay(1500)); + if (winner == titleTask && !ctx.RequestAborted.IsCancellationRequested) + { + var generated = await titleTask; if (!string.IsNullOrWhiteSpace(generated)) { - telemetry.SaveTitle(activeSessionId, generated); try { - var payload = JsonSerializer.Serialize(new { type = "session_title", id = activeSessionId, title = generated }); - await ctx.Response.WriteAsync($"data: {payload}\n\n"); + var p = JsonSerializer.Serialize(new { type = "session_title", id = activeSessionId, title = generated }); + await ctx.Response.WriteAsync($"data: {p}\n\n"); await ctx.Response.Body.FlushAsync(); } catch { /* client may have disconnected — title is still saved */ } } } - catch (OperationCanceledException) { /* timeout or client abort */ } } } @@ -318,6 +467,14 @@ async Task SafeEmit(string sseData) new KeyValuePair("model", copilotFactory.Deployment), new KeyValuePair("user", userLogin)); chatActivity?.SetTag("ai.duration_ms", chatSw.Elapsed.TotalMilliseconds); + try + { + var donePayload = JsonSerializer.Serialize(new { type = "timing", phase = "chat.total", ms = Math.Round(chatSw.Elapsed.TotalMilliseconds, 1) }); + await ctx.Response.WriteAsync($"data: {donePayload}\n\n"); + await ctx.Response.Body.FlushAsync(); + } + catch (OperationCanceledException) { } + catch (ObjectDisposedException) { } } catch (Exception ex) { @@ -334,6 +491,14 @@ async Task SafeEmit(string sseData) await ctx.Response.WriteAsync("data: [DONE]\n\n"); await ctx.Response.Body.FlushAsync(); } + finally + { + // Release this turn's reporter only — never sweep by userId + // prefix, since a concurrent turn from the same user (two tabs, + // sidebar score racing chat) holds its own key in the dict. + if (turnKey is not null) + Infrastructure.HttpHelper.RetryReporters.TryRemove(turnKey, out _); + } }); app.MapPost("/api/chat/reset", async (HttpContext ctx) => diff --git a/src/Dashboard/AI/CopilotSessionFactory.cs b/src/Dashboard/AI/CopilotSessionFactory.cs index d69ba38..0a4ce1d 100644 --- a/src/Dashboard/AI/CopilotSessionFactory.cs +++ b/src/Dashboard/AI/CopilotSessionFactory.cs @@ -586,10 +586,13 @@ private async Task DisposeLiveAsync(string sessionId) private async Task CreateSessionConfigAsync(long userId, string? entraOid) { var bearerToken = await GetAzureOpenAIBearerTokenAsync(); + var effort = IsReasoningModel(_deployment) ? "xhigh" : null; + _logger.LogInformation("SessionConfig(create) model={Model} reasoningEffort={Effort} isReasoning={IsReasoning}", + _deployment, effort ?? "", IsReasoningModel(_deployment)); return new SessionConfig { Model = _deployment, - ReasoningEffort = IsReasoningModel(_deployment) ? "xhigh" : null, + ReasoningEffort = effort, Streaming = true, Tools = GetOrCreateUserTools(userId), WorkingDirectory = GetWorkingDirectory(userId, entraOid), @@ -611,10 +614,13 @@ private async Task CreateSessionConfigAsync(long userId, string? private async Task CreateResumeConfigAsync(long userId, string? entraOid) { var bearerToken = await GetAzureOpenAIBearerTokenAsync(); + var effort = IsReasoningModel(_deployment) ? "xhigh" : null; + _logger.LogInformation("SessionConfig(resume) model={Model} reasoningEffort={Effort} isReasoning={IsReasoning} — NOTE: CLI may retain original-session effort", + _deployment, effort ?? "", IsReasoningModel(_deployment)); return new ResumeSessionConfig { Model = _deployment, - ReasoningEffort = IsReasoningModel(_deployment) ? "xhigh" : null, + ReasoningEffort = effort, Streaming = true, Tools = GetOrCreateUserTools(userId), WorkingDirectory = GetWorkingDirectory(userId, entraOid), diff --git a/src/Dashboard/AI/Tools/AnomalyTools.cs b/src/Dashboard/AI/Tools/AnomalyTools.cs index 953cf87..8eb06aa 100644 --- a/src/Dashboard/AI/Tools/AnomalyTools.cs +++ b/src/Dashboard/AI/Tools/AnomalyTools.cs @@ -96,26 +96,35 @@ private async Task DetectCostAnomalies( var threshold = mean + zThreshold * stddev; var lowThreshold = Math.Max(0, mean - zThreshold * stddev); - var anomalies = new List(); + // Build the list of anomalous days first (cheap, in-memory), then + // fan out the per-day cost-breakdown drilldowns in parallel. Each + // drilldown is an independent Cost Management query; serialising + // them was needlessly multiplying wall time by N anomalies. + var anomalyCandidates = new List<(DateTime Date, double Cost, double Z)>(); foreach (var p in detection) { if (stddev < 0.01) continue; // flat baseline, can't detect var z = (p.Cost - mean) / stddev; - if (Math.Abs(z) >= zThreshold) - { - // Drill down for this specific day - var breakdown = await GetBreakdownForDay(token, subscriptionId, p.Date, groupBy, activity); - anomalies.Add(new - { - date = p.Date.ToString("yyyy-MM-dd"), - cost = Math.Round(p.Cost, 2), - z_score = Math.Round(z, 2), - deviation_pct = mean > 0.01 ? Math.Round((p.Cost - mean) / mean * 100, 1) : 0, - direction = z > 0 ? "spike" : "drop", - top_contributors = breakdown - }); - } + if (Math.Abs(z) >= zThreshold) anomalyCandidates.Add((p.Date, p.Cost, z)); } + var drilldownTasks = anomalyCandidates.Select(async c => + { + // Pass null activity — Activity is not safe for concurrent SetTag + // writers, and these drilldowns run in parallel. Each call still + // gets its own ActivitySource span inside HttpHelper. + try { return (c, Breakdown: (object)await GetBreakdownForDay(token, subscriptionId, c.Date, groupBy, activity: null)); } + catch (Exception ex) { return (c, Breakdown: new { error = ex.Message }); } + }).ToArray(); + var drilldowns = await Task.WhenAll(drilldownTasks); + var anomalies = drilldowns.Select(d => (object)new + { + date = d.c.Date.ToString("yyyy-MM-dd"), + cost = Math.Round(d.c.Cost, 2), + z_score = Math.Round(d.c.Z, 2), + deviation_pct = mean > 0.01 ? Math.Round((d.c.Cost - mean) / mean * 100, 1) : 0, + direction = d.c.Z > 0 ? "spike" : "drop", + top_contributors = d.Breakdown + }).ToList(); var result = new { diff --git a/src/Dashboard/AI/Tools/FaqTools.cs b/src/Dashboard/AI/Tools/FaqTools.cs index f31d110..b05f994 100644 --- a/src/Dashboard/AI/Tools/FaqTools.cs +++ b/src/Dashboard/AI/Tools/FaqTools.cs @@ -152,7 +152,7 @@ private static async Task PingIndexNowAsync(string slug) { try { - using var http = new HttpClient { Timeout = TimeSpan.FromSeconds(10) }; + using var http = new HttpClient(AzureFinOps.Dashboard.Infrastructure.Ipv4HttpHandler.Create(), disposeHandler: true) { Timeout = TimeSpan.FromSeconds(10) }; var body = JsonSerializer.Serialize(new { host = "azure-finops-agent.com", diff --git a/src/Dashboard/AI/Tools/IdleResourceTools.cs b/src/Dashboard/AI/Tools/IdleResourceTools.cs index 246619a..e341344 100644 --- a/src/Dashboard/AI/Tools/IdleResourceTools.cs +++ b/src/Dashboard/AI/Tools/IdleResourceTools.cs @@ -72,11 +72,22 @@ private async Task FindIdleResources( $"ResourceContainers | where type =~ 'microsoft.resources/subscriptions/resourcegroups' | join kind=leftouter (Resources | summarize count() by resourceGroup, subscriptionId) on resourceGroup, subscriptionId | where isnull(count_) or count_ == 0 | project id, name, location, subscriptionId | top {topPerPattern} by name"), }; - var results = new Dictionary(); - foreach (var (label, kql) in patterns) + // Fan out all 8 KQL queries in parallel. Each result is wrapped + // in a per-query try/catch so a single failed pattern (perms, + // throttle, schema drift) doesn't lose the other 7. Cuts wall + // time from 8×latency to max(latency). + var queryTasks = patterns.Select(async p => { - results[label] = await RunResourceGraphQuery(token, kql, subs, activity); - } + // Pass null activity — Activity is not safe for concurrent SetTag + // writers across the 8 parallel queries. HttpHelper still emits + // its own per-call span. + try { return (p.Label, Result: await RunResourceGraphQuery(token, p.Kql, subs, activity: null)); } + catch (HttpRequestException ex) { return (p.Label, Result: new { error = "query exception", detail = ex.Message }); } + catch (TaskCanceledException ex) { return (p.Label, Result: new { error = "query exception", detail = ex.Message }); } + catch (OperationCanceledException ex) { return (p.Label, Result: new { error = "query exception", detail = ex.Message }); } + }).ToArray(); + var completed = await Task.WhenAll(queryTasks); + var results = completed.ToDictionary(x => x.Label, x => x.Result); var summary = new { diff --git a/src/Dashboard/AI/Tools/RetailPricingTools.cs b/src/Dashboard/AI/Tools/RetailPricingTools.cs index 038bf9a..1473712 100644 --- a/src/Dashboard/AI/Tools/RetailPricingTools.cs +++ b/src/Dashboard/AI/Tools/RetailPricingTools.cs @@ -89,10 +89,16 @@ private static async Task GetAzureRetailPricing( ?? Math.Min(Math.Pow(2, attempt + 1) + Random.Shared.NextDouble(), 30); var waitSeconds = Math.Max(1, retryAfter); activity?.SetTag($"pricing.retry_{attempt}", $"{(int)res.StatusCode}, waiting {waitSeconds:F0}s"); - // Surface the cool-down to the chat UI on the same SSE channel HttpHelper uses. - if (HttpHelper.RetryReporter.Value is { } report) + // Surface the cool-down to the chat UI via the same baggage-keyed SSE channel HttpHelper uses. + var turnKey = System.Diagnostics.Activity.Current?.GetBaggageItem("finops.turn.id"); + if (turnKey is not null && HttpHelper.RetryReporters.TryGetValue(turnKey, out var report)) { - try { await report(attempt + 1, waitSeconds); } catch { /* best-effort */ } + try { await report(attempt + 1, waitSeconds, url, "pricing", (int)res.StatusCode); } + catch (Exception emitEx) + { + HttpHelper.Logger?.LogWarning(emitEx, + "SSE cooling_down emit failed for pricing attempt={Attempt}", attempt + 1); + } } await Task.Delay(TimeSpan.FromSeconds(waitSeconds)); } diff --git a/src/Dashboard/Auth/AzureSessionEndpoints.cs b/src/Dashboard/Auth/AzureSessionEndpoints.cs index c16f402..c8ab4a5 100644 --- a/src/Dashboard/Auth/AzureSessionEndpoints.cs +++ b/src/Dashboard/Auth/AzureSessionEndpoints.cs @@ -25,6 +25,55 @@ public static void MapAzureSessionEndpoints( var azureUserJson = ctx.Session.GetString("azure_user"); object? azureUser = azureUserJson is not null ? JsonSerializer.Deserialize(azureUserJson) : null; + // Last-resort fallback: if azure_user wasn't populated by the OAuth + // callback or the persistent-identity middleware, decode the JWT + // access-token claims directly. Guarantees the sidebar always + // shows the signed-in email when a valid token exists. + // SECURITY NOTE: we deliberately do NOT verify the JWT signature here. + // This path runs AFTER the same `token` has already been used + // successfully upstream (the caller fetched it via OAuth refresh and + // is about to call ARM with it). The Bearer call to management.azure.com + // below would fail with 401 if the token were forged, so an attacker + // can't get a spoofed identity past the sidebar. We're just reading + // claims for display. + if (azureUser is null) + { + try + { + var parts = token.Split('.'); + if (parts.Length >= 2) + { + var payload = parts[1].Replace('-', '+').Replace('_', '/'); + switch (payload.Length % 4) { case 2: payload += "=="; break; case 3: payload += "="; break; } + var claims = JsonSerializer.Deserialize(Convert.FromBase64String(payload)); + string? upn = null, name = null, oid = null, tid = null; + if (claims.TryGetProperty("upn", out var u)) upn = u.GetString(); + else if (claims.TryGetProperty("preferred_username", out var pu)) upn = pu.GetString(); + else if (claims.TryGetProperty("unique_name", out var un)) upn = un.GetString(); + if (claims.TryGetProperty("name", out var n)) name = n.GetString(); + if (claims.TryGetProperty("oid", out var o)) oid = o.GetString(); + if (claims.TryGetProperty("tid", out var t)) tid = t.GetString(); + if (upn is not null || name is not null) + { + var derived = new Dictionary + { + ["tenantId"] = tid, + ["objectId"] = oid, + ["name"] = name, + ["email"] = upn, + }; + azureUser = JsonSerializer.Deserialize(JsonSerializer.Serialize(derived)); + // Persist so subsequent /status calls don't re-decode. + ctx.Session.SetString("azure_user", JsonSerializer.Serialize(derived)); + } + } + } + catch (Exception jwtEx) + { + logger.LogWarning(jwtEx, "Failed to decode Azure access-token claims for /status fallback"); + } + } + var http = httpFactory.CreateClient(); var subscriptions = new List(); try diff --git a/src/Dashboard/Auth/MicrosoftAuthEndpoints.cs b/src/Dashboard/Auth/MicrosoftAuthEndpoints.cs index 6672c1c..818a1dd 100644 --- a/src/Dashboard/Auth/MicrosoftAuthEndpoints.cs +++ b/src/Dashboard/Auth/MicrosoftAuthEndpoints.cs @@ -257,7 +257,9 @@ public static void MapMicrosoftAuthEndpoints( ctx.Session.Remove("ms_oauth_state"); - var http = httpFactory.CreateClient(); + // Named client with 30 s overall timeout. IPv4-only transport is + // already applied by ConfigureHttpClientDefaults in Program.cs. + var http = httpFactory.CreateClient("entra-token"); var redirectUri = $"{MicrosoftOAuthOptions.NormalizeCallbackHost(ctx)}/auth/microsoft/callback"; var effectiveTenant = ctx.Session.GetString("auth_tenant") ?? options.TenantId; @@ -400,7 +402,7 @@ public static void MapMicrosoftAuthEndpoints( // GraphTier in sync as the user adds add-on consents incrementally. if (!string.IsNullOrEmpty(refreshToken)) { - persistentIdentity.SaveIdentity(ctx, new IdentityRecord + await persistentIdentity.SaveIdentityAsync(ctx, new IdentityRecord { Oid = oid, TenantId = validated.TenantId ?? "", @@ -416,7 +418,7 @@ public static void MapMicrosoftAuthEndpoints( // Edge case: re-consent without a fresh refresh_token. Update only // the GraphTier so post-restart hydration still reflects the new // add-on without clobbering the existing refresh token. - persistentIdentity.UpdateGraphTier(oid, ctx.Session.GetString("graph_tier")); + await persistentIdentity.UpdateGraphTierAsync(oid, ctx.Session.GetString("graph_tier")); } } } diff --git a/src/Dashboard/Auth/PersistentIdentity.cs b/src/Dashboard/Auth/PersistentIdentity.cs index df3d838..ae2134b 100644 --- a/src/Dashboard/Auth/PersistentIdentity.cs +++ b/src/Dashboard/Auth/PersistentIdentity.cs @@ -74,10 +74,10 @@ public static long DeriveUserId(string oid) /// writes the encrypted identity cookie. Call this from the OAuth callback /// after a successful id_token validation and from any path that mints a /// new refresh_token. - public void SaveIdentity(HttpContext ctx, IdentityRecord record) + public async Task SaveIdentityAsync(HttpContext ctx, IdentityRecord record) { var sem = LockFor(record.Oid); - sem.Wait(); + await sem.WaitAsync(); try { var dir = GetUserDir(record.Oid); @@ -204,25 +204,25 @@ public void Clear(HttpContext ctx, string? oid) /// Updates only the refresh token + recorded scopes on an existing /// identity file. Used by when a refresh /// rotates the token (Entra rotates refresh tokens on use). - public void UpdateRefreshToken(string oid, string newRefreshToken) + public Task UpdateRefreshTokenAsync(string oid, string newRefreshToken) { - UpdateRecord(oid, r => { r.RefreshToken = newRefreshToken; }); + return UpdateRecordAsync(oid, r => { r.RefreshToken = newRefreshToken; }); } /// Persists the comma-separated list of consented Graph tiers so a /// post-restart hydration restores the user's full add-on set, not just the /// base ARM scope. - public void UpdateGraphTier(string oid, string? graphTier) + public Task UpdateGraphTierAsync(string oid, string? graphTier) { - UpdateRecord(oid, r => { r.GraphTier = graphTier; }); + return UpdateRecordAsync(oid, r => { r.GraphTier = graphTier; }); } - private void UpdateRecord(string oid, Action mutate) + private async Task UpdateRecordAsync(string oid, Action mutate) { var path = Path.Combine(GetUserDir(oid), "identity.json"); if (!File.Exists(path)) return; var sem = LockFor(oid); - sem.Wait(); + await sem.WaitAsync(); try { var existing = JsonSerializer.Deserialize(_protector.Unprotect(File.ReadAllText(path))); diff --git a/src/Dashboard/Auth/SessionTokenStore.cs b/src/Dashboard/Auth/SessionTokenStore.cs index ca39a1a..529545c 100644 --- a/src/Dashboard/Auth/SessionTokenStore.cs +++ b/src/Dashboard/Auth/SessionTokenStore.cs @@ -29,28 +29,88 @@ public SessionTokenStore(MicrosoftOAuthOptions options, EntraClientCredentials c HttpClient http, string refreshToken, string scope, string? tenantOverride = null) { var effectiveTenant = tenantOverride ?? _options.TenantId; - using var req = new HttpRequestMessage(HttpMethod.Post, - $"https://login.microsoftonline.com/{Uri.EscapeDataString(effectiveTenant)}/oauth2/v2.0/token"); - var form = new Dictionary + // Up to 2 retries on transient TCP/HTTP timeouts (corporate proxy + // dropping SYN on cold connections is the #1 cause of token-fetch + // hangs). Connect timeout itself is 5 s via the named client's + // SocketsHttpHandler.ConnectTimeout, so worst case here is ~15 s + // instead of the OS default 63 s. + for (var attempt = 0; attempt < 3; attempt++) { - ["client_id"] = _options.ClientId, - ["refresh_token"] = refreshToken, - ["grant_type"] = "refresh_token", - ["scope"] = scope - }; - await _credentials.AddCredentialFieldsAsync(form); - req.Content = new FormUrlEncodedContent(form); - - var res = await http.SendAsync(req); - if (!res.IsSuccessStatusCode) return null; - - var body = await res.Content.ReadAsStringAsync(); - var json = JsonSerializer.Deserialize(body); - if (!json.TryGetProperty("access_token", out var tokenProp)) return null; - - var expiresIn = json.TryGetProperty("expires_in", out var expProp) ? expProp.GetInt32() : 3600; - var rotated = json.TryGetProperty("refresh_token", out var newRt) ? newRt.GetString() : null; - return (tokenProp.GetString()!, DateTimeOffset.UtcNow.AddSeconds(expiresIn - 60), rotated); + using var req = new HttpRequestMessage(HttpMethod.Post, + $"https://login.microsoftonline.com/{Uri.EscapeDataString(effectiveTenant)}/oauth2/v2.0/token"); + var form = new Dictionary + { + ["client_id"] = _options.ClientId, + ["refresh_token"] = refreshToken, + ["grant_type"] = "refresh_token", + ["scope"] = scope + }; + await _credentials.AddCredentialFieldsAsync(form); + req.Content = new FormUrlEncodedContent(form); + + try + { + var res = await http.SendAsync(req); + if (!res.IsSuccessStatusCode) return null; + + var body = await res.Content.ReadAsStringAsync(); + var json = JsonSerializer.Deserialize(body); + if (!json.TryGetProperty("access_token", out var tokenProp)) return null; + + var expiresIn = json.TryGetProperty("expires_in", out var expProp) ? expProp.GetInt32() : 3600; + var rotated = json.TryGetProperty("refresh_token", out var newRt) ? newRt.GetString() : null; + return (tokenProp.GetString()!, DateTimeOffset.UtcNow.AddSeconds(expiresIn - 60), rotated); + } + catch (Exception ex) when (attempt < 2 && IsTransientNetworkError(ex)) + { + var delayMs = 250 * (1 << attempt); // 250 ms, 500 ms + _logger.LogWarning("Entra token POST failed (attempt {Attempt}/3) for scope={Scope}: {Err}. Retrying in {Delay}ms", + attempt + 1, scope, ex.GetType().Name + ": " + ex.Message, delayMs); + await Task.Delay(delayMs); + } + catch (Exception ex) when (attempt == 2 && IsTransientNetworkError(ex)) + { + // Final attempt failed with a transient network error — swallow + // and return null. Throwing here would surface as HTTP 500 from + // /auth/azure/status (observed in production telemetry). Null + // lets the endpoint return { connected = false } and the UI + // shows "sign in again" instead of a generic error page. + _logger.LogWarning("Entra token POST failed after 3 attempts for scope={Scope}: {Err}. Returning null (user must re-auth).", + scope, ex.GetType().Name + ": " + ex.Message); + } + } + return null; + } + + /// True for connect timeouts, socket resets, and DNS failures we + /// expect to clear on a fresh connection (typical of corporate proxies that + /// drop the first SYN). Deliberately excludes SSL/cert/auth failures — + /// those won't recover on retry and burning 3 attempts on them is wasteful. + private static bool IsTransientNetworkError(Exception ex) + { + for (var e = ex; e != null; e = e.InnerException) + { + // Never retry auth/cert/SSL failures — they're deterministic. + if (e is System.Security.Authentication.AuthenticationException) return false; + if (e is System.Security.Cryptography.CryptographicException) return false; + + if (e is TimeoutException) return true; + if (e is System.Net.Sockets.SocketException se) + { + return se.SocketErrorCode is + System.Net.Sockets.SocketError.TimedOut or + System.Net.Sockets.SocketError.ConnectionRefused or + System.Net.Sockets.SocketError.ConnectionReset or + System.Net.Sockets.SocketError.NetworkUnreachable or + System.Net.Sockets.SocketError.HostUnreachable; + } + // HttpRequestException with no SocketException or AuthException inner is + // most often a connection-establishment failure (proxy / DNS / transient + // network) — retry. SSL handshake failures bubble up as AuthException + // inner and are filtered out above. + if (e is System.Net.Http.HttpRequestException) return true; + } + return false; } public async Task GetSessionTokenAsync(HttpContext ctx, IHttpClientFactory httpFactory, @@ -93,7 +153,7 @@ public SessionTokenStore(MicrosoftOAuthOptions options, EntraClientCredentials c ctx.Session.Remove(expiryKey); return null; } - var http = httpFactory.CreateClient(); + var http = httpFactory.CreateClient("entra-token"); var sessionTenant = ctx.Session.GetString("auth_tenant"); var result = await ExchangeRefreshTokenForResource(http, refreshToken, refreshScope, sessionTenant); if (result is null) @@ -113,17 +173,19 @@ public SessionTokenStore(MicrosoftOAuthOptions options, EntraClientCredentials c var azureUserJson = ctx.Session.GetString("azure_user"); if (azureUserJson is not null) { + string? oid = null; try { var au = JsonSerializer.Deserialize(azureUserJson); if (au.TryGetProperty("objectId", out var oidProp)) - { - var oid = oidProp.GetString(); - if (!string.IsNullOrEmpty(oid)) - _identity.UpdateRefreshToken(oid, result.Value.RotatedRefreshToken); - } + oid = oidProp.GetString(); } catch { } + if (!string.IsNullOrEmpty(oid)) + { + try { await _identity.UpdateRefreshTokenAsync(oid, result.Value.RotatedRefreshToken); } + catch (Exception ex) { _logger.LogWarning(ex, "Failed to mirror rotated refresh token for oid={Oid}", oid); } + } } } return result.Value.Token; diff --git a/src/Dashboard/Auth/TenantTokenRefresher.cs b/src/Dashboard/Auth/TenantTokenRefresher.cs index 17ea7e8..082aad6 100644 --- a/src/Dashboard/Auth/TenantTokenRefresher.cs +++ b/src/Dashboard/Auth/TenantTokenRefresher.cs @@ -182,7 +182,7 @@ private static bool NeedsRefresh(DateTimeOffset? expiry) if (!string.IsNullOrEmpty(result.Value.RotatedRefreshToken) && result.Value.RotatedRefreshToken != record.RefreshToken) { - _identity.UpdateRefreshToken(record.Oid, result.Value.RotatedRefreshToken); + await _identity.UpdateRefreshTokenAsync(record.Oid, result.Value.RotatedRefreshToken); record.RefreshToken = result.Value.RotatedRefreshToken; } diff --git a/src/Dashboard/Dashboard.csproj b/src/Dashboard/Dashboard.csproj index c3a367c..b863204 100644 --- a/src/Dashboard/Dashboard.csproj +++ b/src/Dashboard/Dashboard.csproj @@ -11,7 +11,7 @@ - + diff --git a/src/Dashboard/Infrastructure/HttpHelper.cs b/src/Dashboard/Infrastructure/HttpHelper.cs index 1feb053..3cb634e 100644 --- a/src/Dashboard/Infrastructure/HttpHelper.cs +++ b/src/Dashboard/Infrastructure/HttpHelper.cs @@ -11,12 +11,10 @@ namespace AzureFinOps.Dashboard.Infrastructure; /// public static class HttpHelper { - // SocketsHttpHandler with PooledConnectionLifetime so a process running for weeks on App - // Service eventually picks up DNS changes (the classic long-lived static HttpClient footgun). - private static readonly HttpClient Http = new(new SocketsHttpHandler - { - PooledConnectionLifetime = TimeSpan.FromMinutes(5) - }) + // Shared IPv4-only SocketsHttpHandler (see Ipv4HttpHandler.cs). Corp egress + // drops IPv6 SYNs and the OS only gives up after ~21 s; forcing IPv4 here + // matches the factory defaults wired up in Program.cs. + private static readonly HttpClient Http = new(Ipv4HttpHandler.Create()) { Timeout = TimeSpan.FromSeconds(60) }; public static readonly ActivitySource Telemetry = new("AzureFinOps.AI"); @@ -24,13 +22,29 @@ public static class HttpHelper private static readonly Meter Meter = new("AzureFinOps.AI"); private static readonly Counter ThrottleRetries = Meter.CreateCounter("finops.throttle.retries", description: "HTTP retries triggered by 429 or transient 5xx"); + // Time the caller actually spent waiting on backoff (sum of Task.Delay + // across all retry attempts on a single call). Distinct from request + // latency — this is pure throttle penalty. + private static readonly Histogram RetryWaitMs = + Meter.CreateHistogram("finops.http.retry_wait_ms", "ms", "Cumulative backoff wait per HTTP call"); + private static readonly Histogram RequestTotalMs = + Meter.CreateHistogram("finops.http.request_total_ms", "ms", "Total time per HTTP call including retries"); + + /// Optional logger plumbed from Program.cs so retries surface in + /// the console / Application Insights traces instead of being silently + /// counted. Null when running outside the host (tests). + public static ILogger? Logger { get; set; } /// - /// Per-request hook for reporting 429 retries to the SSE stream. Set by ChatEndpoints - /// before session.SendAsync; flows through AsyncLocal into every tool invocation - /// on the same async context. Null in background workers / tests — calls become no-ops. + /// Per-turn hook for reporting 429/5xx retries to the SSE stream. Keyed by + /// userId:sessionId (the "turn id") so it survives the JSON-RPC tool-callback + /// boundary from the Copilot CLI (where AsyncLocal does NOT flow), AND so concurrent + /// turns from the same user (two tabs, sidebar score racing chat) don't clobber each + /// other's reporter. ChatEndpoints stamps the turn id into Activity Baggage as + /// finops.turn.id; tools look it up via the current activity's baggage. + /// Null lookup = no-op. Signature: (attemptNumber, waitSeconds, url, telemetryPrefix, statusCode). /// - public static readonly AsyncLocal?> RetryReporter = new(); + public static readonly System.Collections.Concurrent.ConcurrentDictionary> RetryReporters = new(); // Max retry attempts on HTTP 429. After this many failed attempts the throttled // response is returned to the caller so the LLM (or user) sees the throttle status. @@ -61,6 +75,9 @@ public static async Task SendWithRetryAsync( { method ??= HttpMethod.Get; + var totalSw = Stopwatch.StartNew(); + var totalWaitSec = 0.0; + var retryCount = 0; HttpResponseMessage res = null!; for (var attempt = 0; attempt < MaxThrottleRetries; attempt++) { @@ -86,19 +103,51 @@ public static async Task SendWithRetryAsync( if (attempt == MaxThrottleRetries - 1) break; // last attempt — return as-is to caller var waitSeconds = ResolveRetryAfterSeconds(res, attempt); + totalWaitSec += waitSeconds; + retryCount++; var reason = isThrottle ? "429" : status.ToString(); activity?.SetTag($"{telemetryPrefix}.retry_{attempt}", $"{reason}, waiting {waitSeconds:F0}s"); ThrottleRetries.Add(1, new KeyValuePair("status", reason), new KeyValuePair("tool", telemetryPrefix)); - if (RetryReporter.Value is { } report) + // Loud log so silent throttling can never hide a 60-second wait + // again. Prefix matches the tool/scope tag in App Insights. + Logger?.LogWarning("HTTP retry {Tool} attempt={Attempt} status={Status} waitSec={Wait:F1} url={Url}", + telemetryPrefix, attempt + 1, reason, waitSeconds, url); + // Look up the SSE reporter via Activity Baggage — baggage + // propagates across W3C tracecontext boundaries (including the + // Copilot CLI subprocess JSON-RPC tool callback) where RootId + // does not. ChatEndpoints stamps "finops.turn.id" (userId:sessionId) + // on the chat activity before SendAsync. + var turnKey = Activity.Current?.GetBaggageItem("finops.turn.id"); + if (turnKey is not null && RetryReporters.TryGetValue(turnKey, out var report)) + { + try { await report(attempt + 1, waitSeconds, url, telemetryPrefix, status); } + catch (Exception emitEx) { Logger?.LogWarning(emitEx, "SSE cooling_down emit failed for {Tool}", telemetryPrefix); } + } + else { - try { await report(attempt + 1, waitSeconds); } catch { /* UI hook is best-effort */ } + Logger?.LogWarning("SSE cooling_down skipped — no reporter for turn={Turn} (tool={Tool})", + turnKey ?? "", telemetryPrefix); } await Task.Delay(TimeSpan.FromSeconds(waitSeconds)); } var responseBody = await res.Content.ReadAsStringAsync(); + totalSw.Stop(); + + RequestTotalMs.Record(totalSw.Elapsed.TotalMilliseconds, + new KeyValuePair("tool", telemetryPrefix), + new KeyValuePair("status", (int)res.StatusCode)); + if (retryCount > 0) + { + RetryWaitMs.Record(totalWaitSec * 1000.0, + new KeyValuePair("tool", telemetryPrefix)); + Logger?.LogWarning("HTTP retried {Tool} retries={Retries} totalWaitSec={Wait:F1} totalMs={Total:F0} url={Url}", + telemetryPrefix, retryCount, totalWaitSec, totalSw.Elapsed.TotalMilliseconds, url); + activity?.SetTag($"{telemetryPrefix}.total_retries", retryCount); + activity?.SetTag($"{telemetryPrefix}.total_wait_sec", totalWaitSec); + } activity?.SetTag($"{telemetryPrefix}.status_code", (int)res.StatusCode); activity?.SetTag($"{telemetryPrefix}.response_length", responseBody.Length); diff --git a/src/Dashboard/Infrastructure/Ipv4HttpHandler.cs b/src/Dashboard/Infrastructure/Ipv4HttpHandler.cs new file mode 100644 index 0000000..8d1d32f --- /dev/null +++ b/src/Dashboard/Infrastructure/Ipv4HttpHandler.cs @@ -0,0 +1,86 @@ +using System.Net; +using System.Net.Sockets; + +namespace AzureFinOps.Dashboard.Infrastructure; + +/// +/// Single source of truth for outbound HTTP transport in this app. +/// +/// Corporate egress here drops IPv6 SYNs and the OS only gives up after +/// 3 TCP retries (~21 s), wedging every outbound call. .NET's default +/// connect path tries each DNS-returned address sequentially, so a single +/// AAAA record kills the request even though A records are reachable. +/// +/// Fix: resolve DNS ourselves, filter to IPv4 (AddressFamily.InterNetwork), +/// and connect on an explicit IPv4 socket with a hard 5-s cap. Never even +/// open an IPv6 socket. Use this for every HttpClient — factory default, +/// named clients, and the static helper in HttpHelper. +/// +public static class Ipv4HttpHandler +{ + public static SocketsHttpHandler Create() => new() + { + ConnectTimeout = TimeSpan.FromSeconds(5), + PooledConnectionLifetime = TimeSpan.FromMinutes(5), + PooledConnectionIdleTimeout = TimeSpan.FromMinutes(2), + EnableMultipleHttp2Connections = true, + ConnectCallback = Ipv4ConnectAsync, + }; + + public static async ValueTask Ipv4ConnectAsync(SocketsHttpConnectionContext ctx, CancellationToken ct) + { + var host = ctx.DnsEndPoint.Host; + var port = ctx.DnsEndPoint.Port; + var sw = System.Diagnostics.Stopwatch.StartNew(); + + IPAddress[] addresses; + // Skip DNS for literal IPs (avoids AddressFamily mismatch on IPv6 literals). + if (IPAddress.TryParse(host, out var literal)) + { + if (literal.AddressFamily != AddressFamily.InterNetwork) + throw new InvalidOperationException($"IPv6 literal {host} blocked by Ipv4HttpHandler"); + addresses = [literal]; + } + else + { + addresses = await Dns.GetHostAddressesAsync(host, AddressFamily.InterNetwork, ct); + if (addresses.Length == 0) + throw new InvalidOperationException($"No IPv4 address resolved for {host}"); + } + var dnsMs = sw.ElapsedMilliseconds; + + using var attemptCts = CancellationTokenSource.CreateLinkedTokenSource(ct); + attemptCts.CancelAfter(TimeSpan.FromSeconds(5)); + + Exception? lastError = null; + for (var i = 0; i < addresses.Length; i++) + { + var addr = addresses[i]; + var connectStart = sw.ElapsedMilliseconds; + var socket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp) { NoDelay = true }; + try + { + await socket.ConnectAsync(addr, port, attemptCts.Token); + var totalMs = sw.ElapsedMilliseconds; + HttpHelper.Logger?.LogInformation( + "IPv4 connect OK {Host}:{Port} via {Addr} (attempt {N}/{Total}) — dns={DnsMs}ms connect={ConnectMs}ms total={TotalMs}ms", + host, port, addr, i + 1, addresses.Length, dnsMs, totalMs - connectStart, totalMs); + return new NetworkStream(socket, ownsSocket: true); + } + catch (Exception ex) + { + lastError = ex; + socket.Dispose(); + HttpHelper.Logger?.LogWarning(ex, + "IPv4 connect FAIL {Host}:{Port} via {Addr} (attempt {N}/{Total}) after {Ms}ms", + host, port, addr, i + 1, addresses.Length, sw.ElapsedMilliseconds - connectStart); + } + } + + HttpHelper.Logger?.LogError(lastError, + "IPv4 connect EXHAUSTED {Host}:{Port} — all {N} addresses failed after {Ms}ms", + host, port, addresses.Length, sw.ElapsedMilliseconds); + throw new HttpRequestException( + $"All {addresses.Length} IPv4 connects to {host}:{port} failed", lastError); + } +} diff --git a/src/Dashboard/Program.cs b/src/Dashboard/Program.cs index 25ddd13..83869ab 100644 --- a/src/Dashboard/Program.cs +++ b/src/Dashboard/Program.cs @@ -50,6 +50,21 @@ options.Cookie.SecurePolicy = CookieSecurePolicy.Always; }); builder.Services.AddHttpClient(); +// Force IPv4 + 5-s connect cap on EVERY factory-created HttpClient (default +// and all named clients). Corporate egress drops IPv6 SYNs and the OS retries +// for ~21 s before falling back, wedging every outbound call. See +// Infrastructure/Ipv4HttpHandler.cs for the rationale. +builder.Services.ConfigureHttpClientDefaults(b => + b.ConfigurePrimaryHttpMessageHandler(AzureFinOps.Dashboard.Infrastructure.Ipv4HttpHandler.Create)); +// Dedicated client for Microsoft Entra token endpoints. Inherits the IPv4-only +// handler from ConfigureHttpClientDefaults above; only overrides Timeout and +// HTTP version for token-endpoint specifics. +builder.Services.AddHttpClient("entra-token", c => +{ + // Budget enough headroom for retries above the 5-s per-attempt connect cap. + c.Timeout = TimeSpan.FromSeconds(30); + c.DefaultRequestVersion = System.Net.HttpVersion.Version20; +}); if (!builder.Environment.IsDevelopment()) builder.Services.AddHttpsRedirection(options => options.HttpsPort = 443); @@ -87,6 +102,10 @@ var loggerFactory = app.Services.GetRequiredService(); var logger = loggerFactory.CreateLogger("AzureFinOps.AI"); logger.LogInformation("Application starting. AppInsights configured: {Configured}", !string.IsNullOrEmpty(appInsightsCs)); +// Plumb a logger into HttpHelper so 429/5xx retries surface in stdout + +// Application Insights instead of being only a silent counter. +AzureFinOps.Dashboard.Infrastructure.HttpHelper.Logger = + loggerFactory.CreateLogger("AzureFinOps.AI.HttpHelper"); await using var copilotFactory = await CopilotSessionFactory.CreateAsync( telemetry, oauthOptions, azureOpenAIEndpoint, azureOpenAIDeployment, loggerFactory); @@ -213,14 +232,6 @@ // CSRF defense — for state-changing requests, require Origin/Referer to match this host. // Combined with SameSite=Lax cookies this defeats the standard CSRF surface. -var allowedOriginHosts = new HashSet(StringComparer.OrdinalIgnoreCase) -{ - "azure-finops-agent.com", - "www.azure-finops-agent.com", - "finops-agent-container.azurewebsites.net", - "localhost:5000", - "localhost:5173", -}; app.Use(async (ctx, next) => { var method = ctx.Request.Method; @@ -234,7 +245,8 @@ if (Uri.TryCreate(origin, UriKind.Absolute, out var oUri)) sourceHost = oUri.Authority; else if (Uri.TryCreate(referer, UriKind.Absolute, out var rUri)) sourceHost = rUri.Authority; - if (string.IsNullOrEmpty(sourceHost) || !allowedOriginHosts.Contains(sourceHost)) + var ownHost = ctx.Request.Host.Value ?? ""; + if (string.IsNullOrEmpty(sourceHost) || !string.Equals(sourceHost, ownHost, StringComparison.OrdinalIgnoreCase)) { ctx.Response.StatusCode = 403; await ctx.Response.WriteAsync("Forbidden: cross-origin write blocked"); @@ -250,33 +262,45 @@ var persistentIdentity = app.Services.GetRequiredService(); app.Use(async (ctx, next) => { - if (ctx.Session.GetString("user") is null) + var hasUser = ctx.Session.GetString("user") is not null; + var hasAzureUser = ctx.Session.GetString("azure_user") is not null; + if (!hasUser || !hasAzureUser) { var record = persistentIdentity.Load(ctx); if (record is not null && !string.IsNullOrEmpty(record.Oid)) { // Returning Entra user: rebuild the session blobs deterministically. - ctx.Session.SetString("user", JsonSerializer.Serialize(new + // We rehydrate `azure_user` even when `user` is already set so the + // sidebar always shows the signed-in email after a backend restart + // (the in-memory session middleware loses azure_user across + // process boundaries; the persistent identity cookie survives). + if (!hasUser) { - id = record.UserId, - login = $"user-{record.UserId & 0xFFFF:X4}", - name = record.Name, - avatar = (string?)null, - email = record.Email, - })); - ctx.Session.SetString("azure_user", JsonSerializer.Serialize(new Dictionary + ctx.Session.SetString("user", JsonSerializer.Serialize(new + { + id = record.UserId, + login = $"user-{record.UserId & 0xFFFF:X4}", + name = record.Name, + avatar = null, + email = record.Email, + })); + } + if (!hasAzureUser) { - ["tenantId"] = record.TenantId, - ["objectId"] = record.Oid, - ["name"] = record.Name, - ["email"] = record.Email, - })); - if (!string.IsNullOrEmpty(record.RefreshToken)) + ctx.Session.SetString("azure_user", JsonSerializer.Serialize(new Dictionary + { + ["tenantId"] = record.TenantId, + ["objectId"] = record.Oid, + ["name"] = record.Name, + ["email"] = record.Email, + })); + } + if (!string.IsNullOrEmpty(record.RefreshToken) && ctx.Session.GetString("azure_refresh_token") is null) ctx.Session.SetString("azure_refresh_token", record.RefreshToken); - if (!string.IsNullOrEmpty(record.GraphTier)) + if (!string.IsNullOrEmpty(record.GraphTier) && ctx.Session.GetString("graph_tier") is null) ctx.Session.SetString("graph_tier", record.GraphTier); } - else + else if (!hasUser) { // Brand-new visitor: crypto-random anonymous id keyed only in this session. var sessionUserId = (long)(RandomNumberGenerator.GetInt32(1_000_000, int.MaxValue)) << 24 diff --git a/src/Dashboard/frontend/src/components/ChatView.vue b/src/Dashboard/frontend/src/components/ChatView.vue index 1181258..40c723a 100644 --- a/src/Dashboard/frontend/src/components/ChatView.vue +++ b/src/Dashboard/frontend/src/components/ChatView.vue @@ -1625,17 +1625,40 @@ {{ allToolCalls.length }}
-
+
- +
@@ -1864,12 +1887,19 @@ const activeTools = ref([]); // whichever session the user is currently viewing. const perSessionToolCalls = reactive(new Map()); const perSessionCharts = reactive(new Map()); +// Per-session ephemeral "cooling down" ghost rows. NOT persisted into +// messages/IndexedDB — they vanish at stream end. Each entry: +// { _uid, _isCooler:true, tool, url, attempt, wait, status, ts, expanded, _timer } +const perSessionCoolers = reactive(new Map()); const streamToolCalls = computed( () => perSessionToolCalls.get(currentSessionId.value || "__pending__") || [], ); const streamCharts = computed( () => perSessionCharts.get(currentSessionId.value || "__pending__") || [], ); +const streamCoolers = computed( + () => perSessionCoolers.get(currentSessionId.value || "__pending__") || [], +); const streamFollowUp = ref(null); const streamIntent = ref(""); const htmlReady = ref(null); @@ -2674,6 +2704,39 @@ watch( onMounted(async () => { document.addEventListener("click", dismissPopover); + // Dev helper: window.__simulateCool(waitSec?) injects a synthetic ghost + // "cooling down" row in the current session's tool sidebar so you can + // verify the throttle UI without forcing a real 429. Removes itself + // after the wait elapses, just like a real cooler. Dev-build only — + // gated on Vite's import.meta.env.DEV so production bundles don't ship it. + if (import.meta.env.DEV) { + window.__simulateCool = (wait = 15, status = 429) => { + const sid = currentSessionId.value || "__pending__"; + const list = perSessionCoolers.get(sid) || []; + const cooler = { + _uid: `c-sim-${Date.now()}`, + _key: `sim|${Date.now()}`, + _isCooler: true, + tool: "azure", + url: "https://management.azure.com/subscriptions/00000000-0000-0000-0000-000000000000/providers/Microsoft.CostManagement/query?api-version=2025-03-01", + attempt: 1, + wait, + status, + ts: Date.now(), + expanded: false, + done: false, + }; + cooler._timer = setTimeout(() => { + const arr = perSessionCoolers.get(sid) || []; + perSessionCoolers.set(sid, arr.filter((x) => x._uid !== cooler._uid)); + }, wait * 1000 + 500); + list.push(cooler); + perSessionCoolers.set(sid, [...list]); + // eslint-disable-next-line no-console + console.log("[simulateCool] injected ghost row for", wait, "s in session", sid); + return cooler._uid; + }; + } // Load saved conversations once we know whether the user is Azure-connected. // (The auth check fires on mount too; loadSessions is a no-op until then.) setTimeout(() => { @@ -4127,6 +4190,14 @@ const allToolCalls = computed(() => { for (const tc of msg.toolCalls) add(tc, "msg"); } for (const tc of streamToolCalls.value) add(tc, "stream"); + // Append any live cooling-down ghosts at the end so they pin to the top of + // the reversed sidebar view. They are ephemeral and never get committed. + // IMPORTANT: pass the original reference (no spread) so toggling + // tc.expanded in the template mutates the reactive source. + for (const c of streamCoolers.value) { + c._uid = c._uid || `cool-${crypto.randomUUID()}`; + order.push(c); + } return order; }); @@ -4472,6 +4543,20 @@ async function send() { let charts = perSessionCharts.get(streamingId); let hasDeltas = false; + // === TIMING HOOKS === + // Captures every meaningful moment of the turn so we can build a flat + // timing table afterwards. Records: client send, each SSE event arrival + // (with type/tool), backend phase timings (token.*, session.*, sdk.*, + // chat.total), tool durations from tool_done.durationMs, and 429 retries + // from cooling_down. Dumped to console.table + window.__lastTimings + // when the stream ends. + const t0 = performance.now(); + const timings = []; + const recordTiming = (entry) => { + timings.push({ t_ms: Math.round(performance.now() - t0), ...entry }); + }; + recordTiming({ kind: "client", phase: "send.start", prompt: prompt.slice(0, 60) }); + try { const res = await fetch("/api/chat", { method: "POST", @@ -4508,12 +4593,33 @@ async function send() { continue; } + // Capture every SSE event's arrival timestamp for the timing table. + // Backend-emitted `timing` events are stored verbatim (they carry their + // own ms measurement); all other events get a client-arrival t_ms. + if (data.type === "timing") { + recordTiming({ kind: "server", phase: data.phase, server_ms: data.ms, extra: data.extra }); + } else if (data.type === "tool_done") { + recordTiming({ kind: "event", phase: "tool_done", tool: data.tool, durationMs: data.durationMs, success: data.success }); + } else if (data.type === "tool_start") { + recordTiming({ kind: "event", phase: "tool_start", tool: data.tool }); + } else if (data.type === "cooling_down") { + // eslint-disable-next-line no-console + console.log("[SSE cooling_down arrived]", data); + recordTiming({ kind: "event", phase: "cooling_down", attempt: data.attempt, waitSec: data.waitSeconds }); + } else if (data.type === "delta") { + if (!timings.some((x) => x.phase === "first_delta")) { + recordTiming({ kind: "event", phase: "first_delta" }); + } + } else if (data.type !== "message") { + recordTiming({ kind: "event", phase: data.type }); + } + // Routing/sidebar events are handled regardless of which session the // user is currently viewing. Everything else is gated on isActiveView() // below so a backgrounded stream keeps draining bytes (the server // needs us to keep reading) without polluting the foreground view. const routingEvent = - data.type === "session" || data.type === "session_title"; + data.type === "session" || data.type === "session_title" || data.type === "timing"; // Tool / chart / completion-marker events go into the per-session // map so a backgrounded stream still accumulates state and the user // sees a complete tool list when they switch back. Only delta/text @@ -4722,8 +4828,53 @@ async function send() { break; case "cooling_down": { - // 429 backoff in flight — tag the most recent in-progress tool so - // the sidebar swaps its label to "Cooling down… {wait}s". + // 429/5xx backoff in flight. Insert/refresh a separate ephemeral + // ghost row in perSessionCoolers — NOT a label swap on the real + // tool. The ghost has its own animated background and an + // expand-to-detail panel showing the throttled URL. It auto- + // removes after the wait elapses (or sooner if tool_done arrives). + const list = perSessionCoolers.get(streamingId) || []; + const key = `${data.tool || "http"}|${data.url || ""}`; + const existing = list.find((c) => c._key === key); + if (existing) { + existing.attempt = data.attempt; + existing.wait = data.waitSeconds; + existing.status = data.status; + existing.ts = Date.now(); + if (existing._timer) clearTimeout(existing._timer); + existing._timer = setTimeout(() => { + const arr = perSessionCoolers.get(streamingId) || []; + perSessionCoolers.set( + streamingId, + arr.filter((x) => x._uid !== existing._uid), + ); + }, (data.waitSeconds || 5) * 1000 + 500); + perSessionCoolers.set(streamingId, [...list]); + } else { + const cooler = { + _uid: `c-${Date.now()}-${crypto.randomUUID().slice(0, 4)}`, + _key: key, + _isCooler: true, + tool: data.tool || "http", + url: data.url || "", + attempt: data.attempt, + wait: data.waitSeconds, + status: data.status, + ts: Date.now(), + expanded: false, + done: false, + }; + cooler._timer = setTimeout(() => { + const arr = perSessionCoolers.get(streamingId) || []; + perSessionCoolers.set( + streamingId, + arr.filter((x) => x._uid !== cooler._uid), + ); + }, (data.waitSeconds || 5) * 1000 + 500); + list.push(cooler); + perSessionCoolers.set(streamingId, [...list]); + } + // Legacy label swap on the in-flight tool — kept for accessibility. const inFlight = [...toolCalls].reverse().find((t) => !t.done); if (inFlight) { inFlight.cooling = { @@ -4786,6 +4937,10 @@ async function send() { case "error": streamBuffer.value += `\n**Error:** ${data.message}`; break; + + case "timing": + // Already captured above; no UI action needed. + break; } } } @@ -4849,6 +5004,21 @@ async function send() { clearInterval(intentAnimTimer); intentAnimTimer = null; runningSessions.delete(streamingId); + + // === TIMING DUMP === + // Stash on window for easy copy-paste; print a flat table to the + // devtools console so we can eyeball where the time went. + recordTiming({ kind: "client", phase: "send.end" }); + try { + window.__lastTimings = timings; + window.__lastTimingsPrompt = prompt; + // eslint-disable-next-line no-console + console.groupCollapsed(`[timing] ${prompt.slice(0, 60)} — ${timings.length} events, total ${timings[timings.length - 1]?.t_ms ?? 0}ms`); + // eslint-disable-next-line no-console + console.table(timings); + // eslint-disable-next-line no-console + console.groupEnd(); + } catch {} // Only wipe the live-stream UI refs if this stream's session is still // the foreground view. If the user navigated to a different session, // those refs reflect *that* session's state and must not be touched. @@ -4859,6 +5029,10 @@ async function send() { // viewing a different tab at completion. perSessionToolCalls.delete(streamingId); perSessionCharts.delete(streamingId); + // Discard any lingering cooling-down ghosts — they're stream-scoped only. + const lingering = perSessionCoolers.get(streamingId) || []; + for (const c of lingering) { if (c._timer) clearTimeout(c._timer); } + perSessionCoolers.delete(streamingId); if (isActiveView()) { flushText(); streamBuffer.value = ""; @@ -4871,7 +5045,7 @@ async function send() { nextTick(() => inputEl.value?.focus()); } // Refresh the Conversations sidebar so the new/updated summary shows up. - if (azureConnected.value) loadSessions(); + loadSessions(); if (availableModels.value.length <= 1) { try { const mr = await fetch("/api/models"); @@ -7587,6 +7761,72 @@ async function send() { font-size: 11px; } +/* ── Cooling-down ghost row (ephemeral, 429/5xx) ── */ +/* Same row chrome as .st-row (size, padding, font weight 400, height); + only the background + text colour differ so the user can spot retries. + Gradient mirrors the Azure top bar (#005a9e → #0078d4 → #0098e0) and + sweeps horizontally — never goes to white so dark blues stay readable. */ +.st-row--cooler { + cursor: pointer; + position: relative; + flex-wrap: wrap; + background: linear-gradient( + 90deg, + #005a9e 0%, + #0078d4 25%, + #0098e0 50%, + #0078d4 75%, + #005a9e 100% + ); + background-size: 200% 100%; + animation: cool-sweep 2.4s linear infinite; +} +.st-row--cooler:hover { + filter: brightness(1.05); +} +.st-row--cooler .st-name, +.st-row--cooler .st-time { + color: #fff; +} +.st-icon--cooler { + width: 14px; + height: 14px; + color: #fff; +} +.st-cooler-detail { + flex-basis: 100%; + margin-top: 6px; + padding: 8px 10px; + background: rgba(0, 0, 0, 0.25); + border-radius: 4px; + color: #fff; + font-size: 11px; + line-height: 1.5; + cursor: default; +} +.st-cooler-row { + margin-bottom: 2px; +} +.st-cooler-row strong { + color: #fff; + font-weight: 600; + margin-right: 4px; +} +.st-cooler-url code { + background: rgba(0, 0, 0, 0.3); + padding: 2px 4px; + border-radius: 3px; + font-family: ui-monospace, "Cascadia Code", Consolas, monospace; + font-size: 10px; + word-break: break-all; + display: inline-block; + max-width: 100%; +} +@keyframes cool-sweep { + 0% { background-position: 0% 50%; } + 100% { background-position: 200% 50%; } +} + /* ── Tool popover ── */ .tool-popover { position: fixed;