Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/copilot-instructions.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ The agent acts as a frontend on top of Azure Cost Management, Billing, ARM REST

- **Backend**: .NET 10 minimal API (`src/Dashboard/`)
- **Frontend**: Vue 3 + Vite SPA (`src/Dashboard/frontend/`) with ECharts for data visualization
- **AI**: GitHub Copilot SDK (`GitHub.Copilot.SDK` 1.0.0-beta.3) with BYOK (Bring Your Own Key) using Azure OpenAI via Entra ID bearer tokens. Sessions managed via `CopilotClient` / `CopilotSession`. Reasoning effort set to `xhigh`. The Copilot CLI provides built-in tools (file operations, bash, grep, glob, web fetch, memory) — custom tools handle Azure-specific APIs. Multi-session per user: each user can keep many conversations, listed in the right sidebar; the SDK auto-disconnects idle sessions after 30 min (`SessionIdleTimeoutSeconds = 1800`) while preserving on-disk state for resume.
- **AI**: GitHub Copilot SDK (`GitHub.Copilot.SDK` 1.0.0-beta.4) with BYOK (Bring Your Own Key) using Azure OpenAI via Entra ID bearer tokens. Sessions managed via `CopilotClient` / `CopilotSession`. Reasoning effort set to `xhigh`. The Copilot CLI provides built-in tools (file operations, bash, grep, glob, web fetch, memory) — custom tools handle Azure-specific APIs. Multi-session per user: each user can keep many conversations, listed in the right sidebar; the SDK auto-disconnects idle sessions after 30 min (`SessionIdleTimeoutSeconds = 1800`) while preserving on-disk state for resume.
- **Auth**: Auto-assigned anonymous sessions (no login required for chat); Microsoft Entra ID OAuth (multi-tenant) for Azure ARM, Microsoft Graph, and Log Analytics APIs
- **Data Sources**: Azure Retail Prices API (no auth), Azure Service Health (no auth), Azure Cost Management APIs, Microsoft Graph APIs, Azure Monitor / Log Analytics APIs, ECharts visualization
- **Observability**: OpenTelemetry end-to-end. The .NET app uses `UseAzureMonitor()` (auto-instruments HttpClient, ASP.NET Core, custom `ActivitySource("AzureFinOps.AI")` + `Meter("AzureFinOps.AI")`). The Copilot CLI subprocess emits OTLP via the SDK's built-in `TelemetryConfig` (GenAI + MCP semantic conventions — every tool call, LLM round-trip, prompt, tool args, result, token usage). Both feeds reach Application Insights via an in-container **OpenTelemetry Collector** (`otel/opentelemetry-collector-contrib`) using the `azuremonitor` exporter — config at `src/Dashboard/otel-collector-config.yaml`, launched by `entrypoint.sh` before the .NET app. Trace context (W3C `traceparent`) is auto-propagated SDK→CLI so Application Map shows one continuous transaction. Custom metrics (`finops.chat.requests`, `finops.tool.calls`, `finops.sessions.active`, etc.) keep flowing through the .NET exporter. Frontend telemetry in `frontend/src/main.js` captures page views, failed browser dependencies, uncaught JS errors, unhandled promise rejections, Vue component errors, and CSP violations. Third-party correlation headers are excluded for `cdn.jsdelivr.net` and `js.monitor.azure.com`.
Expand Down
1 change: 1 addition & 0 deletions .github/prompts/check-code-changes.prompt.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
go trhough all th enot comitted code does it make sense? analyze evey single line
1 change: 1 addition & 0 deletions .github/prompts/time-test.prompt.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Start the app with browser tools and ask the user to log in. Run a timer that checks every second until the user is logged in, then trigger Crawl. After it completes, look into the logs and produce a granular timetable outlining how long each step took and where the bottlenecks are.
215 changes: 190 additions & 25 deletions src/Dashboard/AI/ChatEndpoints.cs

Large diffs are not rendered by default.

10 changes: 8 additions & 2 deletions src/Dashboard/AI/CopilotSessionFactory.cs
Original file line number Diff line number Diff line change
Expand Up @@ -586,10 +586,13 @@ private async Task DisposeLiveAsync(string sessionId)
private async Task<SessionConfig> CreateSessionConfigAsync(long userId, string? entraOid)
{
var bearerToken = await GetAzureOpenAIBearerTokenAsync();
var effort = IsReasoningModel(_deployment) ? "xhigh" : null;
_logger.LogInformation("SessionConfig(create) model={Model} reasoningEffort={Effort} isReasoning={IsReasoning}",
_deployment, effort ?? "<null>", IsReasoningModel(_deployment));
return new SessionConfig
{
Model = _deployment,
ReasoningEffort = IsReasoningModel(_deployment) ? "xhigh" : null,
ReasoningEffort = effort,
Streaming = true,
Tools = GetOrCreateUserTools(userId),
WorkingDirectory = GetWorkingDirectory(userId, entraOid),
Expand All @@ -611,10 +614,13 @@ private async Task<SessionConfig> CreateSessionConfigAsync(long userId, string?
private async Task<ResumeSessionConfig> CreateResumeConfigAsync(long userId, string? entraOid)
{
var bearerToken = await GetAzureOpenAIBearerTokenAsync();
var effort = IsReasoningModel(_deployment) ? "xhigh" : null;
_logger.LogInformation("SessionConfig(resume) model={Model} reasoningEffort={Effort} isReasoning={IsReasoning} — NOTE: CLI may retain original-session effort",
_deployment, effort ?? "<null>", IsReasoningModel(_deployment));
return new ResumeSessionConfig
{
Model = _deployment,
ReasoningEffort = IsReasoningModel(_deployment) ? "xhigh" : null,
ReasoningEffort = effort,
Streaming = true,
Tools = GetOrCreateUserTools(userId),
WorkingDirectory = GetWorkingDirectory(userId, entraOid),
Expand Down
39 changes: 24 additions & 15 deletions src/Dashboard/AI/Tools/AnomalyTools.cs
Original file line number Diff line number Diff line change
Expand Up @@ -96,26 +96,35 @@ private async Task<string> DetectCostAnomalies(
var threshold = mean + zThreshold * stddev;
var lowThreshold = Math.Max(0, mean - zThreshold * stddev);

var anomalies = new List<object>();
// Build the list of anomalous days first (cheap, in-memory), then
// fan out the per-day cost-breakdown drilldowns in parallel. Each
// drilldown is an independent Cost Management query; serialising
// them was needlessly multiplying wall time by N anomalies.
var anomalyCandidates = new List<(DateTime Date, double Cost, double Z)>();
foreach (var p in detection)
{
if (stddev < 0.01) continue; // flat baseline, can't detect
var z = (p.Cost - mean) / stddev;
if (Math.Abs(z) >= zThreshold)
{
// Drill down for this specific day
var breakdown = await GetBreakdownForDay(token, subscriptionId, p.Date, groupBy, activity);
anomalies.Add(new
{
date = p.Date.ToString("yyyy-MM-dd"),
cost = Math.Round(p.Cost, 2),
z_score = Math.Round(z, 2),
deviation_pct = mean > 0.01 ? Math.Round((p.Cost - mean) / mean * 100, 1) : 0,
direction = z > 0 ? "spike" : "drop",
top_contributors = breakdown
});
}
if (Math.Abs(z) >= zThreshold) anomalyCandidates.Add((p.Date, p.Cost, z));
}
var drilldownTasks = anomalyCandidates.Select(async c =>
{
// Pass null activity — Activity is not safe for concurrent SetTag
// writers, and these drilldowns run in parallel. Each call still
// gets its own ActivitySource span inside HttpHelper.
try { return (c, Breakdown: (object)await GetBreakdownForDay(token, subscriptionId, c.Date, groupBy, activity: null)); }
catch (Exception ex) { return (c, Breakdown: new { error = ex.Message }); }
}).ToArray();
var drilldowns = await Task.WhenAll(drilldownTasks);
var anomalies = drilldowns.Select(d => (object)new
{
date = d.c.Date.ToString("yyyy-MM-dd"),
cost = Math.Round(d.c.Cost, 2),
z_score = Math.Round(d.c.Z, 2),
deviation_pct = mean > 0.01 ? Math.Round((d.c.Cost - mean) / mean * 100, 1) : 0,
direction = d.c.Z > 0 ? "spike" : "drop",
top_contributors = d.Breakdown
}).ToList();

var result = new
{
Expand Down
2 changes: 1 addition & 1 deletion src/Dashboard/AI/Tools/FaqTools.cs
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ private static async Task PingIndexNowAsync(string slug)
{
try
{
using var http = new HttpClient { Timeout = TimeSpan.FromSeconds(10) };
using var http = new HttpClient(AzureFinOps.Dashboard.Infrastructure.Ipv4HttpHandler.Create(), disposeHandler: true) { Timeout = TimeSpan.FromSeconds(10) };
var body = JsonSerializer.Serialize(new
{
host = "azure-finops-agent.com",
Expand Down
19 changes: 15 additions & 4 deletions src/Dashboard/AI/Tools/IdleResourceTools.cs
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,22 @@ private async Task<string> FindIdleResources(
$"ResourceContainers | where type =~ 'microsoft.resources/subscriptions/resourcegroups' | join kind=leftouter (Resources | summarize count() by resourceGroup, subscriptionId) on resourceGroup, subscriptionId | where isnull(count_) or count_ == 0 | project id, name, location, subscriptionId | top {topPerPattern} by name"),
};

var results = new Dictionary<string, object>();
foreach (var (label, kql) in patterns)
// Fan out all 8 KQL queries in parallel. Each result is wrapped
// in a per-query try/catch so a single failed pattern (perms,
// throttle, schema drift) doesn't lose the other 7. Cuts wall
// time from 8×latency to max(latency).
var queryTasks = patterns.Select(async p =>
{
results[label] = await RunResourceGraphQuery(token, kql, subs, activity);
}
// Pass null activity — Activity is not safe for concurrent SetTag
// writers across the 8 parallel queries. HttpHelper still emits
// its own per-call span.
try { return (p.Label, Result: await RunResourceGraphQuery(token, p.Kql, subs, activity: null)); }
catch (HttpRequestException ex) { return (p.Label, Result: new { error = "query exception", detail = ex.Message }); }
catch (TaskCanceledException ex) { return (p.Label, Result: new { error = "query exception", detail = ex.Message }); }
catch (OperationCanceledException ex) { return (p.Label, Result: new { error = "query exception", detail = ex.Message }); }
}).ToArray();
var completed = await Task.WhenAll(queryTasks);
var results = completed.ToDictionary(x => x.Label, x => x.Result);

var summary = new
{
Expand Down
12 changes: 9 additions & 3 deletions src/Dashboard/AI/Tools/RetailPricingTools.cs
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,16 @@ private static async Task<string> GetAzureRetailPricing(
?? Math.Min(Math.Pow(2, attempt + 1) + Random.Shared.NextDouble(), 30);
var waitSeconds = Math.Max(1, retryAfter);
activity?.SetTag($"pricing.retry_{attempt}", $"{(int)res.StatusCode}, waiting {waitSeconds:F0}s");
// Surface the cool-down to the chat UI on the same SSE channel HttpHelper uses.
if (HttpHelper.RetryReporter.Value is { } report)
// Surface the cool-down to the chat UI via the same baggage-keyed SSE channel HttpHelper uses.
var turnKey = System.Diagnostics.Activity.Current?.GetBaggageItem("finops.turn.id");
if (turnKey is not null && HttpHelper.RetryReporters.TryGetValue(turnKey, out var report))
{
try { await report(attempt + 1, waitSeconds); } catch { /* best-effort */ }
try { await report(attempt + 1, waitSeconds, url, "pricing", (int)res.StatusCode); }
catch (Exception emitEx)
{
HttpHelper.Logger?.LogWarning(emitEx,
"SSE cooling_down emit failed for pricing attempt={Attempt}", attempt + 1);
}
Comment on lines +97 to +101
}
await Task.Delay(TimeSpan.FromSeconds(waitSeconds));
}
Expand Down
49 changes: 49 additions & 0 deletions src/Dashboard/Auth/AzureSessionEndpoints.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,55 @@ public static void MapAzureSessionEndpoints(
var azureUserJson = ctx.Session.GetString("azure_user");
object? azureUser = azureUserJson is not null ? JsonSerializer.Deserialize<JsonElement>(azureUserJson) : null;

// Last-resort fallback: if azure_user wasn't populated by the OAuth
// callback or the persistent-identity middleware, decode the JWT
// access-token claims directly. Guarantees the sidebar always
// shows the signed-in email when a valid token exists.
// SECURITY NOTE: we deliberately do NOT verify the JWT signature here.
// This path runs AFTER the same `token` has already been used
// successfully upstream (the caller fetched it via OAuth refresh and
// is about to call ARM with it). The Bearer call to management.azure.com
// below would fail with 401 if the token were forged, so an attacker
// can't get a spoofed identity past the sidebar. We're just reading
// claims for display.
if (azureUser is null)
{
try
{
var parts = token.Split('.');
if (parts.Length >= 2)
{
var payload = parts[1].Replace('-', '+').Replace('_', '/');
switch (payload.Length % 4) { case 2: payload += "=="; break; case 3: payload += "="; break; }
var claims = JsonSerializer.Deserialize<JsonElement>(Convert.FromBase64String(payload));
string? upn = null, name = null, oid = null, tid = null;
if (claims.TryGetProperty("upn", out var u)) upn = u.GetString();
else if (claims.TryGetProperty("preferred_username", out var pu)) upn = pu.GetString();
else if (claims.TryGetProperty("unique_name", out var un)) upn = un.GetString();
if (claims.TryGetProperty("name", out var n)) name = n.GetString();
if (claims.TryGetProperty("oid", out var o)) oid = o.GetString();
if (claims.TryGetProperty("tid", out var t)) tid = t.GetString();
if (upn is not null || name is not null)
{
var derived = new Dictionary<string, string?>
{
["tenantId"] = tid,
["objectId"] = oid,
["name"] = name,
["email"] = upn,
};
azureUser = JsonSerializer.Deserialize<JsonElement>(JsonSerializer.Serialize(derived));
// Persist so subsequent /status calls don't re-decode.
ctx.Session.SetString("azure_user", JsonSerializer.Serialize(derived));
}
}
}
catch (Exception jwtEx)
{
logger.LogWarning(jwtEx, "Failed to decode Azure access-token claims for /status fallback");
}
Comment on lines +71 to +74
}

var http = httpFactory.CreateClient();
var subscriptions = new List<object>();
try
Expand Down
8 changes: 5 additions & 3 deletions src/Dashboard/Auth/MicrosoftAuthEndpoints.cs
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,9 @@ public static void MapMicrosoftAuthEndpoints(

ctx.Session.Remove("ms_oauth_state");

var http = httpFactory.CreateClient();
// Named client with 30 s overall timeout. IPv4-only transport is
// already applied by ConfigureHttpClientDefaults in Program.cs.
var http = httpFactory.CreateClient("entra-token");
var redirectUri = $"{MicrosoftOAuthOptions.NormalizeCallbackHost(ctx)}/auth/microsoft/callback";
var effectiveTenant = ctx.Session.GetString("auth_tenant") ?? options.TenantId;

Expand Down Expand Up @@ -400,7 +402,7 @@ public static void MapMicrosoftAuthEndpoints(
// GraphTier in sync as the user adds add-on consents incrementally.
if (!string.IsNullOrEmpty(refreshToken))
{
persistentIdentity.SaveIdentity(ctx, new IdentityRecord
await persistentIdentity.SaveIdentityAsync(ctx, new IdentityRecord
{
Oid = oid,
TenantId = validated.TenantId ?? "",
Expand All @@ -416,7 +418,7 @@ public static void MapMicrosoftAuthEndpoints(
// Edge case: re-consent without a fresh refresh_token. Update only
// the GraphTier so post-restart hydration still reflects the new
// add-on without clobbering the existing refresh token.
persistentIdentity.UpdateGraphTier(oid, ctx.Session.GetString("graph_tier"));
await persistentIdentity.UpdateGraphTierAsync(oid, ctx.Session.GetString("graph_tier"));
}
}
}
Expand Down
16 changes: 8 additions & 8 deletions src/Dashboard/Auth/PersistentIdentity.cs
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,10 @@ public static long DeriveUserId(string oid)
/// writes the encrypted identity cookie. Call this from the OAuth callback
/// after a successful id_token validation and from any path that mints a
/// new refresh_token.</summary>
public void SaveIdentity(HttpContext ctx, IdentityRecord record)
public async Task SaveIdentityAsync(HttpContext ctx, IdentityRecord record)
{
var sem = LockFor(record.Oid);
sem.Wait();
await sem.WaitAsync();
try
{
var dir = GetUserDir(record.Oid);
Expand Down Expand Up @@ -204,25 +204,25 @@ public void Clear(HttpContext ctx, string? oid)
/// <summary>Updates only the refresh token + recorded scopes on an existing
/// identity file. Used by <see cref="SessionTokenStore"/> when a refresh
/// rotates the token (Entra rotates refresh tokens on use).</summary>
public void UpdateRefreshToken(string oid, string newRefreshToken)
public Task UpdateRefreshTokenAsync(string oid, string newRefreshToken)
{
UpdateRecord(oid, r => { r.RefreshToken = newRefreshToken; });
return UpdateRecordAsync(oid, r => { r.RefreshToken = newRefreshToken; });
}

/// <summary>Persists the comma-separated list of consented Graph tiers so a
/// post-restart hydration restores the user's full add-on set, not just the
/// base ARM scope.</summary>
public void UpdateGraphTier(string oid, string? graphTier)
public Task UpdateGraphTierAsync(string oid, string? graphTier)
{
UpdateRecord(oid, r => { r.GraphTier = graphTier; });
return UpdateRecordAsync(oid, r => { r.GraphTier = graphTier; });
}

private void UpdateRecord(string oid, Action<IdentityRecord> mutate)
private async Task UpdateRecordAsync(string oid, Action<IdentityRecord> mutate)
{
var path = Path.Combine(GetUserDir(oid), "identity.json");
if (!File.Exists(path)) return;
var sem = LockFor(oid);
sem.Wait();
await sem.WaitAsync();
try
{
var existing = JsonSerializer.Deserialize<IdentityRecord>(_protector.Unprotect(File.ReadAllText(path)));
Expand Down
Loading
Loading