Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions apps/api/src/observability/trace-builders.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ export function createRagSuccessTrace(input: {
groundedness: response.devMode.evals?.groundedness,
confidenceScore: response.devMode.evals?.confidence?.confidenceScore,
confidenceLevel: response.devMode.evals?.confidence?.confidenceLevel,
calibratedConfidence: response.devMode.evals?.confidence?.overallConfidence,
confidenceLabel: response.devMode.evals?.confidence?.label,
confidenceBreakdown: response.devMode.evals?.confidence?.breakdown,
confidenceUncertaintyReasons: response.devMode.evals?.confidence?.uncertaintyReasons,
confidenceRecommendedAction: response.devMode.evals?.confidence?.recommendedAction,
confidenceReasoning: response.devMode.evals?.confidence?.confidenceReasoning,
confidenceEvidenceSignals: response.devMode.evals?.confidence?.evidenceSignals,
failureCategory: response.devMode.evals?.taxonomy?.category,
Expand Down
14 changes: 14 additions & 0 deletions apps/api/src/rag-service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3193,7 +3193,21 @@ async function buildReliabilityAugmentation(input: {
evals: input.evals,
});
const confidence = calibrateConfidence({
query: input.query,
diagnostics: retrievalDiagnostics,
chunks: input.devMode.results.map((item) => ({
chunkId: item.chunkId,
documentId: item.documentId,
sectionId: item.sectionId,
score: item.score,
text: item.text,
})),
rerankTrace: (input.rerankingCandidates ?? []).map((candidate) => ({
chunkId: candidate.chunkId,
beforeRank: candidate.beforeRank,
afterRank: candidate.afterRank,
finalScore: candidate.finalScore,
})),
evals: input.evals,
});
const retrievalAnalysis = new RetrievalDiagnosticsEngine().analyze({
Expand Down
134 changes: 134 additions & 0 deletions apps/api/src/rag/confidence.controller.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import { Body, Controller, Get, Post } from "@nestjs/common";
import {
ConfidenceCalibrationEngine,
ConfidencePolicyEngine,
buildRetrievalDiagnostics,
calibrateConfidence,
type CalibrationProfile,
type RetrievalEvidenceChunk,
} from "../retrieval-reliability";

type ConfidenceInputBody = {
query?: string;
answer?: string;
retrievedChunks: RetrievalEvidenceChunk[];
citations?: Array<{ chunkId: string }>;
retrievalTrace?: {
mode?: string;
provider?: string;
model?: string;
queryIntent?: string;
candidateCount?: number;
};
rerankTrace?: Array<{ chunkId: string; beforeRank: number; afterRank: number; finalScore?: number }>;
confidenceProfile?: CalibrationProfile;
includeContradictionCheck?: boolean;
includeCoverageCheck?: boolean;
includeAgreementCheck?: boolean;
devMode?: boolean;
evals?: {
groundedness?: number;
answerOverlap?: number;
retrievalAccuracy?: number;
scorerResults?: {
faithfulness?: { score: number };
relevance?: { score: number };
recall?: { score: number };
};
};
};

@Controller("confidence")
export class ConfidenceController {
private readonly policyEngine = new ConfidencePolicyEngine();
private readonly calibrationEngine = new ConfidenceCalibrationEngine();

@Post("score")
score(@Body() body: ConfidenceInputBody) {
const diagnostics = this.buildDiagnostics(body);
return calibrateConfidence({
query: body.query,
diagnostics,
chunks: body.retrievedChunks,
rerankTrace: body.rerankTrace,
profile: body.confidenceProfile,
evals: body.evals,
});
}

@Post("calibrate")
calibrate(@Body() body: ConfidenceInputBody) {
const diagnostics = this.buildDiagnostics(body);
return this.calibrationEngine.calibrate({
query: body.query,
diagnostics,
chunks: body.retrievedChunks,
rerankTrace: body.rerankTrace,
profile: body.confidenceProfile,
evals: body.evals,
});
}

@Post("evaluate")
evaluate(@Body() body: ConfidenceInputBody) {
const diagnostics = this.buildDiagnostics(body);
const simpleScore = Number(diagnostics.topScore.toFixed(3));
const calibrated = this.calibrationEngine.calibrate({
query: body.query,
diagnostics,
chunks: body.retrievedChunks,
rerankTrace: body.rerankTrace,
profile: body.confidenceProfile,
evals: body.evals,
});
return {
simpleScore,
calibratedScore: calibrated.overallConfidence,
delta: Number((calibrated.overallConfidence - simpleScore).toFixed(3)),
calibratedLabel: calibrated.label,
recommendedAction: calibrated.recommendedAction,
};
}

@Post("explain")
explain(@Body() body: ConfidenceInputBody) {
const diagnostics = this.buildDiagnostics(body);
return this.calibrationEngine.calibrate({
query: body.query,
diagnostics,
chunks: body.retrievedChunks,
rerankTrace: body.rerankTrace,
profile: body.confidenceProfile,
evals: body.evals,
}).trace;
}

@Get("policies")
policies() {
return this.policyEngine.getPolicy();
}

@Get("runs")
runs() {
return {
runs: [],
};
}

private buildDiagnostics(body: ConfidenceInputBody) {
return buildRetrievalDiagnostics({
results: body.retrievedChunks ?? [],
candidateCount: body.retrievalTrace?.candidateCount,
citations: body.citations,
evals: {
groundedness: body.evals?.groundedness,
answerOverlap: body.evals?.answerOverlap,
},
retrievalMode: body.retrievalTrace?.mode,
rerankingApplied: Boolean(body.rerankTrace?.length),
provider: body.retrievalTrace?.provider,
model: body.retrievalTrace?.model,
queryIntent: body.retrievalTrace?.queryIntent,
});
}
}
2 changes: 2 additions & 0 deletions apps/api/src/rag/rag.module.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import { RagMemoryService } from "./rag-memory/rag-memory.service";
import { RagMetricsController } from "./rag-metrics/rag-metrics.controller";
import { RagMetricsService } from "./rag-metrics/rag-metrics.service";
import { RetrievalDiagnosticsController } from "./retrieval-diagnostics.controller";
import { ConfidenceController } from "./confidence.controller";

@Module({
controllers: [
Expand All @@ -18,6 +19,7 @@ import { RetrievalDiagnosticsController } from "./retrieval-diagnostics.controll
RagMetricsController,
RagMemoryController,
RetrievalDiagnosticsController,
ConfidenceController,
MemoryController,
],
providers: [RagService, RagIndexService, RagMetricsService, RagMemoryService, MemoryService],
Expand Down
86 changes: 86 additions & 0 deletions apps/api/src/retrieval-reliability.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ import {
createCorpusDriftReport,
createPromptPolicyDiffReport,
RetrievalDiagnosticsEngine,
ConfidenceCalibrationEngine,
ConfidencePolicyEngine,
} from "./retrieval-reliability";

describe("retrieval reliability", () => {
Expand Down Expand Up @@ -292,6 +294,90 @@ describe("retrieval reliability", () => {
expect(confidence.confidenceReasoning.join(" ")).toContain("FLAG missing-citations");
});

it("builds calibrated confidence breakdown with policy action", () => {
const diagnostics = buildRetrievalDiagnostics({
results: [
{
chunkId: "doc:section-1:chunk-1",
documentId: "doc",
sectionId: "section-1",
score: 0.83,
text: "Phase 6 auth uses JWT with signed tokens and expiry metadata.",
},
{
chunkId: "doc:section-2:chunk-1",
documentId: "doc",
sectionId: "section-2",
score: 0.77,
text: "Phase 5 auth used session tokens with server-side state.",
},
],
citations: [{ chunkId: "doc:section-1:chunk-1" }, { chunkId: "doc:section-2:chunk-1" }],
evals: {
groundedness: 0.88,
answerOverlap: 0.82,
},
retrievalMode: "hybrid",
rerankingApplied: true,
});

const calibrated = new ConfidenceCalibrationEngine().calibrate({
query: "Compare auth changes between phase 5 and phase 6",
diagnostics,
chunks: [
{
chunkId: "doc:section-1:chunk-1",
documentId: "doc",
sectionId: "section-1",
score: 0.83,
text: "Phase 6 auth uses JWT with signed tokens and expiry metadata.",
},
{
chunkId: "doc:section-2:chunk-1",
documentId: "doc",
sectionId: "section-2",
score: 0.77,
text: "Phase 5 auth used session tokens with server-side state.",
},
],
rerankTrace: [
{ chunkId: "doc:section-1:chunk-1", beforeRank: 2, afterRank: 1 },
{ chunkId: "doc:section-2:chunk-1", beforeRank: 1, afterRank: 2 },
],
evals: {
groundedness: 0.88,
answerOverlap: 0.82,
scorerResults: {
faithfulness: { score: 0.86 },
relevance: { score: 0.82 },
},
},
});

expect(calibrated.breakdown.retrievalConfidence).toBeGreaterThan(0.6);
expect(calibrated.breakdown.evidenceCoverage).toBeGreaterThanOrEqual(0.5);
expect(calibrated.recommendedAction).toMatch(
/answer_normally|answer_with_uncertainty|run_additional_retrieval|request_clarification/
);
});

it("selects contradiction check when contradiction risk is high", () => {
const action = new ConfidencePolicyEngine().selectAction({
score: 0.58,
contradictionRisk: 0.8,
evidenceCoverage: 0.7,
sourceDiversity: 0.6,
queryRisk: {
ambiguous: false,
critical: false,
multiHop: false,
normative: false,
precisionRequired: false,
},
});
expect(action).toBe("run_contradiction_check");
});

it("builds formal retrieval failure analysis for semantic drift", () => {
const diagnostics = buildRetrievalDiagnostics({
results: [
Expand Down
Loading
Loading