Skip to content

Commit 772cd3a

Browse files
matteo8pMatthew Wang
andauthored
Polish evals UI (#602)
Co-authored-by: Matthew Wang <[email protected]>
1 parent 70fe208 commit 772cd3a

File tree

9 files changed

+872
-222
lines changed

9 files changed

+872
-222
lines changed

client/src/components/EvalsTab.tsx

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -116,12 +116,6 @@ export function EvalsTab() {
116116
<div className="p-6 space-y-6">
117117
<div className="flex items-center justify-between">
118118
<h1 className="text-2xl font-bold">Evals</h1>
119-
{overviewData?.metadata && (
120-
<div className="text-sm text-muted-foreground">
121-
{overviewData.metadata.iterationsPassed} passed ·{" "}
122-
{overviewData.metadata.iterationsFailed} failed
123-
</div>
124-
)}
125119
</div>
126120

127121
<Tabs defaultValue="results" className="w-full">

client/src/components/evals/SuiteRow.tsx

Lines changed: 81 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,56 +1,34 @@
11
import { useMemo } from "react";
22
import { useAuth } from "@workos-inc/authkit-react";
33
import { useConvexAuth, useQuery } from "convex/react";
4-
import { formatTime, aggregateSuite } from "./helpers";
4+
import { aggregateSuite } from "./helpers";
55
import type { EvalSuite, EvalCase, EvalIteration } from "./types";
66

77
interface SuiteRowProps {
88
suite: EvalSuite;
99
onSelectSuite: (id: string) => void;
1010
}
1111

12-
interface SuiteStatusBadgesProps {
13-
passed: number;
14-
failed: number;
15-
cancelled: number;
16-
pending: number;
17-
}
12+
function formatCompactStatus(
13+
passed: number,
14+
failed: number,
15+
cancelled: number,
16+
pending: number,
17+
): string {
18+
const parts: string[] = [];
1819

19-
function SuiteStatusBadges({
20-
passed,
21-
failed,
22-
cancelled,
23-
pending,
24-
}: SuiteStatusBadgesProps) {
25-
return (
26-
<div className="flex items-center gap-2 text-xs">
27-
{passed > 0 && (
28-
<span className="inline-flex items-center rounded-full bg-green-100 px-2 py-0.5 text-green-700">
29-
{passed} passed
30-
</span>
31-
)}
32-
{failed > 0 && (
33-
<span className="inline-flex items-center rounded-full bg-red-100 px-2 py-0.5 text-red-700">
34-
{failed} failed
35-
</span>
36-
)}
37-
{cancelled > 0 && (
38-
<span className="inline-flex items-center rounded-full bg-gray-100 px-2 py-0.5 text-gray-700">
39-
{cancelled} cancelled
40-
</span>
41-
)}
42-
{pending > 0 && (
43-
<span className="inline-flex items-center rounded-full bg-yellow-100 px-2 py-0.5 text-yellow-700">
44-
{pending} pending
45-
</span>
46-
)}
47-
</div>
48-
);
20+
if (passed > 0) parts.push(`${passed} passed`);
21+
if (failed > 0) parts.push(`${failed} failed`);
22+
if (cancelled > 0) parts.push(`${cancelled} cancelled`);
23+
if (pending > 0) parts.push(`${pending} pending`);
24+
25+
return parts.join(" · ") || "No results";
4926
}
5027

5128
export function SuiteRow({ suite, onSelectSuite }: SuiteRowProps) {
5229
const { isAuthenticated } = useConvexAuth();
5330
const { user } = useAuth();
31+
const servers = suite.config?.environment.servers;
5432

5533
const enableQuery = isAuthenticated && !!user;
5634
const suiteDetails = useQuery(
@@ -73,40 +51,78 @@ export function SuiteRow({ suite, onSelectSuite }: SuiteRowProps) {
7351
? suite.config.tests.length
7452
: 0;
7553

54+
const serverTags = useMemo(() => {
55+
if (!Array.isArray(servers)) return [] as string[];
56+
57+
const sanitized = servers
58+
.filter((server): server is string => typeof server === "string")
59+
.map((server) => server.trim())
60+
.filter(Boolean);
61+
62+
if (sanitized.length <= 2) {
63+
return sanitized;
64+
}
65+
66+
const remaining = sanitized.length - 2;
67+
return [...sanitized.slice(0, 2), `+${remaining} more`];
68+
}, [servers]);
69+
70+
const totalIterations = aggregate?.filteredIterations.length ?? 0;
71+
72+
const getBorderColor = () => {
73+
if (!aggregate) return "bg-zinc-300/50";
74+
75+
const { passed, failed, cancelled, pending } = aggregate.totals;
76+
const total = passed + failed + cancelled + pending;
77+
78+
if (total === 0) return "bg-zinc-300/50";
79+
80+
const completedTotal = passed + failed;
81+
if (completedTotal === 0) return "bg-zinc-300/50";
82+
83+
const failureRate = (failed / completedTotal) * 100;
84+
85+
if (failureRate === 0) return "bg-emerald-500/50";
86+
if (failureRate <= 30) return "bg-amber-500/50";
87+
return "bg-red-500/50";
88+
};
89+
7690
return (
7791
<button
7892
onClick={() => onSelectSuite(suite._id)}
79-
className="grid w-full grid-cols-[minmax(0,1fr)_minmax(0,1.5fr)_minmax(0,0.8fr)] items-center gap-3 px-4 py-3 text-left transition-colors hover:bg-muted focus:outline-none focus-visible:ring-2 focus-visible:ring-primary/60"
93+
className="group relative flex w-full items-center gap-4 py-3 pl-4 pr-4 text-left transition-colors hover:bg-muted/50 focus:outline-none focus-visible:ring-2 focus-visible:ring-primary/60 cursor-pointer"
8094
>
81-
<div>
82-
<div className="font-medium">
83-
{new Date(suite._creationTime || 0).toLocaleDateString("en-US", {
84-
month: "short",
85-
day: "numeric",
86-
year: "numeric",
87-
hour: "numeric",
88-
minute: "2-digit",
89-
hour12: true,
90-
})}
95+
<div className={`absolute left-0 top-0 h-full w-1 ${getBorderColor()}`} />
96+
<div className="grid min-w-0 flex-1 grid-cols-[minmax(0,1fr)_minmax(0,1fr)_minmax(0,1fr)] items-center gap-4">
97+
<div className="min-w-0">
98+
<div className="text-sm font-medium text-foreground">
99+
{new Date(suite._creationTime || 0).toLocaleDateString("en-US", {
100+
month: "short",
101+
day: "numeric",
102+
year: "numeric",
103+
hour: "numeric",
104+
minute: "2-digit",
105+
hour12: true,
106+
})}
107+
</div>
108+
<div className="text-xs text-muted-foreground">
109+
{serverTags.length > 0 ? serverTags.join(", ") : "No servers"}
110+
</div>
91111
</div>
92-
<div className="text-xs text-muted-foreground">
93-
{testCount} test{testCount !== 1 ? "s" : ""}
112+
<div className="text-sm text-muted-foreground">
113+
{testCount} test{testCount !== 1 ? "s" : ""} · {totalIterations}{" "}
114+
iteration{totalIterations !== 1 ? "s" : ""}
115+
</div>
116+
<div className="text-sm text-muted-foreground">
117+
{aggregate
118+
? formatCompactStatus(
119+
aggregate.totals.passed,
120+
aggregate.totals.failed,
121+
aggregate.totals.cancelled,
122+
aggregate.totals.pending,
123+
)
124+
: "Loading..."}
94125
</div>
95-
</div>
96-
<div>
97-
{aggregate ? (
98-
<SuiteStatusBadges
99-
passed={aggregate.totals.passed}
100-
failed={aggregate.totals.failed}
101-
cancelled={aggregate.totals.cancelled}
102-
pending={aggregate.totals.pending}
103-
/>
104-
) : (
105-
<span className="text-xs text-muted-foreground">Loading...</span>
106-
)}
107-
</div>
108-
<div className="text-sm text-muted-foreground">
109-
{formatTime(suite._creationTime)}
110126
</div>
111127
</button>
112128
);

client/src/components/evals/iteration-details.tsx

Lines changed: 23 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -49,23 +49,10 @@ export function IterationDetails({
4949
}, [iteration.blob, getBlob]);
5050

5151
return (
52-
<div className="space-y-3 rounded-lg border border-border bg-background p-4 shadow-sm">
53-
<div className="flex flex-wrap items-center gap-2 text-sm">
54-
<span className="font-semibold">Status</span>
55-
<Badge className="capitalize">{iteration.status}</Badge>
56-
<span className="mx-1 text-muted-foreground">·</span>
57-
<span className="font-semibold">Result</span>
58-
<Badge className="capitalize">{iteration.result}</Badge>
59-
</div>
60-
<div className="grid gap-2 text-sm text-muted-foreground sm:grid-cols-2">
61-
<div>Started {formatTime(iteration.startedAt)}</div>
62-
<div>Updated {formatTime(iteration.updatedAt)}</div>
63-
<div>Tokens {Number(iteration.tokensUsed || 0).toLocaleString()}</div>
64-
<div>Tool calls {iteration.actualToolCalls.length}</div>
65-
</div>
52+
<div className="space-y-3 py-2">
6653
{(testCase?.expectedToolCalls.length || 0) > 0 && (
67-
<div className="space-y-1">
68-
<div className="text-sm font-semibold">Expected tools:</div>
54+
<div className="space-y-1.5">
55+
<div className="text-xs font-semibold">Expected tools</div>
6956
<div className="flex flex-wrap gap-1.5">
7057
{testCase?.expectedToolCalls.map((tool, idx) => (
7158
<Badge key={idx} variant="outline" className="font-mono text-xs">
@@ -75,17 +62,16 @@ export function IterationDetails({
7562
</div>
7663
</div>
7764
)}
65+
7866
{iteration.actualToolCalls.length > 0 && (
79-
<div className="space-y-1">
80-
<div className="text-sm font-semibold">Actual tools called:</div>
67+
<div className="space-y-1.5">
68+
<div className="text-xs font-semibold">Actual tools called</div>
8169
<div className="flex flex-wrap gap-1.5">
8270
{iteration.actualToolCalls.map((tool, idx) => {
83-
const isExpected = testCase?.expectedToolCalls.includes(tool);
84-
const isMissing = false; // actual tools can't be missing
8571
return (
8672
<Badge
8773
key={idx}
88-
variant={isExpected ? "default" : "destructive"}
74+
variant="outline"
8975
className="font-mono text-xs"
9076
>
9177
{tool}
@@ -95,21 +81,23 @@ export function IterationDetails({
9581
</div>
9682
</div>
9783
)}
98-
<div className="rounded-md border bg-muted/40 p-3">
99-
{loading ? (
100-
<div className="text-sm text-muted-foreground">Loading blob…</div>
101-
) : error ? (
102-
<div className="text-sm text-red-600">{error}</div>
103-
) : iteration.blob ? (
104-
<pre className="max-h-[360px] overflow-auto whitespace-pre-wrap break-words text-xs">
105-
{JSON.stringify(blob, null, 2)}
106-
</pre>
107-
) : (
108-
<div className="text-sm text-muted-foreground">
109-
No blob attached to this iteration.
84+
85+
{iteration.blob && (
86+
<div className="space-y-1.5">
87+
<div className="text-xs font-semibold">Trace</div>
88+
<div className="rounded-md bg-muted/20 p-3">
89+
{loading ? (
90+
<div className="text-xs text-muted-foreground">Loading trace</div>
91+
) : error ? (
92+
<div className="text-xs text-red-600">{error}</div>
93+
) : (
94+
<pre className="max-h-[360px] overflow-auto whitespace-pre-wrap break-words text-xs">
95+
{JSON.stringify(blob, null, 2)}
96+
</pre>
97+
)}
11098
</div>
111-
)}
112-
</div>
99+
</div>
100+
)}
113101
</div>
114102
);
115103
}

0 commit comments

Comments
 (0)