Skip to content

Commit b8c8bb8

Browse files
committed
vllm_queue_status
Signed-off-by: Huamin Li <[email protected]>
1 parent a287d59 commit b8c8bb8

File tree

5 files changed

+583
-0
lines changed

5 files changed

+583
-0
lines changed
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
{
2+
"params": {
3+
"startTime": "DateTime64(3)",
4+
"stopTime": "DateTime64(3)"
5+
},
6+
"tests": [
7+
{
8+
"startTime": "2025-10-17T00:00:00.000",
9+
"stopTime": "2025-10-18T00:00:00.000"
10+
}
11+
]
12+
}
Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
/* Windowed per-build table (UTC), incl. PR & main builds, with queue totals, cost, and is_main_branch.
2+
WAIT: only attempts with started_at IS NOT NULL contribute wait (runnable → started).
3+
RUN: clip to [w_start, w_end]; 1-day zombie guard for open 'running' attempts.
4+
COST: 1.3232 * gpu_1_queue_run_hours + 4.602 * gpu_4_queue_run_hours (fixed).
5+
*/
6+
7+
WITH
8+
parseDateTime64BestEffort({startTime:String}, 3) AS w_start, -- inclusive (UTC)
9+
parseDateTime64BestEffort({stopTime:String}, 3) AS w_end, -- exclusive (UTC)
10+
toDateTime64(now(), 3) AS now64,
11+
(w_end - INTERVAL 1 DAY) AS zombie_cutoff,
12+
toDateTime64('2100-01-01 00:00:00', 3) AS FAR_FUTURE,
13+
['gpu_1_queue','gpu_4_queue'] AS QUEUES
14+
15+
/* 1) All builds created within the window (+ branch/PR context) */
16+
, builds_window AS (
17+
SELECT
18+
tupleElement(build,'id') AS build_id,
19+
20+
argMax(tupleElement(build,'number'), tupleElement(job,'created_at')) AS build_number,
21+
argMax(tupleElement(build,'web_url'), tupleElement(job,'created_at')) AS build_url,
22+
concat(argMax(tupleElement(build,'web_url'), tupleElement(job,'created_at')), '/steps/table') AS steps_table_url,
23+
argMax(tupleElement(build,'commit'), tupleElement(job,'created_at')) AS commit_sha,
24+
25+
/* robust start/finish (fallback to job min/max if build-level fields are NULL) */
26+
coalesce(argMax(tupleElement(build,'started_at'), tupleElement(job,'created_at')),
27+
min(tupleElement(job,'started_at'))) AS robust_start,
28+
coalesce(argMax(tupleElement(build,'finished_at'), tupleElement(job,'created_at')),
29+
max(tupleElement(job,'finished_at'))) AS robust_finish,
30+
31+
countDistinct(tupleElement(job,'id')) AS steps_count,
32+
argMax(tupleElement(build,'state'), tupleElement(job,'created_at')) AS latest_build_state,
33+
34+
/* repo + PR mapping (repo_slug may come from pipeline or PR repo) */
35+
coalesce(
36+
nullIf(extract(argMax(tupleElement(pipeline,'repository'), tupleElement(job,'created_at')), 'github\\.com[:/]+([^/]+/[^/.]+)'), ''),
37+
nullIf(extract(argMax(tupleElement(build,'pull_request').repository, tupleElement(job,'created_at')), 'github\\.com[:/]+([^/]+/[^/.]+)'), ''),
38+
nullIf(extract(argMax(tupleElement(build,'pull_request').repository, tupleElement(job,'created_at')), '([^/]+/[^/.]+)'), '')
39+
) AS repo_slug,
40+
coalesce(
41+
toInt64OrNull(argMax(tupleElement(build,'pull_request').id, tupleElement(job,'created_at'))),
42+
toInt64OrNull(extract(argMax(tupleElement(build,'branch'), tupleElement(job,'created_at')), 'pull/([0-9]+)'))
43+
) AS pr_number,
44+
45+
argMax(tupleElement(build,'created_at'), tupleElement(job,'created_at')) AS build_created_at_utc,
46+
argMax(tupleElement(build,'branch'), tupleElement(job,'created_at')) AS branch_name
47+
FROM vllm.vllm_buildkite_jobs
48+
GROUP BY tupleElement(build,'id')
49+
HAVING build_created_at_utc >= w_start AND build_created_at_utc < w_end
50+
)
51+
52+
/* 2) Agent-run attempts for those builds that can overlap the window */
53+
, base_agent AS (
54+
SELECT
55+
tupleElement(build,'id') AS build_id,
56+
tupleElement(job,'id') AS job_id,
57+
tupleElement(job,'created_at') AS created_at,
58+
tupleElement(job,'state') AS state,
59+
tupleElement(job,'runnable_at') AS runnable_at,
60+
tupleElement(job,'started_at') AS started_at,
61+
tupleElement(job,'finished_at') AS finished_at,
62+
replaceOne(arrayFirst(x -> startsWith(x,'queue='),
63+
tupleElement(job,'agent_query_rules')), 'queue=', '') AS queue_key
64+
FROM vllm.vllm_buildkite_jobs
65+
INNER JOIN builds_window b ON tupleElement(build,'id') = b.build_id
66+
WHERE tupleElement(job,'type') IN ('script','command')
67+
AND (
68+
tupleElement(job,'runnable_at') < w_end OR
69+
tupleElement(job,'started_at') < w_end OR
70+
ifNull(tupleElement(job,'finished_at'), FAR_FUTURE) >= w_start
71+
)
72+
)
73+
74+
/* 3) Collapse to (build_id, job_id) and collect attempts keyed by queue */
75+
, jobs_by_build AS (
76+
SELECT
77+
build_id,
78+
job_id,
79+
argMax(state, created_at) AS latest_state,
80+
max(created_at) AS last_event_at,
81+
82+
/* RUN attempts: (queue, start, finish) */
83+
arrayDistinct(arrayFilter(t -> t.2 IS NOT NULL,
84+
groupArray((queue_key, started_at, finished_at))
85+
)) AS run_triplets,
86+
87+
/* WAIT attempts: (queue, runnable, start) — ONLY attempts that actually started */
88+
arrayDistinct(arrayFilter(t -> t.2 IS NOT NULL AND t.3 IS NOT NULL,
89+
groupArray((queue_key, runnable_at, started_at))
90+
)) AS wait_triplets
91+
FROM base_agent
92+
GROUP BY build_id, job_id
93+
)
94+
95+
/* 4) RUN attempts → per build × queue (clip to window; zombie guard for open runs) */
96+
, runs_scored AS (
97+
SELECT
98+
build_id,
99+
tupleElement(rt, 1) AS queue_key,
100+
greatest(tupleElement(rt, 2), w_start) AS s_clip,
101+
least(
102+
ifNull(
103+
tupleElement(rt, 3),
104+
if(latest_state = 'running' AND last_event_at < zombie_cutoff,
105+
least(last_event_at + INTERVAL 1 MINUTE, w_end),
106+
w_end)
107+
),
108+
w_end
109+
) AS e_clip
110+
FROM jobs_by_build
111+
ARRAY JOIN run_triplets AS rt
112+
WHERE tupleElement(rt, 1) IN QUEUES
113+
)
114+
, run_by_build AS (
115+
SELECT
116+
build_id, queue_key,
117+
sumIf(dateDiff('second', s_clip, e_clip), e_clip > s_clip) AS total_run_s
118+
FROM runs_scored
119+
GROUP BY build_id, queue_key
120+
)
121+
122+
/* 5) WAIT attempts (runnable → started) → per build × queue (clip to window) */
123+
, waits_scored AS (
124+
SELECT
125+
build_id,
126+
tupleElement(wt, 1) AS queue_key,
127+
greatest(tupleElement(wt, 2), w_start) AS ra_clip,
128+
least(tupleElement(wt, 3), w_end) AS st_clip,
129+
greatest(0, dateDiff('second', greatest(tupleElement(wt, 2), w_start), least(tupleElement(wt, 3), w_end))) AS wait_s
130+
FROM jobs_by_build
131+
ARRAY JOIN wait_triplets AS wt
132+
WHERE tupleElement(wt, 1) IN QUEUES
133+
)
134+
, waits_p90_pivot AS (
135+
SELECT
136+
build_id,
137+
/* P90 per queue (approximate quantile; broadly supported) */
138+
quantileIf(0.9)(toFloat64(wait_s), queue_key = 'gpu_1_queue') AS gpu1_p90_s,
139+
quantileIf(0.9)(toFloat64(wait_s), queue_key = 'gpu_4_queue') AS gpu4_p90_s,
140+
/* Combined P90 across both queues */
141+
quantile(0.9)(toFloat64(wait_s)) AS p90_combined_s
142+
FROM waits_scored
143+
WHERE wait_s > 0
144+
GROUP BY build_id
145+
)
146+
147+
/* 6) Pivot per-build totals to hour columns */
148+
, run_totals_by_build AS (
149+
SELECT
150+
build_id,
151+
round(ifNull(sumIf(total_run_s, queue_key='gpu_1_queue'), 0) / 3600.0, 2) AS gpu_1_queue_run_hours,
152+
round(ifNull(sumIf(total_run_s, queue_key='gpu_4_queue'), 0) / 3600.0, 2) AS gpu_4_queue_run_hours
153+
FROM (
154+
SELECT build_id, queue_key, total_run_s, toInt64(0) AS total_wait_s FROM run_by_build
155+
UNION ALL
156+
/* keep run totals; wait sums kept only for backward compat (unused in UI now) */
157+
SELECT build_id, queue_key, toInt64(0) AS total_run_s, toInt64(0) AS total_wait_s
158+
)
159+
GROUP BY build_id
160+
)
161+
162+
/* 7) Final table (UTC) — includes both PR and main builds */
163+
SELECT
164+
/* PR URL (NULL for non-PR builds) */
165+
if((pr_number IS NULL) OR (repo_slug IS NULL),
166+
NULL,
167+
concat('https://github.com/', repo_slug, '/pull/', toString(pr_number))
168+
) AS pr_url,
169+
170+
build_number,
171+
build_id,
172+
build_url,
173+
steps_table_url,
174+
commit_sha,
175+
176+
robust_start AS build_started_at,
177+
robust_finish AS build_finished_at,
178+
179+
/* duration (hours) = finish − start (UTC) */
180+
multiIf(
181+
robust_start IS NULL OR robust_finish IS NULL,
182+
NULL,
183+
round(dateDiff('second', robust_start, robust_finish) / 3600.0, 2)
184+
) AS duration_hours,
185+
186+
steps_count,
187+
latest_build_state,
188+
189+
/* Keep run hours for cost */
190+
ifNull(rt.gpu_1_queue_run_hours, 0) AS gpu_1_queue_run_hours,
191+
ifNull(rt.gpu_4_queue_run_hours, 0) AS gpu_4_queue_run_hours,
192+
193+
/* NEW: P90 wait hours (by queue + combined) */
194+
round(ifNull(wp.gpu1_p90_s, 0) / 3600.0, 2) AS gpu_1_queue_wait_p90_hours,
195+
round(ifNull(wp.gpu4_p90_s, 0) / 3600.0, 2) AS gpu_4_queue_wait_p90_hours,
196+
round(ifNull(wp.p90_combined_s, 0) / 3600.0, 2) AS wait_p90_hours,
197+
198+
/* Fixed-rate cost */
199+
round(
200+
1.3232 * ifNull(rt.gpu_1_queue_run_hours, 0) +
201+
4.602 * ifNull(rt.gpu_4_queue_run_hours, 0),
202+
2
203+
) AS cost,
204+
205+
/* Mark if the build branch is literally 'main' */
206+
toUInt8(branch_name = 'main') AS is_main_branch
207+
208+
FROM builds_window AS b
209+
LEFT JOIN run_totals_by_build AS rt ON rt.build_id = b.build_id
210+
LEFT JOIN waits_p90_pivot AS wp ON wp.build_id = b.build_id
211+
ORDER BY build_created_at_utc ASC;
Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
import { Box, Stack, Switch, Tooltip, Typography } from "@mui/material";
2+
import ReactECharts from "echarts-for-react";
3+
import { useCallback, useMemo, useState } from "react";
4+
5+
// Helper: extract pipeline slug from Buildkite URL (e.g., /vllm/ci/builds/...)
6+
function pipelineFromUrl(url: string | null): string {
7+
try {
8+
if (!url) return "unknown";
9+
const u = new URL(url);
10+
const parts = u.pathname.split("/").filter(Boolean);
11+
// ['', 'vllm', 'ci', 'builds', '35333', ...] => ['vllm','ci','builds','35333']
12+
return parts[1] || "unknown";
13+
} catch {
14+
const m = url?.match(/buildkite\.com\/[^/]+\/([^/]+)/i);
15+
return m?.[1] ?? "unknown";
16+
}
17+
}
18+
19+
type Row = {
20+
pr_url: string | null;
21+
build_number: number;
22+
build_id: string;
23+
build_url: string;
24+
steps_table_url: string;
25+
commit_sha: string;
26+
build_started_at: string | null; // UTC
27+
build_finished_at: string | null; // UTC
28+
duration_hours: number | null;
29+
steps_count: number;
30+
latest_build_state: string;
31+
gpu_1_queue_wait_hours: number;
32+
gpu_1_queue_run_hours: number;
33+
gpu_4_queue_wait_hours: number;
34+
gpu_4_queue_run_hours: number;
35+
wait_p90_hours: number;
36+
gpu_1_queue_wait_p90_hours: number;
37+
gpu_4_queue_wait_p90_hours: number;
38+
cost: number; // dollars
39+
is_main_branch: number; // 0/1
40+
};
41+
42+
export default function QueueWaitPerBuildPanel({
43+
data,
44+
}: {
45+
data: Row[] | undefined;
46+
}) {
47+
const [mainOnly, setMainOnly] = useState(true);
48+
49+
// Filter & sort; drop rows without a start time (time axis needs x)
50+
const rows = useMemo(() => {
51+
const r = (data ?? [])
52+
.filter((x) => (mainOnly ? x.is_main_branch === 1 : true))
53+
.filter((x) => !!x.build_started_at);
54+
return r.sort((a, b) => {
55+
const ta = a.build_started_at
56+
? new Date(a.build_started_at).getTime()
57+
: 0;
58+
const tb = b.build_started_at
59+
? new Date(b.build_started_at).getTime()
60+
: 0;
61+
return ta - tb || a.build_number - b.build_number;
62+
});
63+
}, [data, mainOnly]);
64+
65+
// Group rows by pipeline to build one series per pipeline
66+
const grouped = useMemo(() => {
67+
const g = new Map<string, Row[]>();
68+
for (const r of rows) {
69+
const p = pipelineFromUrl(r.build_url);
70+
if (!g.has(p)) g.set(p, []);
71+
g.get(p)!.push(r);
72+
}
73+
return g;
74+
}, [rows]);
75+
76+
// Click → open Buildkite build page
77+
const onPointClick = useCallback((e: { dataIndex?: number; data?: any }) => {
78+
// We attach build_url to each data item; prefer that if present
79+
const url = e?.data?.build_url;
80+
if (url) window.open(url, "_blank", "noopener,noreferrer");
81+
}, []);
82+
83+
const option = useMemo(() => {
84+
const series: any[] = [];
85+
for (const [pipeline, arr] of grouped.entries()) {
86+
series.push({
87+
name: pipeline,
88+
type: "scatter",
89+
symbolSize: 6,
90+
cursor: "pointer",
91+
data: arr.map((r) => ({
92+
value: [r.build_started_at, r.wait_p90_hours ?? 0],
93+
build_url: r.build_url,
94+
row: r,
95+
})),
96+
});
97+
}
98+
99+
return {
100+
tooltip: {
101+
trigger: "item",
102+
confine: true,
103+
formatter: (p: any) => {
104+
const r = (p?.data?.row ?? {}) as Partial<Row>;
105+
const buildLink = r.build_url
106+
? `<a href="${r.build_url}" target="_blank" rel="noreferrer">#${r.build_number}</a>`
107+
: `#${r.build_number}`;
108+
const prLine = r.pr_url
109+
? `<div>PR: <a href="${
110+
r.pr_url
111+
}" target="_blank" rel="noreferrer">${r.pr_url
112+
.replace("https://github.com/", "")
113+
.replace("/pull/", "#")}</a></div>`
114+
: "";
115+
const w1 = Number(r?.gpu_1_queue_wait_p90_hours ?? 0);
116+
const w4 = Number(r?.gpu_4_queue_wait_p90_hours ?? 0);
117+
const waitP90 = Number(r?.wait_p90_hours ?? Math.max(w1, w4));
118+
return `
119+
<div>
120+
<div><b>${r?.build_started_at ?? ""}</b></div>
121+
<div>Pipeline: <b>${pipelineFromUrl(
122+
r?.build_url ?? null
123+
)}</b></div>
124+
<div>Build: ${buildLink}</div>
125+
${prLine}
126+
<div>P90 wait GPU1: ${w1.toFixed(2)} h</div>
127+
<div>P90 wait GPU4: ${w4.toFixed(2)} h</div>
128+
<div>P90 wait (combined): ${waitP90.toFixed(2)} h</div>
129+
<div>Branch: ${r?.is_main_branch ? "main" : "PR/other"}</div>
130+
</div>
131+
`;
132+
},
133+
},
134+
legend: { top: 0 },
135+
grid: { left: 40, right: 50, bottom: 40, top: 40 },
136+
xAxis: { type: "time", name: "Build start (UTC)" },
137+
yAxis: [{ type: "value", name: "Wait (h)" }],
138+
series,
139+
};
140+
}, [grouped]);
141+
142+
return (
143+
<Box sx={{ height: "100%", display: "flex", flexDirection: "column" }}>
144+
<Stack
145+
direction="row"
146+
spacing={2}
147+
alignItems="center"
148+
sx={{ px: 2, pt: 1 }}
149+
>
150+
<Typography variant="h6" sx={{ fontWeight: "bold" }}>
151+
Queue Wait (per build)
152+
</Typography>
153+
<Tooltip title="Show only builds on branch 'main'">
154+
<Stack direction="row" spacing={1} alignItems="center">
155+
<Typography variant="body2">Main only</Typography>
156+
<Switch
157+
size="small"
158+
checked={mainOnly}
159+
onChange={() => setMainOnly((s) => !s)}
160+
/>
161+
</Stack>
162+
</Tooltip>
163+
</Stack>
164+
<Box sx={{ flex: 1, minHeight: 240 }}>
165+
<ReactECharts
166+
style={{ height: "100%" }}
167+
option={option}
168+
notMerge
169+
onEvents={{ click: onPointClick }}
170+
/>
171+
</Box>
172+
</Box>
173+
);
174+
}

0 commit comments

Comments
 (0)