Skip to content

Commit b9e525b

Browse files
committed
vllm_queue_status
Signed-off-by: Huamin Li <[email protected]>
1 parent a287d59 commit b9e525b

File tree

5 files changed

+575
-0
lines changed

5 files changed

+575
-0
lines changed
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
{
2+
"params": {
3+
"startTime": "DateTime64(3)",
4+
"stopTime": "DateTime64(3)"
5+
},
6+
"tests": [
7+
{
8+
"startTime": "2025-10-17T00:00:00.000",
9+
"stopTime": "2025-10-18T00:00:00.000"
10+
}
11+
]
12+
}
Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
/* Windowed per-build table (UTC), incl. PR & main builds, with queue totals, cost, and is_main_branch.
2+
WAIT: only attempts with started_at IS NOT NULL contribute wait (runnable → started).
3+
RUN: clip to [w_start, w_end]; 1-day zombie guard for open 'running' attempts.
4+
COST: 1.3232 * gpu_1_queue_run_hours + 4.602 * gpu_4_queue_run_hours (fixed).
5+
*/
6+
7+
WITH
8+
parseDateTime64BestEffort({startTime:String}, 3) AS w_start, -- inclusive (UTC)
9+
parseDateTime64BestEffort({stopTime:String}, 3) AS w_end, -- exclusive (UTC)
10+
toDateTime64(now(), 3) AS now64,
11+
(w_end - INTERVAL 1 DAY) AS zombie_cutoff,
12+
toDateTime64('2100-01-01 00:00:00', 3) AS FAR_FUTURE,
13+
['gpu_1_queue','gpu_4_queue'] AS QUEUES
14+
15+
/* 1) All builds created within the window (+ branch/PR context) */
16+
, builds_window AS (
17+
SELECT
18+
tupleElement(build,'id') AS build_id,
19+
20+
argMax(tupleElement(build,'number'), tupleElement(job,'created_at')) AS build_number,
21+
argMax(tupleElement(build,'web_url'), tupleElement(job,'created_at')) AS build_url,
22+
concat(argMax(tupleElement(build,'web_url'), tupleElement(job,'created_at')), '/steps/table') AS steps_table_url,
23+
argMax(tupleElement(build,'commit'), tupleElement(job,'created_at')) AS commit_sha,
24+
25+
/* robust start/finish (fallback to job min/max if build-level fields are NULL) */
26+
coalesce(argMax(tupleElement(build,'started_at'), tupleElement(job,'created_at')),
27+
min(tupleElement(job,'started_at'))) AS robust_start,
28+
coalesce(argMax(tupleElement(build,'finished_at'), tupleElement(job,'created_at')),
29+
max(tupleElement(job,'finished_at'))) AS robust_finish,
30+
31+
countDistinct(tupleElement(job,'id')) AS steps_count,
32+
argMax(tupleElement(build,'state'), tupleElement(job,'created_at')) AS latest_build_state,
33+
34+
/* repo + PR mapping (repo_slug may come from pipeline or PR repo) */
35+
coalesce(
36+
nullIf(extract(argMax(tupleElement(pipeline,'repository'), tupleElement(job,'created_at')), 'github\\.com[:/]+([^/]+/[^/.]+)'), ''),
37+
nullIf(extract(argMax(tupleElement(build,'pull_request').repository, tupleElement(job,'created_at')), 'github\\.com[:/]+([^/]+/[^/.]+)'), ''),
38+
nullIf(extract(argMax(tupleElement(build,'pull_request').repository, tupleElement(job,'created_at')), '([^/]+/[^/.]+)'), '')
39+
) AS repo_slug,
40+
coalesce(
41+
toInt64OrNull(argMax(tupleElement(build,'pull_request').id, tupleElement(job,'created_at'))),
42+
toInt64OrNull(extract(argMax(tupleElement(build,'branch'), tupleElement(job,'created_at')), 'pull/([0-9]+)'))
43+
) AS pr_number,
44+
45+
argMax(tupleElement(build,'created_at'), tupleElement(job,'created_at')) AS build_created_at_utc,
46+
argMax(tupleElement(build,'branch'), tupleElement(job,'created_at')) AS branch_name
47+
FROM vllm.vllm_buildkite_jobs
48+
GROUP BY tupleElement(build,'id')
49+
HAVING build_created_at_utc >= w_start AND build_created_at_utc < w_end
50+
)
51+
52+
/* 2) Agent-run attempts for those builds that can overlap the window */
53+
, base_agent AS (
54+
SELECT
55+
tupleElement(build,'id') AS build_id,
56+
tupleElement(job,'id') AS job_id,
57+
tupleElement(job,'created_at') AS created_at,
58+
tupleElement(job,'state') AS state,
59+
tupleElement(job,'runnable_at') AS runnable_at,
60+
tupleElement(job,'started_at') AS started_at,
61+
tupleElement(job,'finished_at') AS finished_at,
62+
replaceOne(arrayFirst(x -> startsWith(x,'queue='),
63+
tupleElement(job,'agent_query_rules')), 'queue=', '') AS queue_key
64+
FROM vllm.vllm_buildkite_jobs
65+
INNER JOIN builds_window b ON tupleElement(build,'id') = b.build_id
66+
WHERE tupleElement(job,'type') IN ('script','command')
67+
AND (
68+
tupleElement(job,'runnable_at') < w_end OR
69+
tupleElement(job,'started_at') < w_end OR
70+
ifNull(tupleElement(job,'finished_at'), FAR_FUTURE) >= w_start
71+
)
72+
)
73+
74+
/* 3) Collapse to (build_id, job_id) and collect attempts keyed by queue */
75+
, jobs_by_build AS (
76+
SELECT
77+
build_id,
78+
job_id,
79+
argMax(state, created_at) AS latest_state,
80+
max(created_at) AS last_event_at,
81+
82+
/* RUN attempts: (queue, start, finish) */
83+
arrayDistinct(arrayFilter(t -> t.2 IS NOT NULL,
84+
groupArray((queue_key, started_at, finished_at))
85+
)) AS run_triplets,
86+
87+
/* WAIT attempts: (queue, runnable, start) — ONLY attempts that actually started */
88+
arrayDistinct(arrayFilter(t -> t.2 IS NOT NULL AND t.3 IS NOT NULL,
89+
groupArray((queue_key, runnable_at, started_at))
90+
)) AS wait_triplets
91+
FROM base_agent
92+
GROUP BY build_id, job_id
93+
)
94+
95+
/* 4) RUN attempts → per build × queue (clip to window; zombie guard for open runs) */
96+
, runs_scored AS (
97+
SELECT
98+
build_id,
99+
tupleElement(rt, 1) AS queue_key,
100+
greatest(tupleElement(rt, 2), w_start) AS s_clip,
101+
least(
102+
ifNull(
103+
tupleElement(rt, 3),
104+
if(latest_state = 'running' AND last_event_at < zombie_cutoff,
105+
least(last_event_at + INTERVAL 1 MINUTE, w_end),
106+
w_end)
107+
),
108+
w_end
109+
) AS e_clip
110+
FROM jobs_by_build
111+
ARRAY JOIN run_triplets AS rt
112+
WHERE tupleElement(rt, 1) IN QUEUES
113+
)
114+
, run_by_build AS (
115+
SELECT
116+
build_id, queue_key,
117+
sumIf(dateDiff('second', s_clip, e_clip), e_clip > s_clip) AS total_run_s
118+
FROM runs_scored
119+
GROUP BY build_id, queue_key
120+
)
121+
122+
/* 5) WAIT attempts (runnable → started) → per build × queue (clip to window) */
123+
, waits_scored AS (
124+
SELECT
125+
build_id,
126+
tupleElement(wt, 1) AS queue_key,
127+
greatest(tupleElement(wt, 2), w_start) AS ra_clip,
128+
least(tupleElement(wt, 3), w_end) AS st_clip
129+
FROM jobs_by_build
130+
ARRAY JOIN wait_triplets AS wt
131+
WHERE tupleElement(wt, 1) IN QUEUES
132+
)
133+
, wait_by_build AS (
134+
SELECT
135+
build_id, queue_key,
136+
sumIf(dateDiff('second', ra_clip, st_clip), st_clip > ra_clip) AS total_wait_s
137+
FROM waits_scored
138+
GROUP BY build_id, queue_key
139+
)
140+
141+
/* 6) Pivot per-build totals to hour columns */
142+
, totals_by_build AS (
143+
SELECT
144+
build_id,
145+
round(ifNull(sumIf(total_wait_s, queue_key='gpu_1_queue'), 0) / 3600.0, 2) AS gpu_1_queue_wait_hours,
146+
round(ifNull(sumIf(total_run_s, queue_key='gpu_1_queue'), 0) / 3600.0, 2) AS gpu_1_queue_run_hours,
147+
round(ifNull(sumIf(total_wait_s, queue_key='gpu_4_queue'), 0) / 3600.0, 2) AS gpu_4_queue_wait_hours,
148+
round(ifNull(sumIf(total_run_s, queue_key='gpu_4_queue'), 0) / 3600.0, 2) AS gpu_4_queue_run_hours
149+
FROM (
150+
SELECT build_id, queue_key, total_run_s, toInt64(0) AS total_wait_s FROM run_by_build
151+
UNION ALL
152+
SELECT build_id, queue_key, toInt64(0) AS total_run_s, total_wait_s FROM wait_by_build
153+
)
154+
GROUP BY build_id
155+
)
156+
157+
/* 7) Final table (UTC) — includes both PR and main builds */
158+
SELECT
159+
/* PR URL (NULL for non-PR builds) */
160+
if((pr_number IS NULL) OR (repo_slug IS NULL),
161+
NULL,
162+
concat('https://github.com/', repo_slug, '/pull/', toString(pr_number))
163+
) AS pr_url,
164+
165+
build_number,
166+
build_id,
167+
build_url,
168+
steps_table_url,
169+
commit_sha,
170+
171+
robust_start AS build_started_at,
172+
robust_finish AS build_finished_at,
173+
174+
/* duration (hours) = finish − start (UTC) */
175+
multiIf(
176+
robust_start IS NULL OR robust_finish IS NULL,
177+
NULL,
178+
round(dateDiff('second', robust_start, robust_finish) / 3600.0, 2)
179+
) AS duration_hours,
180+
181+
steps_count,
182+
latest_build_state,
183+
184+
ifNull(t.gpu_1_queue_wait_hours, 0) AS gpu_1_queue_wait_hours,
185+
ifNull(t.gpu_1_queue_run_hours, 0) AS gpu_1_queue_run_hours,
186+
ifNull(t.gpu_4_queue_wait_hours, 0) AS gpu_4_queue_wait_hours,
187+
ifNull(t.gpu_4_queue_run_hours, 0) AS gpu_4_queue_run_hours,
188+
189+
/* Fixed-rate cost */
190+
round(
191+
1.3232 * ifNull(t.gpu_1_queue_run_hours, 0) +
192+
4.602 * ifNull(t.gpu_4_queue_run_hours, 0),
193+
2
194+
) AS cost,
195+
196+
/* Mark if the build branch is literally 'main' */
197+
toUInt8(branch_name = 'main') AS is_main_branch
198+
199+
FROM builds_window AS b
200+
LEFT JOIN totals_by_build AS t ON t.build_id = b.build_id
201+
ORDER BY build_created_at_utc ASC;
Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
import { Box, Stack, Switch, Tooltip, Typography } from "@mui/material";
2+
import ReactECharts from "echarts-for-react";
3+
import { useCallback, useMemo, useState } from "react";
4+
5+
// Helper: extract pipeline slug from Buildkite URL (e.g., /vllm/ci/builds/...)
6+
function pipelineFromUrl(url: string | null): string {
7+
try {
8+
if (!url) return "unknown";
9+
const u = new URL(url);
10+
const parts = u.pathname.split("/").filter(Boolean);
11+
// ['', 'vllm', 'ci', 'builds', '35333', ...] => ['vllm','ci','builds','35333']
12+
return parts[1] || "unknown";
13+
} catch {
14+
const m = url?.match(/buildkite\.com\/[^/]+\/([^/]+)/i);
15+
return m?.[1] ?? "unknown";
16+
}
17+
}
18+
19+
type Row = {
20+
pr_url: string | null;
21+
build_number: number;
22+
build_id: string;
23+
build_url: string;
24+
steps_table_url: string;
25+
commit_sha: string;
26+
build_started_at: string | null; // UTC
27+
build_finished_at: string | null; // UTC
28+
duration_hours: number | null;
29+
steps_count: number;
30+
latest_build_state: string;
31+
gpu_1_queue_wait_hours: number;
32+
gpu_1_queue_run_hours: number;
33+
gpu_4_queue_wait_hours: number;
34+
gpu_4_queue_run_hours: number;
35+
cost: number; // dollars
36+
is_main_branch: number; // 0/1
37+
};
38+
39+
export default function QueueWaitPerBuildPanel({
40+
data,
41+
}: {
42+
data: Row[] | undefined;
43+
}) {
44+
const [mainOnly, setMainOnly] = useState(true);
45+
46+
// Filter & sort; drop rows without a start time (time axis needs x)
47+
const rows = useMemo(() => {
48+
const r = (data ?? [])
49+
.filter((x) => (mainOnly ? x.is_main_branch === 1 : true))
50+
.filter((x) => !!x.build_started_at);
51+
return r.sort((a, b) => {
52+
const ta = a.build_started_at
53+
? new Date(a.build_started_at).getTime()
54+
: 0;
55+
const tb = b.build_started_at
56+
? new Date(b.build_started_at).getTime()
57+
: 0;
58+
return ta - tb || a.build_number - b.build_number;
59+
});
60+
}, [data, mainOnly]);
61+
62+
// Group rows by pipeline to build one series per pipeline
63+
const grouped = useMemo(() => {
64+
const g = new Map<string, Row[]>();
65+
for (const r of rows) {
66+
const p = pipelineFromUrl(r.build_url);
67+
if (!g.has(p)) g.set(p, []);
68+
g.get(p)!.push(r);
69+
}
70+
return g;
71+
}, [rows]);
72+
73+
// Click → open Buildkite build page
74+
const onPointClick = useCallback(
75+
(e: { dataIndex?: number }) => {
76+
// We attach build_url to each data item; prefer that if present
77+
const url = e?.data?.build_url;
78+
if (url) window.open(url, "_blank", "noopener,noreferrer");
79+
},
80+
[],
81+
);
82+
83+
const option = useMemo(() => {
84+
const series: any[] = [];
85+
for (const [pipeline, arr] of grouped.entries()) {
86+
series.push({
87+
name: pipeline,
88+
type: "scatter",
89+
symbolSize: 6,
90+
cursor: "pointer",
91+
data: arr.map((r) => ({
92+
value: [
93+
r.build_started_at,
94+
(r.gpu_1_queue_wait_hours ?? 0) + (r.gpu_4_queue_wait_hours ?? 0),
95+
],
96+
build_url: r.build_url,
97+
row: r,
98+
})),
99+
});
100+
}
101+
102+
return {
103+
tooltip: {
104+
trigger: "item",
105+
confine: true,
106+
formatter: (p: any) => {
107+
const r: Row = p?.data?.row ?? {};
108+
const buildLink = r.build_url
109+
? `<a href="${r.build_url}" target="_blank" rel="noreferrer">#${r.build_number}</a>`
110+
: `#${r.build_number}`;
111+
const prLine = r.pr_url
112+
? `<div>PR: <a href="${
113+
r.pr_url
114+
}" target="_blank" rel="noreferrer">${r.pr_url
115+
.replace("https://github.com/", "")
116+
.replace("/pull/", "#")}</a></div>`
117+
: "";
118+
const w1 = Number(r?.gpu_1_queue_wait_hours ?? 0);
119+
const w4 = Number(r?gpu_4_queue_wait_hours ?? 0);
120+
const waitTotal = w1 + w4;
121+
return `
122+
<div>
123+
<div><b>${r?.build_started_at ?? ""}</b></div>
124+
<div>Pipeline: <b>${pipelineFromUrl(r?.build_url ?? null)}</b></div>
125+
<div>Build: ${buildLink}</div>
126+
${prLine}
127+
<div>Wait GPU1: ${w1.toFixed(2)} h</div>
128+
<div>Wait GPU4: ${w4.toFixed(2)} h</div>
129+
<div>Wait (total): ${waitTotal.toFixed(2)} h</div>
130+
<div>Branch: ${r?.is_main_branch ? "main" : "PR/other"}</div>
131+
</div>
132+
`;
133+
},
134+
},
135+
legend: { top: 0 },
136+
grid: { left: 40, right: 50, bottom: 40, top: 40 },
137+
xAxis: { type: "time", name: "Build start (UTC)" },
138+
yAxis: [{ type: "value", name: "Wait (h)" }],
139+
series,
140+
};
141+
}, [grouped]);
142+
143+
return (
144+
<Box sx={{ height: "100%", display: "flex", flexDirection: "column" }}>
145+
<Stack
146+
direction="row"
147+
spacing={2}
148+
alignItems="center"
149+
sx={{ px: 2, pt: 1 }}
150+
>
151+
<Typography variant="h6" sx={{ fontWeight: "bold" }}>
152+
Queue Wait (per build)
153+
</Typography>
154+
<Tooltip title="Show only builds on branch 'main'">
155+
<Stack direction="row" spacing={1} alignItems="center">
156+
<Typography variant="body2">Main only</Typography>
157+
<Switch
158+
size="small"
159+
checked={mainOnly}
160+
onChange={() => setMainOnly((s) => !s)}
161+
/>
162+
</Stack>
163+
</Tooltip>
164+
</Stack>
165+
<Box sx={{ flex: 1, minHeight: 240 }}>
166+
<ReactECharts
167+
style={{ height: "100%" }}
168+
option={option}
169+
notMerge
170+
onEvents={{ click: onPointClick }}
171+
/>
172+
</Box>
173+
</Box>
174+
);
175+
}

0 commit comments

Comments
 (0)