-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathindex.html
More file actions
223 lines (213 loc) · 12 KB
/
index.html
File metadata and controls
223 lines (213 loc) · 12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta name="description" content="CocoaBench: Evaluating Unified Digital Agents in the Wild">
<link rel="canonical" href="https://cocoabench.github.io/">
<title>CocoaBench | Introduction</title>
<!-- Open Graph -->
<meta property="og:title" content="CocoaBench | Compositional Cognitive Agents Benchmark">
<meta property="og:description" content="CocoaBench: Evaluating Unified Digital Agents in the Wild">
<meta property="og:url" content="https://cocoabench.github.io/">
<meta property="og:site_name" content="CocoaBench">
<meta property="og:type" content="website">
<meta property="og:image" content="https://cocoabench.github.io/assets/figures/architecture.jpg">
<!-- Twitter -->
<meta name="twitter:card" content="summary_large_image">
<meta name="twitter:title" content="CocoaBench | Compositional Cognitive Agents Benchmark">
<meta name="twitter:description" content="CocoaBench: Evaluating Unified Digital Agents in the Wild">
<meta name="twitter:image" content="https://cocoabench.github.io/assets/figures/architecture.jpg">
<meta name="twitter:site" content="@cocoabench">
<link rel="stylesheet" href="./assets/css/style.css">
<!-- KaTeX for math rendering -->
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.9/dist/katex.min.css">
<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.9/dist/katex.min.js"></script>
<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.9/dist/contrib/auto-render.min.js" onload="renderMathInElement(document.body, {delimiters: [{left: '$$', right: '$$', display: true}, {left: '$', right: '$', display: false}]});"></script>
<!-- Prism.js for syntax highlighting -->
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/prismjs@1.29.0/themes/prism.min.css">
<script defer src="https://cdn.jsdelivr.net/npm/prismjs@1.29.0/prism.min.js"></script>
<script defer src="https://cdn.jsdelivr.net/npm/prismjs@1.29.0/components/prism-python.min.js"></script>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@graph": [
{
"@type": "Organization",
"@id": "https://cocoabench.github.io/#org",
"name": "CocoaBench",
"url": "https://cocoabench.github.io/",
"sameAs": [
"https://github.com/cocoabench",
"https://discord.gg/cbUx7pEN"
]
},
{
"@type": "WebSite",
"@id": "https://cocoabench.github.io/#website",
"url": "https://cocoabench.github.io/",
"name": "CocoaBench",
"description": "CocoaBench: Evaluating Unified Digital Agents in the Wild",
"publisher": {
"@id": "https://cocoabench.github.io/#org"
}
}
]
}
</script>
</head>
<body>
<header>
<a href="index.html" class="logo"></a>
<nav>
<a href="index.html" class="active">Introduction</a>
<a href="blog.html">Blog</a>
</nav>
</header>
<div class="container">
<h1>CocoaBench</h1>
<p class="subtitle">Evaluating Unified Digital Agents in the Wild</p>
<div class="resource-links">
<a href="blog.html" class="resource-link">
<svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
<path d="M14 2H6a2 2 0 0 0-2 2v16a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2V8z"></path>
<polyline points="14 2 14 8 20 8"></polyline>
<line x1="16" y1="13" x2="8" y2="13"></line>
<line x1="16" y1="17" x2="8" y2="17"></line>
</svg>
Blog
</a>
<a href="https://github.com/cocoabench/cocoa-agent" class="resource-link">
<svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
<path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"></path>
</svg>
Code
</a>
<a href="https://github.com/cocoabench/cocoa-agent/tree/main/cocoabench-v1.0" class="resource-link">
<svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
<ellipse cx="12" cy="5" rx="9" ry="3"></ellipse>
<path d="M21 12c0 1.66-4 3-9 3s-9-1.34-9-3"></path>
<path d="M3 5v14c0 1.66 4 3 9 3s9-1.34 9-3V5"></path>
</svg>
Data
</a>
<a href="https://arxiv.org/abs/2604.11201" class="resource-link">
<svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
<path d="M12 20h9"></path>
<path d="M16.5 3.5a2.121 2.121 0 0 1 3 3L7 19l-4 1 1-4L16.5 3.5z"></path>
</svg>
Paper
</a>
</div>
<h2>Overview</h2>
<p>
CocoaBench is a benchmark for unified digital agents built from 153 human-designed, long-horizon tasks that require flexible composition of <strong>vision</strong>, <strong>search</strong>, and <strong>coding</strong>.
</p>
<ul>
<li><strong>Compositional by design.</strong> Most tasks require the combinatino of vision (GUI interaction, visual understanding), search (information synthesis), and coding (terminal use, algorithms).</li>
<li><strong>Minimal specification, automatic evaluation.</strong> Each task is defined only by an instruction and an evaluation script over the response. No fixed infrastructure is assumed. No LLM as judge.</li>
<li><strong>Cocoa-Agent scaffold.</strong> We build <a href="https://github.com/cocoabench/cocoa-agent">Cocoa-Agent</a>, a lightweight framework integrated with <a href="https://github.com/agent-infra/sandbox">Sandbox-AIO</a>, enabling controlled comparison across backbones and reproducible evaluation.</li>
</ul>
<p>
Experiments show that current agents remain far from reliable on CocoaBench. The best evaluated system achieves only <strong>45.1% success rate</strong>, with substantial room for improvement in reasoning & planning, tool use & execution, and visual grounding.
</p>
<!-- <figure class="figure-block">
<img src="https://placehold.co/720x360/f4f4f4/888?text=CocoaBench+Architecture&font=roboto" alt="CocoaBench Architecture Diagram" class="figure-image">
<figcaption>Figure 1: CocoaBench Architecture Diagram — showing the interaction between agents, environments, and evaluation metrics.</figcaption>
</figure> -->
<h2 id="examples">Examples</h2>
<p>
Here are some example tasks (excluded from CocoaBench to avoid contamination), showcasing the diverse reasoning challenges our benchmark presents.
</p>
<div id="example-showcase" class="example-showcase">
<!-- Populated by examples.js -->
</div>
<h2>Evaluation</h2>
<p>
We evaluate representative agent systems on CocoaBench v1.0 (153 tasks).
</p>
<h3>Existing Agents</h3>
<div id="chart-existing" class="perf-figure-wrap">
<!-- Populated by chart.js -->
</div>
<h3>Cocoa-Agent</h3>
<div id="chart-cocoa" class="perf-figure-wrap">
<!-- Populated by chart.js -->
</div>
<h3>Case Studies</h3>
<p>We present the model solutions for the 4 example tasks shwon above. Explore how different models approached each example task. Click on a result block to view the analysis and the raw response.</p>
<div id="solution-gallery" class="solution-gallery">
<!-- Populated by gallery.js -->
</div>
<h2 style="display:none;">Citation</h2>
<div class="citation-block" style="display:none;">
<button class="copy-btn" onclick="copyCitation()" title="Copy to clipboard">
<svg class="copy-icon" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
<rect x="9" y="9" width="13" height="13" rx="2" ry="2"></rect>
<path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1"></path>
</svg>
<svg class="check-icon" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" style="display:none;">
<polyline points="20 6 9 17 4 12"></polyline>
</svg>
</button>
<pre id="citation-text">@misc{cocoabench2025,
title={CocoaBench: An Evaluation Framework for General Agents with Compositional Cognitive Abilities},
author={Shibo Hao and Zhining Zhang and Zhiqi Liang and Tianyang Liu and Zilong Wang and others},
howpublished={Blog post},
month={December},
year={2025},
url={https://cocoabench.github.io/}
}</pre>
</div>
<script>
function copyCode(btn) {
const wrapper = btn.closest('.code-block-wrapper');
const code = wrapper.querySelector('code').textContent;
const textarea = document.createElement('textarea');
textarea.value = code;
textarea.style.position = 'fixed';
textarea.style.opacity = '0';
document.body.appendChild(textarea);
textarea.select();
document.execCommand('copy');
document.body.removeChild(textarea);
btn.classList.add('copied');
btn.innerHTML = '<svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><polyline points="20 6 9 17 4 12"></polyline></svg> Copied!';
setTimeout(() => {
btn.classList.remove('copied');
btn.innerHTML = '<svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><rect x="9" y="9" width="13" height="13" rx="2" ry="2"></rect><path d="M5 15H4a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h9a2 2 0 0 1 2 2v1"></path></svg> Copy';
}, 2000);
}
function copyCitation() {
const text = document.getElementById('citation-text').textContent;
// Fallback for local files / non-HTTPS
const textarea = document.createElement('textarea');
textarea.value = text;
textarea.style.position = 'fixed';
textarea.style.opacity = '0';
document.body.appendChild(textarea);
textarea.select();
document.execCommand('copy');
document.body.removeChild(textarea);
// Show success feedback
const btn = document.querySelector('.copy-btn');
btn.querySelector('.copy-icon').style.display = 'none';
btn.querySelector('.check-icon').style.display = 'block';
setTimeout(() => {
btn.querySelector('.copy-icon').style.display = 'block';
btn.querySelector('.check-icon').style.display = 'none';
}, 2000);
}
</script>
</div>
<footer class="site-footer">
<p>Built by the CocoaBench Team</p>
<p class="footer-credits">Design inspired by <a href="https://thinkingmachines.ai/" target="_blank" rel="noopener">Thinking Machines</a> and <a href="https://openai.com/" target="_blank" rel="noopener">OpenAI</a></p>
</footer>
<script src="./assets/js/logo.js"></script>
<script src="./assets/js/examples.js"></script>
<script src="./assets/js/chart.js"></script>
<script src="./assets/js/gallery.js"></script>
<script src="./assets/js/toc.js"></script>
</body>
</html>