Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 121 additions & 0 deletions css/components.css
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,40 @@ button:focus,
white-space: nowrap;
}

/* Selection checkbox column (fixed slim width) */
.data-table.has-select-col th.select-col,
.data-table.has-select-col td.select-col {
width: 36px;
min-width: 36px;
max-width: 36px;
text-align: center;
white-space: nowrap;
}

/* Prevent first (model) column from expanding when selection column present */
.data-table.has-select-col th.sortable[data-sort="name"],
.data-table.has-select-col td:first-of-type + td {
width: auto;
white-space: normal;
}

/* Ensure model column remains flexible when select column exists */
.data-table.has-select-col th:nth-child(2),
.data-table.has-select-col td:nth-child(2) {
width: 40%;
min-width: 180px;
max-width: 350px;
white-space: normal;
word-wrap: break-word;
text-align: left; /* override default % resolved right alignment */
}

/* Keep % Resolved right-aligned (now 3rd column when select column exists) */
.data-table.has-select-col th:nth-child(3),
.data-table.has-select-col td:nth-child(3) {
text-align: right;
}

/* Cards */
.card {
background-color: var(--color-background);
Expand Down Expand Up @@ -616,6 +650,19 @@ button:focus,
}
}

/* Modal basic styles */
.modal { display: none; position: fixed; inset: 0; z-index: var(--z-modal); }
.modal.show { display: block; }
.modal-backdrop { position: absolute; inset: 0; background: rgba(0,0,0,0.45); }
.modal-dialog { position: relative; background: var(--color-background); color: var(--color-text); width: min(720px, calc(100vw - 2rem)); margin: 5vh auto; border-radius: var(--radius-lg); box-shadow: var(--shadow-xl); border: 1.5px solid var(--color-border); resize: both; overflow: auto; min-width: 400px; min-height: 300px; max-width: 90vw; max-height: 90vh; display: flex; flex-direction: column; }
.modal-dialog-small { width: min(480px, calc(100vw - 2rem)); min-width: 320px; min-height: auto; resize: none; }
.modal-dialog-large { width: min(1200px, 90vw); height: min(800px, 90vh); }
.modal-header { display: flex; align-items: center; justify-content: space-between; padding: 0.75rem 1rem; border-bottom: 1.5px solid var(--color-border); }
.modal-body { padding: 1rem; overflow: auto; flex: 1; display: flex; flex-direction: column; }
.modal-close { background: transparent; border: none; cursor: pointer; color: var(--color-text-secondary); }
.chart-container { flex: 1; display: flex; flex-direction: column; min-height: 0; position: relative; }
.chart-container canvas { flex: 1; min-height: 260px; }

@media (max-width: 992px) {
/* On mobile and tablets */
.table-responsive {
Expand Down Expand Up @@ -919,3 +966,77 @@ button:focus,
text-decoration-thickness: 2px;
text-decoration-color: var(--color-text-muted);
}

/* New Feature Badge */
.new-badge {
position: absolute;
top: 100%;
left: 50%;
transform: translateX(-50%);
margin-top: 2px;
background: linear-gradient(135deg, var(--color-accent), var(--color-accent-dark));
color: white;
padding: 0.25rem 0.5rem;
border-radius: var(--radius-full);
font-size: 0.7rem;
font-weight: var(--weight-medium);
white-space: nowrap;
box-shadow: 0 2px 8px rgba(59, 130, 246, 0.4);
z-index: 100;
animation: newBadgeAnimation 6s ease-in-out forwards;
pointer-events: none;
}

.dark-mode .new-badge {
background: linear-gradient(135deg, var(--blue-400), var(--blue-600));
box-shadow: 0 2px 8px rgba(59, 130, 246, 0.6);
}

.new-badge-button {
top: -12px;
left: auto;
right: -45px;
}

@keyframes newBadgeAnimation {
0% {
opacity: 0;
transform: translateX(-50%) scale(0.8);
}
10% {
opacity: 1;
transform: translateX(-50%) scale(1);
}
15% {
transform: translateX(-50%) scale(1.1);
}
20% {
transform: translateX(-50%) scale(1);
}
30% {
transform: translateX(-50%) scale(1.05);
}
35% {
transform: translateX(-50%) scale(1);
}
45% {
transform: translateX(-50%) scale(1.05);
}
50% {
transform: translateX(-50%) scale(1);
}
60% {
transform: translateX(-50%) scale(1.05);
}
65% {
transform: translateX(-50%) scale(1);
}
85% {
opacity: 1;
transform: translateX(-50%) scale(1);
}
100% {
opacity: 0;
transform: translateX(-50%) scale(0.8);
}
}
155 changes: 155 additions & 0 deletions data/combine_per_instance_details.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
#!/usr/bin/env python3
"""
Script to combine per-instance details from info_for_leaderboard.json
into the leaderboards.json file for all model entries.
"""

import json
import sys
from pathlib import Path


# Mapping from info_for_leaderboard.json keys to leaderboard entry names
MODEL_MAPPING = {
'gpt-5': 'GPT-5 (2025-08-07) (medium reasoning)',
'gpt-5-mini': 'GPT-5 mini (2025-08-07) (medium reasoning)',
'sonnet-4': 'Claude 4 Sonnet (20250514)',
'sonnet-4-5': 'Claude 4.5 Sonnet (20250929)',
}


def main():
# Define file paths
script_dir = Path(__file__).parent
info_file = script_dir / "info_for_leaderboard.json"
leaderboards_file = script_dir / "leaderboards.json"
backup_file = script_dir / "leaderboards.json.backup"

# Check files exist
if not info_file.exists():
print(f"Error: {info_file} not found")
return 1

if not leaderboards_file.exists():
print(f"Error: {leaderboards_file} not found")
return 1

# Load the info file
print(f"Loading {info_file}...")
with open(info_file, 'r') as f:
info_data = json.load(f)

print(f"Found {len(info_data)} model entries in info file")
print(f"Available models: {list(info_data.keys())}")

# Load leaderboards
print(f"\nLoading {leaderboards_file}...")
with open(leaderboards_file, 'r') as f:
leaderboards_data = json.load(f)

# Find bash-only leaderboard
bash_only = None
bash_only_idx = None
for idx, lb in enumerate(leaderboards_data['leaderboards']):
if lb.get('name') == 'bash-only':
bash_only = lb
bash_only_idx = idx
break

if bash_only is None:
print("Error: 'bash-only' leaderboard not found")
return 1

print(f"Found 'bash-only' leaderboard with {len(bash_only['results'])} entries")

# Track which models will be updated
models_to_update = []
for info_key, leaderboard_name in MODEL_MAPPING.items():
if info_key not in info_data:
print(f"\nWarning: '{info_key}' not found in info file, skipping...")
continue

# Find the entry in leaderboard
entry_idx = None
for idx, result in enumerate(bash_only['results']):
if result.get('name') == leaderboard_name:
entry_idx = idx
break

if entry_idx is None:
print(f"\nWarning: '{leaderboard_name}' not found in leaderboard, skipping...")
continue

# Check if already has per_instance_details
has_details = 'per_instance_details' in bash_only['results'][entry_idx]
num_instances = len(info_data[info_key])

models_to_update.append({
'info_key': info_key,
'leaderboard_name': leaderboard_name,
'entry_idx': entry_idx,
'num_instances': num_instances,
'has_details': has_details,
})

status = "(will overwrite)" if has_details else "(new)"
print(f"\n - {leaderboard_name} {status}")
print(f" {num_instances} instances from '{info_key}'")

if not models_to_update:
print("\nError: No models to update")
return 1

# Ask for confirmation
print(f"\n{'='*60}")
print(f"Will update {len(models_to_update)} model(s)")

overwrite_count = sum(1 for m in models_to_update if m['has_details'])
if overwrite_count > 0:
print(f"Warning: {overwrite_count} model(s) already have per_instance_details")

response = input("\nContinue? (yes/no): ").strip().lower()
if response != 'yes':
print("Aborted.")
return 0

# Create backup
print(f"\nCreating backup at {backup_file}...")
with open(backup_file, 'w') as f:
json.dump(leaderboards_data, f, indent=2)

# Update all models
print("\nUpdating models...")
for model in models_to_update:
info_key = model['info_key']
entry_idx = model['entry_idx']
leaderboard_name = model['leaderboard_name']

per_instance_details = info_data[info_key]
leaderboards_data['leaderboards'][bash_only_idx]['results'][entry_idx]['per_instance_details'] = per_instance_details

print(f" ✓ {leaderboard_name}: {len(per_instance_details)} instances")

# Write updated data
print(f"\nWriting updated data to {leaderboards_file}...")
with open(leaderboards_file, 'w') as f:
json.dump(leaderboards_data, f, indent=2)

print("\n" + "="*60)
print("✓ Success! All models updated")
print(f" - Backup saved to: {backup_file}")
print(f" - Models updated: {len(models_to_update)}")

# Show sample of added data for first model
if models_to_update:
first_model = models_to_update[0]
print(f"\nSample instances from {first_model['leaderboard_name']}:")
sample_data = info_data[first_model['info_key']]
for i, (key, value) in enumerate(list(sample_data.items())[:3]):
print(f" - {key}: resolved={value.get('resolved')}, cost={value.get('cost')}")

return 0


if __name__ == '__main__':
sys.exit(main())
1 change: 1 addition & 0 deletions data/info_for_leaderboard.json

Large diffs are not rendered by default.

Loading