-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathevaluate_hellaswag_real.py
More file actions
147 lines (113 loc) · 4.68 KB
/
evaluate_hellaswag_real.py
File metadata and controls
147 lines (113 loc) · 4.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env python3
"""
REAL HellaSwag Evaluation with Full Dataset
==========================================
Evaluates on the actual HellaSwag benchmark using real models.
NO MORE HARDCODED SAMPLES!
Author: Principal Neuro-AI Engineer
Date: January 8, 2026
"""
import sys
import time
import json
import argparse
from pathlib import Path
from typing import Dict, List, Any
import logging
from real_dataset_loader import RealDatasetLoader, install_datasets_if_needed
from real_model_inference import RealSpecialistSystem, install_required_packages
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def run_real_hellaswag_evaluation(max_samples: int = None, use_real_models: bool = True):
"""Run REAL HellaSwag evaluation on full dataset"""
print("[THINK] REAL HellaSwag Evaluation - Full Dataset")
print("=" * 60)
print(f"[CHART] Max samples: {max_samples or 'ALL'}")
print(f"? Real models: {'YES' if use_real_models else 'NO (simulation)'}")
# Install dependencies
if use_real_models:
try:
from transformers import AutoTokenizer
except ImportError:
if not install_required_packages():
return None
if not install_datasets_if_needed():
return None
# Load real HellaSwag dataset
print("\n[CYCLE] Loading REAL HellaSwag dataset...")
loader = RealDatasetLoader()
try:
hellaswag_samples = loader.load_hellaswag_full(max_samples=max_samples)
if not hellaswag_samples:
print("[FAIL] No HellaSwag samples loaded")
return None
print(f"[OK] Loaded {len(hellaswag_samples)} REAL HellaSwag samples")
except Exception as e:
logger.error(f"Failed to load HellaSwag dataset: {e}")
return None
# Initialize specialist system
print(f"\n? Initializing {'REAL' if use_real_models else 'SIMULATION'} specialist system...")
specialist_system = RealSpecialistSystem(use_real_models=use_real_models)
# Run evaluation
print(f"\n[ROCKET] Starting evaluation on {len(hellaswag_samples)} questions...")
results = []
start_time = time.time()
for i, sample in enumerate(hellaswag_samples):
print(f"\r? Processing {i+1}/{len(hellaswag_samples)}...", end='', flush=True)
try:
result = specialist_system.evaluate_question(
question=sample['question'],
choices=sample['choices'],
correct_answer=sample['answer'],
subject=sample['subject']
)
results.append(result)
except Exception as e:
logger.error(f"Error processing HellaSwag sample {i}: {e}")
continue
print() # New line after progress
# Calculate metrics
total_time = time.time() - start_time
total_correct = sum(1 for r in results if r['is_correct'])
total_questions = len(results)
if total_questions == 0:
print("[FAIL] No questions processed")
return None
accuracy = total_correct / total_questions
avg_confidence = sum(r['confidence'] for r in results) / len(results)
# Display results
print(f"\n? REAL HellaSwag Results")
print("=" * 45)
print(f" Accuracy: {accuracy:.1%} ({total_correct}/{total_questions})")
print(f" Avg Confidence: {avg_confidence:.3f}")
print(f" Runtime: {total_time:.1f}s")
# Save results
timestamp = int(time.time())
results_data = {
"evaluation_type": "real_hellaswag",
"accuracy": accuracy,
"total_correct": total_correct,
"total_questions": total_questions,
"avg_confidence": avg_confidence,
"use_real_models": use_real_models,
"timestamp": timestamp,
"results": results[:50] # First 50 detailed results
}
results_file = f"real_hellaswag_results_{timestamp}.json"
with open(results_file, 'w') as f:
json.dump(results_data, f, indent=2)
print(f"? Results saved to: {results_file}")
return results_data
def main():
"""Main function"""
parser = argparse.ArgumentParser(description='Real HellaSwag Evaluation')
parser.add_argument('--max-samples', type=int, default=None)
parser.add_argument('--simulation', action='store_true')
parser.add_argument('--quick-test', action='store_true')
args = parser.parse_args()
max_samples = 50 if args.quick_test else args.max_samples
use_real_models = not args.simulation
results = run_real_hellaswag_evaluation(max_samples, use_real_models)
return 0 if results else 1
if __name__ == "__main__":
exit(main())