biomind/evaluate_hellaswag_real.py at master · 269652/biomind · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env python3
"""
REAL HellaSwag Evaluation with Full Dataset
==========================================
Evaluates on the actual HellaSwag benchmark using real models.
NO MORE HARDCODED SAMPLES!

Author: Principal Neuro-AI Engineer
Date: January 8, 2026
"""

import sys
import time
import json
import argparse
from pathlib import Path
from typing import Dict, List, Any
import logging

from real_dataset_loader import RealDatasetLoader, install_datasets_if_needed
from real_model_inference import RealSpecialistSystem, install_required_packages

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def run_real_hellaswag_evaluation(max_samples: int = None, use_real_models: bool = True):
    """Run REAL HellaSwag evaluation on full dataset"""

    print("[THINK] REAL HellaSwag Evaluation - Full Dataset")
    print("=" * 60)
    print(f"[CHART] Max samples: {max_samples or 'ALL'}")
    print(f"? Real models: {'YES' if use_real_models else 'NO (simulation)'}")

    # Install dependencies
    if use_real_models:
        try:
            from transformers import AutoTokenizer
        except ImportError:
            if not install_required_packages():
                return None

    if not install_datasets_if_needed():
        return None

    # Load real HellaSwag dataset
    print("\n[CYCLE] Loading REAL HellaSwag dataset...")
    loader = RealDatasetLoader()

    try:
        hellaswag_samples = loader.load_hellaswag_full(max_samples=max_samples)
        if not hellaswag_samples:
            print("[FAIL] No HellaSwag samples loaded")
            return None

        print(f"[OK] Loaded {len(hellaswag_samples)} REAL HellaSwag samples")

    except Exception as e:
        logger.error(f"Failed to load HellaSwag dataset: {e}")
        return None

    # Initialize specialist system
    print(f"\n? Initializing {'REAL' if use_real_models else 'SIMULATION'} specialist system...")
    specialist_system = RealSpecialistSystem(use_real_models=use_real_models)

    # Run evaluation
    print(f"\n[ROCKET] Starting evaluation on {len(hellaswag_samples)} questions...")

    results = []
    start_time = time.time()

    for i, sample in enumerate(hellaswag_samples):
        print(f"\r? Processing {i+1}/{len(hellaswag_samples)}...", end='', flush=True)

        try:
            result = specialist_system.evaluate_question(
                question=sample['question'],
                choices=sample['choices'],
                correct_answer=sample['answer'],
                subject=sample['subject']
            )

            results.append(result)

        except Exception as e:
            logger.error(f"Error processing HellaSwag sample {i}: {e}")
            continue

    print()  # New line after progress

    # Calculate metrics
    total_time = time.time() - start_time
    total_correct = sum(1 for r in results if r['is_correct'])
    total_questions = len(results)

    if total_questions == 0:
        print("[FAIL] No questions processed")
        return None

    accuracy = total_correct / total_questions
    avg_confidence = sum(r['confidence'] for r in results) / len(results)

    # Display results
    print(f"\n? REAL HellaSwag Results")
    print("=" * 45)
    print(f"   Accuracy: {accuracy:.1%} ({total_correct}/{total_questions})")
    print(f"   Avg Confidence: {avg_confidence:.3f}")
    print(f"   Runtime: {total_time:.1f}s")

    # Save results
    timestamp = int(time.time())
    results_data = {
        "evaluation_type": "real_hellaswag",
        "accuracy": accuracy,
        "total_correct": total_correct,
        "total_questions": total_questions,
        "avg_confidence": avg_confidence,
        "use_real_models": use_real_models,
        "timestamp": timestamp,
        "results": results[:50]  # First 50 detailed results
    }

    results_file = f"real_hellaswag_results_{timestamp}.json"
    with open(results_file, 'w') as f:
        json.dump(results_data, f, indent=2)

    print(f"? Results saved to: {results_file}")

    return results_data

def main():
    """Main function"""
    parser = argparse.ArgumentParser(description='Real HellaSwag Evaluation')
    parser.add_argument('--max-samples', type=int, default=None)
    parser.add_argument('--simulation', action='store_true')
    parser.add_argument('--quick-test', action='store_true')

    args = parser.parse_args()

    max_samples = 50 if args.quick_test else args.max_samples
    use_real_models = not args.simulation

    results = run_real_hellaswag_evaluation(max_samples, use_real_models)

    return 0 if results else 1

if __name__ == "__main__":
    exit(main())