-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathevaluation_bin.py
More file actions
143 lines (119 loc) · 4.95 KB
/
evaluation_bin.py
File metadata and controls
143 lines (119 loc) · 4.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# evaluation_bin.py
import json
import os
from datetime import datetime
from pathlib import Path
from absl import app, flags
import evaluation_lib as EL
from results_manager import ResultsManager
_INPUT_DATA = flags.DEFINE_string(
"input_data", None, required=False, help="path to inputs.jsonl (or use --run-dir)"
)
_INPUT_RESPONSE_DATA = flags.DEFINE_string(
"input_response_data", None, required=False, help="path to responses.jsonl (or use --run-dir)"
)
_OUTPUT_DIR = flags.DEFINE_string(
"output_dir", None, required=False, help="output dir (or use --run-dir)"
)
_RUN_DIR = flags.DEFINE_string("run_dir", None, help="Use organized run directory from results/")
_PROVIDER = flags.DEFINE_string("provider", None, help="Provider name (for finding latest run)")
_MODEL = flags.DEFINE_string("model", None, help="Model short name (for finding latest run)")
def main(argv):
if len(argv) > 1:
raise app.UsageError("Too many args")
# Determine input/output paths
input_data_path = _INPUT_DATA.value
response_data_path = _INPUT_RESPONSE_DATA.value
output_dir = _OUTPUT_DIR.value
run_dir = None
# Use run directory if specified or find latest run
if _RUN_DIR.value:
run_dir = Path(_RUN_DIR.value)
if not run_dir.exists():
raise ValueError(f"Run directory does not exist: {run_dir}")
elif _PROVIDER.value and _MODEL.value:
# Find latest run for provider/model
results_mgr = ResultsManager()
run_dir = results_mgr.find_latest_run(_PROVIDER.value, _MODEL.value)
if not run_dir:
raise ValueError(f"No runs found for {_PROVIDER.value}/{_MODEL.value}")
print(f"Using latest run: {run_dir}")
# If using run directory, set paths from it
if run_dir:
run_dir = Path(run_dir)
results_mgr = ResultsManager()
paths = results_mgr.get_run_paths(run_dir)
if not input_data_path:
input_data_path = str(paths["inputs"])
if not response_data_path:
response_data_path = str(paths["responses"])
if not output_dir:
output_dir = str(paths["eval_strict"].parent) # evaluations/ directory
# Validate required paths
if not input_data_path or not response_data_path or not output_dir:
raise app.UsageError(
"Must specify either --input_data, --input_response_data, --output_dir "
"OR --run_dir OR --provider and --model"
)
# Read inputs and responses
inputs = EL.read_prompt_list(input_data_path)
p2r = EL.read_prompt_to_response_dict(response_data_path)
os.makedirs(output_dir, exist_ok=True)
# Store results for summary
all_metrics = {}
for fn, name in [
(EL.test_instruction_following_strict, "eval_results_strict"),
(EL.test_instruction_following_loose, "eval_results_loose"),
]:
outs = [fn(inp, p2r) for inp in inputs]
# Determine output filename
if run_dir:
# Use standard names for organized structure
out_name = "strict.jsonl" if "strict" in name else "loose.jsonl"
else:
out_name = f"{name}.jsonl"
out_path = os.path.join(output_dir, out_name)
EL.write_outputs(out_path, outs)
print("=" * 64)
print(f"{out_path} Accuracy Scores:")
# Capture metrics
metrics = EL.print_report(outs)
metric_key = "strict" if "strict" in name else "loose"
all_metrics[metric_key] = metrics
# If using run directory, save evaluation metrics and update summary
if run_dir:
# Save metrics
metrics_path = run_dir / "evaluations" / "metrics.json"
with open(metrics_path, "w") as f:
json.dump(
{
"strict": all_metrics.get("strict", {}),
"loose": all_metrics.get("loose", {}),
"evaluated_at": datetime.now().isoformat(),
},
f,
indent=2,
)
# Update summary if it exists
summary_path = run_dir / "summary.json"
if summary_path.exists():
with open(summary_path) as f:
summary = json.load(f)
# Update metrics
summary["metrics"]["strict_accuracy"] = all_metrics.get("strict", {}).get(
"prompt-level", 0.0
)
summary["metrics"]["loose_accuracy"] = all_metrics.get("loose", {}).get(
"prompt-level", 0.0
)
summary["metrics"]["instruction_level_strict"] = all_metrics.get("strict", {}).get(
"instruction-level", 0.0
)
summary["metrics"]["instruction_level_loose"] = all_metrics.get("loose", {}).get(
"instruction-level", 0.0
)
with open(summary_path, "w") as f:
json.dump(summary, f, indent=2)
print(f"\nEvaluation complete. Results saved to: {run_dir}")
if __name__ == "__main__":
app.run(main)