fife/evaluation_bin.py at main · gtfintechlab/fife · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# evaluation_bin.py
import json
import os
from datetime import datetime
from pathlib import Path

from absl import app, flags

import evaluation_lib as EL
from results_manager import ResultsManager

_INPUT_DATA = flags.DEFINE_string(
    "input_data", None, required=False, help="path to inputs.jsonl (or use --run-dir)"
)
_INPUT_RESPONSE_DATA = flags.DEFINE_string(
    "input_response_data", None, required=False, help="path to responses.jsonl (or use --run-dir)"
)
_OUTPUT_DIR = flags.DEFINE_string(
    "output_dir", None, required=False, help="output dir (or use --run-dir)"
)
_RUN_DIR = flags.DEFINE_string("run_dir", None, help="Use organized run directory from results/")
_PROVIDER = flags.DEFINE_string("provider", None, help="Provider name (for finding latest run)")
_MODEL = flags.DEFINE_string("model", None, help="Model short name (for finding latest run)")


def main(argv):
    if len(argv) > 1:
        raise app.UsageError("Too many args")

    # Determine input/output paths
    input_data_path = _INPUT_DATA.value
    response_data_path = _INPUT_RESPONSE_DATA.value
    output_dir = _OUTPUT_DIR.value
    run_dir = None

    # Use run directory if specified or find latest run
    if _RUN_DIR.value:
        run_dir = Path(_RUN_DIR.value)
        if not run_dir.exists():
            raise ValueError(f"Run directory does not exist: {run_dir}")
    elif _PROVIDER.value and _MODEL.value:
        # Find latest run for provider/model
        results_mgr = ResultsManager()
        run_dir = results_mgr.find_latest_run(_PROVIDER.value, _MODEL.value)
        if not run_dir:
            raise ValueError(f"No runs found for {_PROVIDER.value}/{_MODEL.value}")
        print(f"Using latest run: {run_dir}")

    # If using run directory, set paths from it
    if run_dir:
        run_dir = Path(run_dir)
        results_mgr = ResultsManager()
        paths = results_mgr.get_run_paths(run_dir)

        if not input_data_path:
            input_data_path = str(paths["inputs"])
        if not response_data_path:
            response_data_path = str(paths["responses"])
        if not output_dir:
            output_dir = str(paths["eval_strict"].parent)  # evaluations/ directory

    # Validate required paths
    if not input_data_path or not response_data_path or not output_dir:
        raise app.UsageError(
            "Must specify either --input_data, --input_response_data, --output_dir "
            "OR --run_dir OR --provider and --model"
        )

    # Read inputs and responses
    inputs = EL.read_prompt_list(input_data_path)
    p2r = EL.read_prompt_to_response_dict(response_data_path)

    os.makedirs(output_dir, exist_ok=True)

    # Store results for summary
    all_metrics = {}

    for fn, name in [
        (EL.test_instruction_following_strict, "eval_results_strict"),
        (EL.test_instruction_following_loose, "eval_results_loose"),
    ]:
        outs = [fn(inp, p2r) for inp in inputs]

        # Determine output filename
        if run_dir:
            # Use standard names for organized structure
            out_name = "strict.jsonl" if "strict" in name else "loose.jsonl"
        else:
            out_name = f"{name}.jsonl"

        out_path = os.path.join(output_dir, out_name)
        EL.write_outputs(out_path, outs)
        print("=" * 64)
        print(f"{out_path} Accuracy Scores:")

        # Capture metrics
        metrics = EL.print_report(outs)
        metric_key = "strict" if "strict" in name else "loose"
        all_metrics[metric_key] = metrics

    # If using run directory, save evaluation metrics and update summary
    if run_dir:
        # Save metrics
        metrics_path = run_dir / "evaluations" / "metrics.json"
        with open(metrics_path, "w") as f:
            json.dump(
                {
                    "strict": all_metrics.get("strict", {}),
                    "loose": all_metrics.get("loose", {}),
                    "evaluated_at": datetime.now().isoformat(),
                },
                f,
                indent=2,
            )

        # Update summary if it exists
        summary_path = run_dir / "summary.json"
        if summary_path.exists():
            with open(summary_path) as f:
                summary = json.load(f)

            # Update metrics
            summary["metrics"]["strict_accuracy"] = all_metrics.get("strict", {}).get(
                "prompt-level", 0.0
            )
            summary["metrics"]["loose_accuracy"] = all_metrics.get("loose", {}).get(
                "prompt-level", 0.0
            )
            summary["metrics"]["instruction_level_strict"] = all_metrics.get("strict", {}).get(
                "instruction-level", 0.0
            )
            summary["metrics"]["instruction_level_loose"] = all_metrics.get("loose", {}).get(
                "instruction-level", 0.0
            )

            with open(summary_path, "w") as f:
                json.dump(summary, f, indent=2)

            print(f"\nEvaluation complete. Results saved to: {run_dir}")


if __name__ == "__main__":
    app.run(main)