CutFinder/CutFinder/algorithms.py at master · pviscone/CutFinder · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import hist
from hist.intervals import ratio_uncertainty
import numpy as np

# ! NB if scaling is applied, the cuts are computed on the offline pT scale.
#! Inverse scaling will be applied in the regressor to map back to online pT.


def iterative_bin_cutter(ref, obj, glob):
    ref_h = ref.makeRate(glob.pt_bins, glob.maxRate)
    cuts = []
    cuts_err = []

    new_rate = []

    print(f"Processing pt {len(glob.pt_bins)-1}: >= {glob.pt_bins[-1]} GeV (Obj: {obj.name}, Ref: {ref.name})")
    if ref.rate[-1] == 0.0:
        print(
            f"Warning: target rate in {len(glob.pt_bins)-1} ({glob.pt_bins[-1]} GeV) is zero (probably due to low stat). No cut will be applied."
        )
        cuts.append(-np.inf)
        cuts_err.append(0.0)
    else:
        # Select max score in the last pt bin
        scores = (
            obj.rdf.Define(
                "scores", f"{obj.score_branch}[{obj.pt_branch} >= {glob.pt_bins[-1]}]"
            )
            .Filter("scores.size() > 0")
            .Define("max_score", "Max(scores)")
        ).AsNumpy(["max_score"])["max_score"]

        rate_bin = len(scores) * (glob.maxRate / obj.TotEvents)

        nEvents_ref = ref_h[hist.loc(glob.pt_bins[-1])].value

        # Fraction of events to keep in the last pt bin
        f = ref.rate[-1] / rate_bin

        # check if f>=1 otherwise no cut can be applied
        if f > 1.0:
            print(
                f"Warning: target rate in bin {len(glob.pt_bins)-1} ({glob.pt_bins[-1]} GeV) is higher than current rate. No cut will be applied."
            )
            cuts.append(-np.inf)
            cuts_err.append(0.0)
        elif len(scores) == 0:
            print(
                f"Warning: no events in the last pt bin {glob.pt_bins[-1]} GeV. No cut will be applied."
            )
            cuts.append(-np.inf)
            cuts_err.append(0.0)
        else:
            f_err = ratio_uncertainty(np.array([nEvents_ref]), np.array([len(scores)]), uncertainty_type="poisson-ratio") * (obj.TotEvents / ref.TotEvents)
            f_err_up = f_err[1][0]
            f_err_down = f_err[0][0]

            q = np.quantile(scores, 1 - f)
            q_down = np.quantile(scores, 1 - min(f_err_up, 1.0))
            q_up = np.quantile(scores, 1 - max(f_err_down, 0.0))

            q_err = (q_up - q_down) / 2.0

            cuts.append(q)
            cuts_err.append(float(q_err))

        new_rate.append(len(scores[scores >= np.nan_to_num(cuts[-1], neginf = -9999)]) * (glob.maxRate / obj.TotEvents))

        print(f"\tCut found : {cuts[-1]} +- {cuts_err[-1]}\n", flush=True)

    # Apply cut in the last bin
    if cuts[-1] != -np.inf:
        cut_mask = f"({obj.pt_branch} >= {glob.pt_bins[-1]} && {obj.score_branch} >= {cuts[-1]}) || ({obj.pt_branch} < {glob.pt_bins[-1]})"
        rdf = (
            obj.rdf.Define("cut_mask", cut_mask)
            .Redefine(
                obj.score_branch,
                f"{obj.score_branch}[cut_mask]",
            )
            .Redefine(
                obj.pt_branch,
                f"{obj.pt_branch}[cut_mask]",
            )
            .Filter(f"{obj.pt_branch}.size() > 0")
        )
    else:
        rdf = obj.rdf.Define(
            "cut_mask", f"ROOT::VecOps::RVec<bool>({obj.pt_branch}.size(), true)"
        )  # keep all events

    for i in range(len(glob.pt_bins) - 2, -1, -1):
        print(
            f"Processing pt bin {i}: {glob.pt_bins[i]} - {glob.pt_bins[i + 1]} GeV (Obj: {obj.name}, Ref: {ref.name})"
        )
        # Select max score in the current pt bin (from the last to the first) and discard events which already triggered the previous cuts
        scores = (
            # Discard events which already triggered previous cuts
            rdf.Filter(f"Max({obj.pt_branch}) < {glob.pt_bins[i + 1]}")
            # select objs in current pt bin
            .Redefine(
                obj.score_branch,
                f"{obj.score_branch}[{obj.pt_branch} >= {glob.pt_bins[i]} && {obj.pt_branch} < {glob.pt_bins[i + 1]}]",
            )
            .Redefine(
                obj.pt_branch,
                f"{obj.pt_branch}[{obj.pt_branch} >= {glob.pt_bins[i]} && {obj.pt_branch} < {glob.pt_bins[i + 1]}]",
            )
            .Filter(f"{obj.pt_branch}.size() > 0")
            # Select max score
            .Define("max_score", f"Max({obj.score_branch})")
            .AsNumpy(["max_score"])["max_score"]
        )

        rate_bin = len(scores) * (glob.maxRate / obj.TotEvents) + new_rate[-1]
        nEvents_ref = ref_h[hist.loc(glob.pt_bins[i])].value

        f = (ref.rate[i] - ref.rate[i + 1]) / (rate_bin - new_rate[-1])
        if f > 1.0:
            print(
                f"Warning: target rate in bin {i} ({glob.pt_bins[i]} GeV) is higher than current rate. No cut will be applied."
            )
            cuts.append(-np.inf)
            cuts_err.append(0.0)
        elif f < 0.0:
            print(
                f"Warning: target rate in bin {i} ({glob.pt_bins[i]} GeV) is lower than previous rate. No cut will be applied."
            )
            cuts.append(-np.inf)
            cuts_err.append(0.0)
        else:
            # find cut to achieve target rate in the current pt bin
            if len(scores) == 0:
                print(
                    f"Warning: no events in the current pt bin {i} ({glob.pt_bins[i]} GeV). No cut will be applied."
                )
                cuts.append(-np.inf)
                cuts_err.append(0.0)
            else:
                f_err = ratio_uncertainty(np.array([nEvents_ref]), np.array([len(scores)]), uncertainty_type="poisson-ratio") * (obj.TotEvents / ref.TotEvents)
                f_err_up = f_err[1][0]
                f_err_down = f_err[0][0]

                q = np.quantile(scores, 1 - f)
                q_down = np.quantile(scores, 1 - min(f_err_up, 1.0))
                q_up = np.quantile(scores, 1 - max(f_err_down, 0.0))
                q_err = (q_up - q_down) / 2.0

                cuts.append(q)
                cuts_err.append(float(q_err))

        new_rate.append(len(scores[scores >= np.nan_to_num(cuts[-1], neginf = -9999)]) * (glob.maxRate / obj.TotEvents) + new_rate[-1])

        if (
            cuts[-1] != -np.inf and i > 0
        ):  # it's useless to apply cut in the first bin (last iteration)
            cut_mask = f"({obj.pt_branch} >= {glob.pt_bins[i]} && {obj.pt_branch} < {glob.pt_bins[i + 1]} && {obj.score_branch} >= {cuts[-1]}) || ({obj.pt_branch} < {glob.pt_bins[i]})"
            rdf = (
                rdf.Redefine("cut_mask", cut_mask)
                .Redefine(
                    obj.score_branch,
                    f"{obj.score_branch}[cut_mask]",
                )
                .Redefine(
                    obj.pt_branch,
                    f"{obj.pt_branch}[cut_mask]",
                )
                .Filter(f"{obj.pt_branch}.size() > 0")
            )
        print(f"\tCut found : {cuts[-1]} +- {cuts_err[-1]}\n", flush=True)

    cuts = np.array(cuts[::-1])
    cuts_err = np.array(cuts_err[::-1])
    new_rate = np.array(new_rate[::-1])
    return cuts, cuts_err, new_rate