Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/changes/39.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add CTAO-specific support for telescope indexing/sorting and geomagnetic angle calculation by introducing an observatory configuration, new geomagnetic field presets, and updated sorting behavior (mirror area first, then size).
15 changes: 15 additions & 0 deletions src/eventdisplay_ml/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,13 +79,21 @@ def configure_training(analysis_type):
help="Maximum number of CPU cores to use for training.",
default=1,
)
parser.add_argument(
"--observatory",
type=str,
help="Observatory/site name for geomagnetic field (default: VERITAS).",
default="VERITAS",
)

model_configs = vars(parser.parse_args())

_logger.info(f"--- XGBoost {analysis_type} training ---")
_logger.info(f"Observatory: {model_configs.get('observatory')}")
_logger.info(f"Telescope multiplicity: {model_configs.get('n_tel')}")
_logger.info(f"Model output prefix: {model_configs.get('model_prefix')}")
_logger.info(f"Train vs test fraction: {model_configs['train_test_fraction']}")
_logger.info(f"Random state: {model_configs['random_state']}")
_logger.info(f"Max events: {model_configs['max_events']}")
_logger.info(f"Max CPU cores: {model_configs['max_cores']}")

Expand Down Expand Up @@ -171,10 +179,17 @@ def configure_apply(analysis_type):
help="Maximum number of CPU cores to use for processing.",
default=1,
)
parser.add_argument(
"--observatory",
type=str,
help="Observatory/site name for geomagnetic field (default: VERITAS).",
default="VERITAS",
)

model_configs = vars(parser.parse_args())

_logger.info(f"--- XGBoost {analysis_type} evaluation ---")
_logger.info(f"Observatory: {model_configs.get('observatory')}")
_logger.info(f"Input file: {model_configs.get('input_file')}")
_logger.info(f"Model prefix: {model_configs.get('model_prefix')}")
_logger.info(f"Output file: {model_configs.get('output_file')}")
Expand Down
61 changes: 41 additions & 20 deletions src/eventdisplay_ml/data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ def _clip_size_array(size_array):


def flatten_telescope_data_vectorized(
df, n_tel, features, analysis_type, training=True, tel_config=None
df, n_tel, features, analysis_type, training=True, tel_config=None, observatory="veritas"
):
"""
Vectorized flattening of telescope array columns.
Expand Down Expand Up @@ -394,10 +394,9 @@ def flatten_telescope_data_vectorized(
n_evt = len(df)
max_tel_id = tel_config["max_tel_id"] if tel_config else (n_tel - 1)

# Detect indexing mode and determine which index list to use for remapping
has_r_core = _has_field(df, "R_core")
# Index remapping mode for CTAO-style variable-length indexing
index_list_for_remapping = None # None = telescope-ID-indexed (VERITAS R_core mode)
if has_r_core:
if observatory.lower() == "veritas":
_logger.info("Detected VERITAS R_core fixed telescope-ID indexing mode.")
else:
_logger.info("Detected CTAO ImgSel_list variable-length indexing mode.")
Expand Down Expand Up @@ -474,7 +473,8 @@ def flatten_telescope_data_vectorized(
index = _get_index(df, n_evt)
df_flat = flatten_telescope_variables(n_tel, flat_features, index, tel_config)
return pd.concat(
[df_flat, extra_columns(df, analysis_type, training, index, tel_config)], axis=1
[df_flat, extra_columns(df, analysis_type, training, index, tel_config, observatory)],
axis=1,
)


Expand Down Expand Up @@ -515,8 +515,9 @@ def _to_dense_array(col):

def _get_core_arrays(df):
"""Extract core position arrays from DataFrame."""
core_x = _to_numpy_1d(df["Xcore"], np.float32)
core_y = _to_numpy_1d(df["Ycore"], np.float32)
# Make copies to ensure arrays are writable (some sources return read-only views)
core_x = _to_numpy_1d(df["Xcore"], np.float32).copy()
core_y = _to_numpy_1d(df["Ycore"], np.float32).copy()
# Filter out sentinel values and apply physical bounds
# shower cores beyond +-10 km are cut
core_x[(core_x <= -90000) | (np.abs(core_x) > 10000)] = np.nan
Expand All @@ -543,20 +544,35 @@ def _compute_size_area_sort_indices(size_data, active_mask, tel_config, max_tel_
if tid <= max_tel_id:
mirror_lookup[int(tid)] = float(area)

# Build dense size matrix aligned by telescope ID
sizes = np.full((n_evt, max_tel_id + 1), np.nan, dtype=np.float32)
col_cap = min(size_data.shape[1], max_tel_id + 1)
sizes[:, :col_cap] = size_data[:, :col_cap]
# size_data is already in tel_id space (shape: n_evt x (max_tel_id + 1))
sizes = size_data

# Mask telescopes that are not active in this event
sizes = np.where(active_mask, sizes, np.nan)
# Sort per-event: primary = mirror area (desc), secondary = size (desc)
# Area has highest priority ALWAYS, even if size is NaN.
# NaN mirror areas go to the very end; within equal area groups,
# valid sizes come before NaN sizes, and then by size descending.
sort_indices = np.zeros((n_evt, max_tel_id + 1), dtype=int)

# Prepare sort keys: primary = mirror area (desc), secondary = size (desc)
area_key = np.where(np.isnan(mirror_lookup), np.inf, -mirror_lookup)
primary_matrix = np.broadcast_to(area_key, (n_evt, max_tel_id + 1))
size_key = np.where(np.isnan(sizes), np.inf, -sizes)
for evt_idx in range(n_evt):
tel_entries = []
for tel_idx in range(max_tel_id + 1):
area = mirror_lookup[tel_idx]
size_val = sizes[evt_idx, tel_idx]
# Build sort key:
# 1) valid area first (0), NaN area last (1)
# 2) area descending via negative value
# 3) within same area: valid size first (0), NaN last (1)
# 4) size descending via negative value
area_valid = 0 if not np.isnan(area) else 1
size_valid = 0 if not np.isnan(size_val) else 1
area_key = -area if area_valid == 0 else 0.0
size_key = -size_val if size_valid == 0 else 0.0
tel_entries.append((tel_idx, area_valid, area_key, size_valid, size_key))

tel_entries.sort(key=lambda x: (x[1], x[2], x[3], x[4]))
sort_indices[evt_idx] = np.array([t[0] for t in tel_entries])

return np.lexsort((size_key, primary_matrix), axis=1)
return sort_indices


def _to_numpy_1d(x, dtype=np.float32):
Expand Down Expand Up @@ -590,7 +606,9 @@ def _get_index(df_like, n):
return pd.RangeIndex(n)


def flatten_feature_data(group_df, ntel, analysis_type, training, tel_config=None):
def flatten_feature_data(
group_df, ntel, analysis_type, training, tel_config=None, observatory="veritas"
):
"""
Get flattened features for events.

Expand All @@ -616,6 +634,7 @@ def flatten_feature_data(group_df, ntel, analysis_type, training, tel_config=Non
analysis_type=analysis_type,
training=training,
tel_config=tel_config,
observatory=observatory,
)
max_tel_id = tel_config["max_tel_id"] if tel_config else ntel - 1
excluded_columns = set(features_module.target_features(analysis_type)) | set(
Expand Down Expand Up @@ -719,6 +738,7 @@ def load_training_data(model_configs, file_list, analysis_type):
analysis_type,
training=True,
tel_config=tel_config,
observatory=model_configs.get("observatory", "veritas"),
)
if analysis_type == "stereo_analysis":
df_flat["MCxoff"] = _to_numpy_1d(df["MCxoff"], np.float32)
Expand Down Expand Up @@ -936,7 +956,7 @@ def _calculate_array_footprint(tel_config, elev_rad, azim_rad, tel_list_matrix):
return footprints


def extra_columns(df, analysis_type, training, index, tel_config=None):
def extra_columns(df, analysis_type, training, index, tel_config=None, observatory="veritas"):
"""Add extra columns required for analysis type."""
if analysis_type == "stereo_analysis":
n = len(index)
Expand Down Expand Up @@ -969,6 +989,7 @@ def extra_columns(df, analysis_type, training, index, tel_config=None):
"Geomagnetic_Angle": calculate_geomagnetic_angles(
_to_numpy_1d(df["ArrayPointing_Azimuth"], np.float32),
_to_numpy_1d(df["ArrayPointing_Elevation"], np.float32),
observatory=observatory,
),
}
# Add array footprint if telescope configuration is available
Expand Down
2 changes: 2 additions & 0 deletions src/eventdisplay_ml/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ def _regression_features(training):
*telescope_features("stereo_analysis"),
"DispNImages",
"DispTelList_T",
"ImgSel_list",
"Xoff",
"Yoff",
"Xoff_intersect",
Expand All @@ -138,6 +139,7 @@ def _classification_features():
var_array = [
"DispNImages",
"DispTelList_T",
"ImgSel_list",
"EChi2S",
"EmissionHeight",
"EmissionHeightChi2",
Expand Down
30 changes: 21 additions & 9 deletions src/eventdisplay_ml/geomag.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,21 @@
"BX": 25.2e-6, # Tesla
"BY": 0.0, # Tesla
"BZ": 40.88e-6, # Tesla
}
},
"CTAO-NORTH": {
"BX": 30.909e-6, # Tesla
"BY": 0.0, # Tesla
"BZ": 23.409e-6, # Tesla
},
"CTAO-SOUTH": {
"BX": 20.552e-6, # Tesla
"BY": 0.0, # Tesla
"BZ": -9.367 - 6, # Tesla
},
}


def calculate_geomagnetic_angles(azimuth, elevation, site="VERITAS"):
def calculate_geomagnetic_angles(azimuth, elevation, observatory="VERITAS"):
"""
Calculate the angle between the shower direction and the geomagnetic field.

Expand All @@ -24,21 +34,23 @@ def calculate_geomagnetic_angles(azimuth, elevation, site="VERITAS"):
Azimuth angles of the showers in degrees.
elevation : array-like
Elevation angles of the showers in degrees.
site : str
Site identifier to get geomagnetic field components.
observatory : str
Observatory identifier to get geomagnetic field components.

Returns
-------
theta_B : array-like
Angle between shower direction and geomagnetic field in degrees.
"""
observatory = observatory.upper()
try:
bx = FIELD_COMPONENTS[site]["BX"]
by = FIELD_COMPONENTS[site]["BY"]
bz = FIELD_COMPONENTS[site]["BZ"]
bx = FIELD_COMPONENTS[observatory]["BX"]
by = FIELD_COMPONENTS[observatory]["BY"]
bz = FIELD_COMPONENTS[observatory]["BZ"]
except KeyError as exc:
raise KeyError(f"Geomagnetic field components for site '{site}' are not defined.") from exc

raise KeyError(
f"Geomagnetic field components for observatory '{observatory}' are not defined."
) from exc
# Shower direction unit vector
sx = np.cos(np.radians(elevation)) * np.cos(np.radians(azimuth)) # North
sy = np.cos(np.radians(elevation)) * np.sin(np.radians(azimuth)) # East
Expand Down
4 changes: 2 additions & 2 deletions src/eventdisplay_ml/hyper_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,8 @@ def _load_hyper_parameters_from_file(config_file):


def pre_cuts_regression(n_tel):
"""Get pre-cuts for regression analysis (no multiplicity filter)."""
event_cut = ""
"""Get pre-cuts for regression analysis."""
event_cut = "DispNImages >=2"
if PRE_CUTS_REGRESSION:
event_cut = " & ".join(f"({c})" for c in PRE_CUTS_REGRESSION)
_logger.info(f"Pre-cuts (regression): {event_cut if event_cut else 'None'}")
Expand Down
14 changes: 12 additions & 2 deletions src/eventdisplay_ml/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,12 @@ def apply_regression_models(df, model_configs):
n_tel = tel_config["max_tel_id"] + 1 if tel_config else 4

flatten_data = flatten_feature_data(
df, n_tel, analysis_type="stereo_analysis", training=False, tel_config=tel_config
df,
n_tel,
analysis_type="stereo_analysis",
training=False,
tel_config=tel_config,
observatory=model_configs.get("observatory", "veritas"),
)

models = model_configs["models"]
Expand Down Expand Up @@ -302,7 +307,12 @@ def apply_classification_models(df, model_configs, threshold_keys):
_logger.info(f"Processing {len(group_df)} events with bin={e_bin}")

flatten_data = flatten_feature_data(
group_df, n_tel, analysis_type="classification", training=False, tel_config=tel_config
group_df,
n_tel,
analysis_type="classification",
training=False,
tel_config=tel_config,
observatory=model_configs.get("observatory", "veritas"),
)
model = models[e_bin]["model"]
flatten_data = flatten_data.reindex(columns=models[e_bin]["features"])
Expand Down
7 changes: 4 additions & 3 deletions tests/test_imgsel_debug.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,12 @@ def test_imgsel_sorting_and_alignment():
analysis_type="stereo_analysis",
training=True,
tel_config=tel_config,
observatory="ctao-north",
)

# Expected order by size desc (mirror areas equal): tel 1 then tel 3
expected_size_log = [np.log10(3725.51), np.log10(2640.01)]
# Disp_T should align with sorted order
# Expected order by size desc (mirror areas equal): tel 1 (3725.51) then tel 3 (2640.01)
expected_size_log = [np.log10(3725.51), np.log10(2640.01)] # Descending order
# Disp_T aligned to sorted order (tel 1 first, then tel 3)
expected_disp = [1.56872, 1.50883]

# Assert mirror_area columns exist and equal 100 for sorted positions
Expand Down
53 changes: 53 additions & 0 deletions tests/test_sort_logic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""Unit tests for telescope sorting logic (mirror area first, then size)."""

import numpy as np


def test_sort_logic_equal_area_by_size():
"""Test sorting with equal mirror areas: secondary sort by size descending."""
max_tel_id = 3
mirror_lookup = np.array([np.nan, 100.0, np.nan, 100.0], dtype=np.float32)
sizes_row = np.array([np.nan, 3725.51, np.nan, 2640.01], dtype=np.float32)

# Apply the new sort logic: area priority, then size within equal area
tel_entries = []
for tel_idx in range(max_tel_id + 1):
area = mirror_lookup[tel_idx]
size_val = sizes_row[tel_idx]
area_valid = 0 if not np.isnan(area) else 1
size_valid = 0 if not np.isnan(size_val) else 1
area_key = -area if area_valid == 0 else 0.0
size_key = -size_val if size_valid == 0 else 0.0
tel_entries.append((tel_idx, area_valid, area_key, size_valid, size_key))

tel_entries.sort(key=lambda x: (x[1], x[2], x[3], x[4]))
sort_indices = np.array([t[0] for t in tel_entries])

# Expect: tel 1 (area 100, size 3725.51), tel 3 (area 100, size 2640.01), then NaN areas
assert np.array_equal(sort_indices, np.array([1, 3, 0, 2]))


def test_sort_logic_mixed_areas_and_nan_sizes():
"""Test sorting with mixed mirror areas and some NaN sizes: area priority always."""
max_tel_id = 4
# Areas: tel 0=200, tel 1=100 (NaN size), tel 2=200, tel 3=100, tel 4=NaN
mirror_lookup = np.array([200.0, 100.0, 200.0, 100.0, np.nan], dtype=np.float32)
# Sizes: tel 0=1000, tel 1=NaN, tel 2=500, tel 3=2000, tel 4=3000
sizes_row = np.array([1000.0, np.nan, 500.0, 2000.0, 3000.0], dtype=np.float32)

# Apply sorting
tel_entries = []
for tel_idx in range(max_tel_id + 1):
area = mirror_lookup[tel_idx]
size_val = sizes_row[tel_idx]
area_valid = 0 if not np.isnan(area) else 1
size_valid = 0 if not np.isnan(size_val) else 1
area_key = -area if area_valid == 0 else 0.0
size_key = -size_val if size_valid == 0 else 0.0
tel_entries.append((tel_idx, area_valid, area_key, size_valid, size_key))

tel_entries.sort(key=lambda x: (x[1], x[2], x[3], x[4]))
sort_indices = np.array([t[0] for t in tel_entries])

# Expect: area 200 desc (tel 0 size 1000, tel 2 size 500), then area 100 desc (tel 3 size 2000, tel 1 size NaN), then NaN area (tel 4)
assert np.array_equal(sort_indices, np.array([0, 2, 3, 1, 4]))