diff --git a/docs/changes/39.feature.md b/docs/changes/39.feature.md new file mode 100644 index 0000000..e2ce4c2 --- /dev/null +++ b/docs/changes/39.feature.md @@ -0,0 +1 @@ +Add CTAO-specific support for telescope indexing/sorting and geomagnetic angle calculation by introducing an observatory configuration, new geomagnetic field presets, and updated sorting behavior (mirror area first, then size). diff --git a/src/eventdisplay_ml/config.py b/src/eventdisplay_ml/config.py index fbce449..f3c2c2b 100644 --- a/src/eventdisplay_ml/config.py +++ b/src/eventdisplay_ml/config.py @@ -79,13 +79,21 @@ def configure_training(analysis_type): help="Maximum number of CPU cores to use for training.", default=1, ) + parser.add_argument( + "--observatory", + type=str, + help="Observatory/site name for geomagnetic field (default: VERITAS).", + default="VERITAS", + ) model_configs = vars(parser.parse_args()) _logger.info(f"--- XGBoost {analysis_type} training ---") + _logger.info(f"Observatory: {model_configs.get('observatory')}") _logger.info(f"Telescope multiplicity: {model_configs.get('n_tel')}") _logger.info(f"Model output prefix: {model_configs.get('model_prefix')}") _logger.info(f"Train vs test fraction: {model_configs['train_test_fraction']}") + _logger.info(f"Random state: {model_configs['random_state']}") _logger.info(f"Max events: {model_configs['max_events']}") _logger.info(f"Max CPU cores: {model_configs['max_cores']}") @@ -171,10 +179,17 @@ def configure_apply(analysis_type): help="Maximum number of CPU cores to use for processing.", default=1, ) + parser.add_argument( + "--observatory", + type=str, + help="Observatory/site name for geomagnetic field (default: VERITAS).", + default="VERITAS", + ) model_configs = vars(parser.parse_args()) _logger.info(f"--- XGBoost {analysis_type} evaluation ---") + _logger.info(f"Observatory: {model_configs.get('observatory')}") _logger.info(f"Input file: {model_configs.get('input_file')}") _logger.info(f"Model prefix: {model_configs.get('model_prefix')}") _logger.info(f"Output file: {model_configs.get('output_file')}") diff --git a/src/eventdisplay_ml/data_processing.py b/src/eventdisplay_ml/data_processing.py index 7ef484f..60a2103 100644 --- a/src/eventdisplay_ml/data_processing.py +++ b/src/eventdisplay_ml/data_processing.py @@ -361,7 +361,7 @@ def _clip_size_array(size_array): def flatten_telescope_data_vectorized( - df, n_tel, features, analysis_type, training=True, tel_config=None + df, n_tel, features, analysis_type, training=True, tel_config=None, observatory="veritas" ): """ Vectorized flattening of telescope array columns. @@ -394,10 +394,9 @@ def flatten_telescope_data_vectorized( n_evt = len(df) max_tel_id = tel_config["max_tel_id"] if tel_config else (n_tel - 1) - # Detect indexing mode and determine which index list to use for remapping - has_r_core = _has_field(df, "R_core") + # Index remapping mode for CTAO-style variable-length indexing index_list_for_remapping = None # None = telescope-ID-indexed (VERITAS R_core mode) - if has_r_core: + if observatory.lower() == "veritas": _logger.info("Detected VERITAS R_core fixed telescope-ID indexing mode.") else: _logger.info("Detected CTAO ImgSel_list variable-length indexing mode.") @@ -474,7 +473,8 @@ def flatten_telescope_data_vectorized( index = _get_index(df, n_evt) df_flat = flatten_telescope_variables(n_tel, flat_features, index, tel_config) return pd.concat( - [df_flat, extra_columns(df, analysis_type, training, index, tel_config)], axis=1 + [df_flat, extra_columns(df, analysis_type, training, index, tel_config, observatory)], + axis=1, ) @@ -515,8 +515,9 @@ def _to_dense_array(col): def _get_core_arrays(df): """Extract core position arrays from DataFrame.""" - core_x = _to_numpy_1d(df["Xcore"], np.float32) - core_y = _to_numpy_1d(df["Ycore"], np.float32) + # Make copies to ensure arrays are writable (some sources return read-only views) + core_x = _to_numpy_1d(df["Xcore"], np.float32).copy() + core_y = _to_numpy_1d(df["Ycore"], np.float32).copy() # Filter out sentinel values and apply physical bounds # shower cores beyond +-10 km are cut core_x[(core_x <= -90000) | (np.abs(core_x) > 10000)] = np.nan @@ -543,20 +544,35 @@ def _compute_size_area_sort_indices(size_data, active_mask, tel_config, max_tel_ if tid <= max_tel_id: mirror_lookup[int(tid)] = float(area) - # Build dense size matrix aligned by telescope ID - sizes = np.full((n_evt, max_tel_id + 1), np.nan, dtype=np.float32) - col_cap = min(size_data.shape[1], max_tel_id + 1) - sizes[:, :col_cap] = size_data[:, :col_cap] + # size_data is already in tel_id space (shape: n_evt x (max_tel_id + 1)) + sizes = size_data - # Mask telescopes that are not active in this event - sizes = np.where(active_mask, sizes, np.nan) + # Sort per-event: primary = mirror area (desc), secondary = size (desc) + # Area has highest priority ALWAYS, even if size is NaN. + # NaN mirror areas go to the very end; within equal area groups, + # valid sizes come before NaN sizes, and then by size descending. + sort_indices = np.zeros((n_evt, max_tel_id + 1), dtype=int) - # Prepare sort keys: primary = mirror area (desc), secondary = size (desc) - area_key = np.where(np.isnan(mirror_lookup), np.inf, -mirror_lookup) - primary_matrix = np.broadcast_to(area_key, (n_evt, max_tel_id + 1)) - size_key = np.where(np.isnan(sizes), np.inf, -sizes) + for evt_idx in range(n_evt): + tel_entries = [] + for tel_idx in range(max_tel_id + 1): + area = mirror_lookup[tel_idx] + size_val = sizes[evt_idx, tel_idx] + # Build sort key: + # 1) valid area first (0), NaN area last (1) + # 2) area descending via negative value + # 3) within same area: valid size first (0), NaN last (1) + # 4) size descending via negative value + area_valid = 0 if not np.isnan(area) else 1 + size_valid = 0 if not np.isnan(size_val) else 1 + area_key = -area if area_valid == 0 else 0.0 + size_key = -size_val if size_valid == 0 else 0.0 + tel_entries.append((tel_idx, area_valid, area_key, size_valid, size_key)) + + tel_entries.sort(key=lambda x: (x[1], x[2], x[3], x[4])) + sort_indices[evt_idx] = np.array([t[0] for t in tel_entries]) - return np.lexsort((size_key, primary_matrix), axis=1) + return sort_indices def _to_numpy_1d(x, dtype=np.float32): @@ -590,7 +606,9 @@ def _get_index(df_like, n): return pd.RangeIndex(n) -def flatten_feature_data(group_df, ntel, analysis_type, training, tel_config=None): +def flatten_feature_data( + group_df, ntel, analysis_type, training, tel_config=None, observatory="veritas" +): """ Get flattened features for events. @@ -616,6 +634,7 @@ def flatten_feature_data(group_df, ntel, analysis_type, training, tel_config=Non analysis_type=analysis_type, training=training, tel_config=tel_config, + observatory=observatory, ) max_tel_id = tel_config["max_tel_id"] if tel_config else ntel - 1 excluded_columns = set(features_module.target_features(analysis_type)) | set( @@ -719,6 +738,7 @@ def load_training_data(model_configs, file_list, analysis_type): analysis_type, training=True, tel_config=tel_config, + observatory=model_configs.get("observatory", "veritas"), ) if analysis_type == "stereo_analysis": df_flat["MCxoff"] = _to_numpy_1d(df["MCxoff"], np.float32) @@ -936,7 +956,7 @@ def _calculate_array_footprint(tel_config, elev_rad, azim_rad, tel_list_matrix): return footprints -def extra_columns(df, analysis_type, training, index, tel_config=None): +def extra_columns(df, analysis_type, training, index, tel_config=None, observatory="veritas"): """Add extra columns required for analysis type.""" if analysis_type == "stereo_analysis": n = len(index) @@ -969,6 +989,7 @@ def extra_columns(df, analysis_type, training, index, tel_config=None): "Geomagnetic_Angle": calculate_geomagnetic_angles( _to_numpy_1d(df["ArrayPointing_Azimuth"], np.float32), _to_numpy_1d(df["ArrayPointing_Elevation"], np.float32), + observatory=observatory, ), } # Add array footprint if telescope configuration is available diff --git a/src/eventdisplay_ml/features.py b/src/eventdisplay_ml/features.py index e41cb20..6cbe161 100644 --- a/src/eventdisplay_ml/features.py +++ b/src/eventdisplay_ml/features.py @@ -115,6 +115,7 @@ def _regression_features(training): *telescope_features("stereo_analysis"), "DispNImages", "DispTelList_T", + "ImgSel_list", "Xoff", "Yoff", "Xoff_intersect", @@ -138,6 +139,7 @@ def _classification_features(): var_array = [ "DispNImages", "DispTelList_T", + "ImgSel_list", "EChi2S", "EmissionHeight", "EmissionHeightChi2", diff --git a/src/eventdisplay_ml/geomag.py b/src/eventdisplay_ml/geomag.py index 2d186dd..f5e9034 100644 --- a/src/eventdisplay_ml/geomag.py +++ b/src/eventdisplay_ml/geomag.py @@ -10,11 +10,21 @@ "BX": 25.2e-6, # Tesla "BY": 0.0, # Tesla "BZ": 40.88e-6, # Tesla - } + }, + "CTAO-NORTH": { + "BX": 30.909e-6, # Tesla + "BY": 0.0, # Tesla + "BZ": 23.409e-6, # Tesla + }, + "CTAO-SOUTH": { + "BX": 20.552e-6, # Tesla + "BY": 0.0, # Tesla + "BZ": -9.367 - 6, # Tesla + }, } -def calculate_geomagnetic_angles(azimuth, elevation, site="VERITAS"): +def calculate_geomagnetic_angles(azimuth, elevation, observatory="VERITAS"): """ Calculate the angle between the shower direction and the geomagnetic field. @@ -24,21 +34,23 @@ def calculate_geomagnetic_angles(azimuth, elevation, site="VERITAS"): Azimuth angles of the showers in degrees. elevation : array-like Elevation angles of the showers in degrees. - site : str - Site identifier to get geomagnetic field components. + observatory : str + Observatory identifier to get geomagnetic field components. Returns ------- theta_B : array-like Angle between shower direction and geomagnetic field in degrees. """ + observatory = observatory.upper() try: - bx = FIELD_COMPONENTS[site]["BX"] - by = FIELD_COMPONENTS[site]["BY"] - bz = FIELD_COMPONENTS[site]["BZ"] + bx = FIELD_COMPONENTS[observatory]["BX"] + by = FIELD_COMPONENTS[observatory]["BY"] + bz = FIELD_COMPONENTS[observatory]["BZ"] except KeyError as exc: - raise KeyError(f"Geomagnetic field components for site '{site}' are not defined.") from exc - + raise KeyError( + f"Geomagnetic field components for observatory '{observatory}' are not defined." + ) from exc # Shower direction unit vector sx = np.cos(np.radians(elevation)) * np.cos(np.radians(azimuth)) # North sy = np.cos(np.radians(elevation)) * np.sin(np.radians(azimuth)) # East diff --git a/src/eventdisplay_ml/hyper_parameters.py b/src/eventdisplay_ml/hyper_parameters.py index 2f2fc91..f8568fb 100644 --- a/src/eventdisplay_ml/hyper_parameters.py +++ b/src/eventdisplay_ml/hyper_parameters.py @@ -88,8 +88,8 @@ def _load_hyper_parameters_from_file(config_file): def pre_cuts_regression(n_tel): - """Get pre-cuts for regression analysis (no multiplicity filter).""" - event_cut = "" + """Get pre-cuts for regression analysis.""" + event_cut = "DispNImages >=2" if PRE_CUTS_REGRESSION: event_cut = " & ".join(f"({c})" for c in PRE_CUTS_REGRESSION) _logger.info(f"Pre-cuts (regression): {event_cut if event_cut else 'None'}") diff --git a/src/eventdisplay_ml/models.py b/src/eventdisplay_ml/models.py index 56fda0e..c01de4c 100644 --- a/src/eventdisplay_ml/models.py +++ b/src/eventdisplay_ml/models.py @@ -246,7 +246,12 @@ def apply_regression_models(df, model_configs): n_tel = tel_config["max_tel_id"] + 1 if tel_config else 4 flatten_data = flatten_feature_data( - df, n_tel, analysis_type="stereo_analysis", training=False, tel_config=tel_config + df, + n_tel, + analysis_type="stereo_analysis", + training=False, + tel_config=tel_config, + observatory=model_configs.get("observatory", "veritas"), ) models = model_configs["models"] @@ -302,7 +307,12 @@ def apply_classification_models(df, model_configs, threshold_keys): _logger.info(f"Processing {len(group_df)} events with bin={e_bin}") flatten_data = flatten_feature_data( - group_df, n_tel, analysis_type="classification", training=False, tel_config=tel_config + group_df, + n_tel, + analysis_type="classification", + training=False, + tel_config=tel_config, + observatory=model_configs.get("observatory", "veritas"), ) model = models[e_bin]["model"] flatten_data = flatten_data.reindex(columns=models[e_bin]["features"]) diff --git a/tests/test_imgsel_debug.py b/tests/test_imgsel_debug.py index 6e9735d..59e4b36 100644 --- a/tests/test_imgsel_debug.py +++ b/tests/test_imgsel_debug.py @@ -65,11 +65,12 @@ def test_imgsel_sorting_and_alignment(): analysis_type="stereo_analysis", training=True, tel_config=tel_config, + observatory="ctao-north", ) - # Expected order by size desc (mirror areas equal): tel 1 then tel 3 - expected_size_log = [np.log10(3725.51), np.log10(2640.01)] - # Disp_T should align with sorted order + # Expected order by size desc (mirror areas equal): tel 1 (3725.51) then tel 3 (2640.01) + expected_size_log = [np.log10(3725.51), np.log10(2640.01)] # Descending order + # Disp_T aligned to sorted order (tel 1 first, then tel 3) expected_disp = [1.56872, 1.50883] # Assert mirror_area columns exist and equal 100 for sorted positions diff --git a/tests/test_sort_logic.py b/tests/test_sort_logic.py new file mode 100644 index 0000000..023d608 --- /dev/null +++ b/tests/test_sort_logic.py @@ -0,0 +1,53 @@ +"""Unit tests for telescope sorting logic (mirror area first, then size).""" + +import numpy as np + + +def test_sort_logic_equal_area_by_size(): + """Test sorting with equal mirror areas: secondary sort by size descending.""" + max_tel_id = 3 + mirror_lookup = np.array([np.nan, 100.0, np.nan, 100.0], dtype=np.float32) + sizes_row = np.array([np.nan, 3725.51, np.nan, 2640.01], dtype=np.float32) + + # Apply the new sort logic: area priority, then size within equal area + tel_entries = [] + for tel_idx in range(max_tel_id + 1): + area = mirror_lookup[tel_idx] + size_val = sizes_row[tel_idx] + area_valid = 0 if not np.isnan(area) else 1 + size_valid = 0 if not np.isnan(size_val) else 1 + area_key = -area if area_valid == 0 else 0.0 + size_key = -size_val if size_valid == 0 else 0.0 + tel_entries.append((tel_idx, area_valid, area_key, size_valid, size_key)) + + tel_entries.sort(key=lambda x: (x[1], x[2], x[3], x[4])) + sort_indices = np.array([t[0] for t in tel_entries]) + + # Expect: tel 1 (area 100, size 3725.51), tel 3 (area 100, size 2640.01), then NaN areas + assert np.array_equal(sort_indices, np.array([1, 3, 0, 2])) + + +def test_sort_logic_mixed_areas_and_nan_sizes(): + """Test sorting with mixed mirror areas and some NaN sizes: area priority always.""" + max_tel_id = 4 + # Areas: tel 0=200, tel 1=100 (NaN size), tel 2=200, tel 3=100, tel 4=NaN + mirror_lookup = np.array([200.0, 100.0, 200.0, 100.0, np.nan], dtype=np.float32) + # Sizes: tel 0=1000, tel 1=NaN, tel 2=500, tel 3=2000, tel 4=3000 + sizes_row = np.array([1000.0, np.nan, 500.0, 2000.0, 3000.0], dtype=np.float32) + + # Apply sorting + tel_entries = [] + for tel_idx in range(max_tel_id + 1): + area = mirror_lookup[tel_idx] + size_val = sizes_row[tel_idx] + area_valid = 0 if not np.isnan(area) else 1 + size_valid = 0 if not np.isnan(size_val) else 1 + area_key = -area if area_valid == 0 else 0.0 + size_key = -size_val if size_valid == 0 else 0.0 + tel_entries.append((tel_idx, area_valid, area_key, size_valid, size_key)) + + tel_entries.sort(key=lambda x: (x[1], x[2], x[3], x[4])) + sort_indices = np.array([t[0] for t in tel_entries]) + + # Expect: area 200 desc (tel 0 size 1000, tel 2 size 500), then area 100 desc (tel 3 size 2000, tel 1 size NaN), then NaN area (tel 4) + assert np.array_equal(sort_indices, np.array([0, 2, 3, 1, 4]))