1111from pathlib import Path
1212from time import perf_counter
1313from typing import Any , Optional
14+ from urllib .parse import urlparse
1415
1516import xarray as xr
1617
@@ -175,6 +176,62 @@ def _summarize_dask_metrics(
175176 return base
176177
177178
179+ def _derive_item_id_from_input_path (path : str ) -> str :
180+ """Best-effort item id from local path or URL."""
181+ try :
182+ if path .startswith (("http://" , "https://" , "s3://" , "gs://" )):
183+ parsed = urlparse (path )
184+ name = Path (parsed .path .rstrip ("/" )).name
185+ else :
186+ name = Path (str (path ).rstrip ("/" )).name
187+ if name .endswith (".zarr" ):
188+ name = name [: - len (".zarr" )]
189+ return name or "dataset"
190+ except Exception :
191+ return "dataset"
192+
193+
194+ def _resolve_output_prefix (prefix : str , item_id : str ) -> str :
195+ target_name = f"{ item_id } _geozarr.zarr"
196+ if prefix .startswith (("http://" , "https://" , "s3://" , "gs://" )):
197+ if not prefix .endswith ("/" ):
198+ prefix = prefix + "/"
199+ return prefix + target_name
200+ return str (Path (prefix ) / target_name )
201+
202+
203+ def _has_group (tree : xr .DataTree , path : str ) -> bool :
204+ try :
205+ parts = [p for p in str (path ).strip ("/" ).split ("/" ) if p ]
206+ node = tree
207+ for seg in parts :
208+ children = getattr (node , "children" , {}) or {}
209+ if seg not in children :
210+ return False
211+ node = children [seg ]
212+ return True
213+ except Exception :
214+ return False
215+
216+
217+ def _local_path_exists (p : str ) -> bool :
218+ try :
219+ return Path (p ).exists ()
220+ except Exception :
221+ return False
222+
223+
224+ def _remove_local_tree (p : str ) -> bool :
225+ try :
226+ import shutil
227+
228+ if Path (p ).exists ():
229+ shutil .rmtree (p )
230+ return True
231+ except Exception :
232+ return False
233+
234+
178235def convert_command (args : argparse .Namespace ) -> None :
179236 """Convert EOPF dataset to GeoZarr compliant format.
180237
@@ -213,8 +270,27 @@ def convert_command(args: argparse.Namespace) -> None:
213270 sys .exit (1 )
214271 input_path = str (input_path )
215272
273+ # Fast path: just list groups and exit
274+ if getattr (args , "list_groups" , False ):
275+ storage_options = get_storage_options (input_path )
276+ dt = xr .open_datatree (
277+ str (input_path ),
278+ engine = "zarr" ,
279+ chunks = "auto" ,
280+ storage_options = storage_options ,
281+ )
282+ print ("Available groups:" )
283+ for group_name in dt .children :
284+ print (f" - { group_name } " )
285+ return
286+
216287 # Handle output path validation
217288 output_path_str = args .output_path
289+ # Expand trailing-slash prefix to a concrete store
290+ if output_path_str .endswith ("/" ):
291+ item_id = _derive_item_id_from_input_path (str (input_path ))
292+ output_path_str = _resolve_output_prefix (output_path_str , item_id )
293+ print (f"Resolved output store: { output_path_str } " )
218294 if is_s3_path (output_path_str ):
219295 # S3 path - validate S3 access
220296 print ("🔍 Validating S3 access..." )
@@ -247,14 +323,17 @@ def convert_command(args: argparse.Namespace) -> None:
247323 else :
248324 # Local path - create directory if it doesn't exist
249325 output_path = Path (output_path_str )
250- output_path .parent .mkdir (parents = True , exist_ok = True )
326+ # In dry-run mode, don't create anything
327+ if not getattr (args , "dry_run" , False ):
328+ output_path .parent .mkdir (parents = True , exist_ok = True )
251329 output_path = str (output_path )
252- # Prepare debug dir for metrics
253- debug_dir = Path (output_path ) / "debug"
254- try :
255- debug_dir .mkdir (parents = True , exist_ok = True )
256- except Exception :
257- debug_dir = None
330+ # Prepare debug dir for metrics (skip in dry-run)
331+ if not getattr (args , "dry_run" , False ):
332+ debug_dir = Path (output_path ) / "debug"
333+ try :
334+ debug_dir .mkdir (parents = True , exist_ok = True )
335+ except Exception :
336+ debug_dir = None
258337
259338 if args .verbose :
260339 print (f"Loading EOPF dataset from: { input_path } " )
@@ -268,14 +347,48 @@ def convert_command(args: argparse.Namespace) -> None:
268347 # Load the EOPF DataTree with appropriate storage options
269348 print ("Loading EOPF dataset..." )
270349 storage_options = get_storage_options (input_path )
271- # Metrics setup
350+ # Metrics setup (environment first; set_input after group validation)
272351 if getattr (args , "metrics" , True ):
273352 metrics = MetricsRecorder ()
274353 metrics .set_environment ()
354+
355+ with metrics .time_step ("open_input" ) if metrics else nullcontext ():
356+ dt = xr .open_datatree (
357+ str (input_path ),
358+ engine = "zarr" ,
359+ chunks = "auto" ,
360+ storage_options = storage_options ,
361+ )
362+
363+ # Validate/prune groups if requested
364+ groups_effective = list (getattr (args , "groups" , []) or [])
365+ validate_mode = getattr (args , "validate_groups" , None )
366+ missing : list [str ] = []
367+ if validate_mode in {"warn" , "error" } and groups_effective :
368+ existing : list [str ] = []
369+ for g in groups_effective :
370+ if _has_group (dt , g ):
371+ existing .append (g )
372+ else :
373+ missing .append (g )
374+ if missing :
375+ msg = f"Groups not found: { ', ' .join (missing )} "
376+ if validate_mode == "error" :
377+ print (f"❌ { msg } " )
378+ sys .exit (3 )
379+ else :
380+ print (f"⚠️ { msg } ; proceeding with remaining groups" )
381+ groups_effective = existing
382+ if not groups_effective :
383+ print ("❌ No valid groups to convert after validation" )
384+ sys .exit (3 )
385+
386+ # Now that groups are finalized, set input metadata for metrics
387+ if metrics is not None :
275388 metrics .set_input (
276389 source_uri = str (input_path ),
277390 profile = None ,
278- groups = args . groups ,
391+ groups = groups_effective ,
279392 dask = {
280393 "enabled" : bool (dask_client is not None ),
281394 "mode" : getattr (args , "dask_mode" , None ),
@@ -286,19 +399,60 @@ def convert_command(args: argparse.Namespace) -> None:
286399 },
287400 )
288401
289- with metrics .time_step ("open_input" ) if metrics else nullcontext ():
290- dt = xr .open_datatree (
291- str (input_path ),
292- engine = "zarr" ,
293- chunks = "auto" ,
294- storage_options = storage_options ,
295- )
402+ # Overwrite policy handling (local only)
403+ overwrite = getattr (args , "overwrite" , "fail" )
404+ is_remote = is_s3_path (output_path )
405+ if not is_remote :
406+ exists = _local_path_exists (output_path )
407+ if exists :
408+ if overwrite == "fail" :
409+ print (
410+ f"❌ Output already exists and overwrite policy is 'fail': { output_path } "
411+ )
412+ sys .exit (2 )
413+ if overwrite == "skip" :
414+ print (f"⏭️ Output exists; skipping as per policy: { output_path } " )
415+ return
416+ if overwrite == "replace" :
417+ if getattr (args , "dry_run" , False ):
418+ print (
419+ f"🧪 Dry-run: would remove existing output: { output_path } "
420+ )
421+ else :
422+ ok = _remove_local_tree (output_path )
423+ if not ok :
424+ print (f"❌ Failed to remove existing output: { output_path } " )
425+ sys .exit (2 )
426+ # merge: do nothing
427+ else :
428+ if overwrite == "fail" :
429+ print (
430+ "ℹ️ Remote output existence not checked; 'fail' policy may not prevent overwrite."
431+ )
432+ elif overwrite == "replace" :
433+ print (
434+ "⚠️ 'replace' is not implemented for remote outputs; proceeding may overwrite keys."
435+ )
436+
437+ # Print dry-run plan and exit early
438+ if getattr (args , "dry_run" , False ):
439+ print ("\n Dry-run plan:" )
440+ print ("============" )
441+ print (f"Input: { input_path } " )
442+ print (f"Output: { output_path } " )
443+ print (f"Groups: { groups_effective } " )
444+ print (f"Overwrite policy: { overwrite } " )
445+ print (f"Dask: { 'on' if dask_client is not None else 'off' } " )
446+ print ("No data will be written." )
447+ return
296448
297449 if args .verbose :
298450 print (f"Loaded DataTree with { len (dt .children )} groups" )
299451 print ("Available groups:" )
300452 for group_name in dt .children :
301453 print (f" - { group_name } " )
454+ if missing :
455+ print (f"After validation, converting groups: { groups_effective } " )
302456
303457 # Convert to GeoZarr compliant format
304458 print ("Converting to GeoZarr compliant format..." )
@@ -330,7 +484,7 @@ def convert_command(args: argparse.Namespace) -> None:
330484 with metrics .time_step ("convert" ) if metrics else nullcontext ():
331485 dt_geozarr = create_geozarr_dataset (
332486 dt_input = dt ,
333- groups = args . groups ,
487+ groups = groups_effective ,
334488 output_path = output_path ,
335489 spatial_chunk = args .spatial_chunk ,
336490 min_dimension = args .min_dimension ,
@@ -344,7 +498,7 @@ def convert_command(args: argparse.Namespace) -> None:
344498 with metrics .time_step ("convert" ) if metrics else nullcontext ():
345499 dt_geozarr = create_geozarr_dataset (
346500 dt_input = dt ,
347- groups = args . groups ,
501+ groups = groups_effective ,
348502 output_path = output_path ,
349503 spatial_chunk = args .spatial_chunk ,
350504 min_dimension = args .min_dimension ,
@@ -381,7 +535,7 @@ def convert_command(args: argparse.Namespace) -> None:
381535 ),
382536 "perf_report" : getattr (args , "dask_perf_html" , None ),
383537 "wall_clock_s" : wall_clock if dask_client is not None else None ,
384- "groups" : args . groups ,
538+ "groups" : groups_effective ,
385539 "spatial_chunk" : args .spatial_chunk ,
386540 "min_dimension" : args .min_dimension ,
387541 "tile_width" : args .tile_width ,
@@ -399,6 +553,17 @@ def convert_command(args: argparse.Namespace) -> None:
399553 MetricsRecorder .write_json (run_summary , payload )
400554 if args .verbose :
401555 print (f"🧾 Wrote metrics: { run_summary } " )
556+ # Optional external run-metadata path
557+ if getattr (args , "run_metadata" , None ):
558+ try :
559+ outp = Path (args .run_metadata )
560+ outp .parent .mkdir (parents = True , exist_ok = True )
561+ MetricsRecorder .write_json (outp , payload )
562+ if args .verbose :
563+ print (f"🧾 Wrote run metadata: { outp } " )
564+ except Exception as _e :
565+ if args .verbose :
566+ print (f"(debug) could not write run-metadata: { _e } " )
402567 except Exception as _exc :
403568 if args .verbose :
404569 print (f"(debug) could not write run summary: { _exc } " )
@@ -424,10 +589,22 @@ def convert_command(args: argparse.Namespace) -> None:
424589 try :
425590 if metrics and debug_dir is not None and getattr (args , "metrics" , True ):
426591 payload = metrics .finalize (status = "error" , exception = error_msg )
427- run_summary = Path (args .output_path ) / "debug" / "run_summary.json"
428- MetricsRecorder .write_json (run_summary , payload )
429- if args .verbose :
430- print (f"🧾 Wrote failure metrics: { run_summary } " )
592+ run_summary = Path (output_path ) / "debug" / "run_summary.json"
593+ try :
594+ MetricsRecorder .write_json (run_summary , payload )
595+ if args .verbose :
596+ print (f"🧾 Wrote failure metrics: { run_summary } " )
597+ except Exception :
598+ pass
599+ if getattr (args , "run_metadata" , None ):
600+ try :
601+ outp = Path (args .run_metadata )
602+ outp .parent .mkdir (parents = True , exist_ok = True )
603+ MetricsRecorder .write_json (outp , payload )
604+ if args .verbose :
605+ print (f"🧾 Wrote run metadata (error): { outp } " )
606+ except Exception :
607+ pass
431608 except Exception :
432609 pass
433610 sys .exit (1 )
@@ -1414,6 +1591,34 @@ def create_parser() -> argparse.ArgumentParser:
14141591 action = "store_false" ,
14151592 help = "Disable metrics emission." ,
14161593 )
1594+ convert_parser .add_argument (
1595+ "--list-groups" ,
1596+ action = "store_true" ,
1597+ help = "List available groups in the input and exit." ,
1598+ )
1599+ convert_parser .add_argument (
1600+ "--dry-run" ,
1601+ action = "store_true" ,
1602+ help = "Validate I/O, resolve groups and output, and print the plan without writing." ,
1603+ )
1604+ convert_parser .add_argument (
1605+ "--overwrite" ,
1606+ type = str ,
1607+ choices = ["fail" , "skip" , "replace" , "merge" ],
1608+ default = "fail" ,
1609+ help = "Behavior when output exists (local): fail, skip, replace, or merge (default: fail)." ,
1610+ )
1611+ convert_parser .add_argument (
1612+ "--validate-groups" ,
1613+ type = str ,
1614+ choices = ["warn" , "error" ],
1615+ help = "Validate requested groups against input; warn to prune, error to abort." ,
1616+ )
1617+ convert_parser .add_argument (
1618+ "--run-metadata" ,
1619+ type = str ,
1620+ help = "Also write finalized metrics payload to this JSON path." ,
1621+ )
14171622 convert_parser .set_defaults (func = convert_command )
14181623
14191624 # Info command
0 commit comments