77import logging
88import os
99import sys
10+ from typing import Any
1011from urllib .parse import urlparse
1112
1213import fsspec
@@ -46,6 +47,10 @@ def run_conversion(
4647 collection : str ,
4748 s3_output_bucket : str ,
4849 s3_output_prefix : str ,
50+ groups : str | None = None ,
51+ spatial_chunk : int | None = None ,
52+ tile_width : int | None = None ,
53+ enable_sharding : bool | None = None ,
4954) -> str :
5055 """Run GeoZarr conversion workflow.
5156
@@ -54,6 +59,10 @@ def run_conversion(
5459 collection: Collection ID for parameter lookup
5560 s3_output_bucket: S3 bucket for output
5661 s3_output_prefix: S3 prefix for output
62+ groups: Override groups parameter (comma-separated if multiple)
63+ spatial_chunk: Override spatial chunk size
64+ tile_width: Override tile width
65+ enable_sharding: Override sharding enable flag
5766
5867 Returns:
5968 Output Zarr URL (s3://...)
@@ -74,15 +83,17 @@ def run_conversion(
7483 zarr_url = source_url
7584 logger .info (f"Direct Zarr URL: { zarr_url } " )
7685
77- # Get conversion parameters from collection config
78- logger .debug (f"Getting conversion parameters for { collection } ..." )
86+ # Get conversion parameters (with optional overrides)
7987 params = get_conversion_params (collection )
80- logger .debug (f" Groups: { params ['groups' ]} " )
81- logger .debug (f" Chunk: { params ['spatial_chunk' ]} " )
82- logger .debug (f" Tile width: { params ['tile_width' ]} " )
83- logger .debug (f" Extra flags: { params ['extra_flags' ]} " )
84-
85- # Construct output path
88+ overrides = {
89+ "groups" : groups .split ("," ) if groups and "," in groups else groups or None ,
90+ "spatial_chunk" : spatial_chunk ,
91+ "tile_width" : tile_width ,
92+ "enable_sharding" : enable_sharding ,
93+ }
94+ params .update ({k : v for k , v in overrides .items () if v is not None })
95+
96+ logger .info (f"Conversion params: { params } " ) # Construct output path
8697 output_url = f"s3://{ s3_output_bucket } /{ s3_output_prefix } /{ collection } /{ item_id } .zarr"
8798
8899 # Clean up existing output to avoid base array artifacts
@@ -98,46 +109,60 @@ def run_conversion(
98109 logger .info (f" Source: { zarr_url } " )
99110 logger .info (f" Destination: { output_url } " )
100111
101- # Set up Dask cluster for parallel processing
102- from dask .distributed import Client
112+ # Optional: Set up Dask cluster if enabled via environment variable
113+ # Note: eopf-geozarr handles its own Dask setup when using create_geozarr_dataset
114+ # This is here only for future compatibility if we need external cluster management
115+ use_dask = os .getenv ("ENABLE_DASK_CLUSTER" , "" ).lower () in ("true" , "1" , "yes" )
116+ if use_dask :
117+ logger .info ("🚀 Dask cluster enabled via ENABLE_DASK_CLUSTER env var" )
118+ # Future: Could connect to external cluster here if needed
119+ # from dask.distributed import Client
120+ # dask_address = os.getenv("DASK_SCHEDULER_ADDRESS")
121+ # client = Client(dask_address) if dask_address else Client()
122+
123+ # Load source dataset
124+ logger .info ("Loading source dataset..." )
125+ storage_options = get_storage_options (zarr_url )
126+ dt = xr .open_datatree (
127+ zarr_url ,
128+ engine = "zarr" ,
129+ chunks = "auto" ,
130+ storage_options = storage_options ,
131+ )
132+ logger .info (f"Loaded DataTree with { len (dt .children )} groups" )
133+
134+ # Convert to GeoZarr
135+ logger .info ("Converting to GeoZarr format..." )
136+
137+ # Parse extra flags for optional parameters
138+ kwargs : dict [str , Any ] = {}
139+ if params ["extra_flags" ] and "--crs-groups" in params ["extra_flags" ]:
140+ crs_groups_str = params ["extra_flags" ].split ("--crs-groups" )[1 ].strip ().split ()[0 ]
141+ kwargs ["crs_groups" ] = [crs_groups_str ]
142+
143+ # Add sharding if enabled
144+ if params .get ("enable_sharding" , False ):
145+ kwargs ["enable_sharding" ] = True
146+
147+ # groups parameter must be a list
148+ groups_param = params ["groups" ]
149+ if isinstance (groups_param , str ):
150+ groups_list : list [str ] = [groups_param ]
151+ else :
152+ # groups_param is list[str] in mission configs
153+ groups_list = list (groups_param ) if groups_param else []
103154
104- with Client () as client :
105- logger .info (f"🚀 Dask cluster started: { client .dashboard_link } " )
155+ create_geozarr_dataset (
156+ dt_input = dt ,
157+ groups = groups_list ,
158+ output_path = output_url ,
159+ spatial_chunk = params ["spatial_chunk" ],
160+ tile_width = params ["tile_width" ],
161+ ** kwargs ,
162+ )
106163
107- # Load source dataset
108- logger .info ("Loading source dataset..." )
109- storage_options = get_storage_options (zarr_url )
110- dt = xr .open_datatree (
111- zarr_url ,
112- engine = "zarr" ,
113- chunks = "auto" ,
114- storage_options = storage_options ,
115- )
116- logger .info (f"Loaded DataTree with { len (dt .children )} groups" )
117-
118- # Convert to GeoZarr
119- logger .info ("Converting to GeoZarr format..." )
120-
121- # Parse extra flags for optional parameters
122- kwargs = {}
123- if params ["extra_flags" ] and "--crs-groups" in params ["extra_flags" ]:
124- crs_groups_str = params ["extra_flags" ].split ("--crs-groups" )[1 ].strip ().split ()[0 ]
125- kwargs ["crs_groups" ] = [crs_groups_str ]
126-
127- # groups parameter must be a list
128- groups_list = [params ["groups" ]] if isinstance (params ["groups" ], str ) else params ["groups" ]
129-
130- create_geozarr_dataset (
131- dt_input = dt ,
132- groups = groups_list ,
133- output_path = output_url ,
134- spatial_chunk = params ["spatial_chunk" ],
135- tile_width = params ["tile_width" ],
136- ** kwargs ,
137- )
138-
139- logger .info ("✅ Conversion completed successfully!" )
140- logger .info (f"Output: { output_url } " )
164+ logger .info ("✅ Conversion completed successfully!" )
165+ logger .info (f"Output: { output_url } " )
141166
142167 return output_url
143168
@@ -150,18 +175,32 @@ def main(argv: list[str] | None = None) -> int:
150175 parser .add_argument ("--s3-output-bucket" , required = True , help = "S3 output bucket" )
151176 parser .add_argument ("--s3-output-prefix" , required = True , help = "S3 output prefix" )
152177 parser .add_argument ("--verbose" , action = "store_true" , help = "Enable verbose logging" )
178+ # Optional parameter overrides
179+ parser .add_argument ("--groups" , help = "Override groups (comma-separated)" )
180+ parser .add_argument ("--spatial-chunk" , help = "Override spatial chunk size" )
181+ parser .add_argument ("--tile-width" , help = "Override tile width" )
182+ parser .add_argument ("--enable-sharding" , help = "Override sharding (true/false)" )
153183
154184 args = parser .parse_args (argv )
155-
156185 if args .verbose :
157186 logging .getLogger ().setLevel (logging .DEBUG )
158187
188+ # Parse override args (empty string → None)
189+ groups = args .groups or None
190+ spatial_chunk = int (args .spatial_chunk ) if args .spatial_chunk else None
191+ tile_width = int (args .tile_width ) if args .tile_width else None
192+ enable_sharding = args .enable_sharding .lower () == "true" if args .enable_sharding else None
193+
159194 try :
160195 output_url = run_conversion (
161196 args .source_url ,
162197 args .collection ,
163198 args .s3_output_bucket ,
164199 args .s3_output_prefix ,
200+ groups ,
201+ spatial_chunk ,
202+ tile_width ,
203+ enable_sharding ,
165204 )
166205 logger .info (f"Success: { output_url } " )
167206 return 0
0 commit comments