From 957a6e594c681c794547279eb19f9b41ca3943a1 Mon Sep 17 00:00:00 2001 From: Dhiren Date: Wed, 22 Apr 2026 17:08:00 -0700 Subject: [PATCH 1/5] refactoring input params for search_studies from kwargs to respective API names for IDE auto complete --- pyleotups/core/NOAADataset.py | 40 +++++++++++++++++++++++++++++++- pyleotups/core/PangaeaDataset.py | 28 +++++++++++++++------- 2 files changed, 59 insertions(+), 9 deletions(-) diff --git a/pyleotups/core/NOAADataset.py b/pyleotups/core/NOAADataset.py index 6343276e..c84c2b69 100644 --- a/pyleotups/core/NOAADataset.py +++ b/pyleotups/core/NOAADataset.py @@ -127,7 +127,41 @@ def __iadd__(self, other): return self - def search_studies(self, **kwargs): + def search_studies( + self, + xml_id: int | str | None = None, + noaa_id: int | str | None = None, + search_text: str | None = None, + investigators: str | list[str] | None = None, + investigators_and_or: str = "or", + locations: str | list[str] | None = None, + locations_and_or: str = "or", + keywords: str | list[str] | None = None, + keywords_and_or: str = "or", + species: str | list[str] | None = None, + species_and_or: str = "or", + variable_name: str | list[str] | None = None, + variable_name_and_or: str = "or", + cv_materials: str | list[str] | None = None, + cv_materials_and_or: str = "or", + cv_seasonalities: str | list[str] | None = None, + cv_seasonalities_and_or: str = "or", + min_lat: int | None = None, + max_lat: int | None = None, + min_lon: int | None = None, + max_lon: int | None = None, + min_elevation: int | None = None, + max_elevation: int | None = None, + earliest_year: int | None = None, + latest_year: int | None = None, + time_format: str | None = None, + time_method: str | None = None, + reconstruction: bool | None = None, + recent: bool = False, + limit: int = 100, + skip: int | None = None, + data_publisher: str = "NOAA", + ): r""" Search for NOAA studies using the specified parameters. @@ -405,6 +439,10 @@ def search_studies(self, **kwargs): df_skip.head() """ + kwargs = locals().copy() + + kwargs.pop("self") + if "headers_only" in kwargs: log.warning("Keyword Argument Pair : 'headers_only' is not supported and will be ignored while making requests.") kwargs.pop("headers_only", None) diff --git a/pyleotups/core/PangaeaDataset.py b/pyleotups/core/PangaeaDataset.py index 767f987e..55f7c33b 100644 --- a/pyleotups/core/PangaeaDataset.py +++ b/pyleotups/core/PangaeaDataset.py @@ -139,14 +139,23 @@ def _resolve_and_register_ids(self, study_ids): # ------------------------- # search_studies: q, bbox, keywords -> registers studies and returns same style as Dataset.search_studies (DataFrame) # ------------------------- - def search_studies(self, - # q: Optional[str] = None, - # study_ids: Optional[Union[int, str, List]] = None, - # bbox: Optional[Tuple[float, float, float, float]] = None, - # limit: int = 10, - # offset: int = 0, - # display: bool = False - **kwargs) -> Optional[pd.DataFrame]: + def search_studies( + self, + study_ids: int | str | list[int | str] | None = None, + topic: str | list[str] | None = None, + topic_and_or: str = "or", + search_text: str | None = None, + investigators: str | list[str] | None = None, + investigators_and_or: str = "and", + variable_name: str | list[str] | None = None, + variable_name_and_or: str = "and", + min_lat: float | None = None, + max_lat: float | None = None, + min_lon: float | None = None, + max_lon: float | None = None, + limit: int = 100, + skip: int = 0, + ) -> Optional[pd.DataFrame]: """ Search PANGAEA and register results in self.studies. @@ -328,6 +337,9 @@ def search_studies(self, ) df.head() """ + + kwargs = locals().copy() + kwargs.pop("self") study_ids = kwargs.get("study_ids") # ------------------------------------------- From 6957e3e8003ab4833db1ec5cdbe8f8f2db3a04b7 Mon Sep 17 00:00:00 2001 From: Dhiren Date: Mon, 4 May 2026 12:57:28 -0700 Subject: [PATCH 2/5] Updating PangaeaDataset with Object addition. Balancing behavior of PangaeaDataset to clearing studies when redoing search --- pyleotups/core/PangaeaDataset.py | 48 ++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/pyleotups/core/PangaeaDataset.py b/pyleotups/core/PangaeaDataset.py index 55f7c33b..dc741917 100644 --- a/pyleotups/core/PangaeaDataset.py +++ b/pyleotups/core/PangaeaDataset.py @@ -62,6 +62,53 @@ def __init__(self, cache_dir: Optional[str] = None, auth_token: Optional[str] = # keys: StudyID (DOI/URI) -> {'panobj': PanDataSet or None, 'summary': normalized_dict} self.studies: Dict[str, PangaeaStudy] = {} + def __add__(self, other): + if not isinstance(other, PangaeaDataset): + return NotImplemented + + merged = PangaeaDataset(cache_dir=self.cache_dir, auth_token=self.auth_token) + + # Start with a shallow copy of left's studies + merged.studies = dict(self.studies) + + # Union by StudyID. If duplicate ID appears, keep left's version + # but sanity-check equality and warn if they differ. + for sid, study in other.studies.items(): + if sid in merged.studies: + try: + check_same = (merged.studies[sid].to_summary_dict() == study.to_summary_dict()) + except Exception: + check_same = False + if not check_same: + logger.warning( + f"PangaeaDataset union: duplicate StudyID {sid} with differing content. " + "Keeping left-hand version." + ) + else: + merged.studies[sid] = study + + return merged + + def __iadd__(self, other): + if not isinstance(other, PangaeaDataset): + return NotImplemented + + for sid, study in other.studies.items(): + if sid in self.studies: + try: + check_same = (self.studies[sid].to_summary_dict() == study.to_summary_dict()) + except Exception: + check_same = False + if not check_same: + logger.warning( + f"PangaeaDataset in-place union: duplicate StudyID {sid} with differing content. " + "Keeping existing version." + ) + else: + self.studies[sid] = study + + return self + @staticmethod def _normalize_id(study_id: str) -> int: """ @@ -340,6 +387,7 @@ def search_studies( kwargs = locals().copy() kwargs.pop("self") + self.studies.clear() study_ids = kwargs.get("study_ids") # ------------------------------------------- From dbd33f0c01a0c5b0b69964abf3b45a3bc620e37c Mon Sep 17 00:00:00 2001 From: Dhiren Date: Mon, 4 May 2026 13:19:58 -0700 Subject: [PATCH 3/5] Fixing the documentation for skip parameters --- pyleotups/core/NOAADataset.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pyleotups/core/NOAADataset.py b/pyleotups/core/NOAADataset.py index c84c2b69..056b6f8b 100644 --- a/pyleotups/core/NOAADataset.py +++ b/pyleotups/core/NOAADataset.py @@ -264,9 +264,6 @@ def search_studies( limit : int, default 100 Number of studies to return (PyleoTUPS default). - - skip : int, - Number of studies to skip (for paging). Paired with `limit`. skip : int, optional Number of studies to skip (for pagination). Use with ``limit`` to page through results. @@ -301,7 +298,7 @@ def search_studies( Time window defaults. If either ``earliest_year`` or ``latest_year`` is provided and neither ``time_format`` nor ``time_method`` is supplied, ``time_format`` defaults to ``'CE'`` (a note is recorded). - Unsupported parameters. ``headersOnly`` and ``skip`` are not supported by PyleoTUPS and are ignored if passed. + Unsupported parameters. ``headersOnly`` is not supported by PyleoTUPS and ignored if passed. Boolean normalization. Parameters expected as ``'Y'/'N'`` accept: True/False, or strings like ``"true"|"yes"|"y"|"1"`` → ``'Y'`` and ``"false"|"no"|"n"|"0"`` → ``'N'``. From 94a9d4f1de25637fae9a20d171c52d1da37bec76 Mon Sep 17 00:00:00 2001 From: Dhiren Date: Mon, 4 May 2026 15:10:40 -0700 Subject: [PATCH 4/5] Aligning parameters for NOAADataset.search_studies with documentation --- pyleotups/core/NOAADataset.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyleotups/core/NOAADataset.py b/pyleotups/core/NOAADataset.py index 056b6f8b..d80e3f90 100644 --- a/pyleotups/core/NOAADataset.py +++ b/pyleotups/core/NOAADataset.py @@ -132,6 +132,7 @@ def search_studies( xml_id: int | str | None = None, noaa_id: int | str | None = None, search_text: str | None = None, + data_type_id: str | None = None, investigators: str | list[str] | None = None, investigators_and_or: str = "or", locations: str | list[str] | None = None, @@ -181,7 +182,7 @@ def search_studies( General text search across study content. Supports wildcards (%) and logical operators (AND, OR). Examples: 'younger dryas', 'loess AND stratigraphy' - data_publisher : by default 'NOAA' + data_publisher : str, default "NOAA" Choose from: 'NOAA', 'NEOTOMA', or 'PANGAEA'. Example: 'NOAA' @@ -356,7 +357,7 @@ def search_studies( .. jupyter-execute:: ### Multiple investigators (AND by default) - df_multinv_and = ds.search_studies(investigators=["Wahl, E.R.", "Vose, R.S."], investigatorsAndOr = "and") + df_multinv_and = ds.search_studies(investigators=["Wahl, E.R.", "Vose, R.S."], investigators_and_or="and") df_multinv_and.head() .. jupyter-execute:: From ce090aa705838c1c3ad147a727d902001bc885ed Mon Sep 17 00:00:00 2001 From: Dhiren Date: Tue, 5 May 2026 09:30:25 -0700 Subject: [PATCH 5/5] Adding coverage to Pangaea.get_summary() and min max Lat Lon to Pangaea.get_geo() --- pyleotups/core/PangaeaDataset.py | 3 +- pyleotups/utils/PangaeaStudy.py | 60 ++++++++++++++++++++++++++++---- 2 files changed, 56 insertions(+), 7 deletions(-) diff --git a/pyleotups/core/PangaeaDataset.py b/pyleotups/core/PangaeaDataset.py index dc741917..5bde6834 100644 --- a/pyleotups/core/PangaeaDataset.py +++ b/pyleotups/core/PangaeaDataset.py @@ -474,7 +474,8 @@ def get_summary(self) -> pd.DataFrame: pandas.DataFrame Return a DataFrame summarizing all loaded/registered PANGAEA datasets. ["StudyID","StudyName","EarliestYearBP","MostRecentYearBP", - "EarliestYearCE","MostRecentYearCE","StudyNotes","ScienceKeywords","Investigators", + "EarliestYearCE","MostRecentYearCE","Coverage [S, N, W, E]", + "StudyNotes","ScienceKeywords","Investigators", "Publications","Sites","Funding"] """ rows = [] diff --git a/pyleotups/utils/PangaeaStudy.py b/pyleotups/utils/PangaeaStudy.py index b3cb4f01..186cfbfd 100644 --- a/pyleotups/utils/PangaeaStudy.py +++ b/pyleotups/utils/PangaeaStudy.py @@ -251,6 +251,48 @@ def _extract_temporal_extent( return earliest_bp, latest_bp, earliest_ce, latest_ce + def _compute_coverage(self) -> Optional[Tuple[float, float, float, float]]: + """ + Compute consolidated geographic coverage for the study events. + + The coverage is based on all event latitude/longitude pairs. + If latitude2/longitude2 are not provided for an event, the single + coordinate is reused for both bounds. + + Returns + ------- + tuple or None + (MinLatitude, MaxLatitude, MinLongitude, MaxLongitude) + or None when no valid coordinates exist. + """ + latitudes = [] + longitudes = [] + + for ev in self._panobj.events: + lat1 = ev.latitude + lat2 = ev.latitude2 if getattr(ev, "latitude2", None) is not None else lat1 + lon1 = ev.longitude + lon2 = ev.longitude2 if getattr(ev, "longitude2", None) is not None else lon1 + + if lat1 is not None: + latitudes.append(lat1) + if lat2 is not None: + latitudes.append(lat2) + if lon1 is not None: + longitudes.append(lon1) + if lon2 is not None: + longitudes.append(lon2) + + if not latitudes or not longitudes: + return None + + return ( + min(latitudes), + max(latitudes), + min(longitudes), + max(longitudes), + ) + def to_summary_dict(self) -> Dict[str, Any]: """ Convert study metadata to NOAA-style summary dictionary. @@ -264,10 +306,8 @@ def to_summary_dict(self) -> Dict[str, Any]: self.earliest_bp, self.latest_bp, self.earliest_ce, self.latest_ce = ( self._extract_temporal_extent() ) - # if collection_founds : - # logger.warning( - # f'The Summary Table Below may contain Dataset marked as collection.' - # f'Refer to the "CollectionMembers" column to identify collection datasets and their members.') + self.coverage = self._compute_coverage() + return { "StudyID": self.study_id, "StudyName": ds.title, @@ -275,6 +315,7 @@ def to_summary_dict(self) -> Dict[str, Any]: "MostRecentYearBP": self.latest_bp, "EarliestYearCE": self.earliest_ce, "MostRecentYearCE": self.latest_ce, + "Coverage [S, N, W, E]": self.coverage, "StudyNotes": ds.abstract, "ScienceKeywords": getattr(ds, "keywords", None), "Investigators": ", ".join(a.fullname for a in ds.authors), @@ -306,14 +347,21 @@ def get_geo(self) -> pd.DataFrame: """ rows = [] for ev in self._panobj.events: + lat1 = ev.latitude + lon1 = ev.longitude + lat2 = ev.latitude2 if getattr(ev, "latitude2", None) is not None else lat1 + lon2 = ev.longitude2 if getattr(ev, "longitude2", None) is not None else lon1 + rows.append( { "StudyID": self.study_id, "SiteID": ev.id, "SiteName": ev.label, "LocationName": ev.location, - "Latitude": ev.latitude, - "Longitude": ev.longitude, + "MinLatitude": min(v for v in [lat1, lat2] if v is not None) if lat1 is not None or lat2 is not None else None, + "MaxLatitude": max(v for v in [lat1, lat2] if v is not None) if lat1 is not None or lat2 is not None else None, + "MinLongitude": min(v for v in [lon1, lon2] if v is not None) if lon1 is not None or lon2 is not None else None, + "MaxLongitude": max(v for v in [lon1, lon2] if v is not None) if lon1 is not None or lon2 is not None else None, "Elevation": ev.elevation, } )