Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 43 additions & 7 deletions pyleotups/core/NOAADataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,42 @@ def __iadd__(self, other):
return self


def search_studies(self, **kwargs):
def search_studies(
self,
xml_id: int | str | None = None,
noaa_id: int | str | None = None,
search_text: str | None = None,
data_type_id: str | None = None,
investigators: str | list[str] | None = None,
investigators_and_or: str = "or",
locations: str | list[str] | None = None,
locations_and_or: str = "or",
keywords: str | list[str] | None = None,
keywords_and_or: str = "or",
species: str | list[str] | None = None,
species_and_or: str = "or",
variable_name: str | list[str] | None = None,
variable_name_and_or: str = "or",
cv_materials: str | list[str] | None = None,
cv_materials_and_or: str = "or",
cv_seasonalities: str | list[str] | None = None,
cv_seasonalities_and_or: str = "or",
min_lat: int | None = None,
max_lat: int | None = None,
min_lon: int | None = None,
max_lon: int | None = None,
min_elevation: int | None = None,
max_elevation: int | None = None,
earliest_year: int | None = None,
latest_year: int | None = None,
time_format: str | None = None,
time_method: str | None = None,
reconstruction: bool | None = None,
recent: bool = False,
limit: int = 100,
skip: int | None = None,
data_publisher: str = "NOAA",
):
r"""
Search for NOAA studies using the specified parameters.

Expand All @@ -147,7 +182,7 @@ def search_studies(self, **kwargs):
General text search across study content. Supports wildcards (%) and logical operators (AND, OR).
Examples: 'younger dryas', 'loess AND stratigraphy'

data_publisher : by default 'NOAA'
data_publisher : str, default "NOAA"
Choose from: 'NOAA', 'NEOTOMA', or 'PANGAEA'.
Example: 'NOAA'

Expand Down Expand Up @@ -230,9 +265,6 @@ def search_studies(self, **kwargs):

limit : int, default 100
Number of studies to return (PyleoTUPS default).

skip : int,
Number of studies to skip (for paging). Paired with `limit`.

skip : int, optional
Number of studies to skip (for pagination). Use with ``limit`` to page through results.
Expand Down Expand Up @@ -267,7 +299,7 @@ def search_studies(self, **kwargs):
Time window defaults. If either ``earliest_year`` or ``latest_year`` is provided and neither ``time_format``
nor ``time_method`` is supplied, ``time_format`` defaults to ``'CE'`` (a note is recorded).

Unsupported parameters. ``headersOnly`` and ``skip`` are not supported by PyleoTUPS and are ignored if passed.
Unsupported parameters. ``headersOnly`` is not supported by PyleoTUPS and ignored if passed.

Boolean normalization. Parameters expected as ``'Y'/'N'`` accept: True/False, or strings like
``"true"|"yes"|"y"|"1"`` → ``'Y'`` and ``"false"|"no"|"n"|"0"`` → ``'N'``.
Expand Down Expand Up @@ -325,7 +357,7 @@ def search_studies(self, **kwargs):
.. jupyter-execute::

### Multiple investigators (AND by default)
df_multinv_and = ds.search_studies(investigators=["Wahl, E.R.", "Vose, R.S."], investigatorsAndOr = "and")
df_multinv_and = ds.search_studies(investigators=["Wahl, E.R.", "Vose, R.S."], investigators_and_or="and")
df_multinv_and.head()

.. jupyter-execute::
Expand Down Expand Up @@ -405,6 +437,10 @@ def search_studies(self, **kwargs):
df_skip.head()
"""

kwargs = locals().copy()

kwargs.pop("self")

if "headers_only" in kwargs:
log.warning("Keyword Argument Pair : 'headers_only' is not supported and will be ignored while making requests.")
kwargs.pop("headers_only", None)
Expand Down
79 changes: 70 additions & 9 deletions pyleotups/core/PangaeaDataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,53 @@ def __init__(self, cache_dir: Optional[str] = None, auth_token: Optional[str] =
# keys: StudyID (DOI/URI) -> {'panobj': PanDataSet or None, 'summary': normalized_dict}
self.studies: Dict[str, PangaeaStudy] = {}

def __add__(self, other):
if not isinstance(other, PangaeaDataset):
return NotImplemented

merged = PangaeaDataset(cache_dir=self.cache_dir, auth_token=self.auth_token)

# Start with a shallow copy of left's studies
merged.studies = dict(self.studies)

# Union by StudyID. If duplicate ID appears, keep left's version
# but sanity-check equality and warn if they differ.
for sid, study in other.studies.items():
if sid in merged.studies:
try:
check_same = (merged.studies[sid].to_summary_dict() == study.to_summary_dict())
except Exception:
check_same = False
if not check_same:
logger.warning(
f"PangaeaDataset union: duplicate StudyID {sid} with differing content. "
"Keeping left-hand version."
)
else:
merged.studies[sid] = study

return merged

def __iadd__(self, other):
if not isinstance(other, PangaeaDataset):
return NotImplemented

for sid, study in other.studies.items():
if sid in self.studies:
try:
check_same = (self.studies[sid].to_summary_dict() == study.to_summary_dict())
except Exception:
check_same = False
if not check_same:
logger.warning(
f"PangaeaDataset in-place union: duplicate StudyID {sid} with differing content. "
"Keeping existing version."
)
else:
self.studies[sid] = study

return self

@staticmethod
def _normalize_id(study_id: str) -> int:
"""
Expand Down Expand Up @@ -139,14 +186,23 @@ def _resolve_and_register_ids(self, study_ids):
# -------------------------
# search_studies: q, bbox, keywords -> registers studies and returns same style as Dataset.search_studies (DataFrame)
# -------------------------
def search_studies(self,
# q: Optional[str] = None,
# study_ids: Optional[Union[int, str, List]] = None,
# bbox: Optional[Tuple[float, float, float, float]] = None,
# limit: int = 10,
# offset: int = 0,
# display: bool = False
**kwargs) -> Optional[pd.DataFrame]:
def search_studies(
self,
study_ids: int | str | list[int | str] | None = None,
topic: str | list[str] | None = None,
topic_and_or: str = "or",
search_text: str | None = None,
investigators: str | list[str] | None = None,
investigators_and_or: str = "and",
variable_name: str | list[str] | None = None,
variable_name_and_or: str = "and",
min_lat: float | None = None,
max_lat: float | None = None,
min_lon: float | None = None,
max_lon: float | None = None,
limit: int = 100,
skip: int = 0,
) -> Optional[pd.DataFrame]:
"""
Search PANGAEA and register results in self.studies.

Expand Down Expand Up @@ -328,6 +384,10 @@ def search_studies(self,
)
df.head()
"""

kwargs = locals().copy()
kwargs.pop("self")
self.studies.clear()
study_ids = kwargs.get("study_ids")

# -------------------------------------------
Expand Down Expand Up @@ -414,7 +474,8 @@ def get_summary(self) -> pd.DataFrame:
pandas.DataFrame
Return a DataFrame summarizing all loaded/registered PANGAEA datasets.
["StudyID","StudyName","EarliestYearBP","MostRecentYearBP",
"EarliestYearCE","MostRecentYearCE","StudyNotes","ScienceKeywords","Investigators",
"EarliestYearCE","MostRecentYearCE","Coverage [S, N, W, E]",
"StudyNotes","ScienceKeywords","Investigators",
"Publications","Sites","Funding"]
"""
rows = []
Expand Down
60 changes: 54 additions & 6 deletions pyleotups/utils/PangaeaStudy.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,48 @@ def _extract_temporal_extent(

return earliest_bp, latest_bp, earliest_ce, latest_ce

def _compute_coverage(self) -> Optional[Tuple[float, float, float, float]]:
"""
Compute consolidated geographic coverage for the study events.

The coverage is based on all event latitude/longitude pairs.
If latitude2/longitude2 are not provided for an event, the single
coordinate is reused for both bounds.

Returns
-------
tuple or None
(MinLatitude, MaxLatitude, MinLongitude, MaxLongitude)
or None when no valid coordinates exist.
"""
latitudes = []
longitudes = []

for ev in self._panobj.events:
lat1 = ev.latitude
lat2 = ev.latitude2 if getattr(ev, "latitude2", None) is not None else lat1
lon1 = ev.longitude
lon2 = ev.longitude2 if getattr(ev, "longitude2", None) is not None else lon1

if lat1 is not None:
latitudes.append(lat1)
if lat2 is not None:
latitudes.append(lat2)
if lon1 is not None:
longitudes.append(lon1)
if lon2 is not None:
longitudes.append(lon2)

if not latitudes or not longitudes:
return None

return (
min(latitudes),
max(latitudes),
min(longitudes),
max(longitudes),
)

def to_summary_dict(self) -> Dict[str, Any]:
"""
Convert study metadata to NOAA-style summary dictionary.
Expand All @@ -264,17 +306,16 @@ def to_summary_dict(self) -> Dict[str, Any]:
self.earliest_bp, self.latest_bp, self.earliest_ce, self.latest_ce = (
self._extract_temporal_extent()
)
# if collection_founds :
# logger.warning(
# f'The Summary Table Below may contain Dataset marked as collection.'
# f'Refer to the "CollectionMembers" column to identify collection datasets and their members.')
self.coverage = self._compute_coverage()

return {
"StudyID": self.study_id,
"StudyName": ds.title,
"EarliestYearBP": self.earliest_bp,
"MostRecentYearBP": self.latest_bp,
"EarliestYearCE": self.earliest_ce,
"MostRecentYearCE": self.latest_ce,
"Coverage [S, N, W, E]": self.coverage,
"StudyNotes": ds.abstract,
"ScienceKeywords": getattr(ds, "keywords", None),
"Investigators": ", ".join(a.fullname for a in ds.authors),
Expand Down Expand Up @@ -306,14 +347,21 @@ def get_geo(self) -> pd.DataFrame:
"""
rows = []
for ev in self._panobj.events:
lat1 = ev.latitude
lon1 = ev.longitude
lat2 = ev.latitude2 if getattr(ev, "latitude2", None) is not None else lat1
lon2 = ev.longitude2 if getattr(ev, "longitude2", None) is not None else lon1

rows.append(
{
"StudyID": self.study_id,
"SiteID": ev.id,
"SiteName": ev.label,
"LocationName": ev.location,
"Latitude": ev.latitude,
"Longitude": ev.longitude,
"MinLatitude": min(v for v in [lat1, lat2] if v is not None) if lat1 is not None or lat2 is not None else None,
"MaxLatitude": max(v for v in [lat1, lat2] if v is not None) if lat1 is not None or lat2 is not None else None,
"MinLongitude": min(v for v in [lon1, lon2] if v is not None) if lon1 is not None or lon2 is not None else None,
"MaxLongitude": max(v for v in [lon1, lon2] if v is not None) if lon1 is not None or lon2 is not None else None,
"Elevation": ev.elevation,
}
)
Expand Down
Loading