diff --git a/src/harpy/shape/__init__.pyi b/src/harpy/shape/__init__.pyi index 99ffd363..bace2cb3 100644 --- a/src/harpy/shape/__init__.pyi +++ b/src/harpy/shape/__init__.pyi @@ -1,5 +1,11 @@ from ._cell_expansion import create_voronoi_boundaries -from ._shape import add_shapes_layer, filter_shapes_layer, intersect_rectangles, vectorize +from ._filters import ( + filter_shapes_by_shapes, + filter_shapes_categorical, + filter_shapes_numerical, + morphological_features, +) +from ._shape import add_shapes_layer, filter_shapes_layer, intersect_rectangles, prep_region_annotations, vectorize __all__ = [ "add_shapes_layer", @@ -7,4 +13,9 @@ __all__ = [ "filter_shapes_layer", "intersect_rectangles", "vectorize", + "prep_region_annotations", + "morphological_features", + "filter_shapes_numerical", + "filter_shapes_categorical", + "filter_shapes_by_shapes", ] diff --git a/src/harpy/shape/_filters.py b/src/harpy/shape/_filters.py new file mode 100755 index 00000000..027c77ac --- /dev/null +++ b/src/harpy/shape/_filters.py @@ -0,0 +1,561 @@ +from collections.abc import Sequence +from typing import Literal + +import geopandas as gpd +import numpy as np +import pandas as pd +from shapely.geometry import LineString, MultiPolygon, Polygon +from shapely.ops import unary_union +from spatialdata import SpatialData +from spatialdata.transformations import get_transformation + +from harpy.shape._shape import add_shapes_layer +from harpy.utils.pylogger import get_pylogger + +log = get_pylogger(__name__) + + +# Extract exterior and interior boundaries +def _extract_exterior_lines(geom): + if isinstance(geom, Polygon): + return [geom.exterior] + elif isinstance(geom, MultiPolygon): + return [poly.exterior for poly in geom.geoms] + return [] + + +def _extract_interior_lines(geom): + if isinstance(geom, Polygon): + return list(geom.interiors) + elif isinstance(geom, MultiPolygon): + lines = [] + for poly in geom.geoms: + lines.extend(poly.interiors) + return lines + return [] + + +def morphological_features( + sdata: SpatialData, + shapes_layer: str, + output_shapes_layer: str, + shape_names_column: str = "name", + grouped_features: bool = False, + pixel_size_um: float = 1.0, + overwrite: bool = False, +) -> SpatialData: + """ + Compute morphological features for polygons in a shapes layer. Only Polygon and MultiPolygon are supported. + It is recommended to run `hp.sh.prep_region_annotations` first when computing features for region annotations that + may contain multipolygons, point geometries, unnamed annotations, etc. + + - `area`: Area of the polygon (px² or µm²). + - `perimeter`: Perimeter length (px or µm). + - `equivalent_diameter`: Diameter of a circle with the same area (px or µm). + - `convex_area`: Area of convex hull (px² or µm²). + - `convex_perimeter`: Perimeter of convex hull (px or µm). + - `circularity`: 4π * area / perimeter². + → 1 for a perfect circle; lower = irregular shape. + - `compactness`: perimeter² / area. + - `solidity`: area / convex_area. + → Low values mean concave or fragmented shapes. + - `convexity`: convex_perimeter / perimeter. + → 1 for perfectly convex shapes. Lower for rough or spiky boundaries. + - `centroid_x`: X-coordinate of the polygon centroid (px). + - `centroid_y`: Y-coordinate of the polygon centroid (px). + - `centroid_dif`: Distance between polygon and convex hull centroids normalized by the polygon area. + → Captures off-centered concavity or asymmetry. + - `major_axis_length`: Length of the longest side of the minimum rotated bounding box (px or µm). + - `minor_axis_length`: Length of the shortest side of the minimum rotated bounding box (px or µm). + - `major_minor_axis_ratio`: major_axis_length / minor_axis_length. + - `num_vertices`: Number of vertices along exterior boundaries. + - `boundary_complexity`: num_vertices / perimeter. + → Normalized measure of boundary irregularity. + - `num_holes`: Count of internal holes. + - `hole_area`: Area covered by internal holes (px² or µm²) + + Parameters + ---------- + sdata + SpatialData object containing the shapes layer. + shapes_layer + Name of the input shapes layer. + output_shapes_layer + Name of the output shapes layer to store results. + shape_names_column + Column name in shapes layer containing geometry names. Required when using grouped filters. + grouped_features + If True, also computes grouped morphological features by dissolving polygons sharing + the same name in `shape_names_column`. + pixel_size_um + Scale factor to convert geometric measurements from pixel to micron units. + - Applied to: + * "perimeter", "major_axis_length", and "minor_axis_length" → scaled by (pixel_size_um) + * "area" and "convex_area" → scaled by (pixel_size_um)² + - Dimensionless ratios (e.g., "circularity", "compactness", "solidity", "convexity", "centroid_dif", "major_minor_axis_ratio") + are unaffected by scaling but are computed using the scaled geometric quantities. + - XY-coordinates are kept in original units. + Defaults to 1.0 (no scaling, i.e. units remain in pixels). + overwrite + Whether to overwrite existing shapes layer. + + Returns + ------- + SpatialData + Object with updated shapes layer containing morphological feature columns. + """ + # Create copy of shapes layer + gdf = sdata.shapes[shapes_layer].copy() + + # Filter out geometries that are not Polygon or MultiPolygon + supported_mask = gdf.geometry.apply(lambda x: isinstance(x, (Polygon, MultiPolygon))) + + unssuported = len(gdf) - supported_mask.sum() + if unssuported > 0: + raise ValueError( + f"Found {unssuported} non-polygon geometries in {shapes_layer}. Consider running `hp.sh.prep_region_annotations` first to clean up geometries." + ) + if unssuported == len(gdf): + log.warning( + f"No supported geometries (Polygon, MultiPolygon) found. Skipping addition of shapes layer '{output_shapes_layer}' to sdata." + ) + return sdata + + gdf = gdf[supported_mask].copy() + + # Multipolygon check + n_multipolygons = gdf.geometry.apply(lambda x: isinstance(x, MultiPolygon)).sum() + if n_multipolygons > 0: + log.warning( + f"Detected {n_multipolygons} MultiPolygon geometries in '{shapes_layer}'. " + "Consider running `hp.sh.prep_region_annotations` first to split MultiPolygons before feature extraction." + ) + + # Dissolve polygons by name + grouped_gdf = None + if grouped_features: + if shape_names_column not in gdf.columns: + raise ValueError(f"Grouped computation requires column '{shape_names_column}'.") + grouped_gdf = gdf.dissolve(by=shape_names_column, as_index=False, aggfunc="first").copy() + grouped_gdf["geometry"] = gdf.groupby(shape_names_column).geometry.apply(lambda x: x.unary_union).values + log.info(f"Merged {len(gdf)} polygons into {len(grouped_gdf)} groups by '{shape_names_column}'.") + + # Compute features + def _compute_features(gdf: gpd.GeoDataFrame, pixel_size_um, suffix: str = ""): + # Base area & perimeter + gdf[f"area{suffix}"] = gdf.geometry.area * pixel_size_um**2 + gdf[f"perimeter{suffix}"] = gdf.geometry.length * pixel_size_um + + # Equivalent diameter + gdf[f"equivalent_diameter{suffix}"] = np.sqrt(4 * gdf[f"area{suffix}"] / np.pi) + + # Convex geometry + convex_hulls = gdf.geometry.convex_hull + gdf[f"convex_area{suffix}"] = convex_hulls.area * pixel_size_um**2 + gdf[f"convex_perimeter{suffix}"] = convex_hulls.length * pixel_size_um + + # Derived metrics + gdf[f"circularity{suffix}"] = 4 * np.pi * gdf[f"area{suffix}"] / (gdf[f"perimeter{suffix}"] ** 2) + gdf[f"compactness{suffix}"] = (gdf[f"perimeter{suffix}"] ** 2) / gdf[f"area{suffix}"] + gdf[f"solidity{suffix}"] = gdf[f"area{suffix}"] / gdf[f"convex_area{suffix}"] + gdf[f"convexity{suffix}"] = gdf[f"convex_perimeter{suffix}"] / gdf[f"perimeter{suffix}"] + + # Centroid-based metrics + gdf[f"centroid_x{suffix}"] = gdf.geometry.centroid.x + gdf[f"centroid_y{suffix}"] = gdf.geometry.centroid.y + hull_centroids = convex_hulls.centroid + gdf[f"centroid_dif{suffix}"] = np.sqrt( + (gdf[f"centroid_x{suffix}"] - hull_centroids.x) ** 2 + (gdf[f"centroid_y{suffix}"] - hull_centroids.y) ** 2 + ) / np.sqrt(gdf[f"area{suffix}"]) + + # Major/minor axis (from rotated rectangle) + def _rotated_rect_axes(geom): + try: + rect = geom.minimum_rotated_rectangle + x, y = rect.exterior.coords.xy + edges = np.sqrt(np.diff(x) ** 2 + np.diff(y) ** 2) + edges = np.sort(edges[:-1]) # remove duplicate closing edge + minor, major = edges[0], edges[1] + ratio = major / minor if minor > 0 else np.nan + return major, minor, ratio + except Exception: + return np.nan, np.nan, np.nan + + results = np.array([_rotated_rect_axes(g) for g in gdf.geometry]) + major_axis_length, minor_axis_length, major_minor_axis_ratio = results.T + gdf[f"major_axis_length{suffix}"] = major_axis_length * pixel_size_um + gdf[f"minor_axis_length{suffix}"] = minor_axis_length * pixel_size_um + gdf[f"major_minor_axis_ratio{suffix}"] = major_minor_axis_ratio + + # Vertex metrics + def _num_vertices(geom): + exteriors = _extract_exterior_lines(geom) + return sum(len(line.coords) for line in exteriors) + + gdf[f"num_vertices{suffix}"] = gdf.geometry.apply(_num_vertices) + gdf[f"boundary_complexity{suffix}"] = gdf[f"num_vertices{suffix}"] / gdf[f"perimeter{suffix}"] + + # Hole topology + def _num_holes(geom): + return len(_extract_interior_lines(geom)) + + def _hole_area(g): + interiors = _extract_interior_lines(g) + if not interiors: + return 0.0 + try: + holes = [Polygon(ring) for ring in interiors] + total_hole_area = sum(h.area for h in holes) + return total_hole_area + except Exception: + return np.nan + + gdf[f"num_holes{suffix}"] = gdf.geometry.apply(_num_holes) + gdf[f"hole_area{suffix}"] = gdf.geometry.apply(_hole_area) * pixel_size_um**2 + + return gdf + + gdf = _compute_features(gdf, pixel_size_um, "") + + if grouped_features and not grouped_gdf.empty: + grouped_gdf = _compute_features(grouped_gdf, pixel_size_um, "-grouped") + + grouped_metrics = [col for col in grouped_gdf.columns if col.endswith("-grouped")] + + grouped_lookup = grouped_gdf.set_index(shape_names_column) + for col in grouped_metrics: + gdf[col] = gdf[shape_names_column].map(grouped_lookup[col]) + + # sanity check. If this sanity check would fail in spatialdata at some point, then pass transformation to transformations parameter of add_shapes_layer. + assert get_transformation(gdf, get_all=True) == get_transformation(sdata[shapes_layer], get_all=True) + + sdata = add_shapes_layer( + sdata, + input=gdf, + output_layer=output_shapes_layer, + overwrite=overwrite, + ) + + return sdata + + +def filter_shapes_numerical( + sdata: SpatialData, + shapes_layer: str, + output_shapes_layer: str, + numerical_column: str, + min_value: float | int | None = None, + max_value: float | int | None = None, + overwrite: bool = False, +) -> SpatialData: + """ + All polygons with a size outside of the min and max size range are removed using the `numerical_column` in `shapes_layer`. + Polygons are kept if `min_value` ≤ `numerical_column` ≤ `max_value`. + + Parameters + ---------- + sdata + SpatialData object containing the shapes layer. + shapes_layer + Name of the input shapes layer. + output_shapes_layer + Name of the output shapes layer to store the filtered polygons. + numerical_column + Name of numerical column in the shapes layer used for filtering. + min_value + minimum value of `numerical_column` (inclusive). If None, lower bound is ignored. + max_value + maximum value of `numerical_column` (inclusive). If None, upper bound is ignored. + overwrite + Whether to overwrite existing output layer. + + Returns + ------- + SpatialData + Object with updated shapes layer containing only polygons meeting criteria. + """ + gdf = sdata.shapes[shapes_layer].copy() + + if numerical_column not in gdf.columns: + raise ValueError(f"Column '{numerical_column}' not found in '{shapes_layer}.obs'. ") + + if not np.issubdtype(gdf[numerical_column].dtype, np.number): + raise ValueError(f"Column '{numerical_column}' must be numeric, but dtype is {gdf[numerical_column].dtype}.") + + # Filter cells based on min and max values + start = len(gdf) + mask = pd.Series(True, index=gdf.index) + + if min_value is not None: + below = (gdf[numerical_column] < min_value).sum() + log.info(f"Removed {below} cells below {min_value}.") + mask &= gdf[numerical_column] >= min_value + if max_value is not None: + above = (gdf[numerical_column] > max_value).sum() + log.info(f"Removed {above} cells above {max_value}.") + mask &= gdf[numerical_column] <= max_value + + filtered_gdf = gdf[mask].copy() + kept = len(filtered_gdf) + removed = start - len(filtered_gdf) + log.info( + f"Removed {removed}/{start} polygons outside {min_value}–{max_value} for '{numerical_column}' (kept {kept})." + ) + + # sanity check. If this sanity check would fail in spatialdata at some point, then pass transformation to transformations parameter of add_shapes_layer. + assert get_transformation(filtered_gdf, get_all=True) == get_transformation(sdata[shapes_layer], get_all=True) + + # Add to SpatialData + sdata = add_shapes_layer( + sdata, + input=filtered_gdf, + output_layer=output_shapes_layer, + overwrite=overwrite, + ) + + return sdata + + +def filter_shapes_categorical( + sdata: SpatialData, + shapes_layer: str, + output_shapes_layer: str, + categorical_column: str | None = None, + include_values: str | Sequence[str] | None = None, + exclude_values: str | Sequence[str] | None = None, + overwrite: bool = False, +) -> SpatialData: + """ + Filter polygons in a shapes layer based on categorical values or index. + + Parameters + ---------- + sdata + SpatialData object containing the shapes layer. + shapes_layer + Name of the input shapes layer. + output_shapes_layer + Name of the output shapes layer to store filtered polygons. + categorical_column + Name of the categorical column in the shapes layer. If None, filtering is performed based on the index. + include_values + Value(s) to keep. Only polygons whose `categorical_column` matches one of these + values will be kept. Mutually exclusive with `exclude_values`. + exclude_values + Value(s) to remove. polygons whose `categorical_column` matches one of these + values will be removed. Mutually exclusive with `include_values`. + overwrite + If True, overwrites the `output_shapes_layer` if it already exists in `sdata`. + + Returns + ------- + SpatialData + Object with updated shapes layer containing filtered polygons. + """ + if include_values is not None and exclude_values is not None: + raise ValueError("Specify only one of 'include_values' or 'exclude_values'.") + + gdf = sdata.shapes[shapes_layer].copy() + + if categorical_column is not None and categorical_column not in gdf.columns: + raise ValueError(f"Column '{categorical_column}' not found in '{shapes_layer}'.") + + # Ensure include/exclude are lists + if isinstance(include_values, (str, int)): + include_values = [include_values] + if isinstance(exclude_values, (str, int)): + exclude_values = [exclude_values] + + # Filter + start = len(gdf) + mask = pd.Series(True, index=gdf.index) + + if categorical_column is not None: + # Filtering by column + if include_values is not None: + mask &= gdf[categorical_column].isin(include_values) + elif exclude_values is not None: + mask &= ~gdf[categorical_column].isin(exclude_values) + else: + # Filtering by index + if include_values is not None: + mask &= gdf.index.isin(include_values) + elif exclude_values is not None: + mask &= ~gdf.index.isin(exclude_values) + + filtered_gdf = gdf[mask].copy() + kept = len(filtered_gdf) + removed = start - kept + if categorical_column is None: + categorical_column = "index" + log.info(f"Removed {removed}/{start} polygons based on '{categorical_column}' filtering (kept {kept}).") + + # sanity check. If this sanity check would fail in spatialdata at some point, then pass transformation to transformations parameter of add_shapes_layer. + assert get_transformation(filtered_gdf, get_all=True) == get_transformation(sdata[shapes_layer], get_all=True) + + # Add to SpatialData + sdata = add_shapes_layer( + sdata, + input=filtered_gdf, + output_layer=output_shapes_layer, + overwrite=overwrite, + ) + + return sdata + + +def filter_shapes_by_shapes( + sdata: SpatialData, + target_shapes_layer: str, + mask_shapes_layer: str, + output_shapes_layer: str, + shape_names_column: str | None = None, + shape_names: str | list[str] | None = None, + mode: Literal[ + "intersects", + "within", + "centroid_within", + "touches", + "disjoint", + "overlap_fraction", + "edge", + "outer_edge", + "inner_edge", + ] = "centroid_within", + keep: bool = True, + threshold: float | None = None, + overwrite: bool = False, +): + """ + Filter polygons in a target shapes layer based on geometric relationships to polygons of a mask shapes layer. + A typical use-case would be to filter cell segmentations based on their location within annotated regions. + + Parameters + ---------- + sdata + SpatialData object containing the target and mask shapes layers. + target_shapes_layer + Name of shapes layer whose polygons will be filtered. + mask_shapes_layer + Name of shapes layer used as mask for filtering. + output_shapes_layer + Name of the output shapes layer to store filtered polygons. + shape_names_column + Optional column in mask_shapes_layer to select specific polygons. + shape_names + Name or list of names of polygons in mask_shapes_layer to use for filtering. Ignored if shape_names_column is None. + mode + Geometric relationship to use for filtering. + Supported modes: + - `"intersects"`: Keep/remove polygons that intersect mask polygons (i.e. have partial or full overlap). + - `"within"`: Keep/remove polygons that are fully contained within mask polygons. + - `"centroid_within"`: Keep/remove polygons whose centroids are within mask polygons. + - `"touches"`: Keep/remove polygons that touch the boundary of mask polygons but do not overlap. + - `"disjoint"`: Keep/remove polygons that have no overlap with mask polygons at all. + - `"overlap_fraction"`: Keep/remove polygons whose overlap fraction with mask polygons exceeds a threshold. + - `"edge"`: Keep/remove polygons that partially overlap either the outer or inner edge of the mask polygons. + - `"outer_edge"`: Keep/remove polygons that overlap the outer edge (shell) of the mask polygons. + - `"inner_edge"`: Keep/remove polygons that overlap the inner edge (holes) of the mask polygons. + keep + Whether to keep (True) or remove (False) polygons matching the condition. + threshold + Threshold to use for overlap_fraction. + overwrite + If True, overwrites the output shapes layer if it exists. + + Returns + ------- + SpatialData object with filtered shapes layer. + """ + # Copy target layer + target_gdf = sdata.shapes[target_shapes_layer].copy() + + # Copy mask layer + mask_gdf = sdata.shapes[mask_shapes_layer].copy() + + # Optionally select subset of mask polygons + if shape_names_column is not None and shape_names is not None: + if shape_names_column not in mask_gdf.columns: + raise ValueError(f"Column '{shape_names_column}' not found in mask layer '{mask_shapes_layer}'.") + if isinstance(shape_names, str): + shape_names = [shape_names] + mask_gdf = mask_gdf[mask_gdf[shape_names_column].isin(shape_names)].copy() + if mask_gdf.empty: + raise ValueError(f"No geometries found in '{mask_shapes_layer}' matching {shape_names}.") + + # Build mask union + mask_union = mask_gdf.unary_union + + # Compute condition + if mode == "intersects": + condition = target_gdf.geometry.intersects(mask_union) + elif mode == "within": + condition = target_gdf.geometry.within(mask_union) + elif mode == "centroid_within": + condition = target_gdf.geometry.centroid.within(mask_union) + elif mode == "touches": + condition = target_gdf.geometry.touches(mask_union) + elif mode == "disjoint": + condition = target_gdf.geometry.disjoint(mask_union) + elif mode == "overlap_fraction": + if threshold is None: + raise ValueError("Must specify 'threshold' for overlap_fraction mode.") + intersecting = target_gdf[target_gdf.geometry.intersects(mask_union)] + overlaps = intersecting.geometry.intersection(mask_union) + fraction = overlaps.area / intersecting.geometry.area + condition = pd.Series(False, index=target_gdf.index) + condition.loc[intersecting.index] = fraction > threshold + elif mode == "edge": + condition = target_gdf.geometry.intersects(mask_union) & ~target_gdf.geometry.within(mask_union) + elif mode in ["outer_edge", "inner_edge"]: + outer_boundaries = [] + inner_boundaries = [] + + if isinstance(mask_union, gpd.GeoSeries): + for geom in mask_union: + outer_boundaries.extend(_extract_exterior_lines(geom)) + inner_boundaries.extend(_extract_interior_lines(geom)) + else: + outer_boundaries.extend(_extract_exterior_lines(mask_union)) + inner_boundaries.extend(_extract_interior_lines(mask_union)) + + outer_boundary_union = ( + unary_union([LineString(boundary) for boundary in outer_boundaries]) if outer_boundaries else None + ) + inner_boundary_union = ( + unary_union([LineString(boundary) for boundary in inner_boundaries]) if inner_boundaries else None + ) + + if mode == "outer_edge": + condition = target_gdf.geometry.intersects(outer_boundary_union) + elif mode == "inner_edge": + condition = ( + target_gdf.geometry.intersects(inner_boundary_union) if inner_boundary_union is not None else False + ) + + else: + raise ValueError(f"Unsupported mode '{mode}'.") + + # Apply keep/remove + filtered_gdf = target_gdf[condition if keep else ~condition].copy() + removed = len(target_gdf) - len(filtered_gdf) + + if keep: + log.info(f"Kept {len(filtered_gdf)} / {len(target_gdf)} geometries (removed {removed}).") + else: + log.info(f"Removed {removed} / {len(target_gdf)} geometries (kept {len(filtered_gdf)}).") + + # sanity check. If this sanity check would fail in spatialdata at some point, then pass transformation to transformations parameter of add_shapes_layer. + assert get_transformation(filtered_gdf, get_all=True) == get_transformation( + sdata[target_shapes_layer], get_all=True + ) + + # Add to SpatialData + sdata = add_shapes_layer( + sdata, + input=filtered_gdf, + output_layer=output_shapes_layer, + overwrite=overwrite, + ) + + return sdata diff --git a/src/harpy/shape/_shape.py b/src/harpy/shape/_shape.py index c5473db8..52eabd24 100644 --- a/src/harpy/shape/_shape.py +++ b/src/harpy/shape/_shape.py @@ -1,14 +1,18 @@ from __future__ import annotations +import numpy as np from dask.array import Array from geopandas import GeoDataFrame -from shapely.geometry import GeometryCollection, MultiPolygon, Polygon +from shapely.geometry import GeometryCollection, MultiPolygon, Point, Polygon from spatialdata import SpatialData from spatialdata.models._utils import MappingToCoordinateSystem_t from spatialdata.transformations import get_transformation from harpy.image._image import _get_spatial_element from harpy.shape._manager import ShapesLayerManager +from harpy.utils.pylogger import get_pylogger + +log = get_pylogger(__name__) def vectorize( @@ -179,3 +183,125 @@ def intersect_rectangles(rect1: list[int | float], rect2: list[int | float]) -> return [x_min, x_max, y_min, y_max] else: return None + + +def prep_region_annotations( + sdata: SpatialData, + shapes_layer: str, + output_shapes_layer: str, + shape_names_column: str = "name", + unnamed: str = "unnamed", + unique_shape_names_column: str = "name-unique", + erosion: float = 0.5, + overwrite: bool = False, +): + """ + Prepares region annotations in a shapes layer for `hp.sh.morphological_features`, `hp.tb.assign_cells_to_shapes` and `hp.tb.compute_distance_to_shapes`. + Operations performed: + - Ensures a shape name column exists and fills missing names. + - Converts Points with a `radius` column into circular polygons. Points without a `radius` column will be preserved as Points. + - Slightly erodes polygons to avoid shared edges. + - Explodes multipolygons into separate single polygons. + - Generates unique names for shapes with duplicate base names. + - Shift indices by +1 if index 0 is present (index 0 is reserved for background in labels layers). + + Parameters + ---------- + sdata + The SpatialData object containing the input shapes layer. + shapes_layer + The shapes layer in `sdata.shapes` to use as input. + output_shapes_layer + The output shapes layer in `sdata.tables` to which the updated shapes layer will be written. + shape_names_column + Column name in shapes layer containing geometry names. If not present, new names will be generated. + unnamed + Name to be assigned to any unnamed geometries in `shape_names_column`. Defaults to 'unnamed'. + unique_shape_names_column + Column name in which unique names will be created for single polygons by appending a counter to the original name in `shape_names_column` for polygons with the same name. Note + that multipolygons will be split in individual polygons and each will get a unique name based on the original name of the multipolygon. Unique names will be stored in + `{shape_names_column}-unique` in the updated shapes layer. + erosion + Number of pixels to erode polygons by. This can avoid problems with overlapping edges of geometries when calculating distances. Default is 0.5 (i.e. erosion by 0.5 pixels). + overwrite + If True, overwrites the `output_shapes_layer` if it already exists in `sdata`. + + Returns + ------- + Modified `sdata` object with updated updated shapes layer. + """ + # Create copy of shapes layer + gdf = sdata.shapes[shapes_layer].copy() + log.info(f"Found {len(gdf)} geometries in {shapes_layer}.") + + # Ensure shape_names_column exist + if shape_names_column not in gdf.columns: + gdf[shape_names_column] = unnamed + + gdf[shape_names_column] = gdf[shape_names_column].fillna("").astype(str) + unnamed_mask = gdf[shape_names_column] == "" + for _i, idx in enumerate(gdf[unnamed_mask].index): + gdf.at[idx, shape_names_column] = unnamed + + # Convert Points with a radius column to circular polygons + def _point_to_circle(geom, radius=None): + if isinstance(geom, Point) and radius is not None: + return geom.buffer(radius, resolution=16) + return geom + + if "radius" in gdf.columns: + mask_points_with_radius = gdf.geometry.geom_type.eq("Point") & gdf["radius"].notna() + n_converted = mask_points_with_radius.sum() + + if n_converted > 0: + log.info(f"Converting {n_converted} Point geometries with 'radius' to circular polygons.") + gdf.loc[mask_points_with_radius, "geometry"] = gdf.loc[mask_points_with_radius].apply( + lambda row: _point_to_circle(row.geometry, getattr(row, "radius", None)), axis=1 + ) + + # Slightly erode all polygons (this avoids any shared borders between polygons) + gdf["geometry"] = gdf.geometry.apply( + lambda geom: geom.buffer(-erosion, join_style=2, resolution=16) + if geom.geom_type in ["Polygon", "MultiPolygon"] + else geom + ) + polygon_mask = ~gdf.geometry.is_empty + removed = len(gdf) - polygon_mask.sum() + gdf = gdf[polygon_mask] # drop any polygons that collapsed to empty + if removed > 0: + log.info(f"Removed {removed} polygons that collapsed to empty after erosion.") + + # Explode multipolygons into single polygons (this allows us to treat multipolygons as unique polygons) + n_multipolygons = gdf.geometry.apply(lambda g: g.geom_type == "MultiPolygon").sum() + gdf = gdf.explode(index_parts=False).reset_index(drop=True) + n_after = len(gdf) + log.info( + f"Split {n_multipolygons} multipolygons into individual polygons. Total number of geometries after splitting multipolgons is {n_after}." + ) + + # Create unique names + sizes = gdf.groupby(shape_names_column)[shape_names_column].transform("size") + counter = gdf.groupby(shape_names_column).cumcount() + 1 + gdf[unique_shape_names_column] = np.where( + sizes.eq(1), gdf[shape_names_column], gdf[shape_names_column] + counter.astype(str) + ) + + # Avoid index 0 + if 0 in gdf.index: + log.warning( + "Index 0 found in shapes layer — shifting all indices by +1 since index 0 is reserved for background in labels layers." + ) + gdf.index = gdf.index + 1 + + # sanity check. If this sanity check would fail in spatialdata at some point, then pass transformation to transformations parameter of add_shapes_layer. + assert get_transformation(gdf, get_all=True) == get_transformation(sdata[shapes_layer], get_all=True) + + # Add filtered shapes layer + sdata = add_shapes_layer( + sdata, + input=gdf, + output_layer=output_shapes_layer, + overwrite=overwrite, + ) + + return sdata diff --git a/src/harpy/table/__init__.pyi b/src/harpy/table/__init__.pyi index bd9b5837..a254ba3a 100644 --- a/src/harpy/table/__init__.pyi +++ b/src/harpy/table/__init__.pyi @@ -4,8 +4,9 @@ from ._annotation import cluster_cleanliness, score_genes, score_genes_iter from ._clustering import kmeans, leiden from ._enrichment import nhood_enrichment from ._preprocess import preprocess_proteomics, preprocess_transcriptomics +from ._region_annotations import assign_cells_to_shapes, compute_distance_to_shapes from ._regionprops import add_regionprop_features -from ._table import add_table_layer, correct_marker_genes, filter_on_size +from ._table import add_table_layer, correct_marker_genes, filter_categorical, filter_numerical, filter_on_size from .cell_clustering._clustering import flowsom from .cell_clustering._preprocess import cell_clustering_preprocess from .cell_clustering._weighted_channel_expression import weighted_channel_expression @@ -16,6 +17,8 @@ __all__ = [ "add_table_layer", "correct_marker_genes", "filter_on_size", + "filter_numerical", + "filter_categorical", "flowsom", "weighted_channel_expression", "cell_clustering_preprocess", @@ -33,4 +36,6 @@ __all__ = [ "allocate", "bin_counts", "allocate_intensity", + "assign_cells_to_shapes", + "compute_distance_to_shapes", ] diff --git a/src/harpy/table/_region_annotations.py b/src/harpy/table/_region_annotations.py new file mode 100755 index 00000000..78944fff --- /dev/null +++ b/src/harpy/table/_region_annotations.py @@ -0,0 +1,604 @@ +from typing import Literal + +import geopandas as gpd +import pandas as pd +from shapely.geometry import LineString, MultiLineString, MultiPolygon, Point, Polygon +from spatialdata import SpatialData + +from harpy.table._table import add_table_layer +from harpy.utils._keys import _REGION_KEY +from harpy.utils.pylogger import get_pylogger + +log = get_pylogger(__name__) + + +def assign_cells_to_shapes( + sdata: SpatialData, + shapes_layer: str, + table_layer: str, + output_table_layer: str, + shape_names_column: str = "name", + unique_shape_names_column: str = "name-unique", + output_column: str = None, + mode: Literal["original_names", "unique_names", "both"] = "original_names", + create_column_per_shape: bool = False, + overlap_tolerance: float = 0.1, + spatial_key: str = "spatial", + xy_columns: tuple = None, + overwrite: bool = False, +): + """ + Assign cells to polygons in a shapes layer and update the sdata table layer. It is recommended to run `hp.sh.prep_region_annotations` first. + + Parameters + ---------- + sdata + The SpatialData object containing the input table layer and shapes layer. + shapes_layer + The shapes layer in `sdata.shapes` to use as input. + table_layer + The table layer in `sdata.tables` to use as input. + output_table_layer + The output table layer in `sdata.tables` to which the updated table layer will be written. + shape_names_column + Column name in shapes layer containing geometry names. + unique_shape_names_column + Column name in shapes layer containing unique geometry names. + output_column + Name of the output column in `sdata.tables[table_layer].obs` to store the shape name of the geometries a cell was found in (if `create_column_per_shape` is False). + For the `unique_names` mode, the output will be stored in `{output_column}-unique` in `sdata.tables[table_layer].obs` (if `create_column_per_shape` is False). + If create_column_per_shape is True and output_column is not None, then column names will be in the format `{output_colum}-{shape_name}`. + If create_column_per_shape is True and output_column is None, then column names will be in the format `{shape_name}`. + mode + When set to `original_names`, original polygon names from `shape_names_column` will be used. + When set to `unique_names`, unique polygon names from `unique_shape_names_column` will be used. Use `both`, to run both modes at the same time. + create_column_per_shape + If True, create one column (named according to the shape names) per shape indicating whether a cell is located inside it. + overlap_tolerance + Tolerance for detecting overlapping polygons (area units of geometry CRS). + spatial_key + Key in `sdata.tables[table_layer].obsm` containing spatial coordinates. Ignored if `xy_columns` is provided. + xy_columns + Tuple of column names in `sdata.tables[table_layer].obs` containing the x and y coordinates the cells. + If None, defaults to using coordinates from `sdata.tables[table_layer].obsm[spatial_key]`. + overwrite + If True, overwrites the `output_table_layer` and/or `output_shapes_layer` if it already exists in `sdata`. + + Notes + ----- + - Only `Polygon` and `MultiPolygon` geometries are supported. Non-polygon geometries (e.g., `Point`, `LineString`) are skipped. + + Returns + ------- + Modified `sdata` object with updated table layer. + """ + if not output_column and not create_column_per_shape: + raise ValueError("Specify `output_column` or set `create_column_per_shape=True`.") + + # Create copy of shapes layer + gdf = sdata.shapes[shapes_layer].copy() + + # Create copy of table layer + adata = sdata.tables[table_layer].copy() + + # Filter out geometries that are not Polygon or MultiPolygon + supported_mask = gdf.geometry.apply(lambda x: isinstance(x, (Polygon, MultiPolygon))) + skipped = len(gdf) - supported_mask.sum() + if skipped > 0: + log.info(f"Skipped {skipped} non-polygon geometries in {shapes_layer}.") + if skipped == len(gdf): + log.warning( + f"No supported geometries (Polygon, MultiPolygon) found. Skipping addition of table layer '{output_table_layer}' to sdata." + ) + return sdata + + gdf = gdf[supported_mask].copy().reset_index(drop=True) + + # Check for overlapping polygons + total_area = gdf.geometry.area.sum() + union_area = gdf.geometry.unary_union.area + if total_area - union_area > overlap_tolerance and not create_column_per_shape: + raise ValueError( + f"Overlapping polygons detected in {shapes_layer}. Correct polygons or use create_column_per_shape." + ) + elif total_area - union_area > 0 and not create_column_per_shape: + log.warning( + f"Overlaps detected (Δ={total_area - union_area:.3f}), below tolerance threshold {overlap_tolerance}." + ) + + # Get cell coordinates + if xy_columns is not None: + x_col, y_col = xy_columns + coords = adata.obs[[x_col, y_col]].to_numpy() + else: + if spatial_key not in adata.obsm: + raise KeyError(f"No spatial coordinates found in `obsm['{spatial_key}']` and `xy_columns` not provided.") + coords = adata.obsm[spatial_key] + if coords.shape[1] != 2: + raise ValueError(f"`obsm['{spatial_key}']` must have shape (n_cells, 2).") + + # Function to assign cell to a polygon + def _assign_region(x, y, gdf, column, sindex): + point = Point(x, y) + + # Quick bounding box search to prefilter the cells + candidate_idx = list(sindex.intersection(point.bounds)) + if not candidate_idx: + return None + + # Slower point‑in‑polygon search + candidate_gdf = gdf.iloc[candidate_idx] + match = candidate_gdf[candidate_gdf.contains(point)] + + if match.empty: + return None + + return match[column].iloc[0] + + # Assign cells to polygons + if create_column_per_shape: + # Collect all relevant name columns based on mode + name_columns = [] + if mode in ("original_names", "both"): + name_columns.append(shape_names_column) + if mode in ("unique_names", "both"): + name_columns.append(unique_shape_names_column) + + # Collect all unique names across both columns + name_to_column = {} + for name_column in name_columns: + for name in gdf[name_column].dropna().astype(str).unique(): + if name not in name_to_column: + name_to_column[name] = name_column + + # Create column per shape name + for shape_name, name_column in name_to_column.items(): + col_name = f"{output_column}-{shape_name}" if output_column else shape_name + + gdf_shape = gdf[gdf[name_column] == shape_name] + if gdf_shape.empty: + continue + + sidx_shape = gdf_shape.sindex # Build spatial index + assigned = [_assign_region(x, y, gdf_shape, name_column, sidx_shape) for x, y in coords] + adata.obs[col_name] = assigned + + else: + # One output column per mode + run_assign_dict = {} + if mode in ("original_names", "both"): + run_assign_dict[output_column] = shape_names_column + if mode in ("unique_names", "both"): + run_assign_dict[f"{output_column}-unique"] = unique_shape_names_column + + for output_column_name, name_column in run_assign_dict.items(): + # Assign one shape name per cell (to be avoided with overlapping regions) + sidx = gdf.sindex # Build spatial index + assigned = [_assign_region(x, y, gdf, name_column, sidx) for x, y in coords] + adata.obs[output_column_name] = assigned + + # Add table layer + sdata = add_table_layer( + sdata, + adata=adata, + output_layer=output_table_layer, + region=adata.obs[_REGION_KEY].cat.categories.to_list(), + overwrite=overwrite, + ) + + return sdata + + +def compute_distance_to_shapes( + sdata: SpatialData, + shapes_layer: str, + table_layer: str, + output_table_layer: str, + shape_names_column: str = "name", + unique_shape_names_column: str = "name-unique", + output_name: str = None, + pixel_size_um: float = 1.0, + modes: list[ + Literal[ + "nearest_edge", + "nearest_edge_grouped", + "all_edges", + "nearest_outer_edge", + "nearest_outer_edge_grouped", + "all_outer_edges", + "nearest_inner_edge", + "nearest_inner_edge_grouped", + "nearest_inner_edge_grouped_unique", + "all_inner_edges", + "nearest_centroid", + "nearest_centroid_grouped", + "all_centroids", + "nearest_point", + "nearest_point_grouped", + "all_points", + ] + ] = None, + spatial_key: str = "spatial", + xy_columns: tuple = None, + overwrite: bool = False, +): + """ + Compute distances from cells (using their centroid coordinates) to polygons in a shapes layer and update the sdata table layer. + It is recommended to run `hp.sh.prep_region_annotations` first. + + Parameters + ---------- + sdata + The SpatialData object containing the input table layer and shapes layer. + shapes_layer + The shapes layer in `sdata.shapes` to use as input. + table_layer + The table layer in `sdata.tables` to use as input. + output_table_layer + The output table layer in `sdata.tables` to which the updated table layer will be written. + shape_names_column + Column name in shapes layer containing geometry names. + unique_shape_names_column + Column name in shapes layer containing unique geometry names. + output_name + Prefix for new distance columns in `.obs`. If None, no prefix is added. + pixel_size_um + Scale factor to convert distances to microns. Defaults to 1 (i.e. distances are in pixels). + modes + Which distance features to calculate. Options include: + + Edge distances (For Polygon and MultiPolygon geometries) + - `"nearest_edge"`: Distance to the nearest edge (outer or inner) of any polygon. + Creates `{output_name}-distance_to_nearest_edge` and `{output_name}-name_of_nearest_edge` columns. + - `"nearest_edge_grouped"`: For each group of shapes with the same name, compute the distance to the edge that is nearest. + Creates `{output_name}-distance_to_nearest_edge_of_` and `{output_name}-name_of_nearest_edge_of_` columns. + - `"all_edges"`: For each individual polygon, compute the distance to its edge. + Creates `{output_name}-distance_to_edge_of_` column. + + Outer edge distances (For Polygon and MultiPolygon geometries) + - `"nearest_outer_edge"`: Distance to the nearest outer edge of any polygon. + Creates `{output_name}-distance_to_nearest_outer_edge` and `{output_name}-name_of_nearest_outer_edge` columns. + - `"nearest_outer_edge_grouped"`: For each group of shapes with the same name, compute the distance to the outer edge that is nearest. + Creates `{output_name}-distance_to_nearest_outer_edge_of_` and `{output_name}-name_of_nearest_outer_edge_of_` columns. + - `"all_outer_edges"`: For each individual polygon, compute the distance to its outer edge. + Creates `{output_name}-distance_to_outer_edge_of_` column. + + Inner edge (hole) distances (For Polygon and MultiPolygon geometries) + - `"nearest_inner_edge"`: Distance to the nearest interior edge (“hole”) of any polygon. + Creates `{output_name}-distance_to_nearest_inner_edge` and `{output_name}-name_of_nearest_inner_edge` columns. + - `"nearest_inner_edge_grouped"`: For each group of shapes with the same name, compute the distance to the nearest inner edge of that group. + Creates `{output_name}-distance_to_nearest_inner_edge_of_` and `{output_name}-name_of_nearest_inner_edge_of_` columns. + - `"nearest_inner_edge_grouped_unique"`: For each individual polygon, compute the distance to the nearest inner edge of that polygon. + Creates `{output_name}-distance_to_nearest_inner_edge_of_` and `{output_name}-name_of_nearest_inner_edge_of_` columns. + - `"all_inner_edges"`: For all holes, compute the distance to each inner edge. + Creates `{output_name}-distance_to_inner_edge_of_-hole` column. + + Centroid distances (For Polygon and MultiPolygon geometries) + - `"nearest_centroid"`: Distance to the nearest centroid of any polygon. + Creates `{output_name}-distance_to_nearest_centroid` and `{output_name}-name_of_nearest_centroid` columns. + - `"nearest_centroid_grouped"`: For each group of shapes with the same name, compute the distance to the centroid that is nearest. + Creates `{output_name}-distance_to_nearest_centroid_of_` and `{output_name}-name_of_nearest_centroid_of_` columns. + - `"all_centroids"`: For each individual polygon, compute the distance to its centroid. + Creates `{output_name}-distance_to_centroid_of_` column. + + Point distances (For Point geometries) + - `"nearest_point"`: Distance to the nearest point. + Creates `{output_name}-distance_to_nearest_point` and `{output_name}-name_of_nearest_point` columns. + - `"nearest_point_grouped"`: For each group of points with the same name, compute the nearest distance. + Creates `{output_name}-distance_to_nearest_point_of_` and `{output_name}-name_of_nearest_point_of_` columns. + - `"all_points"`: For each individual Point, compute the distance to point coordinates. + Creates `{output_name}-distance_to_point_` column. + + spatial_key + Key in `sdata.tables[table_layer].obsm` containing spatial coordinates. Ignored if `xy_columns` is provided. + xy_columns + Tuple of column names in `sdata.tables[table_layer].obs` containing the x and y coordinates the cells. + If None, defaults to using coordinates from `sdata.tables[table_layer].obsm[spatial_key]`. + overwrite + If True, overwrites the `output_table_layer` if it already exists in `sdata`. + + Notes + ----- + - Only `Polygon`, `MultiPolygon` and `Point` geometries are supported. Other geometries (e.g., `LineString`, `MultiPoint`) are skipped. + + Returns + ------- + Modified `sdata` object with updated table layer. + """ + if modes is None: + modes = ["all_edges"] + if output_name is not None: + output_name = f"{output_name}-" + elif output_name is None: + output_name = "" + + # Create copy of shapes layer + gdf = sdata.shapes[shapes_layer].copy() + + # Create copy of table layer + adata = sdata.tables[table_layer].copy() + + # Filter out geometries that are not Polygon, MultiPolygon or Point + supported_mask = gdf.geometry.apply(lambda x: isinstance(x, (Polygon, MultiPolygon, Point))) + skipped = len(gdf) - supported_mask.sum() + if skipped > 0: + log.info(f"Skipped {skipped} geometries in {shapes_layer} that are not Polygon, MultiPolygon or Point.") + + gdf = gdf[supported_mask].copy().reset_index(drop=True) + + # Separate gdf by type + gdf_polygons = gdf[gdf.geometry.geom_type.isin(["Polygon", "MultiPolygon"])].copy() + gdf_points = gdf[gdf.geometry.geom_type == "Point"].copy() + + has_polygons = not gdf_polygons.empty + has_points = not gdf_points.empty + + if not has_polygons and not has_points: + log.warning( + f"No supported geometries (Polygon, MultiPolygon, Point) found. Skipping addition of table layer '{output_table_layer}' to sdata." + ) + return sdata + + # Get cell coordinates + if xy_columns is not None: + x_col, y_col = xy_columns + coords = adata.obs[[x_col, y_col]].to_numpy() + else: + if spatial_key not in adata.obsm: + raise KeyError(f"No spatial coordinates found in `obsm['{spatial_key}']` and `xy_columns` not provided.") + coords = adata.obsm[spatial_key] + if coords.shape[1] != 2: + raise ValueError(f"`obsm['{spatial_key}']` must have shape (n_cells, 2).") + + # Build point GeoDataFrame + pts = gpd.GeoSeries([Point(x, y) for x, y in coords], crs=gdf.crs) + + pts_gdf = gpd.GeoDataFrame({"geometry": pts}, crs=gdf.crs).set_index(adata.obs.index) + + # Polygon and MultiPolygon + if has_polygons: + # Extract exterior and interior boundaries + def _extract_exterior_lines(geom): + if isinstance(geom, Polygon): + return [geom.exterior] + elif isinstance(geom, MultiPolygon): + return [poly.exterior for poly in geom.geoms] + return [] + + def _extract_interior_lines(geom): + if isinstance(geom, Polygon): + return list(geom.interiors) + elif isinstance(geom, MultiPolygon): + lines = [] + for poly in geom.geoms: + lines.extend(poly.interiors) + return lines + return [] + + names = gdf_polygons[unique_shape_names_column].tolist() + exteriors = [MultiLineString(_extract_exterior_lines(geom)) for geom in gdf_polygons.geometry] + interiors = [MultiLineString(_extract_interior_lines(geom)) for geom in gdf_polygons.geometry] + + ext_gdf = gpd.GeoDataFrame({unique_shape_names_column: names}, geometry=exteriors, crs=gdf.crs) + int_gdf = gpd.GeoDataFrame({unique_shape_names_column: names}, geometry=interiors, crs=gdf.crs) + + # Edges (outer and inner) + if "nearest_edge" in modes: + log.info("Calculating 'nearest_edge' distances'") + all_edges_gdf = gpd.GeoDataFrame(pd.concat([ext_gdf, int_gdf], ignore_index=True), crs=gdf.crs) + joined = gpd.sjoin_nearest(pts_gdf, all_edges_gdf, how="left", distance_col="dist") + adata.obs[f"{output_name}distance_to_nearest_edge"] = joined["dist"].to_numpy() * pixel_size_um + adata.obs[f"{output_name}name_of_nearest_edge"] = joined[unique_shape_names_column] + + if "nearest_edge_grouped" in modes: + log.info("Calculating 'nearest_edge_grouped' distances'") + for name, group in gdf_polygons.groupby(shape_names_column): + edges = [] + labels = [] + for _idx, row in group.iterrows(): + edge_lines = _extract_exterior_lines(row.geometry) + _extract_interior_lines(row.geometry) + edges.extend(edge_lines) + labels.extend([row[unique_shape_names_column]] * len(edge_lines)) + + edge_gdf = gpd.GeoDataFrame({"geometry": edges, unique_shape_names_column: labels}, crs=gdf.crs) + joined = gpd.sjoin_nearest(pts_gdf, edge_gdf, how="left", distance_col="dist") + + adata.obs[f"{output_name}distance_to_nearest_edge_of_{name}"] = ( + joined["dist"].to_numpy() * pixel_size_um + ) + adata.obs[f"{output_name}name_of_nearest_edge_of_{name}"] = joined[unique_shape_names_column] + + if "all_edges" in modes: + log.info("Calculating 'all_edges' distances'") + for _, feat in gdf_polygons.reset_index(drop=True).iterrows(): + adata.obs[f"{output_name}distance_to_edge_of_{feat[unique_shape_names_column]}"] = ( + pts.distance(feat.geometry.boundary).to_numpy() * pixel_size_um + ) + + # Outer edges + if "nearest_outer_edge" in modes: + log.info("Calculating 'nearest_outer_edge' distances'") + joined = gpd.sjoin_nearest(pts_gdf, ext_gdf, how="left", distance_col="dist") + adata.obs[f"{output_name}distance_to_nearest_outer_edge"] = joined["dist"].to_numpy() * pixel_size_um + adata.obs[f"{output_name}name_of_nearest_outer_edge"] = joined[unique_shape_names_column] + + if "nearest_outer_edge_grouped" in modes: + log.info("Calculating 'nearest_outer_edge_grouped' distances'") + for name, group in gdf_polygons.groupby(shape_names_column): + edges = [] + labels = [] + for _idx, row in group.iterrows(): + edge_lines = _extract_exterior_lines(row.geometry) + edges.extend(edge_lines) + labels.extend([row[unique_shape_names_column]] * len(edge_lines)) + + edge_gdf = gpd.GeoDataFrame({"geometry": edges, unique_shape_names_column: labels}, crs=gdf.crs) + joined = gpd.sjoin_nearest(pts_gdf, edge_gdf, how="left", distance_col="dist") + + adata.obs[f"{output_name}distance_to_nearest_outer_edge_of_{name}"] = ( + joined["dist"].to_numpy() * pixel_size_um + ) + adata.obs[f"{output_name}name_of_nearest_outer_edge_of_{name}"] = joined[unique_shape_names_column] + + if "all_outer_edges" in modes: + log.info("Calculating 'all_outer_edges' distances'") + for _, feat in gdf_polygons.reset_index(drop=True).iterrows(): + adata.obs[f"{output_name}distance_to_outer_edge_of_{feat[unique_shape_names_column]}"] = ( + pts.distance(feat.geometry.exterior).to_numpy() * pixel_size_um + ) + + # Inner edges + hole_geoms = [] + hole_names = [] + + for _, row in gdf_polygons.reset_index(drop=True).iterrows(): + base = row[unique_shape_names_column] + geom = row.geometry + + holes = _extract_interior_lines(geom) + + if len(holes) == 0: + continue + + if len(holes) == 1: + hole = holes[0] + hole_geoms.append(LineString(hole.coords)) + hole_names.append(base) + + else: + for i, hole in enumerate(holes, start=1): + hole_geoms.append(LineString(hole.coords)) + hole_names.append(f"{base}-hole{i}") + + holes_gdf = gpd.GeoDataFrame({unique_shape_names_column: hole_names}, geometry=hole_geoms, crs=gdf.crs) + + if "nearest_inner_edge" in modes and not holes_gdf.empty: + log.info("Calculating 'nearest_inner_edge' distances'") + joined = gpd.sjoin_nearest(pts_gdf, holes_gdf, how="left", distance_col="dist") + adata.obs[f"{output_name}distance_to_nearest_inner_edge"] = joined["dist"].to_numpy() * pixel_size_um + adata.obs[f"{output_name}name_of_nearest_inner_edge"] = joined[unique_shape_names_column] + + if "nearest_inner_edge_grouped" in modes: + log.info("Calculating 'nearest_inner_edge_grouped' distances'") + for name, group in gdf_polygons.groupby(shape_names_column): + hole_lines = [] + hole_labels = [] + + for _idx, row in group.iterrows(): + base = row[unique_shape_names_column] + for i, hole in enumerate(_extract_interior_lines(row.geometry), start=1): + hole_lines.append(LineString(hole.coords)) + hole_labels.append(f"{base}-hole{i}") + + if not hole_lines: + continue + + hole_gdf = gpd.GeoDataFrame({"geometry": hole_lines, "hole_name": hole_labels}, crs=gdf.crs) + + joined = gpd.sjoin_nearest(pts_gdf, hole_gdf, how="left", distance_col="dist") + + adata.obs[f"{output_name}distance_to_nearest_inner_edge_of_{name}"] = ( + joined["dist"].to_numpy() * pixel_size_um + ) + adata.obs[f"{output_name}name_of_nearest_inner_edge_of_{name}"] = joined["hole_name"] + + if "nearest_inner_edge_grouped_unique" in modes: + log.info("Calculating 'nearest_inner_edge_grouped_unique' distances'") + for _idx, row in gdf_polygons.iterrows(): + base = row[unique_shape_names_column] + holes = _extract_interior_lines(row.geometry) + + if not holes: + continue + + hole_geoms = [LineString(hole.coords) for hole in holes] + hole_names = [f"{base}-hole{i + 1}" for i in range(len(holes))] + + hole_gdf = gpd.GeoDataFrame({"geometry": hole_geoms, "hole_name": hole_names}, crs=gdf.crs) + + joined = gpd.sjoin_nearest(pts_gdf, hole_gdf, how="left", distance_col="dist") + + adata.obs[f"{output_name}distance_to_nearest_inner_edge_of_{base}"] = ( + joined["dist"].to_numpy() * pixel_size_um + ) + adata.obs[f"{output_name}name_of_nearest_inner_edge_of_{base}"] = joined["hole_name"] + + if "all_inner_edges" in modes and not holes_gdf.empty: + log.info("Calculating 'all_inner_edges' distances'") + for _, hole in holes_gdf.iterrows(): + hole_name = hole[unique_shape_names_column] + col = f"{output_name}distance_to_inner_edge_of_{hole_name}" + adata.obs[col] = pts.distance(hole.geometry).to_numpy() * pixel_size_um + + # Centroids + centroids_df = gdf_polygons.reset_index(drop=True)[[unique_shape_names_column]].copy() + centroids_df["geometry"] = gdf_polygons.geometry.centroid.reset_index(drop=True) + + centroids_gdf = gpd.GeoDataFrame(centroids_df, geometry="geometry", crs=gdf.crs) + + if "nearest_centroid" in modes: + log.info("Calculating 'nearest_centroid' distances'") + joined = gpd.sjoin_nearest(pts_gdf, centroids_gdf, how="left", distance_col="dist") + adata.obs.loc[joined.index, f"{output_name}distance_to_nearest_centroid"] = ( + joined["dist"].to_numpy() * pixel_size_um + ) + adata.obs.loc[joined.index, f"{output_name}name_of_nearest_centroid"] = joined[ + unique_shape_names_column + ].to_numpy() + + if "nearest_centroid_grouped" in modes: + log.info("Calculating 'nearest_centroid_grouped' distances'") + for name, group in gdf_polygons.groupby(shape_names_column): + centroids = group.geometry.centroid + labels = group[unique_shape_names_column].tolist() + + centroid_gdf = gpd.GeoDataFrame({"geometry": centroids, unique_shape_names_column: labels}, crs=gdf.crs) + + joined = gpd.sjoin_nearest(pts_gdf, centroid_gdf, how="left", distance_col="dist") + + adata.obs[f"{output_name}distance_to_nearest_centroid_of_{name}"] = ( + joined["dist"].to_numpy() * pixel_size_um + ) + adata.obs[f"{output_name}name_of_nearest_centroid_of_{name}"] = joined[unique_shape_names_column] + + if "all_centroids" in modes: + log.info("Calculating 'all_centroids' distances'") + for _idx, feat in gdf_polygons.reset_index(drop=True).iterrows(): + adata.obs[f"{output_name}distance_to_centroid_of_{feat[unique_shape_names_column]}"] = ( + pts.distance(feat.geometry.centroid).to_numpy() * pixel_size_um + ) + + # Point + if has_points: + if "nearest_point" in modes: + log.info("Calculating 'nearest_point' distances'") + joined = gpd.sjoin_nearest(pts_gdf, gdf_points, how="left", distance_col="dist") + adata.obs[f"{output_name}distance_to_nearest_point"] = joined["dist"].to_numpy() * pixel_size_um + adata.obs[f"{output_name}name_of_nearest_point"] = joined[f"{shape_names_column}-unique"] + + if "nearest_point_grouped" in modes: + log.info("Calculating 'nearest_point_grouped' distances'") + for name, group in gdf_points.groupby(shape_names_column): + joined = gpd.sjoin_nearest(pts_gdf, group, how="left", distance_col="dist") + adata.obs[f"{output_name}distance_to_nearest_point_of_{name}"] = ( + joined["dist"].to_numpy() * pixel_size_um + ) + adata.obs[f"{output_name}name_of_nearest_point_of_{name}"] = joined[f"{shape_names_column}-unique"] + + if "all_points" in modes: + log.info("Calculating 'all_points' distances'") + for _, row in gdf_points.iterrows(): + adata.obs[f"{output_name}distance_to_point_{row[unique_shape_names_column]}"] = ( + pts.distance(row.geometry).to_numpy() * pixel_size_um + ) + + # Add table layer + sdata = add_table_layer( + sdata, + adata=adata, + output_layer=output_table_layer, + region=adata.obs[_REGION_KEY].cat.categories.to_list(), + overwrite=overwrite, + ) + + return sdata diff --git a/src/harpy/table/_table.py b/src/harpy/table/_table.py index bc467b02..69f1dbe4 100644 --- a/src/harpy/table/_table.py +++ b/src/harpy/table/_table.py @@ -1,8 +1,10 @@ from __future__ import annotations -from collections.abc import Iterable +from collections.abc import Iterable, Sequence +from typing import Literal import numpy as np +import pandas as pd from anndata import AnnData from spatialdata import SpatialData from spatialdata.models import TableModel @@ -223,15 +225,17 @@ def filter_on_size( labels_layer: list[str], table_layer: str, output_layer: str, - min_size: int = 100, - max_size: int = 100000, - update_shapes_layers: bool = True, cellsize_key=_CELLSIZE_KEY, + min_size: float | int | None = None, + max_size: float | int | None = None, + update_shapes_layers: bool = True, + prefix_filtered_shapes_layer: str = "filtered_size", overwrite: bool = False, ) -> SpatialData: """Returns the updated SpatialData object. All cells with a size outside of the min and max size range are removed using the `cellsize_key` in `.obs`. Run e.g. `harpy.tb.preprocess_transcriptomics` or `harpy.tb.preprocess_proteomics` to obtain cell sizes. + Cells are kept if min_size ≤ cellsize_key ≤ max_size. Parameters ---------- @@ -246,16 +250,90 @@ def filter_on_size( The table layer in `sdata`. output_layer The output table layer in `sdata`. + cellsize_key + Column in `sdata.tables[table_layer].obs` containing cell sizes. min_size - minimum size in pixels. + minimum size in pixels. If None, this value is not used for filtering. max_size - maximum size in pixels. + maximum size in pixels. If None, this value is not used for filtering. update_shapes_layers Whether to filter the shapes layers associated with `labels_layer`. If set to `True`, cells that do not appear in resulting `output_layer` (with `_REGION_KEY` equal to `labels_layer`) will be removed from the shapes layers (via `_INSTANCE_KEY`) in the `sdata` object. - Filtered shapes will be added to `sdata` with prefix 'filtered_size'. - cellsize_key - Column in `sdata.tables[table_layer].obs` containing cell sizes. + Filtered shapes will be added to `sdata` with prefix `prefix_filtered_shapes_layer`. + prefix_filtered_shapes_layer + prefix to use for filtered shapes layer if update_shapes_layers is True. Defaults to 'filtered_size'. + overwrite + If True, overwrites the `output_layer` if it already exists in `sdata`. + + Returns + ------- + The updated SpatialData object. + """ + sdata = filter_numerical( + sdata=sdata, + labels_layer=labels_layer, + table_layer=table_layer, + output_layer=output_layer, + numerical_column=cellsize_key, + min_value=min_size, + max_value=max_size, + target="obs", + update_shapes_layers=update_shapes_layers, + prefix_filtered_shapes_layer=prefix_filtered_shapes_layer, + overwrite=overwrite, + ) + + return sdata + + +def filter_numerical( + sdata: SpatialData, + labels_layer: list[str], + table_layer: str, + output_layer: str, + numerical_column: str, + min_value: float | int | None = None, + max_value: float | int | None = None, + target: Literal["obs", "var"] = "obs", + update_shapes_layers: bool = True, + prefix_filtered_shapes_layer: str = "filtered", + overwrite: bool = False, +) -> SpatialData: + """Returns the updated SpatialData object. + + All observations (obs) or features (var) with values outside of the min and max size range are removed using the `numerical_column` in `.obs` or `.var`. + observations/features are kept if `min_value` ≤ `numerical_column` ≤ `max_value`. + + Parameters + ---------- + sdata + The SpatialData object. + labels_layer + The labels layer(s) of `sdata` used to select the observations via the _REGION_KEY in `sdata.tables[table_layer].obs`. + Note that if `output_layer` is equal to `table_layer` and overwrite is True, + observations in `sdata.tables[table_layer]` linked to other `labels_layer` (via the _REGION_KEY), will be removed from `sdata.tables[table_layer]` + (also from the backing zarr store if it is backed). + table_layer + The table layer in `sdata`. + output_layer + The output table layer in `sdata`. + numerical_column + Name of numerical column to use for filtering. + If `target='obs'`, this refers to `.obs[numerical_column]`. + If `target='var'`, this refers to `.var[numerical_column]`. + target + Whether to filter observations ('obs') or features ('var'). + min_value + minimum value of `numerical_column` (inclusive). If None, lower bound is ignored. + max_value + maximum value of `numerical_column` (inclusive). If None, upper bound is ignored. + update_shapes_layers + Whether to filter the shapes layers associated with `labels_layer`. + If set to `True`, observations that do not appear in resulting `output_layer` (with `_REGION_KEY` equal to `labels_layer`) will be removed from the shapes layers (via `_INSTANCE_KEY`) in the `sdata` object. + Filtered shapes will be added to `sdata` with prefix `prefix_filtered_shapes_layer`. + Ignored if `target='var'`. + prefix_filtered_shapes_layer + prefix to use for filtered shapes layer if update_shapes_layers is True. Defaults to 'filtered'. overwrite If True, overwrites the `output_layer` if it already exists in `sdata`. @@ -263,14 +341,164 @@ def filter_on_size( ------- The updated SpatialData object. """ + if target not in {"obs", "var"}: + raise ValueError("target must be either 'obs' or 'var'") + process_table_instance = ProcessTable(sdata, labels_layer=labels_layer, table_layer=table_layer) - adata = process_table_instance._get_adata() - start = adata.shape[0] + adata = process_table_instance._get_adata().copy() + start = adata.shape[0] if target == "obs" else adata.shape[1] + + df = getattr(adata, target) + label = "observations" if target == "obs" else "features" - # Filter cells based on size and distance - # need to do the copy because we pop the spatialdata_attrs in add_table_layer, otherwise it would not be updated inplace - adata = adata[adata.obs[cellsize_key] < max_size, :].copy() - adata = adata[adata.obs[cellsize_key] > min_size, :].copy() + if numerical_column not in df.columns: + raise ValueError(f"Column '{numerical_column}' not found in '{table_layer}.{target}'. ") + + if not np.issubdtype(df[numerical_column].dtype, np.number): + raise ValueError(f"Column '{numerical_column}' must be numeric, but dtype is {df[numerical_column].dtype}.") + + # Filter observations based on min and max values + mask = pd.Series(True, index=df.index) + + if min_value is not None: + below = (df[numerical_column] < min_value).sum() + log.info(f"Removed {below} {label} below {min_value}.") + mask &= df[numerical_column] >= min_value + if max_value is not None: + above = (df[numerical_column] > max_value).sum() + log.info(f"Removed {above} {label} above {max_value}.") + mask &= df[numerical_column] <= max_value + + if target == "obs": + adata = adata[mask, :].copy() + else: + adata = adata[:, mask].copy() + + sdata = add_table_layer( + sdata, + adata=adata, + output_layer=output_layer, + region=process_table_instance.labels_layer, + overwrite=overwrite, + ) + + if target == "obs" and update_shapes_layers: + for _labels_layer in process_table_instance.labels_layer: + sdata = filter_shapes_layer( + sdata, + table_layer=output_layer, + labels_layer=_labels_layer, + prefix_filtered_shapes_layer=prefix_filtered_shapes_layer, + ) + + removed = start - (adata.shape[0] if target == "obs" else adata.shape[1]) + kept = start - removed + log.info( + f"Removed {removed}/{start} {label} based on '{numerical_column}' (min={min_value}, max={max_value}) and kept {kept}." + ) + + return sdata + + +def filter_categorical( + sdata: SpatialData, + labels_layer: list[str], + table_layer: str, + output_layer: str, + categorical_column: str | None = None, + include_values: str | Sequence[str] | None = None, + exclude_values: str | Sequence[str] | None = None, + target: Literal["obs", "var"] = "obs", + update_shapes_layers: bool = True, + prefix_filtered_shapes_layer: str = "filtered", + overwrite: bool = False, +) -> SpatialData: + """ + Removes or keeps observations (obs) or features (var) based on specific values in a categorical column of + `.obs` or `.var`. or based on their index. + + Parameters + ---------- + sdata + The SpatialData object. + labels_layer + The labels layer(s) of `sdata` used to select the observations via the _REGION_KEY in `sdata.tables[table_layer].obs`. + Note that if `output_layer` is equal to `table_layer` and overwrite is True, + observations in `sdata.tables[table_layer]` linked to other `labels_layer` (via the _REGION_KEY), will be removed from `sdata.tables[table_layer]` + (also from the backing zarr store if it is backed). + table_layer + The table layer in `sdata`. + output_layer + The output table layer in `sdata`. + categorical_column + Name of categorical column to use for filtering. + If `target='obs'`, this refers to `.obs[categorical_column]`. + If `target='var'`, this refers to `.var[categorical_column]`. + If None, filtering is performed based on the index. + include_values + Value(s) to keep. Only observations/features whose `categorical_column` matches one of these + values will be kept. Mutually exclusive with `exclude_values`. + exclude_values + Value(s) to remove. Only observations/features whose `categorical_column` matches one of these + values will be removed. Mutually exclusive with `include_values`. + target + Whether to filter observations ('obs') or features ('var'). + update_shapes_layers + Whether to filter the shapes layers associated with `labels_layer`. + If set to `True`, observations that do not appear in resulting `output_layer` (with `_REGION_KEY` equal to `labels_layer`) will be removed from the shapes layers (via `_INSTANCE_KEY`) in the `sdata` object. + Filtered shapes will be added to `sdata` with prefix `prefix_filtered_shapes_layer`. + Ignored if `target='var'`. + prefix_filtered_shapes_layer + prefix to use for filtered shapes layer if update_shapes_layers is True. Defaults to 'filtered'. + overwrite + If True, overwrites the `output_layer` if it already exists in `sdata`. + + Returns + ------- + The updated SpatialData object. + """ + if include_values is not None and exclude_values is not None: + raise ValueError("Specify only one of 'include_values' or 'exclude_values'.") + + if target not in {"obs", "var"}: + raise ValueError("target must be either 'obs' or 'var'") + + process_table_instance = ProcessTable(sdata, labels_layer=labels_layer, table_layer=table_layer) + adata = process_table_instance._get_adata().copy() + start = adata.shape[0] if target == "obs" else adata.shape[1] + + df = getattr(adata, target) + label = "observations" if target == "obs" else "features" + + if categorical_column is not None and categorical_column not in df.columns: + raise ValueError(f"Column '{categorical_column}' not found in '{table_layer}.{target}'.") + + # Ensure include/exclude are lists + if isinstance(include_values, str): + include_values = [include_values] + if isinstance(exclude_values, str): + exclude_values = [exclude_values] + + # Filter + mask = pd.Series(True, index=df.index) + + if categorical_column is not None: + # Filtering by column + if include_values is not None: + mask &= df[categorical_column].isin(include_values) + elif exclude_values is not None: + mask &= ~df[categorical_column].isin(exclude_values) + else: + # Filtering by index + if include_values is not None: + mask &= df.index.isin(include_values) + elif exclude_values is not None: + mask &= ~df.index.isin(exclude_values) + + if target == "obs": + adata = adata[mask, :].copy() + else: + adata = adata[:, mask].copy() sdata = add_table_layer( sdata, @@ -280,17 +508,20 @@ def filter_on_size( overwrite=overwrite, ) - if update_shapes_layers: + if target == "obs" and update_shapes_layers: for _labels_layer in process_table_instance.labels_layer: sdata = filter_shapes_layer( sdata, table_layer=output_layer, labels_layer=_labels_layer, - prefix_filtered_shapes_layer="filtered_size", + prefix_filtered_shapes_layer=prefix_filtered_shapes_layer, ) - filtered = start - adata.shape[0] - log.info(f"{filtered} cells were filtered out based on size.") + kept = adata.shape[0] if target == "obs" else adata.shape[1] + removed = start - kept + if categorical_column is None: + categorical_column = "index" + log.info(f"Removed {removed}/{start} {label} based on '{categorical_column}' filtering (kept {kept}).") return sdata