diff --git a/.gitignore b/.gitignore
index ccd784f..36a793e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -209,3 +209,10 @@ cython_debug/
marimo/_static/
marimo/_lsp/
__marimo__/
+
+# model files
+*.pt
+*.pth
+*.onnx
+*.engine
+*.ts
diff --git a/README.md b/README.md
index 49392b8..21352a8 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,8 @@ ROS2 perception stack for generalist robotics.
## Packages
- **track_anything** - EdgeTAM tracking + 3D segmentation with RGBD
+- **detect_anything** - YOLO-E detection node and utilities
+- **semantic_mapping** - placeholder for future mapping components (currently empty)
- **vector_perception_utils** - Image and point cloud utilities
## Installation
@@ -21,6 +23,7 @@ pip install -r requirements.txt
# Build
source /opt/ros/jazzy/setup.bash
colcon build
+```
diff --git a/detect_anything/CMakeLists.txt b/detect_anything/CMakeLists.txt
new file mode 100644
index 0000000..c8e9188
--- /dev/null
+++ b/detect_anything/CMakeLists.txt
@@ -0,0 +1,47 @@
+cmake_minimum_required(VERSION 3.10)
+project(detect_anything)
+
+find_package(ament_cmake REQUIRED)
+find_package(rclpy REQUIRED)
+find_package(sensor_msgs REQUIRED)
+find_package(std_msgs REQUIRED)
+find_package(vision_msgs REQUIRED)
+find_package(cv_bridge REQUIRED)
+find_package(rosidl_default_generators REQUIRED)
+find_package(Python3 REQUIRED COMPONENTS Interpreter)
+
+rosidl_generate_interfaces(${PROJECT_NAME}
+ "msg/DetectionResult.msg"
+ DEPENDENCIES std_msgs sensor_msgs
+)
+
+set(PYTHON_INSTALL_DIR "lib/python${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR}/site-packages")
+
+install(
+ DIRECTORY detect_anything
+ DESTINATION ${PYTHON_INSTALL_DIR}
+)
+
+install(
+ PROGRAMS
+ scripts/detection_node
+ DESTINATION lib/${PROJECT_NAME}
+)
+
+install(
+ FILES resource/detect_anything
+ DESTINATION share/${PROJECT_NAME}/resource
+)
+
+install(
+ DIRECTORY config
+ DESTINATION share/${PROJECT_NAME}
+)
+
+set(ENV_HOOK "${CMAKE_CURRENT_SOURCE_DIR}/env-hooks/venv_pythonpath.sh.in")
+set(ENV_HOOK_OUT "${CMAKE_CURRENT_BINARY_DIR}/ament_cmake_environment_hooks/venv_pythonpath.sh")
+configure_file(${ENV_HOOK} ${ENV_HOOK_OUT} @ONLY)
+ament_environment_hooks(${ENV_HOOK_OUT})
+
+ament_export_dependencies(rosidl_default_runtime)
+ament_package()
diff --git a/detect_anything/README.md b/detect_anything/README.md
new file mode 100644
index 0000000..1e94a06
--- /dev/null
+++ b/detect_anything/README.md
@@ -0,0 +1,28 @@
+# detect_anything
+
+YOLO-E detection node that publishes `DetectionResult` with cropped masks and an annotated overlay topic.
+
+## What’s inside
+- `detect_anything/detection.py`: detection results container and Ultralytics parsing helpers.
+- `detect_anything/yoloe.py`: YOLO-E wrapper with prompt support and basic filtering.
+- `detect_anything/detection_node.py`: ROS2 node wiring the detector to `DetectionResult`.
+- `msg/DetectionResult.msg`: compressed image + cropped mask array.
+
+## Quick start
+```bash
+source ~/vector_venv/bin/activate
+source /opt/ros/jazzy/setup.bash
+colcon build --packages-select detect_anything
+source install/setup.bash
+
+ros2 run detect_anything detection_node \
+ --ros-args -p model_path:=/path/to/yoloe/models \
+ -p model_name:=yoloe-11s-seg-pf.pt \
+ -p conf:=0.6 \
+ -p max_area_ratio:=0.3 \
+ -p image_topic:=/camera/image
+```
+
+Topics:
+- Publishes `/detection_result` (`detect_anything/DetectionResult`) and `/annotated_image_detection` (`sensor_msgs/Image`).
+- Subscribes to `/camera/image` (or `/camera/image/compressed` if `use_compressed:=true`).
diff --git a/detect_anything/config/objects.yaml b/detect_anything/config/objects.yaml
new file mode 100644
index 0000000..371f186
--- /dev/null
+++ b/detect_anything/config/objects.yaml
@@ -0,0 +1,46 @@
+# Simple list of object names for prompting YOLO-E
+- chair
+- desk
+- tv_monitor
+- sofa
+- unknown
+- printer
+- coffee machine
+- refrigerator
+- trash can
+- shoe
+- sink
+- table
+- oven
+- bed
+- painting
+- bulletin board
+- plant
+- vase
+- cabinet
+- shelf
+- book
+- cup
+- sculpture
+- keyboard
+- mouse
+- clock
+- phone
+- toilet
+- bathtub
+- microwave oven
+- pan
+- suitcase
+- light
+- curtain
+- whiteboard
+- shower knob
+- bottle
+- water dispenser
+- vending machine
+- laptop
+- bag
+- locker
+- picture
+- cardboard
+- extinguisher
diff --git a/detect_anything/detect_anything/__init__.py b/detect_anything/detect_anything/__init__.py
new file mode 100644
index 0000000..9c61aab
--- /dev/null
+++ b/detect_anything/detect_anything/__init__.py
@@ -0,0 +1 @@
+"""detect_anything package."""
diff --git a/detect_anything/detect_anything/detection.py b/detect_anything/detect_anything/detection.py
new file mode 100644
index 0000000..1d64d76
--- /dev/null
+++ b/detect_anything/detect_anything/detection.py
@@ -0,0 +1,256 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import List, Optional, Sequence
+
+from cv_bridge import CvBridge
+import cv2
+import numpy as np
+
+from detect_anything.msg import DetectionResult
+
+_BRIDGE = CvBridge()
+
+
+@dataclass
+class SingleDetectionResult:
+ """Simple container for a YOLO/Ultralytics detection with a segmentation mask."""
+
+ bbox: tuple[float, float, float, float]
+ track_id: int
+ class_id: int
+ label: str
+ confidence: float
+ mask: np.ndarray
+
+ def area(self) -> float:
+ x1, y1, x2, y2 = self.bbox
+ return max(0.0, (x2 - x1)) * max(0.0, (y2 - y1))
+
+
+@dataclass
+class DetectionResults:
+ """Container for detections plus the source image."""
+
+ image: np.ndarray
+ detections: List[SingleDetectionResult]
+
+ def overlay(self, alpha: float = 0.4, thickness: int = 2) -> np.ndarray:
+ """Draw bboxes and masks on the image."""
+ output = self.image.copy()
+ for det in self.detections:
+ rng = np.random.default_rng(det.track_id)
+ color = tuple(int(c) for c in rng.integers(50, 255, size=3))
+
+ if det.mask.size:
+ mask_bool = det.mask.astype(bool)
+ if mask_bool.shape[:2] == output.shape[:2]:
+ colored_mask = np.zeros_like(output)
+ colored_mask[mask_bool] = color
+ output[mask_bool] = cv2.addWeighted(
+ output[mask_bool], 1 - alpha, colored_mask[mask_bool], alpha, 0
+ )
+ contours, _ = cv2.findContours(det.mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+ cv2.drawContours(output, contours, -1, color, thickness)
+
+ x1, y1, x2, y2 = map(int, det.bbox)
+ cv2.rectangle(output, (x1, y1), (x2, y2), color, thickness)
+
+ label_text = f"{det.label} {det.confidence:.2f}"
+ label_text = f"{label_text} | id {det.track_id}"
+
+ (text_w, text_h), baseline = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
+ cv2.rectangle(
+ output,
+ (x1, max(0, y1 - text_h - baseline - 2)),
+ (x1 + text_w, max(0, y1)),
+ color,
+ -1,
+ )
+ cv2.putText(
+ output,
+ label_text,
+ (x1, max(0, y1 - baseline - 2)),
+ cv2.FONT_HERSHEY_SIMPLEX,
+ 0.5,
+ (255, 255, 255),
+ 1,
+ cv2.LINE_AA,
+ )
+ return output
+
+ def to_msg(self, header) -> DetectionResult:
+ """
+ Convert detections into a DetectionResult ROS message.
+ """
+ msg = DetectionResult()
+ msg.header = header
+ msg.image = _BRIDGE.cv2_to_compressed_imgmsg(self.image, dst_format="jpeg")
+ msg.image.header = header
+
+ for det in self.detections:
+ msg.track_id.append(int(det.track_id))
+ msg.x1.append(float(det.bbox[0]))
+ msg.y1.append(float(det.bbox[1]))
+ msg.x2.append(float(det.bbox[2]))
+ msg.y2.append(float(det.bbox[3]))
+ msg.label.append(str(det.label))
+ msg.confidence.append(float(det.confidence))
+
+ mask = det.mask
+ # Crop mask to bbox and align origin to bbox top-left for transport.
+ x1, y1, x2, y2 = [int(v) for v in det.bbox]
+ h, w = self.image.shape[:2]
+ x1 = max(0, min(x1, w))
+ x2 = max(0, min(x2, w))
+ y1 = max(0, min(y1, h))
+ y2 = max(0, min(y2, h))
+ crop_h = max(1, y2 - y1)
+ crop_w = max(1, x2 - x1)
+ if mask.size == 0:
+ mask = np.zeros((crop_h, crop_w), dtype=np.uint8)
+ else:
+ mask = mask[y1:y2, x1:x2]
+ if mask.shape != (crop_h, crop_w):
+ mask = cv2.resize(
+ mask.astype(np.uint8),
+ (crop_w, crop_h),
+ interpolation=cv2.INTER_NEAREST,
+ )
+
+ mask_msg = _BRIDGE.cv2_to_compressed_imgmsg(mask, dst_format="png")
+ mask_msg.header = header
+ msg.masks.append(mask_msg)
+
+ return msg
+
+ @classmethod
+ def from_msg(cls, msg: DetectionResult) -> DetectionResults:
+ """
+ Construct DetectionResults from a DetectionResult ROS message.
+ Masks are converted back to full-size overlays at bbox locations.
+ """
+ image = _BRIDGE.compressed_imgmsg_to_cv2(msg.image)
+ detections: List[SingleDetectionResult] = []
+
+ for i in range(len(msg.track_id)):
+ bbox = (
+ float(msg.x1[i]),
+ float(msg.y1[i]),
+ float(msg.x2[i]),
+ float(msg.y2[i]),
+ )
+ mask_img = _BRIDGE.compressed_imgmsg_to_cv2(msg.masks[i], desired_encoding="mono8")
+
+ full_mask = np.zeros(image.shape[:2], dtype=np.uint8)
+ x1, y1, x2, y2 = [int(v) for v in bbox]
+ h, w = image.shape[:2]
+ x1 = max(0, min(x1, w))
+ x2 = max(0, min(x2, w))
+ y1 = max(0, min(y1, h))
+ y2 = max(0, min(y2, h))
+ crop_h = max(1, y2 - y1)
+ crop_w = max(1, x2 - x1)
+ if mask_img.shape[:2] != (crop_h, crop_w):
+ mask_img = cv2.resize(mask_img, (crop_w, crop_h), interpolation=cv2.INTER_NEAREST)
+ full_mask[y1:y2, x1:x2] = mask_img
+
+ detections.append(
+ SingleDetectionResult(
+ bbox=bbox,
+ track_id=int(msg.track_id[i]),
+ class_id=0, # class_id is not transmitted in DetectionResult
+ label=str(msg.label[i]),
+ confidence=float(msg.confidence[i]),
+ mask=full_mask,
+ )
+ )
+
+ return cls(image=image, detections=detections)
+
+
+def _from_ultralytics_result(
+ result,
+ idx: int,
+ image_shape: Optional[tuple[int, int]] = None,
+ names: Optional[Sequence[str]] = None,
+) -> SingleDetectionResult:
+ """Convert a single Ultralytics Results entry into a SingleDetectionResult."""
+ from ultralytics.engine.results import Results # local import
+
+ if not isinstance(result, Results):
+ raise TypeError("result must be an ultralytics Results object")
+ if result.boxes is None:
+ raise ValueError("Result has no boxes")
+
+ bbox_array = result.boxes.xyxy[idx].cpu().numpy()
+ bbox = (
+ float(bbox_array[0]),
+ float(bbox_array[1]),
+ float(bbox_array[2]),
+ float(bbox_array[3]),
+ )
+
+ confidence = float(result.boxes.conf[idx].cpu())
+ class_id = int(result.boxes.cls[idx].cpu())
+
+ resolved_names = names
+ if resolved_names is None and hasattr(result, "names"):
+ resolved_names = result.names # type: ignore[assignment]
+
+ label = f"class_{class_id}"
+ if resolved_names is not None:
+ if isinstance(resolved_names, dict):
+ label = resolved_names.get(class_id, label)
+ elif isinstance(resolved_names, (list, tuple)) and class_id < len(resolved_names):
+ label = resolved_names[class_id]
+
+ track_id = idx
+ if hasattr(result.boxes, "id") and result.boxes.id is not None:
+ track_id = int(result.boxes.id[idx].cpu())
+
+ target_h, target_w = (image_shape or (None, None))
+ if target_h is None or target_w is None:
+ if hasattr(result, "orig_shape") and result.orig_shape:
+ target_h, target_w = result.orig_shape[:2]
+
+ mask = np.zeros((target_h, target_w), dtype=np.uint8) if target_h and target_w else np.zeros((0, 0), dtype=np.uint8)
+ if result.masks is not None and idx < len(result.masks.data):
+ mask_tensor = result.masks.data[idx]
+ mask_np = mask_tensor.cpu().numpy()
+
+ if mask_np.ndim == 3:
+ mask_np = mask_np.squeeze()
+
+ if target_h and target_w and mask_np.shape != (target_h, target_w):
+ mask_np = cv2.resize(mask_np.astype(np.float32), (target_w, target_h), interpolation=cv2.INTER_LINEAR)
+
+ mask = (mask_np > 0.5).astype(np.uint8) * 255 # type: ignore[assignment]
+
+ return SingleDetectionResult(
+ bbox=bbox,
+ track_id=track_id,
+ class_id=class_id,
+ label=label,
+ confidence=confidence,
+ mask=mask,
+ )
+
+
+def from_ultralytics_results(
+ results,
+ image_shape: Optional[tuple[int, int]] = None,
+ names: Optional[Sequence[str]] = None,
+) -> List[SingleDetectionResult]:
+ """Convert a list of Ultralytics Results objects into detections with masks."""
+ detections: List[SingleDetectionResult] = []
+ for result in results:
+ if result.boxes is None:
+ continue
+ num_detections = len(result.boxes.xyxy)
+ for i in range(num_detections):
+ detections.append(_from_ultralytics_result(result, i, image_shape=image_shape, names=names))
+ return detections
+
+
+__all__ = ["DetectionResults", "SingleDetectionResult", "from_ultralytics_results"]
diff --git a/detect_anything/detect_anything/detection_node.py b/detect_anything/detect_anything/detection_node.py
new file mode 100644
index 0000000..aebd0e8
--- /dev/null
+++ b/detect_anything/detect_anything/detection_node.py
@@ -0,0 +1,169 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import List, Optional
+
+import yaml
+import rclpy
+from cv_bridge import CvBridge
+from sensor_msgs.msg import CompressedImage, Image
+from rclpy.node import Node
+from rclpy.qos import DurabilityPolicy, QoSProfile, ReliabilityPolicy
+
+from detect_anything.detection import DetectionResults
+from detect_anything.yoloe import Yoloe2DDetector, YoloePromptMode
+from detect_anything.msg import DetectionResult
+
+
+class DetectionNode(Node):
+ """ROS2 node that runs YOLO-E detection and publishes DetectionResult."""
+
+ def __init__(self) -> None:
+ super().__init__("detect_anything_detection_node")
+ self.bridge = CvBridge()
+
+ # Parameters
+ defaults = self._default_params()
+ self.declare_parameter("image_topic", defaults["image_topic"])
+ self.declare_parameter("compressed_image_topic", defaults["compressed_image_topic"])
+ self.declare_parameter("use_compressed", defaults["use_compressed"])
+ self.declare_parameter("model_path", defaults["model_path"])
+ self.declare_parameter("model_name", defaults["model_name"])
+ self.declare_parameter("device", defaults["device"])
+ self.declare_parameter("prompt_mode", defaults["prompt_mode"])
+ self.declare_parameter("objects_file", defaults["objects_file"])
+ self.declare_parameter("conf", defaults["conf"])
+ self.declare_parameter("max_area_ratio", defaults["max_area_ratio"])
+
+ params = self._default_params()
+ image_topic = str(self.get_parameter("image_topic").value)
+ compressed_image_topic = str(self.get_parameter("compressed_image_topic").value)
+ self.use_compressed = bool(self.get_parameter("use_compressed").value)
+
+ model_path_val = self.get_parameter("model_path").value
+ model_path = str(model_path_val) if model_path_val not in (None, "") else params["model_path"]
+ if str(model_path).lower() == "none":
+ model_path = params["model_path"]
+
+ model_name = str(self.get_parameter("model_name").value or params["model_name"])
+ device = str(self.get_parameter("device").value or params["device"])
+ prompt_mode_str = str(self.get_parameter("prompt_mode").value or params["prompt_mode"]).lower()
+ objects_file = Path(str(self.get_parameter("objects_file").value or params["objects_file"]))
+ conf = float(self.get_parameter("conf").value)
+ max_area_ratio = float(self.get_parameter("max_area_ratio").value)
+
+ prompt_mode = YoloePromptMode(prompt_mode_str)
+
+ # Auto-pick the right default weight if user didn't override
+ if prompt_mode == YoloePromptMode.LRPC and ("seg.pt" in model_name and "seg-pf.pt" not in model_name):
+ model_name = "yoloe-11l-seg-pf.pt"
+ self.get_logger().info(f"Prompt-free mode: switching model_name to {model_name}")
+ if prompt_mode == YoloePromptMode.PROMPT and "seg-pf.pt" in model_name:
+ model_name = "yoloe-11l-seg.pt"
+ self.get_logger().info(f"Prompt mode: switching model_name to {model_name}")
+
+ self.detector = Yoloe2DDetector(
+ model_path=model_path,
+ model_name=model_name,
+ device=device or None,
+ prompt_mode=prompt_mode,
+ max_area_ratio=max_area_ratio,
+ conf=conf,
+ )
+
+ prompts = self._load_prompts(objects_file)
+ if prompts and prompt_mode == YoloePromptMode.PROMPT:
+ self.detector.set_prompts(text=prompts)
+ self.get_logger().info(f"Loaded {len(prompts)} prompts from {objects_file}")
+
+ qos = QoSProfile(
+ depth=10,
+ reliability=ReliabilityPolicy.RELIABLE,
+ durability=DurabilityPolicy.VOLATILE,
+ )
+ if self.use_compressed:
+ self.create_subscription(
+ CompressedImage,
+ compressed_image_topic,
+ self._compressed_image_callback,
+ qos,
+ )
+ else:
+ self.create_subscription(
+ Image,
+ image_topic,
+ self._image_callback,
+ qos,
+ )
+
+ self.detection_pub = self.create_publisher(DetectionResult, "/detection_result", qos)
+ self.overlay_pub = self.create_publisher(Image, "/annotated_image_detection", qos)
+
+ self.get_logger().info("detect_anything detection node started")
+
+ def _default_params(self) -> dict:
+ base_share = Path(__file__).resolve().parent.parent
+ return {
+ "image_topic": "/camera/image",
+ "compressed_image_topic": "/camera/image/compressed",
+ "use_compressed": True,
+ "model_path": str(base_share / "models"),
+ "model_name": "yoloe-11l-seg.pt",
+ "device": "cuda",
+ "prompt_mode": YoloePromptMode.PROMPT.value,
+ "objects_file": str(base_share / "config" / "objects.yaml"),
+ "conf": 0.25,
+ "max_area_ratio": 0.3,
+ }
+
+ def _load_prompts(self, objects_file: Path) -> List[str]:
+ if not objects_file.exists():
+ return []
+ try:
+ with open(objects_file, "r", encoding="utf-8") as f:
+ data = yaml.safe_load(f) or []
+ if isinstance(data, list):
+ return [str(p) for p in data]
+ except Exception as exc:
+ self.get_logger().warn(f"Failed to read prompts from {objects_file}: {exc}")
+ return []
+
+ def _image_callback(self, msg: Image) -> None:
+ cv_image = self.bridge.imgmsg_to_cv2(msg, desired_encoding="bgr8")
+ self._process_image(cv_image, msg.header)
+
+ def _compressed_image_callback(self, msg: CompressedImage) -> None:
+ cv_image = self.bridge.compressed_imgmsg_to_cv2(msg, desired_encoding="bgr8")
+ self._process_image(cv_image, msg.header)
+
+ def _process_image(self, cv_image, header) -> None:
+ detections: DetectionResults = self.detector.process_image(cv_image)
+
+ detection_msg = detections.to_msg(header)
+ self.detection_pub.publish(detection_msg)
+
+ overlay = detections.overlay()
+ overlay_msg = self.bridge.cv2_to_imgmsg(overlay, encoding="bgr8")
+ overlay_msg.header = header
+ self.overlay_pub.publish(overlay_msg)
+
+ def destroy_node(self) -> bool:
+ if hasattr(self, "detector") and self.detector:
+ self.detector.stop()
+ return super().destroy_node()
+
+
+def main(args: Optional[list[str]] = None) -> None:
+ rclpy.init(args=args)
+ node = DetectionNode()
+ try:
+ rclpy.spin(node)
+ except KeyboardInterrupt:
+ pass
+ finally:
+ node.destroy_node()
+ rclpy.shutdown()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/detect_anything/detect_anything/yoloe.py b/detect_anything/detect_anything/yoloe.py
new file mode 100644
index 0000000..303afb9
--- /dev/null
+++ b/detect_anything/detect_anything/yoloe.py
@@ -0,0 +1,142 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from enum import Enum
+from pathlib import Path
+import threading
+from typing import Any, List, Optional, Sequence
+
+import numpy as np
+import torch
+from ultralytics import YOLOE # type: ignore[attr-defined, import-not-found]
+from ultralytics.utils import SETTINGS
+
+from detect_anything.detection import (
+ DetectionResults,
+ SingleDetectionResult,
+ from_ultralytics_results,
+)
+
+
+class YoloePromptMode(Enum):
+ """YOLO-E prompt modes."""
+
+ LRPC = "lrpc"
+ PROMPT = "prompt"
+
+
+class Yoloe2DDetector:
+ """Thin wrapper around YOLO-E for 2D detection + tracking with masks."""
+
+ def __init__(
+ self,
+ model_path: str | None = None,
+ model_name: str | None = None,
+ device: str | None = None,
+ prompt_mode: YoloePromptMode = YoloePromptMode.LRPC,
+ max_area_ratio: Optional[float] = 0.3,
+ conf: float = 0.6,
+ iou: float = 0.6,
+ ) -> None:
+ if model_name is None:
+ model_name = "yoloe-11s-seg-pf.pt" if prompt_mode == YoloePromptMode.LRPC else "yoloe-11s-seg.pt"
+
+ model_dir = Path(model_path) if model_path else Path("models/yoloe")
+ model_dir = model_dir.expanduser().resolve()
+ SETTINGS["weights_dir"] = str(model_dir)
+ self.model = YOLOE(model_dir / model_name)
+ self.prompt_mode = prompt_mode
+ self._visual_prompts: dict[str, np.ndarray] | None = None
+ self.max_area_ratio = max_area_ratio
+ self._lock = threading.Lock()
+ self.conf = conf
+ self.iou = iou
+ self._names_override: Sequence[str] | None = None
+
+ if prompt_mode == YoloePromptMode.PROMPT:
+ self.set_prompts(text=["nothing"])
+
+ if self.max_area_ratio is not None and not (0.0 < self.max_area_ratio <= 1.0):
+ raise ValueError("max_area_ratio must be in the range (0, 1].")
+
+ if device:
+ self.device = device
+ elif torch.cuda.is_available():
+ self.device = "cuda"
+ else:
+ self.device = "cpu"
+
+ def set_prompts(
+ self,
+ text: Optional[List[str]] = None,
+ bboxes: Optional[np.ndarray] = None,
+ ) -> None:
+ """Set prompts for detection. Provide either text or bboxes, not both."""
+ if self.prompt_mode == YoloePromptMode.LRPC:
+ # Prompt-free model cannot accept text/visual prompts.
+ return
+ if text is not None and bboxes is not None:
+ raise ValueError("Provide either text or bboxes, not both.")
+ if text is None and bboxes is None:
+ raise ValueError("Must provide either text or bboxes.")
+
+ with self._lock:
+ self.model.predictor = None
+ if text is not None:
+ self.model.set_classes(text, self.model.get_text_pe(text)) # type: ignore[no-untyped-call]
+ self._visual_prompts = None
+ self._names_override = list(text)
+ else:
+ cls = np.arange(len(bboxes), dtype=np.int16) # type: ignore[arg-type]
+ self._visual_prompts = {"bboxes": bboxes, "cls": cls} # type: ignore[dict-item]
+ self._names_override = None
+
+ def process_image(self, image: np.ndarray) -> DetectionResults:
+ """Process an image and return detection results."""
+ track_kwargs: dict[str, Any] = {
+ "source": image,
+ "device": self.device,
+ "conf": self.conf,
+ "iou": self.iou,
+ "persist": True,
+ "verbose": False,
+ }
+
+ with self._lock:
+ if self._visual_prompts is not None:
+ track_kwargs["visual_prompts"] = self._visual_prompts
+
+ results = self.model.track(**track_kwargs) # type: ignore[arg-type]
+
+ detections = from_ultralytics_results(results, image_shape=image.shape[:2], names=self._names_override)
+ filtered = self._apply_filters(image, detections)
+ return DetectionResults(image=image, detections=filtered)
+
+ def _apply_filters(
+ self,
+ image: np.ndarray,
+ detections: List[SingleDetectionResult],
+ ) -> List[SingleDetectionResult]:
+ if self.max_area_ratio is None:
+ return detections
+
+ image_area = float(image.shape[0] * image.shape[1])
+ if image_area <= 0:
+ return detections
+
+ return [det for det in detections if (det.area() / image_area) <= self.max_area_ratio]
+
+ def stop(self) -> None:
+ """Clean up resources used by the detector."""
+ if hasattr(self.model, "predictor") and self.model.predictor is not None:
+ predictor = self.model.predictor
+ if hasattr(predictor, "trackers") and predictor.trackers:
+ for tracker in predictor.trackers:
+ if hasattr(tracker, "tracker") and hasattr(tracker.tracker, "gmc"):
+ gmc = tracker.tracker.gmc
+ if hasattr(gmc, "executor") and gmc.executor is not None:
+ gmc.executor.shutdown(wait=True)
+ self.model.predictor = None
+
+
+__all__ = ["Yoloe2DDetector", "YoloePromptMode"]
diff --git a/detect_anything/env-hooks/venv_pythonpath.sh.in b/detect_anything/env-hooks/venv_pythonpath.sh.in
new file mode 100644
index 0000000..9212bac
--- /dev/null
+++ b/detect_anything/env-hooks/venv_pythonpath.sh.in
@@ -0,0 +1 @@
+ament_prepend_unique_value PYTHONPATH "@PYTHON_INSTALL_DIR@"
diff --git a/detect_anything/msg/DetectionResult.msg b/detect_anything/msg/DetectionResult.msg
new file mode 100644
index 0000000..766d5f1
--- /dev/null
+++ b/detect_anything/msg/DetectionResult.msg
@@ -0,0 +1,10 @@
+std_msgs/Header header
+int32[] track_id
+float32[] x1
+float32[] y1
+float32[] x2
+float32[] y2
+string[] label
+float32[] confidence
+sensor_msgs/CompressedImage image
+sensor_msgs/CompressedImage[] masks # cropped to bbox; aligned to bbox top-left; encoding: mono8
diff --git a/detect_anything/package.xml b/detect_anything/package.xml
new file mode 100644
index 0000000..a7699ae
--- /dev/null
+++ b/detect_anything/package.xml
@@ -0,0 +1,26 @@
+
+
+
+ detect_anything
+ 0.0.1
+ YOLO-E detection node and utilities.
+ Alex Lin
+ MIT
+
+ ament_cmake
+
+ rosidl_default_generators
+ rosidl_default_runtime
+ rosidl_interface_packages
+
+ rclpy
+ std_msgs
+ sensor_msgs
+ vision_msgs
+ cv_bridge
+ vector_perception_utils
+
+
+ ament_cmake
+
+
diff --git a/detect_anything/resource/detect_anything b/detect_anything/resource/detect_anything
new file mode 100644
index 0000000..eac574f
--- /dev/null
+++ b/detect_anything/resource/detect_anything
@@ -0,0 +1 @@
+detect_anything
diff --git a/detect_anything/scripts/detection_node b/detect_anything/scripts/detection_node
new file mode 100755
index 0000000..822cacf
--- /dev/null
+++ b/detect_anything/scripts/detection_node
@@ -0,0 +1,5 @@
+#!/usr/bin/env python3
+from detect_anything.detection_node import main
+
+if __name__ == "__main__":
+ main()
diff --git a/requirements.txt b/requirements.txt
index 7b6c110..67894e1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,13 +4,14 @@ opencv-python
scipy
pillow
tqdm
+pyyaml
+lark
# 3D processing
open3d
-# deep learning
-torch<=2.4.0
-torchvision<=1.19.0
+# vision
+ultralytics
# SAM2 and EdgeTAM dependencies
sam2
diff --git a/rviz/semantic_mapping.rviz b/rviz/semantic_mapping.rviz
new file mode 100644
index 0000000..d56223b
--- /dev/null
+++ b/rviz/semantic_mapping.rviz
@@ -0,0 +1,267 @@
+Panels:
+ - Class: rviz_common/Displays
+ Help Height: 78
+ Name: Displays
+ Property Tree Widget:
+ Expanded:
+ - /Global Options1
+ - /Status1
+ - /Odometry1
+ Splitter Ratio: 0.5
+ Tree Height: 413
+ - Class: rviz_common/Selection
+ Name: Selection
+ - Class: rviz_common/Tool Properties
+ Expanded:
+ - /2D Goal Pose1
+ - /Publish Point1
+ Name: Tool Properties
+ Splitter Ratio: 0.5886790156364441
+ - Class: rviz_common/Views
+ Expanded:
+ - /Current View1
+ Name: Views
+ Splitter Ratio: 0.5
+ - Class: rviz_common/Time
+ Experimental: false
+ Name: Time
+ SyncMode: 0
+ SyncSource: PointCloud2
+Visualization Manager:
+ Class: ""
+ Displays:
+ - Alpha: 0.5
+ Cell Size: 1
+ Class: rviz_default_plugins/Grid
+ Color: 160; 160; 164
+ Enabled: true
+ Line Style:
+ Line Width: 0.029999999329447746
+ Value: Lines
+ Name: Grid
+ Normal Cell Count: 0
+ Offset:
+ X: 0
+ Y: 0
+ Z: 0
+ Plane: XY
+ Plane Cell Count: 10
+ Reference Frame:
+ Value: true
+ - Class: rviz_default_plugins/Image
+ Enabled: true
+ Max Value: 1
+ Median window: 5
+ Min Value: 0
+ Name: Image
+ Normalize Range: true
+ Topic:
+ Depth: 5
+ Durability Policy: Volatile
+ History Policy: Keep Last
+ Reliability Policy: Reliable
+ Value: /annotated_image_detection
+ Value: true
+ - Angle Tolerance: 0.10000000149011612
+ Class: rviz_default_plugins/Odometry
+ Covariance:
+ Orientation:
+ Alpha: 0.5
+ Color: 255; 255; 127
+ Color Style: Unique
+ Frame: Local
+ Offset: 1
+ Scale: 1
+ Value: true
+ Position:
+ Alpha: 0.30000001192092896
+ Color: 204; 51; 204
+ Scale: 1
+ Value: true
+ Value: false
+ Enabled: true
+ Keep: 5
+ Name: Odometry
+ Position Tolerance: 0.10000000149011612
+ Shape:
+ Alpha: 1
+ Axes Length: 1
+ Axes Radius: 0.10000000149011612
+ Color: 255; 25; 0
+ Head Length: 0.30000001192092896
+ Head Radius: 0.10000000149011612
+ Shaft Length: 1
+ Shaft Radius: 0.05000000074505806
+ Value: Arrow
+ Topic:
+ Depth: 5
+ Durability Policy: Volatile
+ Filter size: 10
+ History Policy: Keep Last
+ Reliability Policy: Reliable
+ Value: /state_estimation
+ Value: true
+ - Alpha: 0.10000000149011612
+ Autocompute Intensity Bounds: true
+ Autocompute Value Bounds:
+ Max Value: 10
+ Min Value: -10
+ Value: true
+ Axis: Z
+ Channel Name: intensity
+ Class: rviz_default_plugins/PointCloud2
+ Color: 255; 255; 255
+ Color Transformer: FlatColor
+ Decay Time: 3
+ Enabled: true
+ Invert Rainbow: false
+ Max Color: 255; 255; 255
+ Max Intensity: 165
+ Min Color: 0; 0; 0
+ Min Intensity: 0
+ Name: PointCloud2
+ Position Transformer: XYZ
+ Selectable: true
+ Size (Pixels): 3
+ Size (m): 0.004999999888241291
+ Style: Flat Squares
+ Topic:
+ Depth: 5
+ Durability Policy: Volatile
+ History Policy: Keep Last
+ Reliability Policy: Reliable
+ Value: /registered_scan
+ Use Fixed Frame: true
+ Use rainbow: true
+ Value: true
+ - Class: rviz_default_plugins/MarkerArray
+ Enabled: true
+ Name: MarkerArray
+ Namespaces:
+ bbox: true
+ Topic:
+ Depth: 5
+ Durability Policy: Volatile
+ History Policy: Keep Last
+ Reliability Policy: Reliable
+ Value: /obj_boxes
+ Value: true
+ - Alpha: 1
+ Autocompute Intensity Bounds: true
+ Autocompute Value Bounds:
+ Max Value: 10
+ Min Value: -10
+ Value: true
+ Axis: Z
+ Channel Name: intensity
+ Class: rviz_default_plugins/PointCloud2
+ Color: 255; 255; 255
+ Color Transformer: RGB8
+ Decay Time: 0
+ Enabled: true
+ Invert Rainbow: false
+ Max Color: 255; 255; 255
+ Max Intensity: 4096
+ Min Color: 0; 0; 0
+ Min Intensity: 0
+ Name: PointCloud2
+ Position Transformer: XYZ
+ Selectable: true
+ Size (Pixels): 3
+ Size (m): 0.009999999776482582
+ Style: Flat Squares
+ Topic:
+ Depth: 5
+ Durability Policy: Volatile
+ History Policy: Keep Last
+ Reliability Policy: Reliable
+ Value: /obj_points
+ Use Fixed Frame: true
+ Use rainbow: true
+ Value: true
+ Enabled: true
+ Global Options:
+ Background Color: 48; 48; 48
+ Fixed Frame: map
+ Frame Rate: 30
+ Name: root
+ Tools:
+ - Class: rviz_default_plugins/Interact
+ Hide Inactive Objects: true
+ - Class: rviz_default_plugins/MoveCamera
+ - Class: rviz_default_plugins/Select
+ - Class: rviz_default_plugins/FocusCamera
+ - Class: rviz_default_plugins/Measure
+ Line color: 128; 128; 0
+ - Class: rviz_default_plugins/SetInitialPose
+ Covariance x: 0.25
+ Covariance y: 0.25
+ Covariance yaw: 0.06853891909122467
+ Topic:
+ Depth: 5
+ Durability Policy: Volatile
+ History Policy: Keep Last
+ Reliability Policy: Reliable
+ Value: /initialpose
+ - Class: rviz_default_plugins/SetGoal
+ Topic:
+ Depth: 5
+ Durability Policy: Volatile
+ History Policy: Keep Last
+ Reliability Policy: Reliable
+ Value: /goal_pose
+ - Class: rviz_default_plugins/PublishPoint
+ Single click: true
+ Topic:
+ Depth: 5
+ Durability Policy: Volatile
+ History Policy: Keep Last
+ Reliability Policy: Reliable
+ Value: /clicked_point
+ Transformation:
+ Current:
+ Class: rviz_default_plugins/TF
+ Value: true
+ Views:
+ Current:
+ Class: rviz_default_plugins/Orbit
+ Distance: 21.720943450927734
+ Enable Stereo Rendering:
+ Stereo Eye Separation: 0.05999999865889549
+ Stereo Focal Distance: 1
+ Swap Stereo Eyes: false
+ Value: false
+ Focal Point:
+ X: 3.1760172843933105
+ Y: 11.240063667297363
+ Z: 2.561012029647827
+ Focal Shape Fixed Size: false
+ Focal Shape Size: 0.05000000074505806
+ Invert Z Axis: false
+ Name: Current View
+ Near Clip Distance: 0.009999999776482582
+ Pitch: 1.1497966051101685
+ Target Frame:
+ Value: Orbit (rviz_default_plugins)
+ Yaw: 3.4885518550872803
+ Saved: ~
+Window Geometry:
+ Displays:
+ collapsed: false
+ Height: 996
+ Hide Left Dock: false
+ Hide Right Dock: false
+ Image:
+ collapsed: false
+ QMainWindow State: 000000ff00000000fd00000004000000000000027b00000342fc0200000009fb0000001200530065006c0065006300740069006f006e00000001e10000009b0000005d00fffffffb0000001e0054006f006f006c002000500072006f007000650072007400690065007302000001ed000001df00000185000000a3fb000000120056006900650077007300200054006f006f02000001df000002110000018500000122fb000000200054006f006f006c002000500072006f0070006500720074006900650073003203000002880000011d000002210000017afb000000100044006900730070006c006100790073010000003f0000022b000000cc00fffffffb0000002000730065006c0065006300740069006f006e00200062007500660066006500720200000138000000aa0000023a00000294fb00000014005700690064006500530074006500720065006f02000000e6000000d2000003ee0000030bfb0000000c004b0069006e0065006300740200000186000001060000030c00000261fb0000000a0049006d0061006700650100000270000001110000001700ffffff000000010000010f00000342fc0200000003fb0000001e0054006f006f006c002000500072006f00700065007200740069006500730100000041000000780000000000000000fb0000000a00560069006500770073010000003f00000342000000a900fffffffb0000001200530065006c0065006300740069006f006e010000025a000000b200000000000000000000000200000490000000a9fc0100000001fb0000000a00560069006500770073030000004e00000080000002e10000019700000003000006b90000003efc0100000002fb0000000800540069006d00650100000000000006b90000026f00fffffffb0000000800540069006d00650100000000000004500000000000000000000003230000034200000004000000040000000800000008fc0000000100000002000000010000000a0054006f006f006c00730100000000ffffffff0000000000000000
+ Selection:
+ collapsed: false
+ Time:
+ collapsed: false
+ Tool Properties:
+ collapsed: false
+ Views:
+ collapsed: false
+ Width: 1721
+ X: 3248
+ Y: 331
diff --git a/semantic_mapping/CMakeLists.txt b/semantic_mapping/CMakeLists.txt
new file mode 100644
index 0000000..7266468
--- /dev/null
+++ b/semantic_mapping/CMakeLists.txt
@@ -0,0 +1,52 @@
+cmake_minimum_required(VERSION 3.10)
+project(semantic_mapping)
+
+find_package(ament_cmake REQUIRED)
+find_package(rclpy REQUIRED)
+find_package(std_msgs REQUIRED)
+find_package(sensor_msgs REQUIRED)
+find_package(geometry_msgs REQUIRED)
+find_package(nav_msgs REQUIRED)
+find_package(visualization_msgs REQUIRED)
+find_package(cv_bridge REQUIRED)
+find_package(rosidl_default_generators REQUIRED)
+find_package(Python3 REQUIRED COMPONENTS Interpreter)
+
+rosidl_generate_interfaces(${PROJECT_NAME}
+ "msg/ObjectNode.msg"
+ "msg/ObjectType.msg"
+ "msg/TargetObjectInstruction.msg"
+ "msg/ViewpointRep.msg"
+ DEPENDENCIES std_msgs sensor_msgs geometry_msgs
+)
+
+set(PYTHON_INSTALL_DIR "lib/python${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR}/site-packages")
+
+install(
+ DIRECTORY semantic_mapping
+ DESTINATION ${PYTHON_INSTALL_DIR}
+)
+
+install(
+ PROGRAMS
+ scripts/semantic_mapping_node
+ DESTINATION lib/${PROJECT_NAME}
+)
+
+install(
+ FILES resource/semantic_mapping
+ DESTINATION share/${PROJECT_NAME}/resource
+)
+
+install(
+ DIRECTORY config
+ DESTINATION share/${PROJECT_NAME}
+)
+
+install(
+ DIRECTORY launch
+ DESTINATION share/${PROJECT_NAME}
+)
+
+ament_export_dependencies(rosidl_default_runtime)
+ament_package()
diff --git a/semantic_mapping/README.md b/semantic_mapping/README.md
new file mode 100644
index 0000000..62b78e5
--- /dev/null
+++ b/semantic_mapping/README.md
@@ -0,0 +1,152 @@
+# Semantic Mapping
+
+ROS2 package for semantic 3D object mapping with VLM integration.
+
+## Overview
+
+This package builds a persistent semantic map of objects detected by `detect_anything`, fusing 2D detections with LiDAR point clouds to create 3D object representations. It publishes `ObjectNode` messages compatible with the tare_planner exploration system.
+The pipeline stores voxelized geometry, merges observations, and exports planner‑friendly ROS messages.
+
+## Status: Work in Progress
+
+| Component | File | Status |
+|-----------|------|--------|
+| Messages | `msg/*.msg` | Complete |
+| Data Models | `map_object.py` | Complete |
+| Matching | `utils/matching.py` | Complete |
+| Object Mapper | `mapper.py` | Complete |
+| Cloud Projection | `vector_perception_utils/pointcloud_utils.py` | Complete |
+| Publishers | `utils/map_object_utils.py` | Complete |
+| Image Storage | `utils/storage.py` | Complete |
+| Mapping Node | `mapping_node.py` | Complete |
+| Launch Files | `launch/` | Complete |
+
+## Messages
+
+### ObjectNode.msg
+Main output message for planner integration:
+- `object_id[]`: Track IDs (may contain multiple merged IDs)
+- `label`: Dominant class label
+- `position`: Object centroid in world frame
+- `bbox3d[8]`: 3D bounding box corners
+- `cloud`: Object point cloud
+- `status`: Lifecycle status ("new", "updated", "unchanged")
+- `img_path`: Path to best saved image crop
+- `is_asked_vlm`: VLM query flag
+- `viewpoint_id`: Associated viewpoint
+
+### ObjectType.msg
+VLM query/answer for object relabeling.
+
+### TargetObjectInstruction.msg
+User instruction for target object search.
+
+### ViewpointRep.msg
+Viewpoint trigger for image saving.
+
+Note: `detect_anything/DetectionResult` masks are cropped to the bbox and aligned to
+the bbox top-left (as documented in `detect_anything/msg/DetectionResult.msg`).
+The mapping node reconstructs full-image masks before projection.
+
+## Architecture
+
+```
+detect_anything/DetectionResult
+ │
+ ▼
+┌───────────────────────────────────────────────┐
+│ semantic_mapping_node │
+│ │
+│ ┌────────────────────┐ ┌──────────────┐ │
+│ │ pointcloud_utils │ │ ObjectMapper │ │
+│ │ (projection + mask)├──▶│ (pending/ │ │
+│ │ │ │ permanent) │ │
+│ └────────────────────┘ └──────────────┘ │
+│ │ │ │
+│ ▼ ▼ │
+│ ┌────────────────┐ ┌──────────────┐ │
+│ │ MapObject │ │ map_object_ │ │
+│ │ (voxels/votes) ├─▶│ utils │ │
+│ └────────────────┘ └──────────────┘ │
+│ │ │ │
+│ ▼ ▼ │
+│ utils/storage.py mapping_node.py │
+└───────────────────────────────────────────────┘
+ │
+ ▼
+/object_nodes, /obj_points, /obj_boxes, /obj_labels
+```
+
+Key components:
+- `map_object.py`: Object state, voxel voting, pose de‑duplication, cached geometry.
+- `utils/matching.py`: Track‑id and spatial matching with IoU/ratio heuristics.
+- `mapper.py`: Pending/permanent tiers, merge rules, and pruning.
+- `vector_perception_utils/pointcloud_utils.py`: Projection + mask segmentation utilities.
+- `utils/map_object_utils.py`: ROS message construction helpers.
+- `utils/storage.py`: Async image save queue.
+- `mapping_node.py`: ROS2 orchestration (subscriptions, sync, publish).
+
+## Dependencies
+
+- `detect_anything`: Detection results with masks
+- `vector_perception_utils`: Point cloud and image utilities
+- ROS2 messages: `sensor_msgs`, `geometry_msgs`, `nav_msgs`, `visualization_msgs`
+
+## Usage
+
+```bash
+# Build
+colcon build --packages-select semantic_mapping
+
+# Run (after implementation complete)
+ros2 launch semantic_mapping semantic_mapping.launch.py
+```
+
+## Configuration
+
+- `config/params.yaml`: Default node parameters for `semantic_mapping_node`.
+- `config/sensor_to_camera_tf.yaml`: Static sensor→camera transform for projection.
+- `config/objects.yaml`: Object taxonomy for detection labels.
+
+Key runtime params:
+- `regularize_voxel_size`: Connectivity voxel size used in shape regularization.
+- `ground_filter`, `ground_radius`, `ground_z_clearance`: Simple ground clamp around robot.
+- `visualization_max_objects`: Limits RViz markers/pointcloud to most recent objects.
+
+## Topics
+
+### Subscriptions
+| Topic | Type | Description |
+|-------|------|-------------|
+| `/detection_result` | `detect_anything/DetectionResult` | 2D detections with masks |
+| `/registered_scan` | `sensor_msgs/PointCloud2` | LiDAR point cloud |
+| `/state_estimation` | `nav_msgs/Odometry` | Robot odometry |
+| `/object_type_answer` | `ObjectType` | VLM relabeling response |
+| `/target_object_instruction` | `TargetObjectInstruction` | Target object search |
+| `/viewpoint_rep_header` | `ViewpointRep` | Viewpoint trigger |
+
+### Publications
+| Topic | Type | Description |
+|-------|------|-------------|
+| `/object_nodes` | `ObjectNode` | Semantic object map (for planner) |
+| `/obj_points` | `sensor_msgs/PointCloud2` | Colored object point cloud |
+| `/obj_boxes` | `visualization_msgs/MarkerArray` | 3D bounding box wireframes |
+| `/obj_labels` | `visualization_msgs/MarkerArray` | Object label text |
+| `/object_type_query` | `ObjectType` | VLM query for relabeling |
+
+## References
+
+- Full migration plan: [`docs/SEMANTIC_MAPPING_MIGRATION.md`](docs/SEMANTIC_MAPPING_MIGRATION.md)
+- Original implementation: `VLM_ROS/src/semantic_mapping/` (main branch)
+ - SAM2 for segmentation
+ - External Gemini API for VLM reasoning
+- Detection: `detect_anything` package (decoupled)
+- dimos patterns: `dimos/perception/detection/objectDB.py`
+
+## Handoff Notes
+
+- Regularization uses vote-weighted filtering + voxel connected components; adjust
+ `regularize_voxel_size` and the regularization percentile (in `MapObject`) if
+ thin structures are dropped.
+- Mapping node logs summary stats and a single mapping loop time per cycle.
+- Permanent objects persist; only pending objects time out via `pending_ttl_s`.
diff --git a/semantic_mapping/config/params.yaml b/semantic_mapping/config/params.yaml
new file mode 100644
index 0000000..0749d09
--- /dev/null
+++ b/semantic_mapping/config/params.yaml
@@ -0,0 +1,48 @@
+semantic_mapping_node:
+ ros__parameters:
+ detection_topic: /detection_result
+ cloud_topic: /registered_scan
+ odom_topic: /state_estimation
+ viewpoint_topic: /viewpoint_rep_header
+ object_type_answer_topic: /object_type_answer
+ object_type_query_topic: /object_type_query
+ target_object_instruction_topic: /target_object_instruction
+ object_nodes_topic: /object_nodes
+ obj_points_topic: /obj_points
+ obj_boxes_topic: /obj_boxes
+ obj_labels_topic: /obj_labels
+ map_frame: map
+
+ confidence_threshold: 0.35
+ voxel_size: 0.05
+ regularize_voxel_size: 0.2
+ distance_threshold: 0.3
+ iou_threshold: 0.3
+ min_detections_for_permanent: 4
+ pending_ttl_s: 5.0
+ num_angle_bins: 15
+
+ detection_linear_state_time_bias: 0.0
+ detection_angular_state_time_bias: 0.0
+ cloud_window_before: 0.5
+ cloud_window_after: 0.1
+
+ projection: equirect
+ axis_mode: xz
+ image_width: 1920
+ image_height: 640
+ depth_filter: true
+ depth_filter_margin: 0.15
+ depth_jump_threshold: 0.3
+ sensor_to_camera_tf: config/sensor_to_camera_tf.yaml
+ mask_erode_kernel: 5
+ mask_erode_iterations: 2
+ max_cloud_distance: 8.0
+ visualization_max_objects: 50
+ ground_filter: true
+ ground_radius: 2.5
+ ground_z_clearance: 0.0
+
+ target_object: refrigerator
+ save_png: True
+ output_dir: output/object_images
diff --git a/semantic_mapping/config/sensor_to_camera_tf.yaml b/semantic_mapping/config/sensor_to_camera_tf.yaml
new file mode 100644
index 0000000..e48c06f
--- /dev/null
+++ b/semantic_mapping/config/sensor_to_camera_tf.yaml
@@ -0,0 +1,11 @@
+sensor_to_camera:
+ parent_frame: lidar
+ child_frame: camera
+ translation:
+ x: -0.12
+ y: -0.075
+ z: 0.265
+ rotation_rpy:
+ roll: -1.5707963
+ pitch: 0.0
+ yaw: -1.5707963
diff --git a/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md b/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md
new file mode 100644
index 0000000..3f22372
--- /dev/null
+++ b/semantic_mapping/docs/SEMANTIC_MAPPING_MIGRATION.md
@@ -0,0 +1,487 @@
+# Semantic Mapping Migration Plan
+
+## Overview
+
+This document outlines the comprehensive plan for porting the `semantic_mapping` package from `VLM_ROS` into `vector_perception_ros/semantic_mapping`, following the coding standards of `vector_perception_ros` while preserving functional compatibility with downstream consumers (particularly the tare_planner).
+
+> **Note**: Detection is decoupled into `detect_anything` package. VLM reasoning uses external Gemini API.
+
+## Design Principles
+
+1. **Logic cleanup, not restructure** - Preserve the core algorithms (voxel voting, observation angles, shape regularization, IoU/distance matching) while cleaning up dead code and improving organization.
+2. **Same I/O contract** - Maintain the same ROS topic names, message types, and semantics so existing planner integration continues to work.
+3. **vector_perception_ros coding standards** - Full type hints, Google-style docstrings, dataclass containers, proper ROS2 patterns.
+4. **ObjectDB-style patterns** - Adopt the two-tier (pending/permanent) architecture and dual matching (track_id + distance) from dimos where it improves clarity.
+
+---
+
+## Implementation Progress
+
+| Step | Component | Status |
+|------|-----------|--------|
+| 1 | Package Setup & Messages | **COMPLETED** |
+| 2 | Data Models (`map_object.py`) | **COMPLETED** |
+| 2.1 | Refactor to use shared utilities | **COMPLETED** |
+| 3 | Matching Utilities (`utils/matching.py`) | **COMPLETED** |
+| 4 | Object Mapper (`mapper.py`) | **COMPLETED** |
+| 5 | Cloud-Image Projection (`pointcloud_utils.py`) | **COMPLETED** |
+| 6 | Message Publishers (`utils/map_object_utils.py`) | **COMPLETED** |
+| 7 | Image Storage (`utils/storage.py`) | **COMPLETED** |
+| 8 | Mapping Node (`mapping_node.py`) | **COMPLETED** |
+| 9 | Launch Files & Config | **COMPLETED** |
+| 10 | Integration Testing | **COMPLETED** |
+
+### Step 2.1: Refactored map_object.py to use shared utilities
+
+Added to `vector_perception_utils/transform_utils.py`:
+- `normalize_angle()` - now supports both scalar and array input
+- `discretize_angles()` - discretize angles into bin indices
+
+Added to `vector_perception_utils/pointcloud_utils.py`:
+- `get_oriented_bbox()` - returns (center, corners) tuple from minimal OBB
+
+Refactored in `map_object.py`:
+- Removed duplicate `_normalize_angles_to_pi` and `_discretize_angles` functions
+- Simplified `_compute_geometry()` from 43 lines to 15 using `get_oriented_bbox()`
+- Use `create_pointcloud()` in `regularize_shape()` instead of manual o3d creation
+- Use `compute_centroid()` instead of `np.mean(..., axis=0)`
+
+---
+
+## VLM_ROS Reference (Source Implementation)
+
+Key files in `VLM_ROS/src/semantic_mapping/`:
+- `semantic_map_new.py`: `ObjMapper` manages the persistent map
+- `single_object_new.py`: `SingleObject` holds per-object state
+- `semantic_mapping_node.py`: `MappingNode` orchestrates SAM2 masks, LiDAR projection, odom transforms
+
+### ObjMapper Core Flow
+- Input: dict with `bboxes`, `labels`, `confidences`, `ids`, `masks` (from SAM2), plus LiDAR cloud and odom
+- Memory fix: Uses `.copy()` on detection data extraction to prevent memory leaks
+- VLM query threshold: Objects need `best_image_score > 500` before querying VLM
+- `update_map()` steps:
+ 1. Transform incoming cloud into body/world frame using odom
+ 2. Filter detections by confidence threshold
+ 3. For each detection: reconstruct full-image masks (bbox-cropped in `detect_anything/msg/DetectionResult.msg`), project masked cloud, build per-object voxels, then either merge or create a `SingleObject`
+ 4. Merge heuristics: match by tracker `obj_id`, 3D IoU, distance thresholds, and optional primitive groups (e.g., chair/sofa)
+ 5. Maintain delete lists, save queues (best image/mask per object)
+ 6. Publish ROS messages: `ObjectNode`, bbox markers, label markers, colored point cloud
+
+### SingleObject Highlights
+- Keeps class_id→counts/confidence, track_id list, voxel grid (`VoxelFeatureManager`), robot poses, masks, best image saving
+- Computes 3D bbox (axis-aligned + oriented), centroids, clustering flags, and validity caches
+- Manages persistence (`life`, `inactive_frame`, `publish_status`) and VLM interaction state
+
+### dimos Patterns to Adopt
+- `ObjMapper` ⇄ `ObjectDB`: explicit pending/permanent tiers, track_id + spatial matching, promotion thresholds
+- `SingleObject` ⇄ `Object`: cleaner update patterns, separation of core geometry vs VLM metadata
+
+---
+
+## Phase 1: Package Structure & Messages (COMPLETED)
+
+### 1.1 Package Structure
+
+```
+semantic_mapping/
+├── CMakeLists.txt # Hybrid CMake for message generation + Python install
+├── package.xml # Dependencies: detect_anything, vector_perception_utils, ROS msgs
+├── msg/
+│ ├── ObjectNode.msg
+│ ├── ObjectType.msg
+│ ├── TargetObjectInstruction.msg
+│ └── ViewpointRep.msg
+├── config/
+│ ├── objects.yaml # Object taxonomy (to be copied from VLM_ROS)
+│ └── params.yaml # Default node parameters
+├── launch/
+│ └── semantic_mapping.launch.py
+├── semantic_mapping/
+│ ├── __init__.py
+│ ├── mapping_node.py # ROS2 node (orchestration)
+│ ├── mapper.py # ObjectMapper class (core logic)
+│ ├── map_object.py # Data classes: MapObject, Observation, VoxelManager
+│ ├── utils/
+│ │ ├── matching.py # Object matching utilities (IoU, distance, track_id)
+│ │ ├── map_object_utils.py # ROS message conversion helpers
+│ │ └── storage.py # Async image saving queue
+│ ├── (moved to vector_perception_utils) # Cloud-image projection helpers
+├── docs/
+│ └── SEMANTIC_MAPPING_MIGRATION.md
+└── resource/
+ └── semantic_mapping # ament resource marker
+```
+
+### 1.2 Message Definitions
+
+**ObjectNode.msg** (preserves exact schema for planner compatibility):
+```
+std_msgs/Header header
+int32[] object_id
+string label
+geometry_msgs/Point position
+geometry_msgs/Point[8] bbox3d # 8 corner points in world frame
+sensor_msgs/PointCloud2 cloud
+string status # "new", "updated", "unchanged"
+string img_path
+bool is_asked_vlm
+int32 viewpoint_id
+```
+
+**ObjectType.msg**: VLM query/answer for object relabeling
+**TargetObjectInstruction.msg**: User-set target/anchor objects
+**ViewpointRep.msg**: Viewpoint trigger message
+
+---
+
+## Phase 2: Data Models (COMPLETED)
+
+### 2.1 ObjectStatus Enum
+
+```python
+class ObjectStatus(Enum):
+ """Object lifecycle status (ObjectDB-style tiering)."""
+ PENDING = "pending" # < min_detections threshold
+ PERMANENT = "permanent" # >= threshold, confirmed object
+ DELETED = "deleted" # Marked for removal
+```
+
+### 2.2 Observation
+
+```python
+@dataclass
+class Observation:
+ """A single detection observation to update a MapObject."""
+ track_id: int
+ label: str
+ confidence: float
+ points: np.ndarray # (N, 3) world-frame points
+ robot_pose: dict # {R: (3,3), t: (3,)}
+ mask: np.ndarray | None = None # 2D segmentation mask
+ image: np.ndarray | None = None # RGB crop for best-image selection
+ timestamp: float = 0.0
+```
+
+### 2.3 VoxelFeatureManager
+
+```python
+@dataclass
+class VoxelFeatureManager:
+ """Manages voxelized point cloud with observation angle voting."""
+ voxels: np.ndarray # (N, 3) world coords
+ votes: np.ndarray # (N,) detection count per voxel
+ observation_angles: np.ndarray # (N, num_bins) angle histogram
+ regularized_mask: np.ndarray | None # (N,) valid voxel mask after clustering
+ voxel_size: float = 0.03
+ num_angle_bins: int = 20
+
+ def update(self, new_points: np.ndarray, robot_position: np.ndarray) -> None:
+ """Add new points with KDTree-based merge and angle voting."""
+
+ def get_valid_voxels(self, min_votes: int = 1) -> np.ndarray:
+ """Return voxels passing vote threshold and regularization mask."""
+```
+
+### 2.4 MapObject
+
+```python
+@dataclass
+class MapObject:
+ """Represents a tracked object in the semantic map."""
+ # Identity
+ object_id: str # UUID for this object
+ track_ids: list[int] # Associated tracker IDs (can merge)
+
+ # Classification (voted)
+ class_votes: dict[str, int] # label -> observation count
+ confidence_scores: dict[str, float] # label -> best confidence
+
+ # Geometry
+ voxel_manager: VoxelFeatureManager
+ robot_poses: list[dict] # [{R, t}, ...] observation poses
+
+ # Lifecycle (ObjectDB-style)
+ detection_count: int = 0
+ inactive_frames: int = 0
+ status: ObjectStatus = ObjectStatus.PENDING
+
+ # VLM interaction
+ is_asked_vlm: bool = False
+ updated_by_vlm: bool = False
+ best_image_path: str | None = None
+ best_image_score: float = 0.0
+
+ @property
+ def label(self) -> str: ...
+ @property
+ def centroid(self) -> np.ndarray: ...
+ @property
+ def bbox3d(self) -> np.ndarray: ...
+ def update(self, observation: Observation) -> None: ...
+ def merge(self, other: MapObject) -> None: ...
+ def regularize_shape(self, percentile: float = 0.85) -> None: ...
+```
+
+---
+
+## Phase 3: Matching Utilities (utils/matching.py)
+
+```python
+def compute_iou_3d(bbox1: np.ndarray, bbox2: np.ndarray) -> float:
+ """Compute 3D IoU between two axis-aligned bounding boxes."""
+
+def compute_center_distance(obj1: MapObject, obj2: MapObject) -> float:
+ """Compute Euclidean distance between object centroids."""
+
+def are_merge_compatible(label1: str, label2: str, merge_groups: list[set[str]]) -> bool:
+ """Check if two labels can be merged (same class or in same merge group)."""
+
+def find_best_match(
+ obs: Observation,
+ candidates: list[MapObject],
+ distance_threshold: float,
+ iou_threshold: float,
+) -> MapObject | None:
+ """Find best matching object by distance and IoU criteria."""
+```
+
+---
+
+## Phase 4: Object Mapper (mapper.py)
+
+```python
+class ObjectMapper:
+ """
+ Manages the semantic object map with two-tier state management.
+ Pending objects are promoted to permanent after detection_count >= threshold.
+ """
+
+ def __init__(
+ self,
+ voxel_size: float = 0.03,
+ confidence_threshold: float = 0.30,
+ distance_threshold: float = 0.5,
+ iou_threshold: float = 0.2,
+ min_detections_for_permanent: int = 3,
+ pending_ttl_s: float = 10.0,
+ max_inactive_frames: int = 20,
+ ) -> None:
+ self._pending_objects: dict[str, MapObject] = {}
+ self._permanent_objects: dict[str, MapObject] = {}
+ self._track_id_map: dict[int, str] = {}
+ self._lock = threading.RLock()
+ self._merge_groups: list[set[str]] = [
+ {"chair", "sofa", "couch", "armchair"},
+ {"table", "desk"},
+ ]
+
+ def update(self, observations: list[Observation], target_object: str | None = None) -> UpdateStats: ...
+ def _match_observation(self, obs: Observation) -> MapObject | None: ...
+ def _create_object(self, obs: Observation) -> MapObject: ...
+ def _check_promotion(self, obj: MapObject) -> None: ...
+ def _try_merge_objects(self) -> None: ...
+ def _prune_inactive(self, target_object: str | None) -> list[MapObject]: ...
+ def get_objects(self, include_pending: bool = False) -> list[MapObject]: ...
+```
+
+---
+
+## Phase 5: Mapping Node (mapping_node.py)
+
+### 5.1 Node Structure
+
+```python
+class SemanticMappingNode(Node):
+ """ROS2 node for semantic 3D object mapping."""
+
+ def __init__(self) -> None:
+ super().__init__("semantic_mapping_node")
+ self._declare_parameters()
+ self.mapper = ObjectMapper(...)
+ self.projection = pointcloud_utils
+ self.image_saver = ImageSaveQueue(...)
+
+ # Data buffers (with locks)
+ self._cloud_buffer: deque[tuple[float, np.ndarray]] = deque(maxlen=50)
+ self._odom_buffer: deque[tuple[float, Pose]] = deque(maxlen=100)
+ self._lock = threading.Lock()
+
+ self._create_subscriptions()
+ self._create_publishers()
+ self.create_timer(0.5, self._mapping_callback)
+```
+
+### 5.2 Subscriptions
+
+| Topic | Type | QoS |
+|-------|------|-----|
+| `/detection_result` | `detect_anything/DetectionResult` | Reliable |
+| `/registered_scan` | `PointCloud2` | Best Effort |
+| `/state_estimation` | `Odometry` | Best Effort |
+| `/object_type_answer` | `ObjectType` | Reliable |
+| `/target_object_instruction` | `TargetObjectInstruction` | Reliable |
+| `/viewpoint_rep_header` | `ViewpointRep` | Reliable |
+
+### 5.3 Publishers
+
+| Topic | Type | Description |
+|-------|------|-------------|
+| `/object_nodes` | `ObjectNode` | Main output for planner |
+| `/obj_points` | `PointCloud2` | Colored object cloud |
+| `/obj_boxes` | `MarkerArray` | Bbox wireframes |
+| `/obj_labels` | `MarkerArray` | Text labels |
+| `/annotated_image` | `Image` | Debug visualization |
+| `/object_type_query` | `ObjectType` | VLM query |
+
+---
+
+## Phase 6: Cloud-Image Projection (pointcloud_utils.py)
+
+```python
+def project_pointcloud_to_image(...): ...
+def segment_pointcloud_by_mask(...): ...
+def segment_pointclouds_from_projection(...): ...
+def transform_segmented_clouds_to_world(...): ...
+```
+
+---
+
+## Phase 7: Message Publishers (utils/map_object_utils.py)
+
+```python
+def map_object_to_object_node(obj: MapObject, header: Header, status: str, target_object: str | None = None) -> ObjectNode | None: ...
+def create_bbox_markers(objects: list[MapObject], header: Header) -> MarkerArray: ...
+def create_label_markers(objects: list[MapObject], header: Header) -> MarkerArray: ...
+def create_object_pointcloud(objects: list[MapObject], header: Header) -> PointCloud2 | None: ...
+def create_delete_markers(deleted_ids: list[int], header: Header, namespace: str) -> MarkerArray: ...
+```
+
+Publisher helpers:
+- `map_object_to_object_node` converts a `MapObject` into an `ObjectNode` with centroid, bbox corners, and point cloud.
+- Marker helpers emit bbox wireframes and text labels for RViz.
+- `create_object_pointcloud` builds a colored aggregate cloud from object voxels.
+
+---
+
+## Phase 8: Image Storage (utils/storage.py)
+
+```python
+class ImageSaveQueue:
+ """Background worker for saving object images to disk."""
+
+ def __init__(self, output_dir: Path, max_queue_size: int = 100) -> None: ...
+ def start(self) -> None: ...
+ def stop(self) -> None: ...
+ def queue_save(self, object_id: str, image: np.ndarray, mask: np.ndarray | None = None) -> str: ...
+```
+
+Image storage:
+- Async worker writes `.npy` (and optional `.png`) crops to disk.
+- Queue is bounded to avoid blocking the main mapping loop.
+
+---
+
+## Phase 9: Launch Files
+
+```python
+# semantic_mapping.launch.py
+DeclareLaunchArgument('platform', default_value='mecanum_sim')
+DeclareLaunchArgument('detection_topic', default_value='/detection_result')
+DeclareLaunchArgument('cloud_topic', default_value='/registered_scan')
+DeclareLaunchArgument('odom_topic', default_value='/state_estimation')
+DeclareLaunchArgument('voxel_size', default_value='0.03')
+DeclareLaunchArgument('confidence_threshold', default_value='0.30')
+DeclareLaunchArgument('target_object', default_value='')
+```
+
+---
+
+## Implementation Checklist
+
+### Completed
+- [x] Create `CMakeLists.txt` for message generation
+- [x] Port message definitions from tare_planner
+- [x] Set up `package.xml` with dependencies
+- [x] Implement `map_object.py` with VoxelFeatureManager, MapObject, Observation
+- [x] Port lazy computation patterns (centroid, bbox3d, regularization)
+- [x] Refactor `map_object.py` to use shared utilities from `vector_perception_utils`
+- [x] Add `normalize_angle`, `discretize_angles` to `transform_utils.py`
+- [x] Add `get_oriented_bbox` to `pointcloud_utils.py`
+- [x] Simplify `_compute_geometry()` using Open3D oriented bbox
+- [x] Implement `utils/matching.py` with IoU, distance, merge-compatibility functions
+- [x] Implement `mapper.py` with ObjectDB-style tiering
+- [x] Implement cloud-image projection helpers in `vector_perception_utils/pointcloud_utils.py`
+- [x] Implement `utils/map_object_utils.py` message conversion functions
+- [x] Implement `utils/storage.py` with async queue
+- [x] Implement `mapping_node.py` following detect_anything patterns
+
+### Remaining
+- [x] Create launch files and params.yaml
+- [x] Integration testing with bag files
+
+---
+
+## Code Cleanup Items
+
+Items from VLM_ROS to preserve:
+- Memory fixes with `.copy()` on detection data extraction
+- VLM query threshold (`best_image_score > 500`)
+
+Items to remove:
+1. Unused publishers: `/cloud_image`
+2. Unused methods: `serialize_objs_to_bag()`, `is_merge_allowed()`, `handle_object_query()`
+3. Review constants: `BACKGROUND_OBJECTS`, `VERTICAL_OBJECTS`, `BIG_OBJECTS`, `SMALL_OBJECTS`
+4. Don't port: `byte_track`, `Grounded-SAM-2`, `mmconfig` directories
+
+---
+
+## Compatibility Checklist
+
+- [x] `/object_nodes` publishes ObjectNode with exact schema
+- [x] `/obj_points` publishes colored PointCloud2
+- [x] `/obj_boxes` publishes MarkerArray wireframes
+- [x] `/obj_labels` publishes MarkerArray text
+- [x] `/object_type_query` publishes ObjectType for VLM
+- [x] Subscribes to `/detection_result` from detect_anything
+- [x] Subscribes to `/registered_scan` PointCloud2
+- [x] Subscribes to `/state_estimation` Odometry
+- [x] Subscribes to `/target_object_instruction`
+- [x] Subscribes to `/viewpoint_rep_header`
+- [x] Subscribes to `/object_type_answer`
+- [x] Object positions, bbox corners, status flags match expected semantics
+- [x] Delete markers properly remove objects from RViz
+
+---
+
+## Recent Updates (Jan 2026)
+
+- Unified launch to `semantic_mapping.launch.py` with mode handling, detect_anything integration, and bagfile defaults.
+- Fixed cloud/image projection transform to match legacy `CloudImageFusion` math.
+- Added mask erosion, depth-jump pruning, distance gating, and light outlier filtering for cleaner object clouds.
+- Added reprojected observation-angle voting for better multi-view aggregation.
+- Fixed bbox ordering and switched `/obj_boxes` to axis-aligned markers (via `get_oriented_bbox(axis_aligned=True)`).
+- Replaced DBSCAN regularization with vote-weighted filtering + voxel connected components.
+- Added `regularize_voxel_size` to grow connectivity for thin structures.
+- Simplified ground filtering to radius-based Z clamp around robot.
+- Added `visualization_max_objects` to keep recent objects visible in RViz.
+- Logging: mapping stats (total/total_deleted/total_permanent) + mapping loop time.
+- Fixed image saving color order and ensured output dir is cleared on start.
+- Set MobileCLIP weights cache to `detect_anything/models` and removed corrupted caches.
+
+## Temporary Handoff Notes
+
+- Regularization is now connected-components; tune `regularize_voxel_size` and keep ratio
+ (`MapObject.regularize_shape` percentile) if legs/sparse parts are dropped.
+- Ground filter uses `ground_filter`, `ground_radius`, `ground_z_clearance`.
+- If bagfile sync issues persist, consider switching mapping inputs to `/sensor_scan` + `/state_estimation_at_scan`.
+- Verify VLM objects.yaml path in VLM_ROS (`vlm_reasoning_node` still expects old path).
+
+---
+
+## Planner Dependencies
+
+The exploration planner (`sensor_coverage_planner_ground.cpp` in tare_planner) is tightly coupled to `ObjectNode`:
+- Subscribes to `/object_nodes` and stores per-object reps
+- Uses object positions, bbox corners, clouds, status, `is_asked_vlm`, and `img_path` for navigation decisions
+
+**Implication**: Keep the `ObjectNode` schema stable. Any new fields from the `MapObject` model can be mapped into extensions, but existing fields and semantics must remain for planner compatibility.
diff --git a/semantic_mapping/launch/semantic_mapping.launch.py b/semantic_mapping/launch/semantic_mapping.launch.py
new file mode 100644
index 0000000..0b7d8ab
--- /dev/null
+++ b/semantic_mapping/launch/semantic_mapping.launch.py
@@ -0,0 +1,137 @@
+"""Launch semantic mapping and detect_anything with a mode selector."""
+
+import os
+
+from ament_index_python.packages import get_package_share_directory
+from launch import LaunchDescription
+from launch.actions import DeclareLaunchArgument, OpaqueFunction
+from launch.substitutions import LaunchConfiguration
+from launch_ros.actions import Node
+
+
+def _parse_bool(value: str, default: bool) -> bool:
+ if value == "":
+ return default
+ return value.lower() in ("true", "1", "yes")
+
+
+def _launch_nodes(context):
+ mode = LaunchConfiguration("mode").perform(context).lower()
+ use_sim_time_arg = LaunchConfiguration("use_sim_time").perform(context)
+ use_compressed_arg = LaunchConfiguration("use_compressed").perform(context)
+ odom_topic_arg = LaunchConfiguration("odom_topic").perform(context)
+
+ use_sim_time = _parse_bool(use_sim_time_arg, default=mode == "bagfile")
+ use_compressed = _parse_bool(use_compressed_arg, default=mode in ("bagfile", "sim"))
+
+ if odom_topic_arg:
+ odom_topic = odom_topic_arg
+ else:
+ odom_topic = "/aft_mapped_to_init_incremental" if mode == "bagfile" else "/state_estimation"
+
+ params_file = LaunchConfiguration("params_file").perform(context)
+ sensor_to_camera_tf = LaunchConfiguration("sensor_to_camera_tf").perform(context)
+ detection_topic = LaunchConfiguration("detection_topic").perform(context)
+ cloud_topic = LaunchConfiguration("cloud_topic").perform(context)
+ viewpoint_topic = LaunchConfiguration("viewpoint_topic").perform(context)
+ object_type_answer_topic = LaunchConfiguration("object_type_answer_topic").perform(context)
+ object_type_query_topic = LaunchConfiguration("object_type_query_topic").perform(context)
+ target_object_instruction_topic = LaunchConfiguration("target_object_instruction_topic").perform(
+ context
+ )
+ object_nodes_topic = LaunchConfiguration("object_nodes_topic").perform(context)
+ obj_points_topic = LaunchConfiguration("obj_points_topic").perform(context)
+ obj_boxes_topic = LaunchConfiguration("obj_boxes_topic").perform(context)
+ obj_labels_topic = LaunchConfiguration("obj_labels_topic").perform(context)
+ map_frame = LaunchConfiguration("map_frame").perform(context)
+ target_object = LaunchConfiguration("target_object").perform(context)
+ save_png = _parse_bool(LaunchConfiguration("save_png").perform(context), default=False)
+ output_dir = LaunchConfiguration("output_dir").perform(context)
+ image_topic = LaunchConfiguration("image_topic").perform(context)
+ compressed_image_topic = LaunchConfiguration("compressed_image_topic").perform(context)
+ conf = float(LaunchConfiguration("conf").perform(context))
+ max_area_ratio = float(LaunchConfiguration("max_area_ratio").perform(context))
+
+ detect_anything_node = Node(
+ package="detect_anything",
+ executable="detection_node",
+ name="detect_anything_detection_node",
+ output="screen",
+ parameters=[
+ {
+ "use_sim_time": use_sim_time,
+ "use_compressed": use_compressed,
+ "image_topic": image_topic,
+ "compressed_image_topic": compressed_image_topic,
+ "conf": conf,
+ "max_area_ratio": max_area_ratio,
+ }
+ ],
+ )
+
+ semantic_mapping_node = Node(
+ package="semantic_mapping",
+ executable="semantic_mapping_node",
+ name="semantic_mapping_node",
+ output="screen",
+ parameters=[
+ params_file,
+ {
+ "use_sim_time": use_sim_time,
+ "sensor_to_camera_tf": sensor_to_camera_tf,
+ "detection_topic": detection_topic,
+ "cloud_topic": cloud_topic,
+ "odom_topic": odom_topic,
+ "viewpoint_topic": viewpoint_topic,
+ "object_type_answer_topic": object_type_answer_topic,
+ "object_type_query_topic": object_type_query_topic,
+ "target_object_instruction_topic": target_object_instruction_topic,
+ "object_nodes_topic": object_nodes_topic,
+ "obj_points_topic": obj_points_topic,
+ "obj_boxes_topic": obj_boxes_topic,
+ "obj_labels_topic": obj_labels_topic,
+ "map_frame": map_frame,
+ "target_object": target_object,
+ "save_png": save_png,
+ "output_dir": output_dir,
+ },
+ ],
+ )
+
+ return [detect_anything_node, semantic_mapping_node]
+
+
+def generate_launch_description() -> LaunchDescription:
+ pkg_share = get_package_share_directory("semantic_mapping")
+ default_params = os.path.join(pkg_share, "config", "params.yaml")
+ default_sensor_tf = os.path.join(pkg_share, "config", "sensor_to_camera_tf.yaml")
+
+ return LaunchDescription(
+ [
+ DeclareLaunchArgument("mode", default_value="sim"),
+ DeclareLaunchArgument("use_sim_time", default_value=""),
+ DeclareLaunchArgument("use_compressed", default_value=""),
+ DeclareLaunchArgument("params_file", default_value=default_params),
+ DeclareLaunchArgument("sensor_to_camera_tf", default_value=default_sensor_tf),
+ DeclareLaunchArgument("detection_topic", default_value="/detection_result"),
+ DeclareLaunchArgument("cloud_topic", default_value="/registered_scan"),
+ DeclareLaunchArgument("odom_topic", default_value=""),
+ DeclareLaunchArgument("viewpoint_topic", default_value="/viewpoint_rep_header"),
+ DeclareLaunchArgument("object_type_answer_topic", default_value="/object_type_answer"),
+ DeclareLaunchArgument("object_type_query_topic", default_value="/object_type_query"),
+ DeclareLaunchArgument("target_object_instruction_topic", default_value="/target_object_instruction"),
+ DeclareLaunchArgument("object_nodes_topic", default_value="/object_nodes"),
+ DeclareLaunchArgument("obj_points_topic", default_value="/obj_points"),
+ DeclareLaunchArgument("obj_boxes_topic", default_value="/obj_boxes"),
+ DeclareLaunchArgument("obj_labels_topic", default_value="/obj_labels"),
+ DeclareLaunchArgument("map_frame", default_value="map"),
+ DeclareLaunchArgument("target_object", default_value="refrigerator"),
+ DeclareLaunchArgument("save_png", default_value="false"),
+ DeclareLaunchArgument("output_dir", default_value="output/object_images"),
+ DeclareLaunchArgument("image_topic", default_value="/camera/image"),
+ DeclareLaunchArgument("compressed_image_topic", default_value="/camera/image/compressed"),
+ DeclareLaunchArgument("conf", default_value="0.35"),
+ DeclareLaunchArgument("max_area_ratio", default_value="0.3"),
+ OpaqueFunction(function=_launch_nodes),
+ ]
+ )
diff --git a/semantic_mapping/msg/ObjectNode.msg b/semantic_mapping/msg/ObjectNode.msg
new file mode 100644
index 0000000..b2d7686
--- /dev/null
+++ b/semantic_mapping/msg/ObjectNode.msg
@@ -0,0 +1,35 @@
+# Semantic object representation for planner integration.
+#
+# Each message represents a single object in the semantic map with its
+# 3D bounding box, point cloud, and metadata for VLM interaction.
+
+std_msgs/Header header
+
+# Object identity - may contain multiple merged track IDs
+int32[] object_id
+
+# Dominant class label (by weighted vote)
+string label
+
+# Object centroid in world frame
+geometry_msgs/Point position
+
+# 3D bounding box corners (8 points in world frame)
+# Order: [min_x,min_y,min_z], [max_x,min_y,min_z], [max_x,max_y,min_z], [min_x,max_y,min_z],
+# [min_x,min_y,max_z], [max_x,min_y,max_z], [max_x,max_y,max_z], [min_x,max_y,max_z]
+geometry_msgs/Point[8] bbox3d
+
+# Object point cloud in world frame
+sensor_msgs/PointCloud2 cloud
+
+# Lifecycle status: "new", "updated", "unchanged"
+string status
+
+# Path to best saved image crop of this object
+string img_path
+
+# VLM interaction flags
+bool is_asked_vlm
+
+# Associated viewpoint ID (for viewpoint-triggered saves)
+int32 viewpoint_id
diff --git a/semantic_mapping/msg/ObjectType.msg b/semantic_mapping/msg/ObjectType.msg
new file mode 100644
index 0000000..ab02ea6
--- /dev/null
+++ b/semantic_mapping/msg/ObjectType.msg
@@ -0,0 +1,17 @@
+# VLM query/answer message for object relabeling.
+#
+# Used bidirectionally:
+# - Query (mapping -> VLM): object_id, img_path, labels[] populated
+# - Answer (VLM -> mapping): final_label populated with VLM decision
+
+# Object ID to query/update
+int32 object_id
+
+# Path to object image for VLM analysis
+string img_path
+
+# VLM-determined final label (populated in answer)
+string final_label
+
+# Candidate labels from detection (populated in query)
+string[] labels
diff --git a/semantic_mapping/msg/TargetObjectInstruction.msg b/semantic_mapping/msg/TargetObjectInstruction.msg
new file mode 100644
index 0000000..1a88005
--- /dev/null
+++ b/semantic_mapping/msg/TargetObjectInstruction.msg
@@ -0,0 +1,22 @@
+# User instruction for target object search.
+#
+# Specifies the target object to find and optional spatial/attribute
+# conditions relative to an anchor object.
+
+# Primary target object label to search for
+string target_object
+
+# Room condition (e.g., "in the kitchen")
+string room_condition
+
+# Spatial condition relative to anchor (e.g., "next to", "on top of")
+string spatial_condition
+
+# Attribute condition for target (e.g., "red", "large")
+string attribute_condition
+
+# Anchor object for spatial reference
+string anchor_object
+
+# Attribute condition for anchor object
+string attribute_condition_anchor
diff --git a/semantic_mapping/msg/ViewpointRep.msg b/semantic_mapping/msg/ViewpointRep.msg
new file mode 100644
index 0000000..5bb1bfe
--- /dev/null
+++ b/semantic_mapping/msg/ViewpointRep.msg
@@ -0,0 +1,8 @@
+# Viewpoint trigger message for image saving.
+#
+# Published by planner to request saving images at specific viewpoints.
+
+std_msgs/Header header
+
+# Unique viewpoint identifier
+int32 viewpoint_id
diff --git a/semantic_mapping/package.xml b/semantic_mapping/package.xml
new file mode 100644
index 0000000..4804d1a
--- /dev/null
+++ b/semantic_mapping/package.xml
@@ -0,0 +1,29 @@
+
+
+
+ semantic_mapping
+ 0.0.1
+ Semantic 3D object mapping node with VLM integration.
+ Alex Lin
+ MIT
+
+ ament_cmake
+
+ rosidl_default_generators
+ rosidl_default_runtime
+ rosidl_interface_packages
+
+ rclpy
+ std_msgs
+ sensor_msgs
+ geometry_msgs
+ nav_msgs
+ visualization_msgs
+ cv_bridge
+ detect_anything
+ vector_perception_utils
+
+
+ ament_cmake
+
+
diff --git a/semantic_mapping/resource/semantic_mapping b/semantic_mapping/resource/semantic_mapping
new file mode 100644
index 0000000..e69de29
diff --git a/semantic_mapping/scripts/semantic_mapping_node b/semantic_mapping/scripts/semantic_mapping_node
new file mode 100755
index 0000000..0312e88
--- /dev/null
+++ b/semantic_mapping/scripts/semantic_mapping_node
@@ -0,0 +1,7 @@
+#!/usr/bin/env python3
+"""Entry point for semantic_mapping_node."""
+
+from semantic_mapping.mapping_node import main
+
+if __name__ == "__main__":
+ main()
diff --git a/semantic_mapping/semantic_mapping/__init__.py b/semantic_mapping/semantic_mapping/__init__.py
new file mode 100644
index 0000000..8df6f20
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/__init__.py
@@ -0,0 +1 @@
+"""Semantic 3D object mapping for vector_perception_ros."""
diff --git a/semantic_mapping/semantic_mapping/map_object.py b/semantic_mapping/semantic_mapping/map_object.py
new file mode 100644
index 0000000..a9e3633
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/map_object.py
@@ -0,0 +1,705 @@
+"""Data models for semantic mapping.
+
+This module defines the core data structures for the semantic mapping system:
+- VoxelFeatureManager: Manages voxelized point clouds with observation angle voting
+- MapObject: Represents a tracked object in the semantic map
+- Observation: A single detection observation
+- ObjectStatus: Enum for object lifecycle status
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Optional
+
+import numpy as np
+from scipy.spatial import cKDTree
+
+from vector_perception_utils.transform_utils import discretize_angles, normalize_angle
+from vector_perception_utils.pointcloud_utils import (
+ compute_centroid,
+ create_pointcloud,
+)
+
+
+class ObjectStatus(Enum):
+ """Object lifecycle status (ObjectDB-style tiering)."""
+
+ PENDING = "pending" # < min_detections threshold
+ PERMANENT = "permanent" # >= threshold, confirmed object
+ DELETED = "deleted" # Marked for removal
+
+
+@dataclass
+class Observation:
+ """A single detection observation to update a MapObject.
+
+ Attributes:
+ track_id: Tracker ID from detection.
+ label: Class label string.
+ confidence: Detection confidence score.
+ points: World-frame point cloud (N, 3).
+ robot_R: Robot rotation matrix (3, 3) at observation time.
+ robot_t: Robot translation vector (3,) at observation time.
+ mask: Optional 2D segmentation mask.
+ image: Optional RGB image crop for best-image selection.
+ timestamp: Observation timestamp in seconds.
+ """
+
+ track_id: int
+ label: str
+ confidence: float
+ points: np.ndarray
+ robot_R: np.ndarray
+ robot_t: np.ndarray
+ mask: Optional[np.ndarray] = None
+ image: Optional[np.ndarray] = None
+ timestamp: float = 0.0
+
+
+class VoxelFeatureManager:
+ """Manages voxelized point cloud with observation angle voting.
+
+ This class tracks 3D voxels representing an object, along with:
+ - Vote counts per voxel (how many times each voxel was observed)
+ - Observation angles (which directions each voxel was seen from)
+ - Regularization mask (which voxels pass shape filtering)
+
+ The observation angle voting helps distinguish between reliable points
+ (seen from multiple angles) and spurious points (seen from only one angle).
+
+ Attributes:
+ voxels: Point coordinates (N, 3) in world frame.
+ votes: Detection count per voxel (N,).
+ observation_angles: Binary angle histogram per voxel (N, num_bins).
+ regularized_mask: Boolean mask for valid voxels after clustering (N,).
+ voxel_size: Size of voxel grid cells in meters.
+ num_angle_bins: Number of discretized angle bins.
+ """
+
+ def __init__(
+ self,
+ points: np.ndarray,
+ voxel_size: float,
+ robot_R: np.ndarray,
+ robot_t: np.ndarray,
+ num_angle_bins: int = 15,
+ ) -> None:
+ """Initialize VoxelFeatureManager with initial points.
+
+ Args:
+ points: Initial point cloud (N, 3) in world frame.
+ voxel_size: Voxel grid cell size in meters.
+ robot_R: Robot rotation matrix (3, 3) at observation time.
+ robot_t: Robot translation vector (3,) at observation time.
+ num_angle_bins: Number of angle bins for observation tracking.
+ """
+ num_points = points.shape[0]
+
+ # Compute observation angles from robot position
+ vectors = points - robot_t
+ obs_angles = np.arctan2(vectors[:, 1], vectors[:, 0])
+ obs_bins = discretize_angles(obs_angles, num_angle_bins)
+
+ self.voxels = points.copy()
+ self.voxel_size = voxel_size
+ self.num_angle_bins = num_angle_bins
+ self._tree = cKDTree(points)
+
+ # Initialize vote counts
+ self.votes = np.ones(num_points, dtype=np.float32)
+
+ # Initialize observation angle histogram (binary: seen/not seen from each angle)
+ self.observation_angles = np.zeros((num_points, num_angle_bins), dtype=np.uint8)
+ self.observation_angles[np.arange(num_points), obs_bins] = 1
+
+ # Regularization mask (all False until regularize_shape is called)
+ self.regularized_mask = np.zeros(num_points, dtype=bool)
+
+ # Remove vote for dynamic object clearing (not commonly used)
+ self._remove_votes = np.zeros(num_points, dtype=np.int32)
+
+ def update(
+ self,
+ new_points: np.ndarray,
+ robot_R: np.ndarray,
+ robot_t: np.ndarray,
+ ) -> None:
+ """Add new points with KDTree-based merge and angle voting.
+
+ Points within voxel_size of existing points are merged (votes incremented).
+ Points farther away are added as new voxels.
+
+ Args:
+ new_points: New point cloud (M, 3) in world frame.
+ robot_R: Robot rotation matrix (3, 3) at observation time.
+ robot_t: Robot translation vector (3,) at observation time.
+ """
+ if new_points.shape[0] == 0:
+ return
+
+ # Compute observation angles for new points
+ vectors = new_points - robot_t
+ obs_angles = np.arctan2(vectors[:, 1], vectors[:, 0])
+ obs_bins = discretize_angles(obs_angles, self.num_angle_bins)
+
+ # Find nearest existing voxels
+ distances, indices = self._tree.query(new_points)
+
+ # Merge points within voxel_size
+ merge_mask = distances < self.voxel_size
+ merge_indices = indices[merge_mask]
+ merge_bins = obs_bins[merge_mask]
+
+ self.votes[merge_indices] += 1
+ self.observation_angles[merge_indices, merge_bins] = 1
+
+ # Add new points that don't merge
+ new_mask = ~merge_mask
+ if np.any(new_mask):
+ new_voxels = new_points[new_mask]
+ new_bins = obs_bins[new_mask]
+ num_new = new_voxels.shape[0]
+
+ # Extend arrays
+ self.voxels = np.vstack([self.voxels, new_voxels])
+ self.votes = np.concatenate([self.votes, np.ones(num_new, dtype=np.float32)])
+
+ new_obs = np.zeros((num_new, self.num_angle_bins), dtype=np.uint8)
+ new_obs[np.arange(num_new), new_bins] = 1
+ self.observation_angles = np.vstack([self.observation_angles, new_obs])
+
+ self.regularized_mask = np.concatenate(
+ [self.regularized_mask, np.zeros(num_new, dtype=bool)]
+ )
+ self._remove_votes = np.concatenate(
+ [self._remove_votes, np.zeros(num_new, dtype=np.int32)]
+ )
+
+ # Rebuild KDTree
+ self._tree = cKDTree(self.voxels)
+
+ def update_observation_angles(self, voxel_indices: np.ndarray, obs_bins: np.ndarray) -> None:
+ """Update observation angle histogram for selected voxels."""
+ if voxel_indices.size == 0:
+ return
+ self.observation_angles[voxel_indices, obs_bins] = 1
+
+ def merge_from(self, other: VoxelFeatureManager) -> None:
+ """Merge voxels from another VoxelFeatureManager.
+
+ Args:
+ other: Another VoxelFeatureManager to merge from.
+ """
+ if other.voxels.shape[0] == 0:
+ return
+
+ distances, indices = self._tree.query(other.voxels)
+
+ # Merge overlapping voxels
+ merge_mask = distances < self.voxel_size
+ merge_indices = indices[merge_mask]
+
+ self.votes[merge_indices] += other.votes[merge_mask]
+ self.observation_angles[merge_indices] = np.logical_or(
+ self.observation_angles[merge_indices],
+ other.observation_angles[merge_mask],
+ ).astype(np.uint8)
+
+ # Add non-overlapping voxels
+ new_mask = ~merge_mask
+ if np.any(new_mask):
+ self.voxels = np.vstack([self.voxels, other.voxels[new_mask]])
+ self.votes = np.concatenate([self.votes, other.votes[new_mask]])
+ self.observation_angles = np.vstack(
+ [self.observation_angles, other.observation_angles[new_mask]]
+ )
+ self.regularized_mask = np.concatenate(
+ [self.regularized_mask, other.regularized_mask[new_mask]]
+ )
+ self._remove_votes = np.concatenate(
+ [self._remove_votes, np.zeros(np.sum(new_mask), dtype=np.int32)]
+ )
+ self._tree = cKDTree(self.voxels)
+
+ def get_valid_indices(
+ self,
+ diversity_percentile: float = 0.3,
+ use_regularized: bool = True,
+ ) -> np.ndarray:
+ """Get indices of valid voxels based on observation angle diversity.
+
+ Voxels are ranked by their observation angle diversity (number of angles
+ from which they were observed), weighted by vote count. The top percentile
+ of voxels are returned.
+
+ Args:
+ diversity_percentile: Fraction of total weight to include (0-1).
+ use_regularized: If True, only consider voxels in regularized_mask.
+
+ Returns:
+ Indices of valid voxels.
+ """
+ if use_regularized:
+ mask = self.regularized_mask
+ if not np.any(mask):
+ return np.array([], dtype=int)
+ obs_angles = self.observation_angles[mask]
+ votes = self.votes[mask]
+ else:
+ obs_angles = self.observation_angles
+ votes = self.votes
+
+ if len(obs_angles) == 0:
+ return np.array([], dtype=int)
+
+ # Compute diversity score (number of angles observed)
+ angle_diversity = np.sum(obs_angles, axis=1)
+
+ # Sort by diversity (ascending)
+ sorted_indices = np.argsort(angle_diversity)
+ sorted_diversity = angle_diversity[sorted_indices]
+ sorted_weights = votes[sorted_indices]
+
+ # Weight by diversity * votes
+ weighted = sorted_diversity * sorted_weights
+ total_weight = np.sum(weighted)
+
+ if total_weight == 0:
+ return np.array([], dtype=int)
+
+ # Find cutoff index for percentile
+ target_weight = (1 - diversity_percentile) * total_weight
+ cumsum = np.cumsum(weighted)
+ cutoff_idx = np.searchsorted(cumsum, target_weight)
+
+ # Return indices above cutoff (higher diversity)
+ return sorted_indices[cutoff_idx:]
+
+ def get_valid_voxels(
+ self,
+ diversity_percentile: float = 0.3,
+ use_regularized: bool = True,
+ ) -> np.ndarray:
+ """Get valid voxel coordinates.
+
+ Args:
+ diversity_percentile: Fraction of total weight to include (0-1).
+ use_regularized: If True, only consider voxels in regularized_mask.
+
+ Returns:
+ Valid voxel coordinates (M, 3).
+ """
+ indices = self.get_valid_indices(diversity_percentile, use_regularized)
+ if use_regularized and np.any(self.regularized_mask):
+ voxels = self.voxels[self.regularized_mask]
+ else:
+ voxels = self.voxels
+
+ if len(indices) == 0:
+ return np.empty((0, 3), dtype=np.float32)
+ return voxels[indices]
+
+ @property
+ def num_voxels(self) -> int:
+ """Total number of voxels."""
+ return self.voxels.shape[0]
+
+ @property
+ def num_regularized(self) -> int:
+ """Number of voxels passing regularization."""
+ return int(np.sum(self.regularized_mask))
+
+
+@dataclass
+class MapObject:
+ """Represents a tracked object in the semantic map.
+
+ This class combines patterns from VLM_ROS SingleObject and dimos Object,
+ with ObjectDB-style lifecycle management.
+
+ Attributes:
+ object_id: Unique identifier for this object (UUID string).
+ track_ids: List of associated tracker IDs (can merge multiple).
+ voxel_manager: Manages the object's voxelized point cloud.
+ class_votes: Label -> observation count mapping.
+ confidence_scores: Label -> best confidence score mapping.
+ points_count: Label -> total points count mapping.
+ robot_poses: List of robot poses when object was observed.
+ detection_count: Total number of detections.
+ inactive_frames: Frames since last update.
+ last_update_time: Timestamp of last update.
+ status: Object lifecycle status.
+ is_asked_vlm: Whether VLM has been queried for this object.
+ updated_by_vlm: Whether object was relabeled by VLM.
+ best_image_path: Path to best saved image crop.
+ best_image_score: Score of best saved image.
+ centroid: Cached centroid position (None if needs recompute).
+ bbox3d_corners: Cached 3D bbox corners (None if needs recompute).
+ """
+
+ object_id: str
+ track_ids: list[int]
+ voxel_manager: VoxelFeatureManager
+
+ # Classification (voted)
+ class_votes: dict[str, int] = field(default_factory=dict)
+ confidence_scores: dict[str, float] = field(default_factory=dict)
+ points_count: dict[str, int] = field(default_factory=dict)
+
+ # Robot poses at observation times
+ robot_poses: list[dict] = field(default_factory=list)
+
+ # Lifecycle (ObjectDB-style)
+ detection_count: int = 1
+ inactive_frames: int = 0
+ last_update_time: float = 0.0
+ status: ObjectStatus = ObjectStatus.PENDING
+
+ # VLM interaction
+ is_asked_vlm: bool = False
+ updated_by_vlm: bool = False
+
+ # Image storage
+ best_image_path: Optional[str] = None
+ best_image_score: float = 0.0
+
+ # Cached geometry (None means needs recompute)
+ _centroid: Optional[np.ndarray] = field(default=None, repr=False)
+ _bbox3d_corners: Optional[np.ndarray] = field(default=None, repr=False)
+ _needs_recompute: bool = field(default=True, repr=False)
+
+ # Configuration
+ _diversity_percentile: float = field(default=0.85, repr=False)
+ _angle_threshold: float = field(default=np.deg2rad(5), repr=False)
+ _distance_threshold: float = field(default=0.3, repr=False)
+ _needs_regularization: bool = field(default=True, repr=False)
+ _regularize_voxel_size: float = field(default=0.06, repr=False)
+
+ @classmethod
+ def from_observation(
+ cls,
+ object_id: str,
+ obs: Observation,
+ voxel_size: float = 0.03,
+ regularize_voxel_size: float = 0.06,
+ num_angle_bins: int = 15,
+ ) -> MapObject:
+ """Create a new MapObject from an initial observation.
+
+ Args:
+ object_id: Unique identifier for this object.
+ obs: Initial observation.
+ voxel_size: Voxel grid cell size in meters.
+ regularize_voxel_size: Voxel size for regularization clustering.
+ num_angle_bins: Number of angle bins for observation tracking.
+
+ Returns:
+ New MapObject instance.
+ """
+ voxel_manager = VoxelFeatureManager(
+ points=obs.points,
+ voxel_size=voxel_size,
+ robot_R=obs.robot_R,
+ robot_t=obs.robot_t,
+ num_angle_bins=num_angle_bins,
+ )
+
+ obj = cls(
+ object_id=object_id,
+ track_ids=[obs.track_id],
+ voxel_manager=voxel_manager,
+ class_votes={obs.label: 1},
+ confidence_scores={obs.label: obs.confidence},
+ points_count={obs.label: len(obs.points)},
+ robot_poses=[{"R": obs.robot_R.copy(), "t": obs.robot_t.copy()}],
+ last_update_time=obs.timestamp,
+ _regularize_voxel_size=regularize_voxel_size,
+ )
+ return obj
+
+ def _invalidate_cache(self) -> None:
+ """Mark cached geometry as needing recompute."""
+ self._needs_recompute = True
+ self._centroid = None
+ self._bbox3d_corners = None
+
+ def _is_similar_pose(self, new_R: np.ndarray, new_t: np.ndarray) -> bool:
+ """Check if a new pose is similar to existing poses.
+
+ Used to avoid redundant updates from the same viewpoint.
+ Compares observation angle and distance to object center.
+
+ Args:
+ new_R: New robot rotation matrix (3, 3). Not used but kept for API.
+ new_t: New robot translation vector (3,).
+
+ Returns:
+ True if pose is similar to an existing pose.
+ """
+ _ = new_R # Unused but kept for API consistency
+ obj_center = self.centroid
+ if obj_center is None:
+ obj_center = compute_centroid(self.voxel_manager.voxels)
+
+ for pose in self.robot_poses:
+ old_t = pose["t"] # R not used for similarity check
+
+ old_to_obj = obj_center - old_t
+ new_to_obj = obj_center - new_t
+
+ old_angle = np.arctan2(old_to_obj[1], old_to_obj[0])
+ new_angle = np.arctan2(new_to_obj[1], new_to_obj[0])
+ angle_diff = abs(normalize_angle(np.array([new_angle - old_angle]))[0])
+
+ old_dist = np.linalg.norm(old_to_obj)
+ new_dist = np.linalg.norm(new_to_obj)
+ dist_diff = abs(new_dist - old_dist)
+
+ if angle_diff < self._angle_threshold and dist_diff < self._distance_threshold:
+ return True
+
+ return False
+
+ def update(self, obs: Observation) -> None:
+ """Update object with a new observation.
+
+ Args:
+ obs: New observation to incorporate.
+ """
+ # Update voxel manager
+ self.voxel_manager.update(obs.points, obs.robot_R, obs.robot_t)
+ self.detection_count += 1
+ self.inactive_frames = 0
+ self.last_update_time = obs.timestamp
+ self._needs_regularization = True
+
+ # Check if pose is new (not similar to existing)
+ is_similar = self._is_similar_pose(obs.robot_R, obs.robot_t)
+
+ if not is_similar:
+ # Update class statistics
+ self.class_votes[obs.label] = self.class_votes.get(obs.label, 0) + 1
+ self.confidence_scores[obs.label] = max(
+ self.confidence_scores.get(obs.label, 0), obs.confidence
+ )
+ self.points_count[obs.label] = (
+ self.points_count.get(obs.label, 0) + len(obs.points)
+ )
+
+ # Add pose
+ self.robot_poses.append({"R": obs.robot_R.copy(), "t": obs.robot_t.copy()})
+
+ # Add track ID if new
+ if obs.track_id not in self.track_ids:
+ self.track_ids.append(obs.track_id)
+
+ self._invalidate_cache()
+
+ def merge(self, other: MapObject) -> None:
+ """Merge another MapObject into this one.
+
+ Args:
+ other: Object to merge from.
+ """
+ # Merge track IDs
+ for tid in other.track_ids:
+ if tid not in self.track_ids:
+ self.track_ids.append(tid)
+
+ # Merge voxel managers
+ self.voxel_manager.merge_from(other.voxel_manager)
+ self._needs_regularization = True
+
+ # Merge class statistics
+ for label, count in other.class_votes.items():
+ self.class_votes[label] = self.class_votes.get(label, 0) + count
+ self.confidence_scores[label] = max(
+ self.confidence_scores.get(label, 0),
+ other.confidence_scores.get(label, 0),
+ )
+ self.points_count[label] = (
+ self.points_count.get(label, 0) + other.points_count.get(label, 0)
+ )
+
+ # Merge poses (only unique ones)
+ for pose in other.robot_poses:
+ if not self._is_similar_pose(pose["R"], pose["t"]):
+ self.robot_poses.append(pose)
+
+ # Update lifecycle
+ self.detection_count += other.detection_count
+ self.last_update_time = max(self.last_update_time, other.last_update_time)
+ self.is_asked_vlm = self.is_asked_vlm or other.is_asked_vlm
+
+ self._invalidate_cache()
+
+ @property
+ def label(self) -> str:
+ """Get dominant class label by weighted vote.
+
+ Returns:
+ Most likely class label based on points_count * confidence.
+ """
+ if not self.class_votes:
+ return "unknown"
+
+ # Compute weighted scores
+ weighted_scores = {}
+ for lbl in self.class_votes:
+ pts = self.points_count.get(lbl, 0)
+ conf = self.confidence_scores.get(lbl, 0)
+ weighted_scores[lbl] = pts * conf
+
+ return max(weighted_scores, key=weighted_scores.get)
+
+ @property
+ def confidence(self) -> float:
+ """Get confidence of the dominant label."""
+ return self.confidence_scores.get(self.label, 0.0)
+
+ @property
+ def centroid(self) -> Optional[np.ndarray]:
+ """Compute centroid weighted by observation angles.
+
+ Returns:
+ Centroid position (3,) or None if no valid voxels.
+ """
+ if self._needs_recompute or self._centroid is None:
+ self._compute_geometry()
+ return self._centroid
+
+ @property
+ def bbox3d_corners(self) -> Optional[np.ndarray]:
+ """Compute axis-aligned 3D bounding box corners.
+
+ Returns:
+ 8 corner points (8, 3) or None if no valid voxels.
+ """
+ if self._needs_recompute or self._bbox3d_corners is None:
+ self._compute_geometry()
+ return self._bbox3d_corners
+
+ def _compute_geometry(self) -> None:
+ """Compute cached centroid and bbox from valid voxels using Open3D.
+
+ Returns:
+ None. Updates cached centroid and bbox in-place.
+ """
+ # Get valid voxels (regularized first, then fallback to all)
+ valid_voxels = self.voxel_manager.get_valid_voxels(
+ diversity_percentile=self._diversity_percentile,
+ use_regularized=True,
+ )
+ if len(valid_voxels) == 0:
+ valid_voxels = self.voxel_manager.get_valid_voxels(
+ diversity_percentile=self._diversity_percentile,
+ use_regularized=False,
+ )
+
+ if len(valid_voxels) < 4:
+ self._centroid = compute_centroid(valid_voxels) if len(valid_voxels) > 0 else None
+ self._bbox3d_corners = None
+ else:
+ pcd = create_pointcloud(valid_voxels)
+ obb = pcd.get_minimal_oriented_bounding_box()
+ self._centroid = np.asarray(obb.center)
+ half_extent = 0.5 * np.asarray(obb.extent)
+ local_corners = np.array(
+ [
+ [-half_extent[0], -half_extent[1], -half_extent[2]],
+ [half_extent[0], -half_extent[1], -half_extent[2]],
+ [half_extent[0], half_extent[1], -half_extent[2]],
+ [-half_extent[0], half_extent[1], -half_extent[2]],
+ [-half_extent[0], -half_extent[1], half_extent[2]],
+ [half_extent[0], -half_extent[1], half_extent[2]],
+ [half_extent[0], half_extent[1], half_extent[2]],
+ [-half_extent[0], half_extent[1], half_extent[2]],
+ ],
+ dtype=float,
+ )
+ self._bbox3d_corners = local_corners @ np.asarray(obb.R).T + np.asarray(obb.center)
+
+ self._needs_recompute = False
+
+ def regularize_shape(self, percentile: float = 0.85) -> None:
+ """Apply vote-weighted regularization with light outlier removal."""
+ if not self._needs_regularization:
+ return
+ voxels = self.voxel_manager.voxels
+ if len(voxels) < 3:
+ self.voxel_manager.regularized_mask = np.ones(len(voxels), dtype=bool)
+ self._needs_regularization = False
+ self._invalidate_cache()
+ return
+
+ weights = np.sum(self.voxel_manager.observation_angles, axis=1) * self.voxel_manager.votes
+ keep_ratio = percentile if percentile is not None else 1.0
+ if keep_ratio >= 1.0:
+ keep_mask = np.ones(voxels.shape[0], dtype=bool)
+ else:
+ cutoff = float(np.quantile(weights, 1.0 - keep_ratio))
+ keep_mask = weights >= cutoff
+
+ if not np.any(keep_mask):
+ keep_mask = np.ones(voxels.shape[0], dtype=bool)
+
+ selected_indices = np.nonzero(keep_mask)[0]
+ points_sel = voxels[selected_indices]
+ weights_sel = weights[selected_indices]
+ if points_sel.shape[0] >= 3:
+ voxel_size = self._regularize_voxel_size
+ keys = np.floor(points_sel / voxel_size).astype(np.int32)
+ voxel_map: dict[tuple[int, int, int], list[int]] = {}
+ for idx, key in enumerate(map(tuple, keys)):
+ voxel_map.setdefault(key, []).append(idx)
+
+ neighbor_offsets = [
+ (dx, dy, dz)
+ for dx in (-1, 0, 1)
+ for dy in (-1, 0, 1)
+ for dz in (-1, 0, 1)
+ if not (dx == 0 and dy == 0 and dz == 0)
+ ]
+ visited: set[tuple[int, int, int]] = set()
+ components: list[tuple[np.ndarray, float, np.ndarray]] = []
+ for key in voxel_map:
+ if key in visited:
+ continue
+ stack = [key]
+ visited.add(key)
+ comp_indices: list[int] = []
+ while stack:
+ current = stack.pop()
+ comp_indices.extend(voxel_map[current])
+ for dx, dy, dz in neighbor_offsets:
+ neighbor = (current[0] + dx, current[1] + dy, current[2] + dz)
+ if neighbor in voxel_map and neighbor not in visited:
+ visited.add(neighbor)
+ stack.append(neighbor)
+
+ comp_idx = np.array(comp_indices, dtype=int)
+ comp_weight = float(np.sum(weights_sel[comp_idx]))
+ comp_centroid = compute_centroid(points_sel[comp_idx])
+ components.append((comp_idx, comp_weight, comp_centroid))
+
+ if components:
+ main_idx = int(np.argmax([c[1] for c in components]))
+ main_points = points_sel[components[main_idx][0]]
+ main_centroid = components[main_idx][2]
+ extent = np.max(main_points, axis=0) - np.min(main_points, axis=0)
+ dist_thresh = max(0.3, 0.5 * float(np.max(extent)))
+
+ keep_sel_mask = np.zeros(points_sel.shape[0], dtype=bool)
+ for comp_idx, _, comp_centroid in components:
+ if np.linalg.norm(comp_centroid - main_centroid) <= dist_thresh:
+ keep_sel_mask[comp_idx] = True
+ if np.any(keep_sel_mask):
+ keep_mask = np.zeros(voxels.shape[0], dtype=bool)
+ keep_mask[selected_indices[keep_sel_mask]] = True
+
+ self.voxel_manager.regularized_mask = keep_mask
+
+ self._needs_regularization = False
+ self._invalidate_cache()
diff --git a/semantic_mapping/semantic_mapping/mapper.py b/semantic_mapping/semantic_mapping/mapper.py
new file mode 100644
index 0000000..cc67cf1
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/mapper.py
@@ -0,0 +1,275 @@
+"""Object map manager for semantic mapping."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+import threading
+import time
+import uuid
+
+from .utils.matching import compute_bbox, compute_iou_3d_with_ratios, find_best_match
+from .map_object import MapObject, ObjectStatus, Observation
+
+
+@dataclass
+class UpdateStats:
+ """Summary of updates applied in a mapper step."""
+
+ created: int = 0
+ updated: int = 0
+ merged: int = 0
+ deleted: int = 0
+ promoted: int = 0
+ total: int = 0
+
+
+class ObjectMapper:
+ """Manages the semantic object map with two-tier state management."""
+
+ def __init__(
+ self,
+ voxel_size: float = 0.03,
+ regularize_voxel_size: float = 0.06,
+ confidence_threshold: float = 0.30,
+ distance_threshold: float = 0.5,
+ iou_threshold: float = 0.2,
+ min_detections_for_permanent: int = 3,
+ pending_ttl_s: float = 10.0,
+ num_angle_bins: int = 15,
+ ) -> None:
+ """Initialize the object mapper and its thresholds.
+
+ Args:
+ voxel_size: Voxel grid cell size in meters.
+ regularize_voxel_size: Voxel size for regularization clustering.
+ confidence_threshold: Minimum detection confidence to accept.
+ distance_threshold: Max centroid distance for spatial matching.
+ iou_threshold: IoU threshold for merge checks.
+ min_detections_for_permanent: Detections before promotion.
+ pending_ttl_s: Time-to-live for pending objects.
+ num_angle_bins: Observation angle bins for voxel manager.
+ """
+ self._pending_objects: dict[str, MapObject] = {}
+ self._permanent_objects: dict[str, MapObject] = {}
+ self._track_id_map: dict[int, str] = {}
+ self._lock = threading.RLock()
+
+ self._voxel_size = voxel_size
+ self._regularize_voxel_size = regularize_voxel_size
+ self._confidence_threshold = confidence_threshold
+ self._distance_threshold = distance_threshold
+ self._iou_threshold = iou_threshold
+ self._min_detections_for_permanent = min_detections_for_permanent
+ self._pending_ttl_s = pending_ttl_s
+ self._num_angle_bins = num_angle_bins
+
+ def update(self, observations: list[Observation], target_object: str | None = None) -> UpdateStats:
+ """Update the map with new observations.
+
+ Args:
+ observations: List of new observations to integrate.
+ target_object: Optional label to protect from pruning.
+
+ Returns:
+ UpdateStats summarizing changes applied.
+ """
+ stats = UpdateStats()
+ now = max((obs.timestamp for obs in observations), default=time.time())
+ updated_ids: set[str] = set()
+ regularized_ids: set[str] = set()
+
+ with self._lock:
+ for obs in observations:
+ if obs.confidence < self._confidence_threshold:
+ continue
+
+ obj = self._match_observation(obs)
+ if obj is None:
+ obj = self._create_object(obs)
+ self._pending_objects[obj.object_id] = obj
+ stats.created += 1
+ else:
+ obj.update(obs)
+ stats.updated += 1
+
+ updated_ids.add(obj.object_id)
+ for track_id in obj.track_ids:
+ self._track_id_map[track_id] = obj.object_id
+
+ if obj.status == ObjectStatus.PENDING:
+ if self._check_promotion(obj):
+ stats.promoted += 1
+
+ if obj.object_id not in regularized_ids and obj.inactive_frames == 0:
+ obj.regularize_shape(percentile=obj._diversity_percentile)
+ regularized_ids.add(obj.object_id)
+
+ for obj in list(self._pending_objects.values()) + list(self._permanent_objects.values()):
+ if obj.object_id not in updated_ids:
+ obj.inactive_frames += 1
+
+ deleted = self._prune_inactive(target_object, now)
+ stats.deleted = len(deleted)
+ stats.merged += self.merge_objects()
+ stats.total = len(self._permanent_objects)
+
+ return stats
+
+ def _match_observation(self, obs: Observation) -> MapObject | None:
+ """Match an observation to an existing object."""
+ if obs.track_id in self._track_id_map:
+ obj_id = self._track_id_map[obs.track_id]
+ if obj_id in self._permanent_objects:
+ return self._permanent_objects[obj_id]
+ if obj_id in self._pending_objects:
+ return self._pending_objects[obj_id]
+
+ candidates = list(self._permanent_objects.values()) + list(self._pending_objects.values())
+ return find_best_match(
+ obs=obs,
+ candidates=candidates,
+ distance_threshold=self._distance_threshold,
+ iou_threshold=self._iou_threshold,
+ )
+
+ def _create_object(self, obs: Observation) -> MapObject:
+ """Create a new MapObject from an observation."""
+ obj = MapObject.from_observation(
+ object_id=str(uuid.uuid4()),
+ obs=obs,
+ voxel_size=self._voxel_size,
+ regularize_voxel_size=self._regularize_voxel_size,
+ num_angle_bins=self._num_angle_bins,
+ )
+ self._track_id_map[obs.track_id] = obj.object_id
+ return obj
+
+ def _check_promotion(self, obj: MapObject) -> bool:
+ """Promote object from pending to permanent if threshold met."""
+ if obj.status != ObjectStatus.PENDING:
+ return False
+ if obj.detection_count < self._min_detections_for_permanent:
+ return False
+ if obj.object_id in self._pending_objects:
+ self._pending_objects.pop(obj.object_id)
+ obj.status = ObjectStatus.PERMANENT
+ self._permanent_objects[obj.object_id] = obj
+ return True
+
+ def merge_objects(self) -> int:
+ """Merge overlapping permanent objects.
+
+ Returns:
+ Number of merges performed.
+ """
+ merged = 0
+ objs = list(self._permanent_objects.values())
+ i = 0
+ while i < len(objs):
+ obj = objs[i]
+ j = i + 1
+ while j < len(objs):
+ other = objs[j]
+ should_merge = False
+
+ if set(obj.track_ids) & set(other.track_ids):
+ should_merge = True
+ else:
+ if obj.centroid is None or other.centroid is None:
+ j += 1
+ continue
+ dist = float(((obj.centroid - other.centroid) ** 2).sum() ** 0.5)
+ if dist > self._distance_threshold:
+ j += 1
+ continue
+
+ obj_points = obj.voxel_manager.get_valid_voxels(
+ diversity_percentile=obj._diversity_percentile,
+ use_regularized=True,
+ )
+ if len(obj_points) == 0:
+ obj_points = obj.voxel_manager.get_valid_voxels(
+ diversity_percentile=obj._diversity_percentile,
+ use_regularized=False,
+ )
+ other_points = other.voxel_manager.get_valid_voxels(
+ diversity_percentile=other._diversity_percentile,
+ use_regularized=True,
+ )
+ if len(other_points) == 0:
+ other_points = other.voxel_manager.get_valid_voxels(
+ diversity_percentile=other._diversity_percentile,
+ use_regularized=False,
+ )
+
+ obj_bbox = compute_bbox(obj_points) if len(obj_points) >= 3 else None
+ other_bbox = compute_bbox(other_points) if len(other_points) >= 3 else None
+ obj_corners = obj_bbox[0] if obj_bbox is not None else None
+ other_corners = other_bbox[0] if other_bbox is not None else None
+ obj_extent = obj_bbox[1] if obj_bbox is not None else None
+ other_extent = other_bbox[1] if other_bbox is not None else None
+
+ dist_merge = dist < 0.25
+ if obj_extent is not None and other_extent is not None:
+ dist_thresh = (((obj_extent / 2 + other_extent / 2) / 2) ** 2).sum() ** 0.5
+ dist_merge = dist_merge or dist < dist_thresh * 0.5
+
+ if dist_merge:
+ should_merge = True
+ elif obj_corners is not None and other_corners is not None:
+ iou, ratio_obj, ratio_other = compute_iou_3d_with_ratios(
+ obj_corners, other_corners
+ )
+ if (
+ iou > self._iou_threshold
+ or ratio_obj > 0.4
+ or ratio_other > 0.4
+ ):
+ should_merge = True
+
+ if should_merge:
+ obj.merge(other)
+ for track_id in other.track_ids:
+ self._track_id_map[track_id] = obj.object_id
+ self._permanent_objects.pop(other.object_id, None)
+ objs.pop(j)
+ merged += 1
+ else:
+ j += 1
+ i += 1
+
+ return merged
+
+ def _prune_inactive(self, target_object: str | None, now: float) -> list[MapObject]:
+ """Remove inactive pending/permanent objects."""
+ deleted: list[MapObject] = []
+
+ for obj_id, obj in list(self._pending_objects.items()):
+ if (now - obj.last_update_time) > self._pending_ttl_s:
+ deleted.append(self._pending_objects.pop(obj_id))
+
+ for obj_id, obj in list(self._permanent_objects.items()):
+ if target_object and obj.label == target_object:
+ continue
+ if obj.status == ObjectStatus.DELETED:
+ deleted.append(self._permanent_objects.pop(obj_id))
+
+ for obj in deleted:
+ for track_id in obj.track_ids:
+ if self._track_id_map.get(track_id) == obj.object_id:
+ self._track_id_map.pop(track_id, None)
+
+ return deleted
+
+ def get_objects(self, include_pending: bool = False) -> list[MapObject]:
+ """Return current objects.
+
+ Args:
+ include_pending: If True, include pending objects.
+
+ Returns:
+ List of MapObject instances.
+ """
+ if include_pending:
+ return list(self._permanent_objects.values()) + list(self._pending_objects.values())
+ return list(self._permanent_objects.values())
diff --git a/semantic_mapping/semantic_mapping/mapping_node.py b/semantic_mapping/semantic_mapping/mapping_node.py
new file mode 100644
index 0000000..16b5649
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/mapping_node.py
@@ -0,0 +1,861 @@
+"""ROS2 mapping node for semantic 3D object mapping."""
+
+from __future__ import annotations
+
+from collections import deque
+from pathlib import Path
+import time
+
+import numpy as np
+import yaml
+from cv_bridge import CvBridge
+import cv2
+import rclpy
+from rclpy.node import Node
+from rclpy.qos import QoSProfile, ReliabilityPolicy
+from sensor_msgs.msg import CompressedImage, PointCloud2
+from sensor_msgs_py import point_cloud2
+from nav_msgs.msg import Odometry
+from std_msgs.msg import Header
+from visualization_msgs.msg import MarkerArray
+from scipy.spatial.transform import Rotation, Slerp
+
+from detect_anything.msg import DetectionResult
+from semantic_mapping.msg import ObjectType, ObjectNode, TargetObjectInstruction, ViewpointRep
+
+from vector_perception_utils.pointcloud_utils import (
+ project_pointcloud_to_image,
+ segment_pointclouds_from_projection,
+ transform_segmented_clouds_to_world,
+)
+from vector_perception_utils.transform_utils import discretize_angles
+
+from .map_object import Observation
+from .utils.map_object_utils import (
+ create_bbox_markers,
+ create_delete_markers,
+ create_label_markers,
+ create_object_pointcloud,
+ map_object_to_object_node,
+)
+from .mapper import ObjectMapper
+from .utils.storage import ImageSaveQueue
+
+
+class SemanticMappingNode(Node):
+ """ROS2 node for semantic object mapping.
+
+ This node fuses 2D detections and LiDAR point clouds into persistent 3D objects:
+ - Buffers detections, clouds, and odometry with timestamps
+ - Interpolates odometry at detection time (position + orientation)
+ - Segments point clouds by detection masks and projects to world frame
+ - Updates the object mapper and publishes ROS messages
+ - Queues cropped images for VLM queries
+ """
+
+ def __init__(self) -> None:
+ """Initialize the mapping node, parameters, buffers, and ROS I/O."""
+ super().__init__("semantic_mapping_node")
+ self._bridge = CvBridge()
+
+ self.declare_parameter("detection_topic", "/detection_result")
+ self.declare_parameter("cloud_topic", "/registered_scan")
+ self.declare_parameter("odom_topic", "/state_estimation")
+ self.declare_parameter("viewpoint_topic", "/viewpoint_rep_header")
+ self.declare_parameter("object_type_answer_topic", "/object_type_answer")
+ self.declare_parameter("object_type_query_topic", "/object_type_query")
+ self.declare_parameter("target_object_instruction_topic", "/target_object_instruction")
+ self.declare_parameter("object_nodes_topic", "/object_nodes")
+ self.declare_parameter("obj_points_topic", "/obj_points")
+ self.declare_parameter("obj_boxes_topic", "/obj_boxes")
+ self.declare_parameter("obj_labels_topic", "/obj_labels")
+ self.declare_parameter("map_frame", "map")
+
+ self.declare_parameter("confidence_threshold", 0.3)
+ self.declare_parameter("voxel_size", 0.03)
+ self.declare_parameter("regularize_voxel_size", 0.06)
+ self.declare_parameter("distance_threshold", 0.5)
+ self.declare_parameter("iou_threshold", 0.2)
+ self.declare_parameter("min_detections_for_permanent", 3)
+ self.declare_parameter("pending_ttl_s", 10.0)
+ self.declare_parameter("num_angle_bins", 15)
+
+ self.declare_parameter("detection_linear_state_time_bias", 0.0)
+ self.declare_parameter("detection_angular_state_time_bias", 0.0)
+ self.declare_parameter("cloud_window_before", 0.5)
+ self.declare_parameter("cloud_window_after", 0.1)
+
+ self.declare_parameter("projection", "equirect")
+ self.declare_parameter("axis_mode", "xz")
+ self.declare_parameter("image_width", 1920)
+ self.declare_parameter("image_height", 640)
+ self.declare_parameter("depth_filter", False)
+ self.declare_parameter("depth_filter_margin", 0.15)
+ self.declare_parameter("depth_jump_threshold", 0.3)
+ self.declare_parameter("sensor_to_camera_tf", "config/sensor_to_camera_tf.yaml")
+ self.declare_parameter("mask_erode_kernel", 3)
+ self.declare_parameter("mask_erode_iterations", 2)
+ self.declare_parameter("max_cloud_distance", 8.0)
+ self.declare_parameter("visualization_max_objects", 50)
+ self.declare_parameter("ground_filter", True)
+ self.declare_parameter("ground_radius", 2.0)
+ self.declare_parameter("ground_z_clearance", 0.0)
+
+ self.declare_parameter("target_object", "")
+ self.declare_parameter("save_png", False)
+ self.declare_parameter("output_dir", "output/object_images")
+
+ detection_topic = str(self.get_parameter("detection_topic").value)
+ cloud_topic = str(self.get_parameter("cloud_topic").value)
+ odom_topic = str(self.get_parameter("odom_topic").value)
+ viewpoint_topic = str(self.get_parameter("viewpoint_topic").value)
+ object_type_answer_topic = str(self.get_parameter("object_type_answer_topic").value)
+ object_type_query_topic = str(self.get_parameter("object_type_query_topic").value)
+ target_object_instruction_topic = str(self.get_parameter("target_object_instruction_topic").value)
+ object_nodes_topic = str(self.get_parameter("object_nodes_topic").value)
+ obj_points_topic = str(self.get_parameter("obj_points_topic").value)
+ obj_boxes_topic = str(self.get_parameter("obj_boxes_topic").value)
+ obj_labels_topic = str(self.get_parameter("obj_labels_topic").value)
+ self._map_frame = str(self.get_parameter("map_frame").value)
+
+ self._confidence_threshold = float(self.get_parameter("confidence_threshold").value)
+ self._detection_linear_bias = float(self.get_parameter("detection_linear_state_time_bias").value)
+ self._detection_angular_bias = float(self.get_parameter("detection_angular_state_time_bias").value)
+ self._cloud_window_before = float(self.get_parameter("cloud_window_before").value)
+ self._cloud_window_after = float(self.get_parameter("cloud_window_after").value)
+
+ self._projection = str(self.get_parameter("projection").value)
+ self._axis_mode = str(self.get_parameter("axis_mode").value)
+ self._image_width = int(self.get_parameter("image_width").value)
+ self._image_height = int(self.get_parameter("image_height").value)
+ self._depth_filter = bool(self.get_parameter("depth_filter").value)
+ self._depth_filter_margin = float(self.get_parameter("depth_filter_margin").value)
+ self._depth_jump_threshold = float(self.get_parameter("depth_jump_threshold").value)
+ self._sensor_to_camera = self._load_sensor_to_camera(
+ str(self.get_parameter("sensor_to_camera_tf").value)
+ )
+ self._mask_erode_kernel_size = max(1, int(self.get_parameter("mask_erode_kernel").value))
+ self._mask_erode_iterations = max(0, int(self.get_parameter("mask_erode_iterations").value))
+ self._mask_erode_kernel = cv2.getStructuringElement(
+ cv2.MORPH_RECT,
+ (self._mask_erode_kernel_size, self._mask_erode_kernel_size),
+ )
+ self._max_cloud_distance = float(self.get_parameter("max_cloud_distance").value)
+ self._visualization_max_objects = int(
+ self.get_parameter("visualization_max_objects").value
+ )
+ self._ground_filter = bool(self.get_parameter("ground_filter").value)
+ self._ground_radius = float(self.get_parameter("ground_radius").value)
+ self._ground_z_clearance = float(self.get_parameter("ground_z_clearance").value)
+ self._ground_percentile = 0.05
+ self._ground_min_points = 80
+
+ self._target_object = str(self.get_parameter("target_object").value)
+ self._save_png = bool(self.get_parameter("save_png").value)
+ self._output_dir = Path(str(self.get_parameter("output_dir").value))
+
+ self._mapper = ObjectMapper(
+ voxel_size=float(self.get_parameter("voxel_size").value),
+ regularize_voxel_size=float(self.get_parameter("regularize_voxel_size").value),
+ confidence_threshold=self._confidence_threshold,
+ distance_threshold=float(self.get_parameter("distance_threshold").value),
+ iou_threshold=float(self.get_parameter("iou_threshold").value),
+ min_detections_for_permanent=int(self.get_parameter("min_detections_for_permanent").value),
+ pending_ttl_s=float(self.get_parameter("pending_ttl_s").value),
+ num_angle_bins=int(self.get_parameter("num_angle_bins").value),
+ )
+
+ self._image_saver = ImageSaveQueue(
+ output_dir=self._output_dir,
+ max_queue_size=100,
+ save_png=self._save_png,
+ )
+ self._image_saver.start()
+
+ self._cloud_buffer: deque[tuple[float, np.ndarray]] = deque(maxlen=200)
+ self._odom_buffer: deque[tuple[float, dict]] = deque(maxlen=400)
+ self._detection_buffer: deque[DetectionResult] = deque(maxlen=100)
+ self._viewpoint_queue: deque[tuple[float, int]] = deque(maxlen=50)
+ self._processed_viewpoints: set[float] = set()
+ self._published_ids: set[int] = set()
+ self._mapping_busy = False
+
+ qos_cloud = QoSProfile(depth=10, reliability=ReliabilityPolicy.BEST_EFFORT)
+ qos_default = QoSProfile(depth=10, reliability=ReliabilityPolicy.RELIABLE)
+
+ self.create_subscription(DetectionResult, detection_topic, self._detection_callback, qos_default)
+ self.create_subscription(PointCloud2, cloud_topic, self._cloud_callback, qos_cloud)
+ self.create_subscription(Odometry, odom_topic, self._odom_callback, qos_cloud)
+ self.create_subscription(ViewpointRep, viewpoint_topic, self._viewpoint_callback, qos_default)
+ self.create_subscription(ObjectType, object_type_answer_topic, self._object_type_answer_callback, qos_default)
+ self.create_subscription(
+ TargetObjectInstruction, target_object_instruction_topic, self._target_object_callback, qos_default
+ )
+
+ self._object_type_query_pub = self.create_publisher(ObjectType, object_type_query_topic, qos_default)
+ self._object_nodes_pub = self.create_publisher(ObjectNode, object_nodes_topic, qos_default)
+ self._obj_points_pub = self.create_publisher(PointCloud2, obj_points_topic, qos_default)
+ self._obj_boxes_pub = self.create_publisher(MarkerArray, obj_boxes_topic, qos_default)
+ self._obj_labels_pub = self.create_publisher(MarkerArray, obj_labels_topic, qos_default)
+
+ self.create_timer(0.5, self._mapping_callback)
+
+ def destroy_node(self) -> bool:
+ """Stop background workers and destroy the ROS node."""
+ self._image_saver.stop()
+ return super().destroy_node()
+
+ def _detection_callback(self, msg: DetectionResult) -> None:
+ """Buffer detection results for later synchronization.
+
+ Args:
+ msg: DetectionResult message with bboxes, labels, and masks.
+ """
+ self._detection_buffer.append(msg)
+
+ def _cloud_callback(self, msg: PointCloud2) -> None:
+ """Buffer incoming point clouds with timestamps.
+
+ Args:
+ msg: PointCloud2 message in the sensor frame.
+ """
+ stamp = msg.header.stamp.sec + msg.header.stamp.nanosec / 1e9
+ points = point_cloud2.read_points_numpy(msg, field_names=("x", "y", "z"))
+ self._cloud_buffer.append((stamp, points))
+
+ def _odom_callback(self, msg: Odometry) -> None:
+ """Buffer odometry data for interpolation.
+
+ Args:
+ msg: Odometry message containing pose and twist.
+ """
+ stamp = msg.header.stamp.sec + msg.header.stamp.nanosec / 1e9
+ odom = {
+ "position": np.array(
+ [msg.pose.pose.position.x, msg.pose.pose.position.y, msg.pose.pose.position.z]
+ ),
+ "orientation": np.array(
+ [
+ msg.pose.pose.orientation.x,
+ msg.pose.pose.orientation.y,
+ msg.pose.pose.orientation.z,
+ msg.pose.pose.orientation.w,
+ ]
+ ),
+ "linear_velocity": np.array(
+ [msg.twist.twist.linear.x, msg.twist.twist.linear.y, msg.twist.twist.linear.z]
+ ),
+ "angular_velocity": np.array(
+ [msg.twist.twist.angular.x, msg.twist.twist.angular.y, msg.twist.twist.angular.z]
+ ),
+ }
+ self._odom_buffer.append((stamp, odom))
+
+ def _viewpoint_callback(self, msg: ViewpointRep) -> None:
+ """Queue viewpoint trigger requests.
+
+ Args:
+ msg: ViewpointRep message with timestamp and viewpoint id.
+ """
+ stamp = msg.header.stamp.sec + msg.header.stamp.nanosec / 1e9
+ if stamp in self._processed_viewpoints:
+ return
+ self._viewpoint_queue.append((stamp, msg.viewpoint_id))
+
+ def _target_object_callback(self, msg: TargetObjectInstruction) -> None:
+ """Update target object label used for VLM query decisions.
+
+ Args:
+ msg: TargetObjectInstruction message.
+ """
+ self._target_object = msg.target_object
+
+ def _object_type_answer_callback(self, msg: ObjectType) -> None:
+ """Apply VLM relabeling updates to tracked objects.
+
+ Args:
+ msg: ObjectType message with final label for a track id.
+ """
+ for obj in self._mapper.get_objects(include_pending=True):
+ if msg.object_id in obj.track_ids:
+ obj.class_votes[msg.final_label] = obj.class_votes.get(msg.final_label, 0) + 50
+ obj.confidence_scores[msg.final_label] = max(
+ obj.confidence_scores.get(msg.final_label, 0.0), 1.0
+ )
+ obj.points_count[msg.final_label] = obj.points_count.get(msg.final_label, 0)
+ obj.updated_by_vlm = True
+ obj._invalidate_cache()
+ break
+
+ def _mapping_callback(self) -> None:
+ """Main mapping loop: synchronize inputs, update map, and publish outputs."""
+ if self._mapping_busy:
+ return
+ if len(self._detection_buffer) < 1 or len(self._cloud_buffer) < 1 or len(self._odom_buffer) < 2:
+ return
+
+ self._mapping_busy = True
+ start_time = time.perf_counter()
+ try:
+ detection_msg, viewpoint_stamp, viewpoint_id = self._select_detection()
+ if detection_msg is None:
+ return
+
+ detection_stamp = detection_msg.header.stamp.sec + detection_msg.header.stamp.nanosec / 1e9
+ image = self._bridge.compressed_imgmsg_to_cv2(detection_msg.image, desired_encoding="bgr8")
+
+ odom = self._interpolate_odom(detection_stamp)
+ if odom is None:
+ return
+
+ neighbor_cloud = self._collect_cloud_window(detection_stamp)
+ if neighbor_cloud is None:
+ return
+
+ observations = self._build_observations(
+ detection_msg=detection_msg,
+ image=image,
+ odom=odom,
+ cloud=neighbor_cloud,
+ )
+ if len(observations) == 0:
+ return
+
+ stats = self._mapper.update(observations, target_object=self._target_object)
+ self.get_logger().info(
+ "Mapping stats:\n"
+ f" total: {stats.total}\n"
+ f" deleted: {stats.deleted}"
+ )
+ self._update_observation_angles(observations)
+ self._update_best_images(observations, image)
+ self._publish_map(detection_stamp, viewpoint_id)
+ self._publish_object_type_queries()
+
+ if viewpoint_stamp is not None:
+ self._processed_viewpoints.add(viewpoint_stamp)
+ finally:
+ elapsed_ms = (time.perf_counter() - start_time) * 1000.0
+ self.get_logger().info(f"Mapping loop time: {elapsed_ms:.2f} ms")
+ self._mapping_busy = False
+
+ def _select_detection(self) -> tuple[DetectionResult | None, float | None, int]:
+ """Pick the detection to process, optionally aligned to a viewpoint.
+
+ Returns:
+ Tuple of (detection_msg, viewpoint_stamp, viewpoint_id).
+ """
+ viewpoint_stamp = None
+ viewpoint_id = -1
+
+ if len(self._viewpoint_queue) > 0:
+ stamp, vid = self._viewpoint_queue[0]
+ detections = list(self._detection_buffer)
+ stamps = [msg.header.stamp.sec + msg.header.stamp.nanosec / 1e9 for msg in detections]
+ diffs = np.abs(np.array(stamps) - stamp)
+ idx = int(np.argmin(diffs))
+ if diffs[idx] <= 1.0:
+ detection_msg = detections[idx]
+ viewpoint_stamp = stamp
+ viewpoint_id = vid
+ self._viewpoint_queue.popleft()
+ return detection_msg, viewpoint_stamp, viewpoint_id
+ self._viewpoint_queue.popleft()
+
+ idx = -2 if len(self._detection_buffer) > 1 else -1
+ return self._detection_buffer[idx], viewpoint_stamp, viewpoint_id
+
+ def _interpolate_odom(self, detection_stamp: float) -> dict | None:
+ """Interpolate odometry at the detection timestamp.
+
+ Args:
+ detection_stamp: Timestamp in seconds.
+
+ Returns:
+ Interpolated odometry dict or None if interpolation is not possible.
+ """
+ target_stamp = detection_stamp + self._detection_linear_bias
+ angular_stamp = detection_stamp + self._detection_angular_bias
+ stamps = [s for s, _ in self._odom_buffer]
+ if len(stamps) < 2:
+ return None
+
+ if target_stamp < stamps[0] or target_stamp > stamps[-1]:
+ return None
+ if angular_stamp < stamps[0] or angular_stamp > stamps[-1]:
+ return None
+
+ right_idx = next(i for i, s in enumerate(stamps) if s >= target_stamp)
+ left_idx = max(right_idx - 1, 0)
+ angular_right_idx = next(i for i, s in enumerate(stamps) if s >= angular_stamp)
+ angular_left_idx = max(angular_right_idx - 1, 0)
+
+ left_stamp, left_odom = self._odom_buffer[left_idx]
+ right_stamp, right_odom = self._odom_buffer[right_idx]
+ angular_left_stamp, angular_left_odom = self._odom_buffer[angular_left_idx]
+ angular_right_stamp, angular_right_odom = self._odom_buffer[angular_right_idx]
+ denom = right_stamp - left_stamp
+ ratio = (target_stamp - left_stamp) / denom if denom > 0 else 0.5
+ angular_denom = angular_right_stamp - angular_left_stamp
+ angular_ratio = (angular_stamp - angular_left_stamp) / angular_denom if angular_denom > 0 else 0.5
+
+ camera_odom = {}
+ camera_odom["position"] = right_odom["position"] * ratio + left_odom["position"] * (1 - ratio)
+ camera_odom["linear_velocity"] = (
+ right_odom["linear_velocity"] * ratio + left_odom["linear_velocity"] * (1 - ratio)
+ )
+ rotations = Rotation.from_quat(
+ [angular_left_odom["orientation"], angular_right_odom["orientation"]]
+ )
+ slerp = Slerp([0, 1], rotations)
+ camera_odom["orientation"] = slerp(angular_ratio).as_quat()
+ camera_odom["angular_velocity"] = (
+ angular_right_odom["angular_velocity"] * angular_ratio
+ + angular_left_odom["angular_velocity"] * (1 - angular_ratio)
+ )
+ return camera_odom
+
+ def _collect_cloud_window(self, detection_stamp: float) -> np.ndarray | None:
+ """Aggregate point clouds around a detection timestamp.
+
+ Args:
+ detection_stamp: Timestamp in seconds.
+
+ Returns:
+ Concatenated point cloud or None if no data in window.
+ """
+ start = detection_stamp - self._cloud_window_before
+ end = detection_stamp + self._cloud_window_after
+ clouds = [pts for stamp, pts in self._cloud_buffer if start <= stamp <= end]
+ if len(clouds) == 0:
+ return None
+ return np.concatenate(clouds, axis=0)
+
+ def _compute_ground_z_threshold(
+ self,
+ points: np.ndarray,
+ robot_xy: np.ndarray,
+ ) -> float | None:
+ if not self._ground_filter:
+ return None
+ if points.shape[0] < self._ground_min_points:
+ return None
+ if self._ground_radius <= 0.0:
+ return None
+ if not 0.0 < self._ground_percentile < 1.0:
+ return None
+
+ diffs = points[:, :2] - robot_xy
+ dist2 = diffs[:, 0] ** 2 + diffs[:, 1] ** 2
+ radius2 = self._ground_radius ** 2
+ mask = dist2 <= radius2
+ if np.count_nonzero(mask) < self._ground_min_points:
+ return None
+
+ z_vals = points[mask, 2]
+ z_thresh = float(np.percentile(z_vals, self._ground_percentile * 100.0))
+ return z_thresh + self._ground_z_clearance
+
+ def _build_observations(
+ self,
+ detection_msg: DetectionResult,
+ image: np.ndarray,
+ odom: dict,
+ cloud: np.ndarray,
+ ) -> list[Observation]:
+ """Build Observation objects from detections and segmented point clouds.
+
+ Args:
+ detection_msg: DetectionResult with bboxes, labels, and masks.
+ image: RGB image used for masks and cropping.
+ odom: Interpolated odometry dict (position + orientation).
+ cloud: Aggregated point cloud in sensor frame.
+
+ Returns:
+ List of Observation instances for object updates.
+ """
+ det_stamp = detection_msg.header.stamp.sec + detection_msg.header.stamp.nanosec / 1e9
+ image_shape = image.shape[:2]
+
+ masks = []
+ labels = []
+ confidences = []
+ track_ids = []
+
+ for i in range(len(detection_msg.track_id)):
+ conf = float(detection_msg.confidence[i])
+ if conf < self._confidence_threshold:
+ continue
+ x1 = int(detection_msg.x1[i])
+ y1 = int(detection_msg.y1[i])
+ x2 = int(detection_msg.x2[i])
+ y2 = int(detection_msg.y2[i])
+ x1 = max(0, min(x1, image_shape[1] - 1))
+ x2 = max(0, min(x2, image_shape[1]))
+ y1 = max(0, min(y1, image_shape[0] - 1))
+ y2 = max(0, min(y2, image_shape[0]))
+ if x2 <= x1 or y2 <= y1:
+ continue
+
+ full_mask = np.zeros(image_shape, dtype=bool)
+ if i < len(detection_msg.masks):
+ mask_msg: CompressedImage = detection_msg.masks[i]
+ mask_crop = self._bridge.compressed_imgmsg_to_cv2(mask_msg)
+ if mask_crop.ndim == 3:
+ mask_crop = cv2.cvtColor(mask_crop, cv2.COLOR_BGR2GRAY)
+ mask_bool = mask_crop.astype(bool)
+ if self._mask_erode_iterations > 0:
+ mask_eroded = cv2.erode(
+ mask_bool.astype(np.uint8),
+ self._mask_erode_kernel,
+ iterations=self._mask_erode_iterations,
+ )
+ mask_bool = mask_eroded.astype(bool)
+ crop_h = min(mask_bool.shape[0], y2 - y1)
+ crop_w = min(mask_bool.shape[1], x2 - x1)
+ full_mask[y1 : y1 + crop_h, x1 : x1 + crop_w] = mask_bool[:crop_h, :crop_w]
+
+ masks.append(full_mask)
+ labels.append(detection_msg.label[i])
+ confidences.append(conf)
+ track_ids.append(int(detection_msg.track_id[i]))
+
+ if len(masks) == 0:
+ return []
+
+ R_b2w = Rotation.from_quat(odom["orientation"]).as_matrix()
+ t_b2w = np.array(odom["position"])
+ R_w2b = R_b2w.T
+ t_w2b = -R_w2b @ t_b2w
+ cloud_body = cloud @ R_w2b.T + t_w2b
+
+ _, pixel_idx = project_pointcloud_to_image(
+ points=cloud_body,
+ projection=self._projection,
+ image_width=self._image_width,
+ image_height=self._image_height,
+ axis_mode=self._axis_mode,
+ intrinsics=None,
+ depth_filter=self._depth_filter,
+ depth_filter_margin=self._depth_filter_margin,
+ transform=self._sensor_to_camera,
+ )
+ seg_clouds, seg_depths = segment_pointclouds_from_projection(
+ points=cloud_body,
+ masks=masks,
+ pixel_idx=pixel_idx,
+ )
+ if self._depth_filter:
+ filtered_clouds = []
+ filtered_depths = []
+ for cloud, depth in zip(seg_clouds, seg_depths):
+ if depth.shape[0] == 0:
+ filtered_clouds.append(cloud)
+ filtered_depths.append(depth)
+ continue
+ min_depth = float(np.min(depth))
+ keep = depth[:, 0] <= (min_depth + self._depth_filter_margin)
+ filtered_clouds.append(cloud[keep])
+ filtered_depths.append(depth[keep])
+ seg_clouds = filtered_clouds
+ seg_depths = filtered_depths
+ world_clouds = transform_segmented_clouds_to_world(
+ clouds=seg_clouds,
+ depths=seg_depths,
+ R_b2w=R_b2w,
+ t_b2w=t_b2w,
+ depth_jump_threshold=self._depth_jump_threshold,
+ )
+ cloud_world = cloud_body @ R_b2w.T + t_b2w
+ ground_z_threshold = self._compute_ground_z_threshold(cloud_world, t_b2w[:2])
+
+ observations = []
+ for idx, world_cloud in enumerate(world_clouds):
+ if self._max_cloud_distance > 0:
+ dist = np.linalg.norm(world_cloud - t_b2w, axis=1)
+ world_cloud = world_cloud[dist < self._max_cloud_distance]
+ if ground_z_threshold is not None:
+ world_cloud = world_cloud[world_cloud[:, 2] > ground_z_threshold]
+ if world_cloud.shape[0] == 0:
+ continue
+ observations.append(
+ Observation(
+ track_id=track_ids[idx],
+ label=labels[idx],
+ confidence=confidences[idx],
+ points=world_cloud,
+ robot_R=R_b2w,
+ robot_t=t_b2w,
+ mask=masks[idx],
+ image=image,
+ timestamp=det_stamp,
+ )
+ )
+
+ return observations
+
+ def _update_observation_angles(self, observations: list[Observation]) -> None:
+ """Reproject object voxels onto the current mask to update angle votes."""
+ if len(observations) == 0:
+ return
+
+ objects = self._mapper.get_objects(include_pending=True)
+ track_to_obj = {}
+ for obj in objects:
+ for track_id in obj.track_ids:
+ track_to_obj[track_id] = obj
+
+ for obs in observations:
+ obj = track_to_obj.get(obs.track_id)
+ if obj is None or obs.mask is None:
+ continue
+ voxels = obj.voxel_manager.voxels
+ if voxels.shape[0] == 0:
+ continue
+
+ R_b2w = obs.robot_R
+ t_b2w = obs.robot_t
+ R_w2b = R_b2w.T
+ t_w2b = -R_w2b @ t_b2w
+ voxels_body = voxels @ R_w2b.T + t_w2b
+ _, pixel_idx = project_pointcloud_to_image(
+ points=voxels_body,
+ projection=self._projection,
+ image_width=self._image_width,
+ image_height=self._image_height,
+ axis_mode=self._axis_mode,
+ intrinsics=None,
+ depth_filter=self._depth_filter,
+ depth_filter_margin=self._depth_filter_margin,
+ transform=self._sensor_to_camera,
+ )
+ pix = pixel_idx.astype(int)
+ valid = (
+ (pix[:, 0] >= 0)
+ & (pix[:, 0] < obs.mask.shape[1])
+ & (pix[:, 1] >= 0)
+ & (pix[:, 1] < obs.mask.shape[0])
+ )
+ if not np.any(valid):
+ continue
+ pix = pix[valid]
+ voxel_indices = np.nonzero(valid)[0]
+ on_mask = obs.mask[pix[:, 1], pix[:, 0]].astype(bool)
+ if not np.any(on_mask):
+ continue
+ voxel_indices = voxel_indices[on_mask]
+ vectors = voxels[voxel_indices] - t_b2w
+ obs_angles = np.arctan2(vectors[:, 1], vectors[:, 0])
+ obs_bins = discretize_angles(obs_angles, obj.voxel_manager.num_angle_bins)
+ obj.voxel_manager.update_observation_angles(voxel_indices, obs_bins)
+ obj._invalidate_cache()
+
+ def _update_best_images(self, observations: list[Observation], image: np.ndarray) -> None:
+ """Update best image crops for objects based on mask area.
+
+ Args:
+ observations: Observations created for the current frame.
+ image: Full RGB image associated with the detections.
+ """
+ for obs in observations:
+ obj = None
+ for candidate in self._mapper.get_objects(include_pending=True):
+ if obs.track_id in candidate.track_ids:
+ obj = candidate
+ break
+ if obj is None or obs.mask is None:
+ continue
+
+ coords = np.argwhere(obs.mask)
+ if coords.size == 0:
+ continue
+ mask_area = coords.shape[0]
+ if mask_area <= obj.best_image_score:
+ continue
+
+ y0, x0 = coords.min(axis=0)
+ y1, x1 = coords.max(axis=0) + 1
+ padding = max(int(y1 - y0), int(x1 - x0), 80)
+ y0 = max(0, y0 - padding)
+ x0 = max(0, x0 - padding)
+ y1 = min(obs.mask.shape[0], y1 + padding)
+ x1 = min(obs.mask.shape[1], x1 + padding)
+
+ if obj.best_image_path:
+ image_path = Path(obj.best_image_path)
+ mask_path = image_path.with_name(image_path.stem + "_mask.npy")
+ if image_path.exists():
+ image_path.unlink()
+ if mask_path.exists():
+ mask_path.unlink()
+ if self._save_png:
+ png_path = image_path.with_suffix(".png")
+ mask_png_path = image_path.with_name(image_path.stem + "_mask.png")
+ if png_path.exists():
+ png_path.unlink()
+ if mask_png_path.exists():
+ mask_png_path.unlink()
+
+ cropped_image = image[y0:y1, x0:x1]
+ cropped_mask = obs.mask[y0:y1, x0:x1]
+
+ obj.best_image_path = self._image_saver.queue_save(
+ object_id=str(obs.track_id),
+ image=cropped_image,
+ mask=cropped_mask,
+ )
+ obj.best_image_score = float(mask_area)
+ obj.is_asked_vlm = False
+
+ def _publish_map(self, stamp: float, viewpoint_id: int) -> None:
+ """Publish object nodes, markers, and aggregated point clouds.
+
+ Args:
+ stamp: Timestamp (seconds) for message headers.
+ viewpoint_id: Viewpoint id for ObjectNode messages.
+ """
+ seconds = int(stamp)
+ nanoseconds = int((stamp - seconds) * 1e9)
+ header = Header()
+ header.stamp.sec = seconds
+ header.stamp.nanosec = nanoseconds
+ header.frame_id = self._map_frame
+
+ objects = self._mapper.get_objects(include_pending=True)
+ for obj in objects:
+ if obj.detection_count == 1:
+ status = "new"
+ elif obj.inactive_frames == 0:
+ status = "updated"
+ else:
+ status = "unchanged"
+ msg = map_object_to_object_node(
+ obj,
+ header,
+ status=status,
+ viewpoint_id=viewpoint_id,
+ target_object=self._target_object,
+ )
+ if msg is not None:
+ self._object_nodes_pub.publish(msg)
+
+ viz_objects = objects
+ if self._visualization_max_objects > 0 and len(objects) > self._visualization_max_objects:
+ viz_objects = sorted(
+ objects, key=lambda obj: obj.last_update_time, reverse=True
+ )[: self._visualization_max_objects]
+
+ bbox_markers = create_bbox_markers(viz_objects, header)
+ label_markers = create_label_markers(viz_objects, header)
+ self._obj_boxes_pub.publish(bbox_markers)
+ self._obj_labels_pub.publish(label_markers)
+
+ obj_cloud = create_object_pointcloud(viz_objects, header)
+ if obj_cloud is not None:
+ self._obj_points_pub.publish(obj_cloud)
+
+ current_ids = {int(obj.track_ids[0]) if obj.track_ids else 0 for obj in viz_objects}
+ deleted_ids = list(self._published_ids - current_ids)
+ if len(deleted_ids) > 0:
+ delete_markers = create_delete_markers(deleted_ids, header, namespace="bbox")
+ self._obj_boxes_pub.publish(delete_markers)
+ delete_labels = create_delete_markers(deleted_ids, header, namespace="labels")
+ self._obj_labels_pub.publish(delete_labels)
+ self._published_ids = current_ids
+
+ def _publish_object_type_queries(self) -> None:
+ """Publish VLM queries for target objects with good image crops."""
+ for obj in self._mapper.get_objects(include_pending=False):
+ if obj.label != self._target_object:
+ continue
+ if obj.is_asked_vlm:
+ continue
+ if obj.best_image_score <= 500:
+ continue
+ msg = ObjectType()
+ msg.object_id = int(obj.track_ids[0]) if obj.track_ids else -1
+ msg.img_path = obj.best_image_path or ""
+ msg.labels = list(obj.class_votes.keys())
+ self._object_type_query_pub.publish(msg)
+ obj.is_asked_vlm = True
+
+ def _load_sensor_to_camera(self, path: str) -> np.ndarray | None:
+ """Load sensor->camera transform from YAML.
+
+ Args:
+ path: Path to YAML config with translation and rpy.
+
+ Returns:
+ 4x4 transform matrix or None if config not found.
+ """
+ config_path = Path(path)
+ if not config_path.is_absolute():
+ config_path = Path(__file__).resolve().parent.parent / config_path
+ if not config_path.exists():
+ return None
+
+ data = yaml.safe_load(config_path.read_text()) or {}
+ tf_data = data.get("sensor_to_camera", {})
+ trans = tf_data.get("translation", {})
+ rpy = tf_data.get("rotation_rpy", {})
+ cam_offset = np.array(
+ [
+ float(trans.get("x", 0.0)),
+ float(trans.get("y", 0.0)),
+ float(trans.get("z", 0.0)),
+ ]
+ )
+ roll = float(rpy.get("roll", 0.0))
+ pitch = float(rpy.get("pitch", 0.0))
+ yaw = float(rpy.get("yaw", 0.0))
+ cos_roll = np.cos(roll)
+ sin_roll = np.sin(roll)
+ cos_pitch = np.cos(pitch)
+ sin_pitch = np.sin(pitch)
+ cos_yaw = np.cos(yaw)
+ sin_yaw = np.sin(yaw)
+ r_z = np.array(
+ [
+ [cos_yaw, -sin_yaw, 0.0],
+ [sin_yaw, cos_yaw, 0.0],
+ [0.0, 0.0, 1.0],
+ ]
+ )
+ r_y = np.array(
+ [
+ [cos_pitch, 0.0, sin_pitch],
+ [0.0, 1.0, 0.0],
+ [-sin_pitch, 0.0, cos_pitch],
+ ]
+ )
+ r_x = np.array(
+ [
+ [1.0, 0.0, 0.0],
+ [0.0, cos_roll, -sin_roll],
+ [0.0, sin_roll, cos_roll],
+ ]
+ )
+ cam_rotation = r_z @ r_y @ r_x
+ rotation = cam_rotation.T
+ translation = -rotation @ cam_offset
+ transform = np.eye(4)
+ transform[:3, :3] = rotation
+ transform[:3, 3] = translation
+ return transform
+
+
+def main(args: list[str] | None = None) -> None:
+ """Entry point for the semantic mapping node."""
+ rclpy.init(args=args)
+ node = SemanticMappingNode()
+ try:
+ rclpy.spin(node)
+ except KeyboardInterrupt:
+ pass
+ finally:
+ node.destroy_node()
+ rclpy.shutdown()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/semantic_mapping/semantic_mapping/utils/__init__.py b/semantic_mapping/semantic_mapping/utils/__init__.py
new file mode 100644
index 0000000..d949aeb
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/utils/__init__.py
@@ -0,0 +1 @@
+"""Utility modules for semantic mapping."""
diff --git a/semantic_mapping/semantic_mapping/utils/map_object_utils.py b/semantic_mapping/semantic_mapping/utils/map_object_utils.py
new file mode 100644
index 0000000..5ad4163
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/utils/map_object_utils.py
@@ -0,0 +1,264 @@
+"""ROS message helpers for MapObject publishers."""
+
+from __future__ import annotations
+
+import zlib
+from typing import Optional
+
+import numpy as np
+from geometry_msgs.msg import Point
+from sensor_msgs.msg import PointCloud2
+from std_msgs.msg import Header
+from visualization_msgs.msg import Marker, MarkerArray
+
+from semantic_mapping.msg import ObjectNode
+from vector_perception_utils.pointcloud_utils import create_pointcloud2_msg
+
+from ..map_object import MapObject
+
+
+def _axis_aligned_corners(points: np.ndarray) -> Optional[np.ndarray]:
+ if points.size == 0:
+ return None
+ min_bound = np.min(points, axis=0)
+ max_bound = np.max(points, axis=0)
+ return np.array(
+ [
+ [min_bound[0], min_bound[1], min_bound[2]],
+ [max_bound[0], min_bound[1], min_bound[2]],
+ [max_bound[0], max_bound[1], min_bound[2]],
+ [min_bound[0], max_bound[1], min_bound[2]],
+ [min_bound[0], min_bound[1], max_bound[2]],
+ [max_bound[0], min_bound[1], max_bound[2]],
+ [max_bound[0], max_bound[1], max_bound[2]],
+ [min_bound[0], max_bound[1], max_bound[2]],
+ ],
+ dtype=float,
+ )
+
+
+def get_object_seed(obj: MapObject) -> int:
+ """Generate a deterministic seed from a MapObject for color generation.
+
+ Args:
+ obj: MapObject instance.
+
+ Returns:
+ CRC32 hash of the object's identifier.
+ """
+ seed_key = obj.object_id or (str(obj.track_ids[0]) if obj.track_ids else obj.label)
+ return zlib.crc32(seed_key.encode("utf-8"))
+
+
+def map_object_to_object_node(
+ obj: MapObject,
+ header: Header,
+ status: str = "new",
+ viewpoint_id: int = -1,
+ target_object: str | None = None,
+) -> Optional[ObjectNode]:
+ """Convert a MapObject into an ObjectNode message.
+
+ Args:
+ obj: MapObject instance.
+ header: ROS header to attach to the message.
+ status: Lifecycle status string ("new", "updated", "unchanged").
+ viewpoint_id: Viewpoint identifier for image saving.
+ target_object: Target object label for potential labeling.
+
+ Returns:
+ ObjectNode message or None if required geometry is missing.
+ """
+ if obj.centroid is None or obj.bbox3d_corners is None:
+ return None
+
+ msg = ObjectNode()
+ msg.header = header
+ msg.object_id = obj.track_ids
+ if target_object and obj.label == target_object and not obj.is_asked_vlm:
+ msg.label = "Potential Target"
+ else:
+ msg.label = obj.label
+ msg.status = status
+ msg.position.x = float(obj.centroid[0])
+ msg.position.y = float(obj.centroid[1])
+ msg.position.z = float(obj.centroid[2])
+
+ for i in range(8):
+ msg.bbox3d[i].x = float(obj.bbox3d_corners[i, 0])
+ msg.bbox3d[i].y = float(obj.bbox3d_corners[i, 1])
+ msg.bbox3d[i].z = float(obj.bbox3d_corners[i, 2])
+
+ voxels = obj.voxel_manager.get_valid_voxels(
+ diversity_percentile=obj._diversity_percentile,
+ use_regularized=True,
+ )
+ if len(voxels) == 0:
+ voxels = obj.voxel_manager.get_valid_voxels(
+ diversity_percentile=obj._diversity_percentile,
+ use_regularized=False,
+ )
+ if len(voxels) > 0:
+ seed = get_object_seed(obj)
+ color = np.array(
+ [
+ (seed & 0xFF) / 255.0,
+ ((seed >> 8) & 0xFF) / 255.0,
+ ((seed >> 16) & 0xFF) / 255.0,
+ ],
+ dtype=np.float32,
+ )
+ colors = np.tile(color, (voxels.shape[0], 1))
+ msg.cloud = create_pointcloud2_msg(points=voxels, colors=colors, header=header)
+
+ msg.img_path = obj.best_image_path or ""
+ msg.is_asked_vlm = obj.is_asked_vlm
+ msg.viewpoint_id = -1 if obj.updated_by_vlm else viewpoint_id
+ return msg
+
+
+def create_bbox_markers(
+ objects: list[MapObject], header: Header, namespace: str = "bbox"
+) -> MarkerArray:
+ """Create wireframe bounding box markers for MapObjects."""
+ markers = MarkerArray()
+ edges = [
+ (0, 1),
+ (1, 2),
+ (2, 3),
+ (3, 0),
+ (4, 5),
+ (5, 6),
+ (6, 7),
+ (7, 4),
+ (0, 4),
+ (1, 5),
+ (2, 6),
+ (3, 7),
+ ]
+
+ for obj in objects:
+ voxels = obj.voxel_manager.get_valid_voxels(
+ diversity_percentile=obj._diversity_percentile,
+ use_regularized=True,
+ )
+ if len(voxels) == 0:
+ voxels = obj.voxel_manager.get_valid_voxels(
+ diversity_percentile=obj._diversity_percentile,
+ use_regularized=False,
+ )
+ corners = _axis_aligned_corners(voxels) if len(voxels) > 0 else None
+ if corners is None:
+ continue
+ marker = Marker()
+ marker.header = header
+ marker.ns = namespace
+ marker.id = int(obj.track_ids[0]) if obj.track_ids else 0
+ marker.type = Marker.LINE_LIST
+ marker.action = Marker.ADD
+ marker.scale.x = 0.02
+
+ seed = get_object_seed(obj)
+ marker.color.r = (seed & 0xFF) / 255.0
+ marker.color.g = ((seed >> 8) & 0xFF) / 255.0
+ marker.color.b = ((seed >> 16) & 0xFF) / 255.0
+ marker.color.a = 1.0
+
+ for i, j in edges:
+ marker.points.append(Point(x=float(corners[i, 0]), y=float(corners[i, 1]), z=float(corners[i, 2])))
+ marker.points.append(Point(x=float(corners[j, 0]), y=float(corners[j, 1]), z=float(corners[j, 2])))
+
+ markers.markers.append(marker)
+
+ return markers
+
+
+def create_label_markers(
+ objects: list[MapObject],
+ header: Header,
+ namespace: str = "labels",
+ text_height: float = 0.4,
+) -> MarkerArray:
+ """Create text label markers for MapObjects."""
+ markers = MarkerArray()
+
+ for obj in objects:
+ if obj.centroid is None:
+ continue
+ marker = Marker()
+ marker.header = header
+ marker.ns = namespace
+ marker.id = int(obj.track_ids[0]) if obj.track_ids else 0
+ marker.type = Marker.TEXT_VIEW_FACING
+ marker.action = Marker.ADD
+ marker.scale.z = text_height
+ marker.text = obj.label
+ marker.pose.position.x = float(obj.centroid[0])
+ marker.pose.position.y = float(obj.centroid[1])
+ marker.pose.position.z = float(obj.centroid[2]) + 0.3
+
+ seed = get_object_seed(obj)
+ marker.color.r = (seed & 0xFF) / 255.0
+ marker.color.g = ((seed >> 8) & 0xFF) / 255.0
+ marker.color.b = ((seed >> 16) & 0xFF) / 255.0
+ marker.color.a = 1.0
+
+ markers.markers.append(marker)
+
+ return markers
+
+
+def create_object_pointcloud(objects: list[MapObject], header: Header) -> Optional[PointCloud2]:
+ """Create a colored point cloud for a list of MapObjects."""
+ points_list = []
+ colors_list = []
+
+ for obj in objects:
+ voxels = obj.voxel_manager.get_valid_voxels(
+ diversity_percentile=obj._diversity_percentile,
+ use_regularized=True,
+ )
+ if len(voxels) == 0:
+ voxels = obj.voxel_manager.get_valid_voxels(
+ diversity_percentile=obj._diversity_percentile,
+ use_regularized=False,
+ )
+ if len(voxels) == 0:
+ continue
+
+ seed = get_object_seed(obj)
+ color = np.array(
+ [
+ (seed & 0xFF) / 255.0,
+ ((seed >> 8) & 0xFF) / 255.0,
+ ((seed >> 16) & 0xFF) / 255.0,
+ ],
+ dtype=np.float32,
+ )
+ colors = np.tile(color, (voxels.shape[0], 1))
+ points_list.append(voxels)
+ colors_list.append(colors)
+
+ if len(points_list) == 0:
+ return None
+
+ points = np.concatenate(points_list, axis=0)
+ colors = np.concatenate(colors_list, axis=0)
+ return create_pointcloud2_msg(points=points, colors=colors, header=header)
+
+
+def create_delete_markers(
+ deleted_ids: list[int],
+ header: Header,
+ namespace: str = "bbox",
+) -> MarkerArray:
+ """Create deletion markers for a list of marker IDs."""
+ markers = MarkerArray()
+ for marker_id in deleted_ids:
+ marker = Marker()
+ marker.header = header
+ marker.ns = namespace
+ marker.id = int(marker_id)
+ marker.action = Marker.DELETE
+ markers.markers.append(marker)
+ return markers
diff --git a/semantic_mapping/semantic_mapping/utils/matching.py b/semantic_mapping/semantic_mapping/utils/matching.py
new file mode 100644
index 0000000..e5aaa19
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/utils/matching.py
@@ -0,0 +1,256 @@
+"""Object matching utilities for semantic mapping."""
+
+from __future__ import annotations
+
+from typing import Optional
+
+import cv2
+import numpy as np
+
+from vector_perception_utils.pointcloud_utils import compute_centroid
+
+from ..map_object import MapObject, Observation
+
+
+def _polygon_area(poly: np.ndarray) -> float:
+ """Compute polygon area using the shoelace formula.
+
+ Args:
+ poly: Polygon vertices in order (N, 2).
+
+ Returns:
+ Polygon area.
+ """
+ if poly.shape[0] < 3:
+ return 0.0
+ x = poly[:, 0]
+ y = poly[:, 1]
+ return 0.5 * abs(float(np.dot(x, np.roll(y, -1)) - np.dot(y, np.roll(x, -1))))
+
+
+def _ensure_ccw(poly: np.ndarray) -> np.ndarray:
+ """Ensure polygon vertices are ordered counter-clockwise."""
+ if poly.shape[0] < 3:
+ return poly
+ x = poly[:, 0]
+ y = poly[:, 1]
+ signed = float(np.dot(x, np.roll(y, -1)) - np.dot(y, np.roll(x, -1)))
+ if signed < 0:
+ return poly[::-1]
+ return poly
+
+
+def compute_bbox(points: np.ndarray) -> Optional[tuple[np.ndarray, np.ndarray]]:
+ """Compute yaw-only oriented bbox corners + extents."""
+ if points.shape[0] < 3:
+ return None
+
+ xy = points[:, :2].astype(np.float32)
+ unique_xy = np.unique(xy, axis=0)
+ if unique_xy.shape[0] < 3:
+ return None
+
+ rect = cv2.minAreaRect(unique_xy)
+ (width, height) = rect[1]
+ if width == 0.0 or height == 0.0:
+ return None
+
+ box = cv2.boxPoints(rect).astype(np.float64)
+ rect_xy = _ensure_ccw(box)
+
+ zmin = float(np.min(points[:, 2]))
+ zmax = float(np.max(points[:, 2]))
+ extent = np.array([width, height, zmax - zmin], dtype=float)
+ z_center = zmax - extent[2] / 2.0
+ half_z = extent[2] / 2.0
+
+ bottom = np.hstack([rect_xy, np.full((4, 1), z_center - half_z)])
+ top = np.hstack([rect_xy, np.full((4, 1), z_center + half_z)])
+ corners = np.vstack([bottom, top])
+ return corners, extent
+
+
+def _line_intersection(p1: np.ndarray, p2: np.ndarray, cp1: np.ndarray, cp2: np.ndarray) -> np.ndarray:
+ """Intersection of line p1->p2 with line cp1->cp2.
+
+ Args:
+ p1: Line segment start (2,).
+ p2: Line segment end (2,).
+ cp1: Clip edge start (2,).
+ cp2: Clip edge end (2,).
+
+ Returns:
+ Intersection point (2,).
+ """
+ dc = cp1 - cp2
+ dp = p1 - p2
+ denom = dc[0] * dp[1] - dc[1] * dp[0]
+ if denom == 0:
+ return p2
+ n1 = cp1[0] * cp2[1] - cp1[1] * cp2[0]
+ n2 = p1[0] * p2[1] - p1[1] * p2[0]
+ x = (n1 * dp[0] - n2 * dc[0]) / denom
+ y = (n1 * dp[1] - n2 * dc[1]) / denom
+ return np.array([x, y], dtype=np.float64)
+
+
+def _polygon_clip(subject: np.ndarray, clip: np.ndarray) -> np.ndarray:
+ """Clip a polygon with a convex clip polygon (Sutherland–Hodgman).
+
+ Args:
+ subject: Subject polygon vertices (N, 2).
+ clip: Clip polygon vertices (M, 2).
+
+ Returns:
+ Clipped polygon vertices (K, 2).
+ """
+ if subject.shape[0] == 0:
+ return subject
+
+ def inside(p: np.ndarray, cp1: np.ndarray, cp2: np.ndarray) -> bool:
+ return (cp2[0] - cp1[0]) * (p[1] - cp1[1]) - (cp2[1] - cp1[1]) * (
+ p[0] - cp1[0]
+ ) >= 0
+
+ output = subject
+ for i in range(len(clip)):
+ cp1 = clip[i]
+ cp2 = clip[(i + 1) % len(clip)]
+ input_list = output
+ output = []
+ if len(input_list) == 0:
+ return np.empty((0, 2), dtype=np.float64)
+ s = input_list[-1]
+ for e in input_list:
+ if inside(e, cp1, cp2):
+ if not inside(s, cp1, cp2):
+ output.append(_line_intersection(s, e, cp1, cp2))
+ output.append(e)
+ elif inside(s, cp1, cp2):
+ output.append(_line_intersection(s, e, cp1, cp2))
+ s = e
+ output = np.array(output, dtype=np.float64)
+ return np.array(output, dtype=np.float64)
+
+
+def compute_iou_3d_with_ratios(
+ bbox1: np.ndarray, bbox2: np.ndarray
+) -> tuple[float, float, float]:
+ """Compute IoU and intersection ratios using yaw-only bbox overlap.
+
+ Args:
+ bbox1: Corner points for bbox1 (N, 3).
+ bbox2: Corner points for bbox2 (N, 3).
+
+ Returns:
+ Tuple of (iou, ratio_bbox1, ratio_bbox2).
+ """
+ if bbox1.shape[0] < 4 or bbox2.shape[0] < 4:
+ return 0.0, 0.0, 0.0
+
+ rect1 = _ensure_ccw(bbox1[:4, :2])
+ rect2 = _ensure_ccw(bbox2[:4, :2])
+ if rect1.shape[0] < 3 or rect2.shape[0] < 3:
+ return 0.0, 0.0, 0.0
+
+ poly_inter = _polygon_clip(rect1, rect2)
+ inter_area = _polygon_area(poly_inter) if poly_inter.shape[0] >= 3 else 0.0
+
+ zmin1, zmax1 = float(np.min(bbox1[:, 2])), float(np.max(bbox1[:, 2]))
+ zmin2, zmax2 = float(np.min(bbox2[:, 2])), float(np.max(bbox2[:, 2]))
+ height = max(0.0, min(zmax1, zmax2) - max(zmin1, zmin2))
+ inter_vol = inter_area * height
+
+ area1 = _polygon_area(rect1)
+ area2 = _polygon_area(rect2)
+ vol1 = area1 * (zmax1 - zmin1)
+ vol2 = area2 * (zmax2 - zmin2)
+ denom = vol1 + vol2 - inter_vol
+
+ iou = inter_vol / denom if denom > 0 else 0.0
+ ratio1 = inter_vol / vol1 if vol1 > 0 else 0.0
+ ratio2 = inter_vol / vol2 if vol2 > 0 else 0.0
+ return iou, ratio1, ratio2
+
+
+def find_best_match(
+ obs: Observation,
+ candidates: list[MapObject],
+ distance_threshold: float,
+ iou_threshold: float,
+ ratio_threshold: float = 0.4,
+ min_distance_merge: float = 0.25,
+) -> Optional[MapObject]:
+ """Find best matching object by distance/IoU heuristics.
+
+ Args:
+ obs: Observation to match.
+ candidates: Candidate map objects.
+ distance_threshold: Max centroid distance to consider.
+ iou_threshold: Min IoU to accept a merge.
+ ratio_threshold: Min overlap ratio to accept a merge.
+ min_distance_merge: Absolute distance threshold for merge.
+
+ Returns:
+ Best matching object or None if no match passes thresholds.
+ """
+ if obs.points.shape[0] == 0:
+ return None
+
+ obs_center = compute_centroid(obs.points)
+ obs_bbox = compute_bbox(obs.points)
+ obs_corners = obs_bbox[0] if obs_bbox is not None else None
+ obs_extent = obs_bbox[1] if obs_bbox is not None else None
+
+ best_candidate: Optional[MapObject] = None
+ best_dist = float("inf")
+
+ for candidate in candidates:
+ if obs.track_id in candidate.track_ids:
+ return candidate
+
+ cand_center = candidate.centroid
+ if cand_center is None:
+ continue
+
+ dist = float(np.linalg.norm(obs_center - cand_center))
+ if dist > distance_threshold:
+ continue
+
+ cand_points = candidate.voxel_manager.get_valid_voxels(
+ diversity_percentile=candidate._diversity_percentile,
+ use_regularized=True,
+ )
+ if len(cand_points) == 0:
+ cand_points = candidate.voxel_manager.get_valid_voxels(
+ diversity_percentile=candidate._diversity_percentile,
+ use_regularized=False,
+ )
+ cand_bbox = compute_bbox(cand_points) if len(cand_points) >= 3 else None
+ cand_corners = cand_bbox[0] if cand_bbox is not None else None
+ cand_extent = cand_bbox[1] if cand_bbox is not None else None
+
+ dist_merge = dist < min_distance_merge
+ if obs_extent is not None and cand_extent is not None:
+ dist_thresh = np.linalg.norm((obs_extent / 2 + cand_extent / 2) / 2) * 0.5
+ dist_merge = dist_merge or dist < dist_thresh
+
+ if dist_merge:
+ if dist < best_dist:
+ best_candidate = candidate
+ best_dist = dist
+ continue
+
+ if obs_corners is None or cand_corners is None:
+ continue
+
+ iou, ratio_obs, ratio_cand = compute_iou_3d_with_ratios(
+ obs_corners, cand_corners
+ )
+
+ if iou > iou_threshold or ratio_obs > ratio_threshold or ratio_cand > ratio_threshold:
+ if dist < best_dist:
+ best_candidate = candidate
+ best_dist = dist
+
+ return best_candidate
diff --git a/semantic_mapping/semantic_mapping/utils/storage.py b/semantic_mapping/semantic_mapping/utils/storage.py
new file mode 100644
index 0000000..d2be3cc
--- /dev/null
+++ b/semantic_mapping/semantic_mapping/utils/storage.py
@@ -0,0 +1,90 @@
+"""Async image saving queue for semantic mapping."""
+
+from __future__ import annotations
+
+from pathlib import Path
+import shutil
+import queue
+import threading
+import time
+
+import cv2
+import numpy as np
+
+
+class ImageSaveQueue:
+ """Background worker for saving object images to disk."""
+
+ def __init__(self, output_dir: Path, max_queue_size: int = 100, save_png: bool = False) -> None:
+ """Initialize the image save queue.
+
+ Args:
+ output_dir: Directory where images are stored.
+ max_queue_size: Maximum number of queued save jobs.
+ save_png: If True, also save a .png copy of the image.
+ """
+ self._output_dir = Path(output_dir)
+ if self._output_dir.exists() and any(self._output_dir.iterdir()):
+ shutil.rmtree(self._output_dir)
+ self._output_dir.mkdir(parents=True, exist_ok=True)
+ self._save_png = save_png
+ self._queue: queue.Queue = queue.Queue(maxsize=max_queue_size)
+ self._stop_event = threading.Event()
+ self._thread: threading.Thread | None = None
+
+ def start(self) -> None:
+ """Start the background worker."""
+ if self._thread is not None and self._thread.is_alive():
+ return
+ self._stop_event.clear()
+ self._thread = threading.Thread(target=self._run, daemon=True)
+ self._thread.start()
+
+ def stop(self) -> None:
+ """Stop the background worker."""
+ self._stop_event.set()
+ try:
+ self._queue.put_nowait(None)
+ except queue.Full:
+ pass
+ if self._thread is not None:
+ self._thread.join(timeout=2.0)
+ self._thread = None
+
+ def queue_save(
+ self, object_id: str, image: np.ndarray, mask: np.ndarray | None = None
+ ) -> str:
+ """Queue an image (and optional mask) for saving.
+
+ Args:
+ object_id: Object identifier to embed in filename.
+ image: Image array to save.
+ mask: Optional mask array to save alongside the image.
+
+ Returns:
+ Path to the saved image (.npy).
+ """
+ timestamp_ms = int(time.time() * 1000)
+ image_path = self._output_dir / f"{object_id}_{timestamp_ms}.npy"
+ mask_path = self._output_dir / f"{object_id}_{timestamp_ms}_mask.npy"
+ self._queue.put(("save", str(image_path), str(mask_path), image, mask))
+ return str(image_path)
+
+ def _run(self) -> None:
+ while not self._stop_event.is_set():
+ item = self._queue.get()
+ if item is None:
+ self._queue.task_done()
+ break
+ op, image_path, mask_path, image, mask = item
+ if op == "save":
+ try:
+ np.save(image_path, image)
+ if mask is not None:
+ np.save(mask_path, mask)
+ if self._save_png:
+ png_path = Path(image_path).with_suffix(".png")
+ cv2.imwrite(str(png_path), image)
+ except Exception:
+ pass
+ self._queue.task_done()
diff --git a/vector_perception_utils/setup.cfg b/vector_perception_utils/setup.cfg
deleted file mode 100644
index 1ce0c22..0000000
--- a/vector_perception_utils/setup.cfg
+++ /dev/null
@@ -1,5 +0,0 @@
-[develop]
-script_dir=$base/lib/vector_perception_utils
-[install]
-install_scripts=$base/lib/vector_perception_utils
-
diff --git a/vector_perception_utils/vector_perception_utils/pointcloud_utils.py b/vector_perception_utils/vector_perception_utils/pointcloud_utils.py
index 835db76..e86678d 100644
--- a/vector_perception_utils/vector_perception_utils/pointcloud_utils.py
+++ b/vector_perception_utils/vector_perception_utils/pointcloud_utils.py
@@ -148,6 +148,75 @@ def remove_radius_outliers(
return points_filtered, colors_filtered, inlier_indices
+def fit_plane(
+ points: np.ndarray,
+ distance_threshold: float = 0.05,
+ ransac_n: int = 3,
+ num_iterations: int = 50,
+ max_points: int = 8000,
+ min_inlier_ratio: float = 0.0,
+ min_normal_z: float = 0.0,
+ min_above_ratio: float = 0.0,
+ prefer_up: bool = True,
+) -> Optional[np.ndarray]:
+ """Fit a plane to a point cloud using RANSAC.
+
+ Args:
+ points: Point coordinates (N, 3).
+ distance_threshold: Max distance for inliers during RANSAC.
+ ransac_n: Number of points to sample per RANSAC iteration.
+ num_iterations: RANSAC iterations.
+ max_points: Max points to sample for fitting (0 keeps all).
+ min_inlier_ratio: Minimum inlier ratio to accept the plane.
+ min_normal_z: Minimum absolute Z component of the plane normal.
+ min_above_ratio: Minimum ratio of points above the plane.
+ prefer_up: If True, flip normal to have positive Z.
+
+ Returns:
+ Plane coefficients [a, b, c, d] for ax + by + cz + d = 0, or None.
+ """
+ if points.shape[0] < 3:
+ return None
+
+ sample_points = points
+ if max_points > 0 and points.shape[0] > max_points:
+ indices = np.random.choice(points.shape[0], max_points, replace=False)
+ sample_points = points[indices]
+
+ pcd = create_pointcloud(sample_points)
+ plane_model, inliers = pcd.segment_plane(
+ distance_threshold=distance_threshold,
+ ransac_n=ransac_n,
+ num_iterations=num_iterations,
+ )
+ if len(inliers) == 0:
+ return None
+ if min_inlier_ratio > 0.0 and len(inliers) / sample_points.shape[0] < min_inlier_ratio:
+ return None
+
+ normal = np.array(plane_model[:3], dtype=float)
+ norm = float(np.linalg.norm(normal))
+ if norm == 0.0:
+ return None
+ normal /= norm
+ d = float(plane_model[3]) / norm
+
+ if min_normal_z > 0.0 and abs(normal[2]) < min_normal_z:
+ return None
+ if prefer_up and normal[2] < 0:
+ normal = -normal
+ d = -d
+
+ if min_above_ratio > 0.0:
+ check_points = sample_points
+ distances = check_points @ normal + d
+ above_ratio = float(np.mean(distances > 0.0))
+ if above_ratio < min_above_ratio:
+ return None
+
+ return np.array([normal[0], normal[1], normal[2], d], dtype=float)
+
+
def transform_pointcloud(
points: np.ndarray,
transform: np.ndarray
@@ -199,42 +268,126 @@ def crop_pointcloud_bbox(
def compute_centroid(points: np.ndarray) -> np.ndarray:
"""
Compute centroid of point cloud.
-
+
Args:
points: Point coordinates (N, 3)
-
+
Returns:
Centroid as (3,) array
"""
return np.mean(points, axis=0)
+def get_bbox_center(bbox) -> np.ndarray:
+ """Return bounding box center for Open3D AABB/OBB variants."""
+ if hasattr(bbox, "center"):
+ return np.asarray(bbox.center)
+ if hasattr(bbox, "get_center"):
+ return np.asarray(bbox.get_center())
+ if hasattr(bbox, "get_min_bound") and hasattr(bbox, "get_max_bound"):
+ min_bound = np.asarray(bbox.get_min_bound())
+ max_bound = np.asarray(bbox.get_max_bound())
+ return (min_bound + max_bound) * 0.5
+ return np.zeros(3, dtype=float)
+
+
+def get_bbox_extent(bbox) -> np.ndarray:
+ """Return bounding box extent for Open3D AABB/OBB variants."""
+ if hasattr(bbox, "extent"):
+ return np.asarray(bbox.extent)
+ if hasattr(bbox, "get_extent"):
+ return np.asarray(bbox.get_extent())
+ if hasattr(bbox, "get_min_bound") and hasattr(bbox, "get_max_bound"):
+ min_bound = np.asarray(bbox.get_min_bound())
+ max_bound = np.asarray(bbox.get_max_bound())
+ return max_bound - min_bound
+ return np.zeros(3, dtype=float)
+
+
+def get_bbox_corners(bbox) -> np.ndarray:
+ """Return 8 bbox corners for Open3D AABB/OBB variants."""
+ if hasattr(bbox, "get_box_points"):
+ return np.asarray(bbox.get_box_points())
+ if hasattr(bbox, "get_min_bound") and hasattr(bbox, "get_max_bound"):
+ min_bound = np.asarray(bbox.get_min_bound())
+ max_bound = np.asarray(bbox.get_max_bound())
+ return np.array(
+ [
+ [min_bound[0], min_bound[1], min_bound[2]],
+ [max_bound[0], min_bound[1], min_bound[2]],
+ [max_bound[0], max_bound[1], min_bound[2]],
+ [min_bound[0], max_bound[1], min_bound[2]],
+ [min_bound[0], min_bound[1], max_bound[2]],
+ [max_bound[0], min_bound[1], max_bound[2]],
+ [max_bound[0], max_bound[1], max_bound[2]],
+ [min_bound[0], max_bound[1], max_bound[2]],
+ ],
+ dtype=float,
+ )
+ return np.zeros((0, 3), dtype=float)
+
+
+def get_oriented_bbox(
+ points: np.ndarray,
+ axis_aligned: bool = True,
+) -> Optional[Tuple[np.ndarray, np.ndarray]]:
+ """
+ Compute bounding box from point cloud.
+
+ Args:
+ points: Point coordinates (N, 3)
+ axis_aligned: If True, compute axis-aligned bounding box.
+ If False, compute minimal oriented bounding box.
+
+ Returns:
+ Tuple of (center, corners) where center is (3,) and corners is (8, 3),
+ or None if insufficient points.
+ """
+ if len(points) < 4:
+ return None
+
+ pcd = create_pointcloud(points)
+
+ try:
+ if axis_aligned:
+ bbox = pcd.get_axis_aligned_bounding_box()
+ else:
+ bbox = pcd.get_minimal_oriented_bounding_box()
+ except Exception:
+ return None
+
+ center = get_bbox_center(bbox)
+ corners = get_bbox_corners(bbox)
+ return center, corners
+
def pointcloud_to_bbox3d(points: np.ndarray, header=None):
"""
Convert point cloud to minimal oriented 3D bounding box (ROS2 message).
-
+
Args:
points: Point coordinates (N, 3)
header: Optional ROS2 header for the message
-
+
Returns:
vision_msgs/BoundingBox3D message or None if insufficient points
"""
-
+
if len(points) < 4:
return None
pcd = create_pointcloud(points)
-
+
try:
obb = pcd.get_minimal_oriented_bounding_box()
except:
return None
-
- center = obb.center
- extent = obb.extent
- quat = Rotation.from_matrix(obb.R).as_quat()
-
+ center = get_bbox_center(obb)
+ extent = get_bbox_extent(obb)
+ if hasattr(obb, "R"):
+ quat = Rotation.from_matrix(obb.R).as_quat()
+ else:
+ quat = np.array([0.0, 0.0, 0.0, 1.0], dtype=float)
+
bbox = BoundingBox3D()
bbox.center.position = Point(
x=float(center[0]), y=float(center[1]), z=float(center[2])
@@ -243,7 +396,7 @@ def pointcloud_to_bbox3d(points: np.ndarray, header=None):
x=float(quat[0]), y=float(quat[1]), z=float(quat[2]), w=float(quat[3])
)
bbox.size = Vector3(x=float(extent[0]), y=float(extent[1]), z=float(extent[2]))
-
+
return bbox
@@ -328,6 +481,192 @@ def pointcloud_to_depth(
return depth_image
+def project_pointcloud_to_image(
+ points: np.ndarray,
+ projection: str = "equirect",
+ image_width: int = 1920,
+ image_height: int = 640,
+ axis_mode: str = "xz",
+ intrinsics: Optional[np.ndarray] = None,
+ depth_filter: bool = False,
+ depth_filter_margin: float = 0.15,
+ transform: Optional[np.ndarray] = None,
+) -> Tuple[np.ndarray, np.ndarray]:
+ """
+ Project a point cloud into image pixel coordinates.
+
+ Args:
+ points: Point coordinates (N, 3).
+ projection: Projection model ("equirect" or "pinhole").
+ image_width: Output image width in pixels.
+ image_height: Output image height in pixels.
+ axis_mode: Equirect axis convention ("xz" or "xy").
+ intrinsics: Pinhole intrinsics (3, 3) if using "pinhole".
+ depth_filter: Enable depth-based occlusion filtering.
+ depth_filter_margin: Depth margin for occlusion filtering (meters).
+ transform: Optional 4x4 transform applied before projection.
+
+ Returns:
+ Tuple of (camera_points, pixel_idx) where pixel_idx is (N, 3) with (u, v, depth).
+ """
+ camera_points = transform_pointcloud(points, transform) if transform is not None else points
+
+ if projection == "pinhole":
+ if intrinsics is None:
+ raise ValueError("intrinsics required for pinhole projection")
+ u = camera_points[:, 0] * intrinsics[0, 0] / (camera_points[:, 2] + 1e-6) + intrinsics[0, 2]
+ v = camera_points[:, 1] * intrinsics[1, 1] / (camera_points[:, 2] + 1e-6) + intrinsics[1, 2]
+ return camera_points, np.stack([u, v, camera_points[:, 2]], axis=-1)
+
+ if axis_mode == "xz":
+ hori_dis = np.sqrt(camera_points[:, 0] ** 2 + camera_points[:, 2] ** 2)
+ u = (
+ image_width / (2 * np.pi) * np.arctan2(camera_points[:, 0], camera_points[:, 2])
+ + image_width / 2
+ + 1
+ )
+ v = (
+ image_width / (2 * np.pi) * np.arctan(camera_points[:, 1] / hori_dis)
+ + image_height / 2
+ + 1
+ )
+ else:
+ hori_dis = np.sqrt(camera_points[:, 0] ** 2 + camera_points[:, 1] ** 2)
+ u = (
+ -image_width / (2 * np.pi) * np.arctan2(camera_points[:, 1], camera_points[:, 0])
+ + image_width / 2
+ + 1
+ )
+ v = (
+ -image_width / (2 * np.pi) * np.arctan2(camera_points[:, 2], hori_dis)
+ + image_height / 2
+ + 1
+ )
+
+ u = np.clip(u, 0, image_width - 1)
+ v = np.clip(v, 0, image_height - 1)
+
+ if depth_filter:
+ depth_map = np.full((image_height, image_width), np.inf)
+ idx = v.astype(int) * image_width + u.astype(int)
+ np.minimum.at(depth_map.ravel(), idx, hori_dis)
+ remove_mask = hori_dis >= depth_map[v.astype(int), u.astype(int)] + depth_filter_margin
+ u = u.copy()
+ v = v.copy()
+ u[remove_mask] = -1
+ v[remove_mask] = -1
+
+ return camera_points, np.stack([u, v, hori_dis], axis=-1)
+
+
+def segment_pointcloud_by_mask(
+ points: np.ndarray,
+ mask: np.ndarray,
+ pixel_idx: np.ndarray,
+) -> np.ndarray:
+ """
+ Segment a point cloud using a binary mask and projected pixels.
+
+ Args:
+ points: Point coordinates (N, 3).
+ mask: Binary mask (H, W).
+ pixel_idx: Pixel coordinates (N, 3) as (u, v, depth).
+
+ Returns:
+ Masked point cloud (M, 3).
+ """
+ h, w = mask.shape[:2]
+ pixel_idx = pixel_idx.astype(int)
+ valid = (
+ (pixel_idx[:, 0] >= 0)
+ & (pixel_idx[:, 0] < w)
+ & (pixel_idx[:, 1] >= 0)
+ & (pixel_idx[:, 1] < h)
+ )
+ pixel_idx = pixel_idx[valid]
+ points = points[valid]
+ keep = mask[pixel_idx[:, 1], pixel_idx[:, 0]].astype(bool)
+ return points[keep]
+
+
+def segment_pointclouds_from_projection(
+ points: np.ndarray,
+ masks: list[np.ndarray],
+ pixel_idx: np.ndarray,
+) -> Tuple[list[np.ndarray], list[np.ndarray]]:
+ """
+ Segment point clouds from projected pixels.
+
+ Args:
+ points: Point coordinates (N, 3).
+ masks: List of binary masks (H, W).
+ pixel_idx: Pixel coordinates (N, 3) as (u, v, depth).
+
+ Returns:
+ Tuple of (clouds, depths) where each entry corresponds to a mask.
+ """
+ if masks is None or len(masks) == 0:
+ return [], []
+
+ image_shape = masks[0].shape
+ valid = (
+ (pixel_idx[:, 0] >= 0)
+ & (pixel_idx[:, 0] < image_shape[1])
+ & (pixel_idx[:, 1] >= 0)
+ & (pixel_idx[:, 1] < image_shape[0])
+ )
+ pixel_idx = pixel_idx[valid]
+ points = points[valid]
+
+ depths = pixel_idx[:, 2]
+ pixel_idx = pixel_idx.astype(int)
+
+ obj_clouds = []
+ obj_depths = []
+ for obj_mask in masks:
+ cloud_mask = obj_mask[pixel_idx[:, 1], pixel_idx[:, 0]].astype(bool)
+ obj_clouds.append(points[cloud_mask])
+ obj_depths.append(depths[cloud_mask].reshape(-1, 1))
+
+ return obj_clouds, obj_depths
+
+
+def transform_segmented_clouds_to_world(
+ clouds: list[np.ndarray],
+ depths: list[np.ndarray],
+ R_b2w: np.ndarray,
+ t_b2w: np.ndarray,
+ depth_jump_threshold: float = 0.3,
+) -> list[np.ndarray]:
+ """
+ Prune depth jumps and transform segmented clouds to world frame.
+
+ Args:
+ clouds: List of segmented point clouds (sensor frame).
+ depths: List of depth arrays aligned with clouds.
+ R_b2w: Body-to-world rotation matrix (3, 3).
+ t_b2w: Body-to-world translation vector (3,).
+ depth_jump_threshold: Depth jump threshold for pruning (meters).
+
+ Returns:
+ List of segmented object clouds in world frame.
+ """
+ obj_cloud_world_list = []
+ for obj_cloud, obj_depth in zip(clouds, depths):
+ if obj_depth.shape[0] > 1:
+ obj_depth_diff = (obj_depth[1:] - obj_depth[:-1]).squeeze()
+ if obj_depth_diff.size > 0 and np.max(obj_depth_diff) > depth_jump_threshold:
+ idx_tmp = np.ones(len(obj_depth), dtype=bool)
+ jump_idx = int(np.argmax(obj_depth_diff))
+ idx_tmp[jump_idx + 1 :] = False
+ obj_cloud = obj_cloud[idx_tmp]
+
+ obj_cloud_world = obj_cloud[:, :3] @ R_b2w.T + t_b2w
+ obj_cloud_world_list.append(obj_cloud_world)
+
+ return obj_cloud_world_list
+
+
def create_pointcloud2_msg(points: np.ndarray, colors: np.ndarray, header):
"""
Create PointCloud2 ROS message from points and colors.
diff --git a/vector_perception_utils/vector_perception_utils/transform_utils.py b/vector_perception_utils/vector_perception_utils/transform_utils.py
index 534d8d2..ef3238e 100644
--- a/vector_perception_utils/vector_perception_utils/transform_utils.py
+++ b/vector_perception_utils/vector_perception_utils/transform_utils.py
@@ -8,7 +8,7 @@
from geometry_msgs.msg import Pose, Quaternion
-def normalize_angle(angle: float) -> float:
+def normalize_angle(angle: float | np.ndarray) -> float | np.ndarray:
"""
Normalize angle to [-pi, pi] range.
@@ -21,6 +21,26 @@ def normalize_angle(angle: float) -> float:
return np.arctan2(np.sin(angle), np.cos(angle))
+def discretize_angles(angles: np.ndarray, num_bins: int) -> np.ndarray:
+ """
+ Discretize angles into bin indices.
+
+ Normalizes angles to [-pi, pi] then maps to bin indices [0, num_bins-1].
+
+ Args:
+ angles: Angles in radians (any range).
+ num_bins: Number of angle bins.
+
+ Returns:
+ Array of bin indices (0 to num_bins-1), same shape as input.
+ """
+ angles = normalize_angle(angles)
+ # Map [-pi, pi] to [0, 2*pi] then to [0, num_bins)
+ normalized = (angles + np.pi) / (2 * np.pi)
+ bins = (normalized * num_bins).astype(int)
+ return np.clip(bins, 0, num_bins - 1)
+
+
def pose_to_matrix(pose: Pose) -> np.ndarray:
"""
Convert pose to 4x4 homogeneous transform matrix.