From 5e0ae5b26c25774528ecb8d5bf2b0fe77252438b Mon Sep 17 00:00:00 2001 From: multyxu Date: Thu, 30 Oct 2025 21:51:46 -0400 Subject: [PATCH 1/9] Initial version of yolov11 --- .../semantic_inference/models/__init__.py | 1 + .../models/instance_segmenter.py | 160 ++++++++++++++ .../semantic_inference/models/wrappers.py | 49 ++++ semantic_inference_ros/CMakeLists.txt | 2 +- .../app/instance_segmentation_node | 209 ++++++++++++++++++ .../config/instance_segmentation/yolov11.yaml | 4 + .../instance_segmentation_yolov11.launch.yaml | 25 +++ 7 files changed, 449 insertions(+), 1 deletion(-) create mode 100644 semantic_inference/python/semantic_inference/models/instance_segmenter.py create mode 100755 semantic_inference_ros/app/instance_segmentation_node create mode 100644 semantic_inference_ros/config/instance_segmentation/yolov11.yaml create mode 100644 semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml diff --git a/semantic_inference/python/semantic_inference/models/__init__.py b/semantic_inference/python/semantic_inference/models/__init__.py index b6874c5..b4dc607 100644 --- a/semantic_inference/python/semantic_inference/models/__init__.py +++ b/semantic_inference/python/semantic_inference/models/__init__.py @@ -35,6 +35,7 @@ from semantic_inference.models.patch_extractor import * from semantic_inference.models.segment_refinement import * from semantic_inference.models.wrappers import * +from semantic_inference.models.instance_segmenter import * def default_device(use_cuda=True): diff --git a/semantic_inference/python/semantic_inference/models/instance_segmenter.py b/semantic_inference/python/semantic_inference/models/instance_segmenter.py new file mode 100644 index 0000000..616501d --- /dev/null +++ b/semantic_inference/python/semantic_inference/models/instance_segmenter.py @@ -0,0 +1,160 @@ +# BSD 3-Clause License +# +# Copyright (c) 2021-2024, Massachusetts Institute of Technology. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +"""Model to segment an image and encode segments with CLIP embeddings.""" + +import dataclasses +from dataclasses import dataclass, field +from typing import Any + +import numpy as np +import torch +import torch.nn.functional as F +from spark_config import Config, config_field +from torch import nn + +from semantic_inference.models.mask_functions import ConstantMask +from semantic_inference.models.patch_extractor import ( + PatchExtractor, + center_crop, + default_normalization_parameters, + get_image_preprocessor, +) +from semantic_inference.models.segment_refinement import SegmentRefinement + +def _map_opt(values, f): + return {k: v if v is None else f(v) for k, v in values.items()} + + +@dataclass +class Results: + """Openset Segmentation Results.""" + + masks: torch.Tensor + boxes: torch.Tensor # bounding boxes for the masks + categories: torch.Tensor + confidences: torch.Tensor + + @property + def instances(self): + """Get instance image (if it exists).""" + if self.masks.shape[0] == 0: + return None + + np_masks = self.masks.numpy() + img = np.zeros(np_masks[0].shape, dtype=np.uint16) + for i in range(self.masks.shape[0]): + # instance ids are 1-indexed + img[np_masks[i, ...] > 0] = i + 1 + + # TODO: 16 + 16 int for instance id and category id + + return img + + def cpu(self): + """Move results to CPU.""" + values = dataclasses.asdict(self) + return Results(**_map_opt(values, lambda v: v.cpu())) + + def to(self, *args, **kwargs): + """Forward to to all tensors.""" + values = dataclasses.asdict(self) + return Results(**_map_opt(values, lambda v: v.to(*args, **kwargs))) + + +@dataclass +class InstanceSegmenterConfig(Config): + """Main config for instance segmenter.""" + + instance_model: Any = config_field("instance_model", default="yolov11") + # relevant configs (model path, model weights) for the model + + +class InstanceSegmenter(nn.Module): + """Module to segment and encode an image.""" + + def __init__(self, config): + """Construct an instance segmenter.""" + super().__init__() + # for detecting model device + self._canary_param = nn.Parameter(torch.empty(0)) + + self.config = config + self.segmenter = self.config.instance_model.create() + # self.segment_refinement = SegmentRefinement(config.refinement) # might be useful clean up inprecise edges + + def eval(self): + """ + Override eval to avoid issues with certain models + """ + self.segmenter.eval() + + @classmethod + def construct(cls, **kwargs): + """Load model from configuration dictionary.""" + config = InstanceSegmenterConfig() + config.update(kwargs) + return cls(config) + + @torch.no_grad() + def segment(self, rgb_img, is_rgb_order=True): + """ + Segment image and compute language embeddings for each mask. + + Args: + img (np.ndarry): uint8 image of shape (R, C, 3) in rgb order + is_rgb_order (bool): whether the image is rgb order or not + + Returns: + Encoded image + """ + img = rgb_img if is_rgb_order else rgb_img[:, :, ::-1].copy() + return self(img) + + @property + def device(self): + """Get current model device.""" + return self._canary_param.device + + def forward(self, rgb_img): + """ + Segment image and compute language embeddings for each mask. + + Args: + img (np.ndarray): uint8 image of shape (R, C, 3) in rgb order + + Returns: + Encoded image + """ + categories, masks, boxes, confidences = self.segmenter(rgb_img) + + # img = torch.from_numpy(rgb_img).to(self.device) + # return self.encode(img, masks, boxes) + # TODO: return the results of the actual instance segmentation model here + return Results(masks=masks, boxes=boxes, categories=categories, confidences=confidences) diff --git a/semantic_inference/python/semantic_inference/models/wrappers.py b/semantic_inference/python/semantic_inference/models/wrappers.py index 2457ae1..b1b13ff 100644 --- a/semantic_inference/python/semantic_inference/models/wrappers.py +++ b/semantic_inference/python/semantic_inference/models/wrappers.py @@ -335,3 +335,52 @@ class OpenClipConfig(Config): def load(cls, filepath): """Load config from file.""" return Config.load(cls, filepath) + + +class Yolov11InstanceSegmenterWrapper(nn.Module): + """Yolov11 instance segmentation wrapper.""" + + def __init__(self, config): + """Load Yolov11 model.""" + super().__init__() + from ultralytics import YOLO + + self.config = config + self.model = YOLO(config.model_name) + + def eval(self): + """ + override eval to avoid issues with yolo model + """ + self.model.model.eval() + + @classmethod + def construct(cls, **kwargs): + """Load model from configuration dictionary.""" + config = Yolov11InstanceSegmenterConfig() + config.update(kwargs) + return cls(config) + + def forward(self, img): + """Segment image.""" + result = self.model(img)[0] # assume batch size 1 + if result.masks is None: + return None, None, None, None + categories = result.boxes.cls # int8 + masks = result.masks.data.to(torch.bool) # + boxes = result.boxes.xyxy # float32 + confidences = result.boxes.conf # float32 + # assume the instance id is the index in the result? + return categories, masks, boxes, confidences + +@register_config("instance_model", name="yolov11", constructor=Yolov11InstanceSegmenterWrapper) +@dataclasses.dataclass +class Yolov11InstanceSegmenterConfig(Config): + """Configuration for Yolov11 instance segmenter.""" + + model_name: str = "yolo11n-seg.pt" + + @classmethod + def load(cls, filepath): + """Load config from file.""" + return Config.load(cls, filepath) \ No newline at end of file diff --git a/semantic_inference_ros/CMakeLists.txt b/semantic_inference_ros/CMakeLists.txt index e71d1b7..3917bbc 100644 --- a/semantic_inference_ros/CMakeLists.txt +++ b/semantic_inference_ros/CMakeLists.txt @@ -74,7 +74,7 @@ install( LIBRARY DESTINATION lib RUNTIME DESTINATION lib/${PROJECT_NAME} ) -install(PROGRAMS app/image_embedding_node app/open_set_node app/text_embedding_node +install(PROGRAMS app/image_embedding_node app/open_set_node app/text_embedding_node app/instance_segmentation_node DESTINATION lib/${PROJECT_NAME} ) install(DIRECTORY include/${PROJECT_NAME}/ DESTINATION include/${PROJECT_NAME}/) diff --git a/semantic_inference_ros/app/instance_segmentation_node b/semantic_inference_ros/app/instance_segmentation_node new file mode 100755 index 0000000..e1f72d3 --- /dev/null +++ b/semantic_inference_ros/app/instance_segmentation_node @@ -0,0 +1,209 @@ +#!/usr/bin/env python3 +"""Node that runs openset segmentation.""" + +import pathlib +from dataclasses import dataclass, field +from typing import Any + +import rclpy +import spark_config as sc +import torch +from rclpy.node import Node +from sensor_msgs.msg import Image + +import semantic_inference.models as models +import semantic_inference_ros +from semantic_inference_msgs.msg import FeatureImage, FeatureVectorStamped +from semantic_inference_ros import Conversions, ImageWorkerConfig + +# additional imports that may not be here +import numpy as np +import cv2 + +@dataclass +class InstanceSegmentationNodeConfig(sc.Config): + """Configuration for ClipPublisherNode.""" + + worker: ImageWorkerConfig = field(default_factory=ImageWorkerConfig) + model: models.InstanceSegmenterConfig = field( + default_factory=models.InstanceSegmenterConfig + ) # create InstanceSegmenterConfig in models + # visualizer: Any = sc.config_field( + # "feature_visualizer", default="component", required=False + # ) + + +class InstanceSegmentationNode(Node): + """Node to run instance segmentation.""" + + def __init__(self): + """Start subscriber and publisher.""" + super().__init__("instance_segmentation_node") + config_path = ( + self.declare_parameter("config_path", "").get_parameter_value().string_value + ) + config_path = pathlib.Path(config_path).expanduser().absolute() + if not config_path.exists() and config_path != "": + self.get_logger().warn(f"config path '{config_path}' does not exist!") + self.config = InstanceSegmentationNodeConfig() + else: + self.config = sc.Config.load(InstanceSegmentationNodeConfig, config_path) + + self.get_logger().info(f"Initializing with {self.config.show()}") + device = models.default_device() + self._model = models.InstanceSegmenter(self.config.model).to(device) + self._model.eval() # TODO: causing issue with yolo model + self.get_logger().info("Finished initializing!") + + self._pub = self.create_publisher(Image, "semantic/image_raw", 1) # publish segmented image + self._worker = semantic_inference_ros.ImageWorker( + self, self.config.worker, "color/image_raw", self._spin_once + ) # put image in queue for processing + # self._embedder = semantic_inference_ros.PromptEncoder(self, self._model.encoder) + # could be relavent if use GroundedSAM, put flexible label space in config + + self._visualizer = "place_holder" + # self._visualizer = self.config.visualizer.create() + if self._visualizer is not None: + # TODO: write proper visualizer for instance segmentation + self._color_pub = self.create_publisher( + Image, "semantic_color/image_raw", 1 + ) + + def _spin_once(self, header, img): + with torch.no_grad(): + ret = self._model.segment(img, is_rgb_order=True).cpu() + + if ret.masks is None: + self.get_logger().debug("No masks detected in the image.") + return + + instance_seg_img = self.convert_to_instance_seg_img(ret) + # Convert to int32 to match 32SC1 encoding expected by cv_bridge + instance_seg_img = instance_seg_img.astype(np.int32) + msg = Conversions.to_image_msg(header, instance_seg_img, encoding="32SC1") + self._pub.publish(msg) + self.get_logger().debug("Published instance segmentation image.") + + if self._visualizer is not None: + # color_img = self._visualizer.call(ret) + color_img = self.visualizer_call(ret, img) + self._color_pub.publish( + Conversions.to_image_msg(header, color_img, encoding="rgb8") + ) + + def stop(self): + """Stop the underlying image worker.""" + self._worker.stop() + + def convert_to_instance_seg_img(self, ret): + ''' + Convert segmentation results to instance segmentation image. + Each pixel value encodes both category id and instance id. + First 16 bits are category id, last 16 bits are instance id. + ''' + masks = ret.masks.cpu().numpy() + category_ids = ret.categories.cpu().numpy() + img = np.zeros(masks[0].shape, dtype=np.uint32) + for i in range(masks.shape[0]): + category_id = int(category_ids[i]) # category id are 0-indexed + instance_id = i + 1 # instance ids are 1-indexed + combined_id = (category_id << 16) | instance_id # combine into single uint32 + img[masks[i, ...] > 0] = combined_id + + return img + + def recover_instance_and_category(self, instance_seg_img): + ''' + Recover instance ids and category ids from instance segmentation image. + ''' + instance_ids = (instance_seg_img & 0xFFFF).astype(np.uint16) # last 16 bits + category_ids = (instance_seg_img >> 16).astype(np.uint16) # first 16 bits + return instance_ids, category_ids + + def visualizer_call(self, ret, img): + ''' + Process the result from yolo instance segmenter and generate color image. + The returned color image contain bounding boxes, masks, and category labels. + ''' + + categories = ret.categories + masks = ret.masks + boxes = ret.boxes + confidences = ret.confidences + + # TODO: place holder directly from model, need to be replace by proper yaml + category_names = self._model.segmenter.model.names + + # Convert RGB to BGR for OpenCV + vis_img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) + + # Generate random colors for each class + np.random.seed(42) # for consistent colors + colors = np.random.randint(0, 255, size=(len(category_names), 3), dtype=np.uint8) + + # Overlay segmentation masks + if masks is not None: + for i, mask_tensor in enumerate(masks.data): + box = boxes[i] + cls = int(categories[i].cpu().numpy()) + + # Get color for the class + color = colors[cls].tolist() + + # Get mask and resize it to the image dimensions + mask_np = mask_tensor.cpu().numpy().astype(np.uint8) + mask_resized = cv2.resize(mask_np, (vis_img_bgr.shape[1], vis_img_bgr.shape[0]), interpolation=cv2.INTER_NEAREST) + + # Find contours to create a mask overlay + contours, _ = cv2.findContours(mask_resized, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + # Create a transparent overlay + overlay = vis_img_bgr.copy() + cv2.drawContours(overlay, contours, -1, color, -1) + + # Blend the overlay with the original image + alpha = 0.5 + vis_img_bgr = cv2.addWeighted(overlay, alpha, vis_img_bgr, 1 - alpha, 0) + + # Draw bounding boxes and labels + for i, box in enumerate(boxes): + x1, y1, x2, y2 = map(int, box.cpu().numpy()) + conf = confidences[i].cpu().numpy() + cls = int(categories[i].cpu().numpy()) + label = f"{category_names[cls]} {conf:.2f}" + + color = colors[cls].tolist() + + # Draw bounding box + cv2.rectangle(vis_img_bgr, (x1, y1), (x2, y2), color, 2) + + # Put label above the bounding box + (label_width, label_height), baseline = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2) + cv2.rectangle(vis_img_bgr, (x1, y1 - label_height - 10), (x1 + label_width, y1), color, -1) + cv2.putText(vis_img_bgr, label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) + + # Convert BGR back to RGB for displaying with matplotlib + vis_img_rgb = cv2.cvtColor(vis_img_bgr, cv2.COLOR_BGR2RGB) + + return vis_img_rgb + +def main(): + """Start a node.""" + rclpy.init() + + node = None + try: + node = InstanceSegmentationNode() + semantic_inference_ros.setup_ros_log_forwarding(node) + rclpy.spin(node) + except KeyboardInterrupt: + pass + finally: + rclpy.try_shutdown() + if node is not None: + node.stop() + + +if __name__ == "__main__": + main() diff --git a/semantic_inference_ros/config/instance_segmentation/yolov11.yaml b/semantic_inference_ros/config/instance_segmentation/yolov11.yaml new file mode 100644 index 0000000..8e5aab5 --- /dev/null +++ b/semantic_inference_ros/config/instance_segmentation/yolov11.yaml @@ -0,0 +1,4 @@ +--- +instance_model: + type: yolov11 + model_name: yolo11n-seg.pt \ No newline at end of file diff --git a/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml b/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml new file mode 100644 index 0000000..13036ab --- /dev/null +++ b/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml @@ -0,0 +1,25 @@ +--- +launch: + - arg: {name: config_path, default: $(find-pkg-share semantic_inference_ros)/config/instance_segmentation/yolov11.yaml, description: Configuration file for instance segmentation object detector} + - arg: {name: compressed_rgb, default: 'false', description: Triggers decompression for RGB stream} + - arg: {name: log-level, default: info, description: Set the ROS2 log level} + - node: + if: $(var compressed_rgb) + pkg: image_transport + exec: republish + name: decompress_rgb + param: + - {name: in_transport, value: compressed} + - {name: out_transport, value: raw} + remap: + - {from: in/compressed, to: /acl_jackal/forward/color/image_raw/compressed} + - {from: out, to: color/image_raw} + - node: + pkg: semantic_inference_ros + exec: instance_segmentation_node + name: semantic_inference + on_exit: shutdown + param: + - {name: config_path, value: $(var config_path), type: str} + args: > + --ros-args --log-level $(var log-level) From 27bc19079a6dd9b4513d010c94f35668bc681811 Mon Sep 17 00:00:00 2001 From: multyxu Date: Tue, 4 Nov 2025 17:18:55 -0500 Subject: [PATCH 2/9] close set instance segmentation with python environment --- semantic_inference_ros/app/instance_segmentation_node | 7 ++++--- .../config/instance_segmentation/yolov11.yaml | 7 ++++--- .../launch/instance_segmentation_yolov11.launch.yaml | 4 +++- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/semantic_inference_ros/app/instance_segmentation_node b/semantic_inference_ros/app/instance_segmentation_node index e1f72d3..0197e2b 100755 --- a/semantic_inference_ros/app/instance_segmentation_node +++ b/semantic_inference_ros/app/instance_segmentation_node @@ -47,6 +47,7 @@ class InstanceSegmentationNode(Node): self.get_logger().warn(f"config path '{config_path}' does not exist!") self.config = InstanceSegmentationNodeConfig() else: + self.get_logger().info(f"Loading config from '{config_path}'") self.config = sc.Config.load(InstanceSegmentationNodeConfig, config_path) self.get_logger().info(f"Initializing with {self.config.show()}") @@ -66,8 +67,8 @@ class InstanceSegmentationNode(Node): # self._visualizer = self.config.visualizer.create() if self._visualizer is not None: # TODO: write proper visualizer for instance segmentation - self._color_pub = self.create_publisher( - Image, "semantic_color/image_raw", 1 + self._overlay_pub = self.create_publisher( + Image, "semantic_overlay/image_raw", 1 ) def _spin_once(self, header, img): @@ -88,7 +89,7 @@ class InstanceSegmentationNode(Node): if self._visualizer is not None: # color_img = self._visualizer.call(ret) color_img = self.visualizer_call(ret, img) - self._color_pub.publish( + self._overlay_pub.publish( Conversions.to_image_msg(header, color_img, encoding="rgb8") ) diff --git a/semantic_inference_ros/config/instance_segmentation/yolov11.yaml b/semantic_inference_ros/config/instance_segmentation/yolov11.yaml index 8e5aab5..c0f5ac9 100644 --- a/semantic_inference_ros/config/instance_segmentation/yolov11.yaml +++ b/semantic_inference_ros/config/instance_segmentation/yolov11.yaml @@ -1,4 +1,5 @@ --- -instance_model: - type: yolov11 - model_name: yolo11n-seg.pt \ No newline at end of file +model: + instance_model: + type: yolov11 + model_name: yolo11n-seg.pt \ No newline at end of file diff --git a/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml b/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml index 13036ab..816a454 100644 --- a/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml +++ b/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml @@ -1,5 +1,6 @@ --- launch: + - arg: {name: instance_seg_env, default: /home/multyxu/environments/crisp, description: Path to instance segmentation environment} - arg: {name: config_path, default: $(find-pkg-share semantic_inference_ros)/config/instance_segmentation/yolov11.yaml, description: Configuration file for instance segmentation object detector} - arg: {name: compressed_rgb, default: 'false', description: Triggers decompression for RGB stream} - arg: {name: log-level, default: info, description: Set the ROS2 log level} @@ -14,10 +15,11 @@ launch: remap: - {from: in/compressed, to: /acl_jackal/forward/color/image_raw/compressed} - {from: out, to: color/image_raw} - - node: + - pyenv_node: pkg: semantic_inference_ros exec: instance_segmentation_node name: semantic_inference + pyenv: $(var instance_seg_env) on_exit: shutdown param: - {name: config_path, value: $(var config_path), type: str} From 842e48e53ba1958218c91ed305e564bb1da52f0c Mon Sep 17 00:00:00 2001 From: multyxu Date: Tue, 4 Nov 2025 20:38:24 -0500 Subject: [PATCH 3/9] pre-commit fix --- .../semantic_inference/models/__init__.py | 2 +- .../models/instance_segmenter.py | 28 ++--- .../semantic_inference/models/wrappers.py | 19 +-- .../app/instance_segmentation_node | 116 +++++++++++------- .../config/instance_segmentation/yolov11.yaml | 2 +- .../instance_segmentation_yolov11.launch.yaml | 2 +- 6 files changed, 95 insertions(+), 74 deletions(-) diff --git a/semantic_inference/python/semantic_inference/models/__init__.py b/semantic_inference/python/semantic_inference/models/__init__.py index b4dc607..0a0a069 100644 --- a/semantic_inference/python/semantic_inference/models/__init__.py +++ b/semantic_inference/python/semantic_inference/models/__init__.py @@ -30,12 +30,12 @@ import torch from semantic_inference.models.feature_visualizers import * +from semantic_inference.models.instance_segmenter import * from semantic_inference.models.mask_functions import * from semantic_inference.models.openset_segmenter import * from semantic_inference.models.patch_extractor import * from semantic_inference.models.segment_refinement import * from semantic_inference.models.wrappers import * -from semantic_inference.models.instance_segmenter import * def default_device(use_cuda=True): diff --git a/semantic_inference/python/semantic_inference/models/instance_segmenter.py b/semantic_inference/python/semantic_inference/models/instance_segmenter.py index 616501d..0c19136 100644 --- a/semantic_inference/python/semantic_inference/models/instance_segmenter.py +++ b/semantic_inference/python/semantic_inference/models/instance_segmenter.py @@ -30,23 +30,14 @@ """Model to segment an image and encode segments with CLIP embeddings.""" import dataclasses -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import Any import numpy as np import torch -import torch.nn.functional as F from spark_config import Config, config_field from torch import nn -from semantic_inference.models.mask_functions import ConstantMask -from semantic_inference.models.patch_extractor import ( - PatchExtractor, - center_crop, - default_normalization_parameters, - get_image_preprocessor, -) -from semantic_inference.models.segment_refinement import SegmentRefinement def _map_opt(values, f): return {k: v if v is None else f(v) for k, v in values.items()} @@ -57,9 +48,9 @@ class Results: """Openset Segmentation Results.""" masks: torch.Tensor - boxes: torch.Tensor # bounding boxes for the masks + boxes: torch.Tensor # bounding boxes for the masks categories: torch.Tensor - confidences: torch.Tensor + confidences: torch.Tensor @property def instances(self): @@ -74,7 +65,7 @@ def instances(self): img[np_masks[i, ...] > 0] = i + 1 # TODO: 16 + 16 int for instance id and category id - + return img def cpu(self): @@ -107,10 +98,9 @@ def __init__(self, config): self.config = config self.segmenter = self.config.instance_model.create() - # self.segment_refinement = SegmentRefinement(config.refinement) # might be useful clean up inprecise edges - + def eval(self): - """ + """ Override eval to avoid issues with certain models """ self.segmenter.eval() @@ -153,8 +143,10 @@ def forward(self, rgb_img): Encoded image """ categories, masks, boxes, confidences = self.segmenter(rgb_img) - + # img = torch.from_numpy(rgb_img).to(self.device) # return self.encode(img, masks, boxes) # TODO: return the results of the actual instance segmentation model here - return Results(masks=masks, boxes=boxes, categories=categories, confidences=confidences) + return Results( + masks=masks, boxes=boxes, categories=categories, confidences=confidences + ) diff --git a/semantic_inference/python/semantic_inference/models/wrappers.py b/semantic_inference/python/semantic_inference/models/wrappers.py index b1b13ff..98c6d51 100644 --- a/semantic_inference/python/semantic_inference/models/wrappers.py +++ b/semantic_inference/python/semantic_inference/models/wrappers.py @@ -347,7 +347,7 @@ def __init__(self, config): self.config = config self.model = YOLO(config.model_name) - + def eval(self): """ override eval to avoid issues with yolo model @@ -363,17 +363,20 @@ def construct(cls, **kwargs): def forward(self, img): """Segment image.""" - result = self.model(img)[0] # assume batch size 1 + result = self.model(img)[0] # assume batch size 1 if result.masks is None: return None, None, None, None - categories = result.boxes.cls # int8 - masks = result.masks.data.to(torch.bool) # - boxes = result.boxes.xyxy # float32 - confidences = result.boxes.conf # float32 + categories = result.boxes.cls # int8 + masks = result.masks.data.to(torch.bool) # + boxes = result.boxes.xyxy # float32 + confidences = result.boxes.conf # float32 # assume the instance id is the index in the result? return categories, masks, boxes, confidences -@register_config("instance_model", name="yolov11", constructor=Yolov11InstanceSegmenterWrapper) + +@register_config( + "instance_model", name="yolov11", constructor=Yolov11InstanceSegmenterWrapper +) @dataclasses.dataclass class Yolov11InstanceSegmenterConfig(Config): """Configuration for Yolov11 instance segmenter.""" @@ -383,4 +386,4 @@ class Yolov11InstanceSegmenterConfig(Config): @classmethod def load(cls, filepath): """Load config from file.""" - return Config.load(cls, filepath) \ No newline at end of file + return Config.load(cls, filepath) diff --git a/semantic_inference_ros/app/instance_segmentation_node b/semantic_inference_ros/app/instance_segmentation_node index 0197e2b..eaf5f76 100755 --- a/semantic_inference_ros/app/instance_segmentation_node +++ b/semantic_inference_ros/app/instance_segmentation_node @@ -3,8 +3,11 @@ import pathlib from dataclasses import dataclass, field -from typing import Any +import cv2 + +# additional imports that may not be here +import numpy as np import rclpy import spark_config as sc import torch @@ -13,12 +16,8 @@ from sensor_msgs.msg import Image import semantic_inference.models as models import semantic_inference_ros -from semantic_inference_msgs.msg import FeatureImage, FeatureVectorStamped from semantic_inference_ros import Conversions, ImageWorkerConfig -# additional imports that may not be here -import numpy as np -import cv2 @dataclass class InstanceSegmentationNodeConfig(sc.Config): @@ -27,7 +26,7 @@ class InstanceSegmentationNodeConfig(sc.Config): worker: ImageWorkerConfig = field(default_factory=ImageWorkerConfig) model: models.InstanceSegmenterConfig = field( default_factory=models.InstanceSegmenterConfig - ) # create InstanceSegmenterConfig in models + ) # create InstanceSegmenterConfig in models # visualizer: Any = sc.config_field( # "feature_visualizer", default="component", required=False # ) @@ -53,20 +52,20 @@ class InstanceSegmentationNode(Node): self.get_logger().info(f"Initializing with {self.config.show()}") device = models.default_device() self._model = models.InstanceSegmenter(self.config.model).to(device) - self._model.eval() # TODO: causing issue with yolo model + self._model.eval() # TODO: causing issue with yolo model self.get_logger().info("Finished initializing!") - self._pub = self.create_publisher(Image, "semantic/image_raw", 1) # publish segmented image + self._pub = self.create_publisher( + Image, "semantic/image_raw", 1 + ) # publish segmented image self._worker = semantic_inference_ros.ImageWorker( self, self.config.worker, "color/image_raw", self._spin_once - ) # put image in queue for processing - # self._embedder = semantic_inference_ros.PromptEncoder(self, self._model.encoder) - # could be relavent if use GroundedSAM, put flexible label space in config - + ) # put image in queue for processing + self._visualizer = "place_holder" # self._visualizer = self.config.visualizer.create() if self._visualizer is not None: - # TODO: write proper visualizer for instance segmentation + # TODO: write proper visualizer for instance segmentation self._overlay_pub = self.create_publisher( Image, "semantic_overlay/image_raw", 1 ) @@ -74,7 +73,7 @@ class InstanceSegmentationNode(Node): def _spin_once(self, header, img): with torch.no_grad(): ret = self._model.segment(img, is_rgb_order=True).cpu() - + if ret.masks is None: self.get_logger().debug("No masks detected in the image.") return @@ -96,73 +95,83 @@ class InstanceSegmentationNode(Node): def stop(self): """Stop the underlying image worker.""" self._worker.stop() - + def convert_to_instance_seg_img(self, ret): - ''' + """ Convert segmentation results to instance segmentation image. Each pixel value encodes both category id and instance id. First 16 bits are category id, last 16 bits are instance id. - ''' + """ masks = ret.masks.cpu().numpy() category_ids = ret.categories.cpu().numpy() img = np.zeros(masks[0].shape, dtype=np.uint32) for i in range(masks.shape[0]): - category_id = int(category_ids[i]) # category id are 0-indexed + category_id = int(category_ids[i]) # category id are 0-indexed instance_id = i + 1 # instance ids are 1-indexed - combined_id = (category_id << 16) | instance_id # combine into single uint32 + combined_id = ( + category_id << 16 + ) | instance_id # combine into single uint32 img[masks[i, ...] > 0] = combined_id return img - + def recover_instance_and_category(self, instance_seg_img): - ''' + """ Recover instance ids and category ids from instance segmentation image. - ''' + """ instance_ids = (instance_seg_img & 0xFFFF).astype(np.uint16) # last 16 bits - category_ids = (instance_seg_img >> 16).astype(np.uint16) # first 16 bits + category_ids = (instance_seg_img >> 16).astype(np.uint16) # first 16 bits return instance_ids, category_ids def visualizer_call(self, ret, img): - ''' - Process the result from yolo instance segmenter and generate color image. + """ + Process the result from yolo instance segmenter and generate color image. The returned color image contain bounding boxes, masks, and category labels. - ''' - + """ + categories = ret.categories masks = ret.masks boxes = ret.boxes confidences = ret.confidences - + # TODO: place holder directly from model, need to be replace by proper yaml category_names = self._model.segmenter.model.names - + # Convert RGB to BGR for OpenCV vis_img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # Generate random colors for each class - np.random.seed(42) # for consistent colors - colors = np.random.randint(0, 255, size=(len(category_names), 3), dtype=np.uint8) + np.random.seed(42) # for consistent colors + colors = np.random.randint( + 0, 255, size=(len(category_names), 3), dtype=np.uint8 + ) # Overlay segmentation masks if masks is not None: for i, mask_tensor in enumerate(masks.data): box = boxes[i] cls = int(categories[i].cpu().numpy()) - + # Get color for the class color = colors[cls].tolist() - + # Get mask and resize it to the image dimensions mask_np = mask_tensor.cpu().numpy().astype(np.uint8) - mask_resized = cv2.resize(mask_np, (vis_img_bgr.shape[1], vis_img_bgr.shape[0]), interpolation=cv2.INTER_NEAREST) - + mask_resized = cv2.resize( + mask_np, + (vis_img_bgr.shape[1], vis_img_bgr.shape[0]), + interpolation=cv2.INTER_NEAREST, + ) + # Find contours to create a mask overlay - contours, _ = cv2.findContours(mask_resized, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) - + contours, _ = cv2.findContours( + mask_resized, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE + ) + # Create a transparent overlay overlay = vis_img_bgr.copy() cv2.drawContours(overlay, contours, -1, color, -1) - + # Blend the overlay with the original image alpha = 0.5 vis_img_bgr = cv2.addWeighted(overlay, alpha, vis_img_bgr, 1 - alpha, 0) @@ -173,22 +182,39 @@ class InstanceSegmentationNode(Node): conf = confidences[i].cpu().numpy() cls = int(categories[i].cpu().numpy()) label = f"{category_names[cls]} {conf:.2f}" - + color = colors[cls].tolist() - + # Draw bounding box cv2.rectangle(vis_img_bgr, (x1, y1), (x2, y2), color, 2) - + # Put label above the bounding box - (label_width, label_height), baseline = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2) - cv2.rectangle(vis_img_bgr, (x1, y1 - label_height - 10), (x1 + label_width, y1), color, -1) - cv2.putText(vis_img_bgr, label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) + (label_width, label_height), baseline = cv2.getTextSize( + label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2 + ) + cv2.rectangle( + vis_img_bgr, + (x1, y1 - label_height - 10), + (x1 + label_width, y1), + color, + -1, + ) + cv2.putText( + vis_img_bgr, + label, + (x1, y1 - 5), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (255, 255, 255), + 1, + ) # Convert BGR back to RGB for displaying with matplotlib vis_img_rgb = cv2.cvtColor(vis_img_bgr, cv2.COLOR_BGR2RGB) - + return vis_img_rgb + def main(): """Start a node.""" rclpy.init() diff --git a/semantic_inference_ros/config/instance_segmentation/yolov11.yaml b/semantic_inference_ros/config/instance_segmentation/yolov11.yaml index c0f5ac9..e67c3d3 100644 --- a/semantic_inference_ros/config/instance_segmentation/yolov11.yaml +++ b/semantic_inference_ros/config/instance_segmentation/yolov11.yaml @@ -2,4 +2,4 @@ model: instance_model: type: yolov11 - model_name: yolo11n-seg.pt \ No newline at end of file + model_name: yolo11n-seg.pt diff --git a/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml b/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml index 816a454..261eee1 100644 --- a/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml +++ b/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml @@ -23,5 +23,5 @@ launch: on_exit: shutdown param: - {name: config_path, value: $(var config_path), type: str} - args: > + args: >- --ros-args --log-level $(var log-level) From 8a8610ffb1dcd33e4ad3a360d038f66171d2f0e6 Mon Sep 17 00:00:00 2001 From: multyxu Date: Thu, 13 Nov 2025 00:14:03 -0500 Subject: [PATCH 4/9] Add grounded sam 2 as an instance segmenter. Fix yaml to use environment variable for home instead of hardcoding it. --- .../models/instance_segmenter.py | 14 +- .../semantic_inference/models/wrappers.py | 188 +++++++++++++++++- .../app/instance_segmentation_node | 3 +- .../config/instance_segmentation/gdsam2.yaml | 12 ++ .../instance_segmentation_gdsam2.launch.yaml | 27 +++ .../instance_segmentation_yolov11.launch.yaml | 2 +- 6 files changed, 236 insertions(+), 10 deletions(-) create mode 100644 semantic_inference_ros/config/instance_segmentation/gdsam2.yaml create mode 100644 semantic_inference_ros/launch/instance_segmentation_gdsam2.launch.yaml diff --git a/semantic_inference/python/semantic_inference/models/instance_segmenter.py b/semantic_inference/python/semantic_inference/models/instance_segmenter.py index 0c19136..39c637f 100644 --- a/semantic_inference/python/semantic_inference/models/instance_segmenter.py +++ b/semantic_inference/python/semantic_inference/models/instance_segmenter.py @@ -47,10 +47,11 @@ def _map_opt(values, f): class Results: """Openset Segmentation Results.""" - masks: torch.Tensor - boxes: torch.Tensor # bounding boxes for the masks - categories: torch.Tensor - confidences: torch.Tensor + # all on cuda/tensor device? TODO: Maybe should move to cpu by default + masks: torch.Tensor # (n, H, W), torch.bool + boxes: torch.Tensor # (n, 4) xyxy format, torch.float32 + categories: torch.Tensor # (n,), torch.float32/int64 (doesn't matter) + confidences: torch.Tensor # (n,), torch.float32 @property def instances(self): @@ -131,6 +132,11 @@ def segment(self, rgb_img, is_rgb_order=True): def device(self): """Get current model device.""" return self._canary_param.device + + @property + def category_names(self): + """Get category names.""" + return self.segmenter.category_names def forward(self, rgb_img): """ diff --git a/semantic_inference/python/semantic_inference/models/wrappers.py b/semantic_inference/python/semantic_inference/models/wrappers.py index 98c6d51..fd5599c 100644 --- a/semantic_inference/python/semantic_inference/models/wrappers.py +++ b/semantic_inference/python/semantic_inference/models/wrappers.py @@ -31,10 +31,14 @@ import dataclasses +import os import einops +import numpy as np import torch import torch.nn as nn import torchvision +from torchvision.ops import box_convert + from spark_config import Config, register_config from semantic_inference import root_path @@ -44,6 +48,10 @@ def models_path(): """Get path to pre-trained weight storage.""" return root_path().parent.parent / "models" +def path_to_dot_semantic_inference(): + """Get path to ~/.semantic_inference directory.""" + return os.getenv("HOME") + "/.semantic_inference" + class FastSAMSegmentation(nn.Module): """Fast SAM wrapper.""" @@ -353,6 +361,11 @@ def eval(self): override eval to avoid issues with yolo model """ self.model.model.eval() + + @property + def category_names(self): + """Get category names.""" + return self.model.names @classmethod def construct(cls, **kwargs): @@ -366,10 +379,10 @@ def forward(self, img): result = self.model(img)[0] # assume batch size 1 if result.masks is None: return None, None, None, None - categories = result.boxes.cls # int8 - masks = result.masks.data.to(torch.bool) # - boxes = result.boxes.xyxy # float32 - confidences = result.boxes.conf # float32 + categories = result.boxes.cls.cpu() # int8 + masks = result.masks.data.to(torch.bool).cpu() # + boxes = result.boxes.xyxy.cpu() # float32 + confidences = result.boxes.conf.cpu() # float32 # assume the instance id is the index in the result? return categories, masks, boxes, confidences @@ -387,3 +400,170 @@ class Yolov11InstanceSegmenterConfig(Config): def load(cls, filepath): """Load config from file.""" return Config.load(cls, filepath) + + +class GDSam2InstanceSegmenterWrapper(nn.Module): + """Grounded SAM 2 instance segmentation wrapper.""" + + def __init__(self, config): + """Load Grounded SAM 2 model.""" + super().__init__() + from sam2.build_sam import build_sam2 + from sam2.sam2_image_predictor import SAM2ImagePredictor + from groundingdino.util.inference import load_model + + self.config = config + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.text_prompt = config.text_prompt + self.multimask_output = config.multimask_output + + sam2_model_config_path = os.path.join( + "configs/sam2.1", config.sam2_model_config + ) + sam2_checkpoint_path = os.path.join( + path_to_dot_semantic_inference(), config.sam2_checkpoint + ) # this uses hydra config packages so only need relative path to the pkg installation dir + grounding_dino_config_path = os.path.join( + path_to_dot_semantic_inference(), "gdsam2_config", config.grounding_dino_config + ) + grounding_dino_checkpoint_path = os.path.join( + path_to_dot_semantic_inference(), config.grounding_dino_checkpoint + ) + + # build SAM2 image predictor + self.sam2_model = build_sam2( + sam2_model_config_path, + sam2_checkpoint_path + ) + self.sam2_predictor = SAM2ImagePredictor(self.sam2_model) + + # build grounding dino model + self.grounding_model = load_model( + model_config_path=grounding_dino_config_path, + model_checkpoint_path=grounding_dino_checkpoint_path, + device=self.device + ) + + # convert text prompt to category names + self.category_names = self.text_prompt.lower().split('. ') + self.category_names = [cat.strip() for cat in self.category_names if len(cat.strip()) > 0] + self.category_names[-1] = self.category_names[-1].rstrip('.') # remove the last dot if any + + def preprocess_image(self, img): + """Preprocess image for Grounded SAM 2. + Input: + - img np.ndarray (H, W, C) uint8 + Output: + - image_transformed torch.Tensor (C, H, W) float32 + """ + import groundingdino.datasets.transforms as T + transform = T.Compose( + [ + T.RandomResize([800], max_size=1333), + T.ToTensor(), + T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), + ] + ) + pil_img = torchvision.transforms.ToPILImage()(img) + image_transformed, _ = transform(pil_img, None) + return image_transformed + + @classmethod + def construct(cls, **kwargs): + """Load model from configuration dictionary.""" + config = GDSam2InstanceSegmenterConfig() + config.update(kwargs) + return cls(config) + + def forward(self, img): + """Segment image.""" + from groundingdino.util.inference import load_image, predict + + # preprocess img + img_transformed = self.preprocess_image(img) + + # gdino prediction + boxes, confidences, labels = predict( + model=self.grounding_model, + image=img_transformed, + caption=self.text_prompt, + box_threshold=self.config.box_threshold, + text_threshold=self.config.text_threshold, + device=self.device + ) + + # if nothing detected + if boxes.shape[0] == 0: + return None, None, None, None + + # process the box prompt for SAM 2 + h, w, _ = img.shape + boxes = boxes * torch.Tensor([w, h, w, h]) + input_boxes = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy() + + # FIXME: figure how does this influence the G-DINO model (from offical gdsam2 demo) + # torch.autocast(device_type=self.device.type, dtype=torch.bfloat16).__enter__() + # if torch.cuda.is_available() and torch.cuda.get_device_properties(0).major >= 8: + # # turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices) + # torch.backends.cuda.matmul.allow_tf32 = True + # torch.backends.cudnn.allow_tf32 = True + + # SAM 2 predicts mask + self.sam2_predictor.set_image(img) + masks, scores, logits = self.sam2_predictor.predict( + point_coords=None, + point_labels=None, + box=input_boxes, + multimask_output=self.multimask_output, + ) + + # Sample best according to scores if multimask output + if self.multimask_output: + best = np.argmax(scores, axis=1) + masks = masks[np.arange(masks.shape[0]), best] + + # convert the shape to (n, H, W) + if masks.ndim == 4: + masks = masks.squeeze(1) + + # convert string labels to indexes based on the text prompt + categories = [] + for label in labels: + label_str = label.lower() + if label_str in self.category_names: + label_indx = self.category_names.index(label_str) + else: + label_indx = -1 # unknown + categories.append(label_indx) + categories = torch.tensor(categories) + + # convert masks to boolean + masks = masks.astype(bool) + masks = torch.tensor(masks) + + # use xyxy boxes + boxes = torch.tensor(input_boxes) + + return categories, masks, boxes, confidences + + +@register_config( + "instance_model", name="gdsam2", constructor=GDSam2InstanceSegmenterWrapper +) +@dataclasses.dataclass +class GDSam2InstanceSegmenterConfig(Config): + """Configuration for Grounded SAM 2 instance segmenter.""" + + text_prompt: str = "car. tire." + sam2_checkpoint: str = "sam2.1_hiera_large.pt" + sam2_model_config: str = "sam2.1_hiera_l.yaml" + grounding_dino_config: str = "GroundingDINO_SwinT_OGC.py" + grounding_dino_checkpoint: str = "groundingdino_swint_ogc.pth" + box_threshold: float = 0.35 + text_threshold: float = 0.25 + multimask_output: bool = False + + @classmethod + def load(cls, filepath): + """Load config from file.""" + return Config.load(cls, filepath) \ No newline at end of file diff --git a/semantic_inference_ros/app/instance_segmentation_node b/semantic_inference_ros/app/instance_segmentation_node index eaf5f76..34394b1 100755 --- a/semantic_inference_ros/app/instance_segmentation_node +++ b/semantic_inference_ros/app/instance_segmentation_node @@ -135,7 +135,8 @@ class InstanceSegmentationNode(Node): confidences = ret.confidences # TODO: place holder directly from model, need to be replace by proper yaml - category_names = self._model.segmenter.model.names + # category_names = self._model.segmenter.model.names + category_names = self._model.category_names # Convert RGB to BGR for OpenCV vis_img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) diff --git a/semantic_inference_ros/config/instance_segmentation/gdsam2.yaml b/semantic_inference_ros/config/instance_segmentation/gdsam2.yaml new file mode 100644 index 0000000..e97db70 --- /dev/null +++ b/semantic_inference_ros/config/instance_segmentation/gdsam2.yaml @@ -0,0 +1,12 @@ +--- +model: + instance_model: + type: gdsam2 + text_prompt: "person. bicycle. car. truck. bench. fire hydrant." + sam2_checkpoint: "sam2.1_hiera_large.pt" + sam2_model_config: "sam2.1_hiera_l.yaml" + grounding_dino_config: "GroundingDINO_SwinT_OGC.py" + grounding_dino_checkpoint: "groundingdino_swint_ogc.pth" + box_threshold: 0.35 + text_threshold: 0.25 + multimask_output: False \ No newline at end of file diff --git a/semantic_inference_ros/launch/instance_segmentation_gdsam2.launch.yaml b/semantic_inference_ros/launch/instance_segmentation_gdsam2.launch.yaml new file mode 100644 index 0000000..46fcac5 --- /dev/null +++ b/semantic_inference_ros/launch/instance_segmentation_gdsam2.launch.yaml @@ -0,0 +1,27 @@ +--- +launch: + - arg: {name: gdsam2_env, default: $(env HOME)/environments/gdsam2, description: Path to instance segmentation environment} + - arg: {name: config_path, default: $(find-pkg-share semantic_inference_ros)/config/instance_segmentation/gdsam2.yaml, description: Configuration file for instance segmentation object detector} + - arg: {name: compressed_rgb, default: 'false', description: Triggers decompression for RGB stream} + - arg: {name: log-level, default: info, description: Set the ROS2 log level} + - node: + if: $(var compressed_rgb) + pkg: image_transport + exec: republish + name: decompress_rgb + param: + - {name: in_transport, value: compressed} + - {name: out_transport, value: raw} + remap: + - {from: in/compressed, to: /acl_jackal/forward/color/image_raw/compressed} + - {from: out, to: color/image_raw} + - pyenv_node: + pkg: semantic_inference_ros + exec: instance_segmentation_node + name: semantic_inference + pyenv: $(var gdsam2_env) + on_exit: shutdown + param: + - {name: config_path, value: $(var config_path), type: str} + args: >- + --ros-args --log-level $(var log-level) diff --git a/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml b/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml index 261eee1..ff4c2ec 100644 --- a/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml +++ b/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml @@ -1,6 +1,6 @@ --- launch: - - arg: {name: instance_seg_env, default: /home/multyxu/environments/crisp, description: Path to instance segmentation environment} + - arg: {name: instance_seg_env, default: $(env HOME)/environments/crisp, description: Path to instance segmentation environment} - arg: {name: config_path, default: $(find-pkg-share semantic_inference_ros)/config/instance_segmentation/yolov11.yaml, description: Configuration file for instance segmentation object detector} - arg: {name: compressed_rgb, default: 'false', description: Triggers decompression for RGB stream} - arg: {name: log-level, default: info, description: Set the ROS2 log level} From 3eec3e5b8a24c6b57ef2ab1693682f180dbaaf73 Mon Sep 17 00:00:00 2001 From: multyxu Date: Fri, 14 Nov 2025 17:20:34 -0500 Subject: [PATCH 5/9] Add inital document, modify label space --- docs/instance_seg.md | 70 +++++++++++++++++++ .../config/instance_segmentation/gdsam2.yaml | 2 +- 2 files changed, 71 insertions(+), 1 deletion(-) create mode 100644 docs/instance_seg.md diff --git a/docs/instance_seg.md b/docs/instance_seg.md new file mode 100644 index 0000000..74e1ae0 --- /dev/null +++ b/docs/instance_seg.md @@ -0,0 +1,70 @@ +# Instance Segmentation + +## Setting Up + +The open-set segmentation interface works with and without ROS. For working with ROS, we assume you have already built your workspace with this repository in it beforehand (i.e., by running `colcon build`). + +> **Note
** +> If you intend only to use the open-set segmentation interface, you may want to turn off building against TensorRT, which you can do by the following: +> ```shell +> colcon build --cmake-args --no-warn-unused-cli -DSEMANTIC_INFERENCE_USE_TRT=OFF +> ``` + +### Installing + +We assume you are using a virtual environment. You may want to install `virtualenv` (usually `sudo apt install python3-virtualenv`) if you haven't already. +To set up a virtual environment for use with ROS: +```shell +python3 -m virtualenv -p /usr/bin/python3 --system-site-packages +``` +Otherwise, omit the ``--system-site-packages`` option: +```shell +python3 -m virtualenv -p /usr/bin/python3 --download +``` + +Then, install `semantic_inference` +```shell +cd +source /bin/activate +pip install ./semantic_inference[openset] # note that the openset extra is required for open-set semantic segmentation +``` + +The above setup allows you to use `yolov11`, in order to use `grounded sam 2`, we have to manually install it. +```shell +# cd to your favorite path +git clone -b more_gpu https://github.com/MultyXu/Grounded-SAM-2.git +``` +And follow the `README.md` to install gdsam2. + + + diff --git a/semantic_inference_ros/config/instance_segmentation/gdsam2.yaml b/semantic_inference_ros/config/instance_segmentation/gdsam2.yaml index e97db70..6bf2649 100644 --- a/semantic_inference_ros/config/instance_segmentation/gdsam2.yaml +++ b/semantic_inference_ros/config/instance_segmentation/gdsam2.yaml @@ -2,7 +2,7 @@ model: instance_model: type: gdsam2 - text_prompt: "person. bicycle. car. truck. bench. fire hydrant." + text_prompt: "bench. car. trash bin. fire hydrant." sam2_checkpoint: "sam2.1_hiera_large.pt" sam2_model_config: "sam2.1_hiera_l.yaml" grounding_dino_config: "GroundingDINO_SwinT_OGC.py" From fd19d3321013db6a60c2430dfd805711ea44d605 Mon Sep 17 00:00:00 2001 From: multyxu Date: Tue, 18 Nov 2025 16:51:09 -0500 Subject: [PATCH 6/9] mini change before merge hungytae's commit --- semantic_inference/pyproject.toml | 2 +- .../launch/instance_segmentation_gdsam2.launch.yaml | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/semantic_inference/pyproject.toml b/semantic_inference/pyproject.toml index 29bf976..5d931f9 100644 --- a/semantic_inference/pyproject.toml +++ b/semantic_inference/pyproject.toml @@ -30,6 +30,7 @@ dependencies = [ "torchvision", "spark_config@git+https://github.com/MIT-SPARK/Spark-Config.git", "numpy<2", + "ultralytics", ] [tool.setuptools.packages.find] @@ -41,7 +42,6 @@ semantic-inference = "semantic_inference.__main__:cli" [project.optional-dependencies] dev = ["pytest"] openset = [ - "ultralytics", "clip@git+https://github.com/openai/CLIP.git", "open_clip_torch", "numpy >= 1.20", diff --git a/semantic_inference_ros/launch/instance_segmentation_gdsam2.launch.yaml b/semantic_inference_ros/launch/instance_segmentation_gdsam2.launch.yaml index 46fcac5..04a7098 100644 --- a/semantic_inference_ros/launch/instance_segmentation_gdsam2.launch.yaml +++ b/semantic_inference_ros/launch/instance_segmentation_gdsam2.launch.yaml @@ -1,7 +1,8 @@ --- launch: - arg: {name: gdsam2_env, default: $(env HOME)/environments/gdsam2, description: Path to instance segmentation environment} - - arg: {name: config_path, default: $(find-pkg-share semantic_inference_ros)/config/instance_segmentation/gdsam2.yaml, description: Configuration file for instance segmentation object detector} + - arg: {name: model_name, default: gdsam2, description: Name of the segmentation model to use} + - arg: {name: config_path, default: $(find-pkg-share semantic_inference_ros)/config/instance_segmentation/$(var model_name).yaml, description: Configuration file for instance segmentation object detector} - arg: {name: compressed_rgb, default: 'false', description: Triggers decompression for RGB stream} - arg: {name: log-level, default: info, description: Set the ROS2 log level} - node: From a2cc0d5cd3d0fb657ac1b02fca66de9886489495 Mon Sep 17 00:00:00 2001 From: Multyxu Date: Wed, 3 Dec 2025 12:10:41 -0500 Subject: [PATCH 7/9] Fix wrong comment line. Add more info to instance seg setup doc --- docs/instance_seg.md | 2 ++ .../python/semantic_inference/models/wrappers.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/instance_seg.md b/docs/instance_seg.md index 74e1ae0..219ae47 100644 --- a/docs/instance_seg.md +++ b/docs/instance_seg.md @@ -36,6 +36,8 @@ git clone -b more_gpu https://github.com/MultyXu/Grounded-SAM-2.git ``` And follow the `README.md` to install gdsam2. +### Setup model +Put (or symlink) `GroundingDINO_SwinT_OGC.py` under `~/.semantic_inference/gdsam2_config/`. And, put `sam2.1_hiera_large.pt` and `groundingdino_swint_ogc.pth` under `~/.semantic_inference/`