From 5e0ae5b26c25774528ecb8d5bf2b0fe77252438b Mon Sep 17 00:00:00 2001
From: multyxu <multyxu@gmail.com>
Date: Thu, 30 Oct 2025 21:51:46 -0400
Subject: [PATCH 1/9] Initial version of yolov11

---
 .../semantic_inference/models/__init__.py     |   1 +
 .../models/instance_segmenter.py              | 160 ++++++++++++++
 .../semantic_inference/models/wrappers.py     |  49 ++++
 semantic_inference_ros/CMakeLists.txt         |   2 +-
 .../app/instance_segmentation_node            | 209 ++++++++++++++++++
 .../config/instance_segmentation/yolov11.yaml |   4 +
 .../instance_segmentation_yolov11.launch.yaml |  25 +++
 7 files changed, 449 insertions(+), 1 deletion(-)
 create mode 100644 semantic_inference/python/semantic_inference/models/instance_segmenter.py
 create mode 100755 semantic_inference_ros/app/instance_segmentation_node
 create mode 100644 semantic_inference_ros/config/instance_segmentation/yolov11.yaml
 create mode 100644 semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml

diff --git a/semantic_inference/python/semantic_inference/models/__init__.py b/semantic_inference/python/semantic_inference/models/__init__.py
index b6874c5..b4dc607 100644
--- a/semantic_inference/python/semantic_inference/models/__init__.py
+++ b/semantic_inference/python/semantic_inference/models/__init__.py
@@ -35,6 +35,7 @@
 from semantic_inference.models.patch_extractor import *
 from semantic_inference.models.segment_refinement import *
 from semantic_inference.models.wrappers import *
+from semantic_inference.models.instance_segmenter import *
 
 
 def default_device(use_cuda=True):
diff --git a/semantic_inference/python/semantic_inference/models/instance_segmenter.py b/semantic_inference/python/semantic_inference/models/instance_segmenter.py
new file mode 100644
index 0000000..616501d
--- /dev/null
+++ b/semantic_inference/python/semantic_inference/models/instance_segmenter.py
@@ -0,0 +1,160 @@
+# BSD 3-Clause License
+#
+# Copyright (c) 2021-2024, Massachusetts Institute of Technology.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+#    contributors may be used to endorse or promote products derived from
+#    this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+"""Model to segment an image and encode segments with CLIP embeddings."""
+
+import dataclasses
+from dataclasses import dataclass, field
+from typing import Any
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from spark_config import Config, config_field
+from torch import nn
+
+from semantic_inference.models.mask_functions import ConstantMask
+from semantic_inference.models.patch_extractor import (
+    PatchExtractor,
+    center_crop,
+    default_normalization_parameters,
+    get_image_preprocessor,
+)
+from semantic_inference.models.segment_refinement import SegmentRefinement
+
+def _map_opt(values, f):
+    return {k: v if v is None else f(v) for k, v in values.items()}
+
+
+@dataclass
+class Results:
+    """Openset Segmentation Results."""
+
+    masks: torch.Tensor
+    boxes: torch.Tensor # bounding boxes for the masks
+    categories: torch.Tensor
+    confidences: torch.Tensor   
+
+    @property
+    def instances(self):
+        """Get instance image (if it exists)."""
+        if self.masks.shape[0] == 0:
+            return None
+
+        np_masks = self.masks.numpy()
+        img = np.zeros(np_masks[0].shape, dtype=np.uint16)
+        for i in range(self.masks.shape[0]):
+            # instance ids are 1-indexed
+            img[np_masks[i, ...] > 0] = i + 1
+
+        # TODO: 16 + 16 int for instance id and category id
+        
+        return img
+
+    def cpu(self):
+        """Move results to CPU."""
+        values = dataclasses.asdict(self)
+        return Results(**_map_opt(values, lambda v: v.cpu()))
+
+    def to(self, *args, **kwargs):
+        """Forward to to all tensors."""
+        values = dataclasses.asdict(self)
+        return Results(**_map_opt(values, lambda v: v.to(*args, **kwargs)))
+
+
+@dataclass
+class InstanceSegmenterConfig(Config):
+    """Main config for instance segmenter."""
+
+    instance_model: Any = config_field("instance_model", default="yolov11")
+    # relevant configs (model path, model weights) for the model
+
+
+class InstanceSegmenter(nn.Module):
+    """Module to segment and encode an image."""
+
+    def __init__(self, config):
+        """Construct an instance segmenter."""
+        super().__init__()
+        # for detecting model device
+        self._canary_param = nn.Parameter(torch.empty(0))
+
+        self.config = config
+        self.segmenter = self.config.instance_model.create()
+        # self.segment_refinement = SegmentRefinement(config.refinement) # might be useful clean up inprecise edges
+    
+    def eval(self):
+        """ 
+        Override eval to avoid issues with certain models
+        """
+        self.segmenter.eval()
+
+    @classmethod
+    def construct(cls, **kwargs):
+        """Load model from configuration dictionary."""
+        config = InstanceSegmenterConfig()
+        config.update(kwargs)
+        return cls(config)
+
+    @torch.no_grad()
+    def segment(self, rgb_img, is_rgb_order=True):
+        """
+        Segment image and compute language embeddings for each mask.
+
+        Args:
+            img (np.ndarry): uint8 image of shape (R, C, 3) in rgb order
+            is_rgb_order (bool): whether the image is rgb order or not
+
+        Returns:
+            Encoded image
+        """
+        img = rgb_img if is_rgb_order else rgb_img[:, :, ::-1].copy()
+        return self(img)
+
+    @property
+    def device(self):
+        """Get current model device."""
+        return self._canary_param.device
+
+    def forward(self, rgb_img):
+        """
+        Segment image and compute language embeddings for each mask.
+
+        Args:
+            img (np.ndarray): uint8 image of shape (R, C, 3) in rgb order
+
+        Returns:
+            Encoded image
+        """
+        categories, masks, boxes, confidences = self.segmenter(rgb_img)
+        
+        # img = torch.from_numpy(rgb_img).to(self.device)
+        # return self.encode(img, masks, boxes)
+        # TODO: return the results of the actual instance segmentation model here
+        return Results(masks=masks, boxes=boxes, categories=categories, confidences=confidences)
diff --git a/semantic_inference/python/semantic_inference/models/wrappers.py b/semantic_inference/python/semantic_inference/models/wrappers.py
index 2457ae1..b1b13ff 100644
--- a/semantic_inference/python/semantic_inference/models/wrappers.py
+++ b/semantic_inference/python/semantic_inference/models/wrappers.py
@@ -335,3 +335,52 @@ class OpenClipConfig(Config):
     def load(cls, filepath):
         """Load config from file."""
         return Config.load(cls, filepath)
+
+
+class Yolov11InstanceSegmenterWrapper(nn.Module):
+    """Yolov11 instance segmentation wrapper."""
+
+    def __init__(self, config):
+        """Load Yolov11 model."""
+        super().__init__()
+        from ultralytics import YOLO
+
+        self.config = config
+        self.model = YOLO(config.model_name)
+    
+    def eval(self):
+        """
+        override eval to avoid issues with yolo model
+        """
+        self.model.model.eval()
+
+    @classmethod
+    def construct(cls, **kwargs):
+        """Load model from configuration dictionary."""
+        config = Yolov11InstanceSegmenterConfig()
+        config.update(kwargs)
+        return cls(config)
+
+    def forward(self, img):
+        """Segment image."""
+        result = self.model(img)[0] # assume batch size 1
+        if result.masks is None:
+            return None, None, None, None
+        categories = result.boxes.cls # int8
+        masks = result.masks.data.to(torch.bool) # 
+        boxes = result.boxes.xyxy # float32
+        confidences = result.boxes.conf # float32
+        # assume the instance id is the index in the result?
+        return categories, masks, boxes, confidences
+
+@register_config("instance_model", name="yolov11", constructor=Yolov11InstanceSegmenterWrapper)
+@dataclasses.dataclass
+class Yolov11InstanceSegmenterConfig(Config):
+    """Configuration for Yolov11 instance segmenter."""
+
+    model_name: str = "yolo11n-seg.pt"
+
+    @classmethod
+    def load(cls, filepath):
+        """Load config from file."""
+        return Config.load(cls, filepath)
\ No newline at end of file
diff --git a/semantic_inference_ros/CMakeLists.txt b/semantic_inference_ros/CMakeLists.txt
index e71d1b7..3917bbc 100644
--- a/semantic_inference_ros/CMakeLists.txt
+++ b/semantic_inference_ros/CMakeLists.txt
@@ -74,7 +74,7 @@ install(
   LIBRARY DESTINATION lib
   RUNTIME DESTINATION lib/${PROJECT_NAME}
 )
-install(PROGRAMS app/image_embedding_node app/open_set_node app/text_embedding_node
+install(PROGRAMS app/image_embedding_node app/open_set_node app/text_embedding_node app/instance_segmentation_node
         DESTINATION lib/${PROJECT_NAME}
 )
 install(DIRECTORY include/${PROJECT_NAME}/ DESTINATION include/${PROJECT_NAME}/)
diff --git a/semantic_inference_ros/app/instance_segmentation_node b/semantic_inference_ros/app/instance_segmentation_node
new file mode 100755
index 0000000..e1f72d3
--- /dev/null
+++ b/semantic_inference_ros/app/instance_segmentation_node
@@ -0,0 +1,209 @@
+#!/usr/bin/env python3
+"""Node that runs openset segmentation."""
+
+import pathlib
+from dataclasses import dataclass, field
+from typing import Any
+
+import rclpy
+import spark_config as sc
+import torch
+from rclpy.node import Node
+from sensor_msgs.msg import Image
+
+import semantic_inference.models as models
+import semantic_inference_ros
+from semantic_inference_msgs.msg import FeatureImage, FeatureVectorStamped
+from semantic_inference_ros import Conversions, ImageWorkerConfig
+
+# additional imports that may not be here
+import numpy as np
+import cv2
+
+@dataclass
+class InstanceSegmentationNodeConfig(sc.Config):
+    """Configuration for ClipPublisherNode."""
+
+    worker: ImageWorkerConfig = field(default_factory=ImageWorkerConfig)
+    model: models.InstanceSegmenterConfig = field(
+        default_factory=models.InstanceSegmenterConfig
+    ) # create InstanceSegmenterConfig in models
+    # visualizer: Any = sc.config_field(
+    #     "feature_visualizer", default="component", required=False
+    # )
+
+
+class InstanceSegmentationNode(Node):
+    """Node to run instance segmentation."""
+
+    def __init__(self):
+        """Start subscriber and publisher."""
+        super().__init__("instance_segmentation_node")
+        config_path = (
+            self.declare_parameter("config_path", "").get_parameter_value().string_value
+        )
+        config_path = pathlib.Path(config_path).expanduser().absolute()
+        if not config_path.exists() and config_path != "":
+            self.get_logger().warn(f"config path '{config_path}' does not exist!")
+            self.config = InstanceSegmentationNodeConfig()
+        else:
+            self.config = sc.Config.load(InstanceSegmentationNodeConfig, config_path)
+
+        self.get_logger().info(f"Initializing with {self.config.show()}")
+        device = models.default_device()
+        self._model = models.InstanceSegmenter(self.config.model).to(device)
+        self._model.eval() # TODO: causing issue with yolo model
+        self.get_logger().info("Finished initializing!")
+
+        self._pub = self.create_publisher(Image, "semantic/image_raw", 1) # publish segmented image
+        self._worker = semantic_inference_ros.ImageWorker(
+            self, self.config.worker, "color/image_raw", self._spin_once
+        ) # put image in queue for processing
+        # self._embedder = semantic_inference_ros.PromptEncoder(self, self._model.encoder)
+        # could be relavent if use GroundedSAM, put flexible label space in config
+        
+        self._visualizer = "place_holder"
+        # self._visualizer = self.config.visualizer.create()
+        if self._visualizer is not None:
+        # TODO: write proper visualizer for instance segmentation
+            self._color_pub = self.create_publisher(
+                Image, "semantic_color/image_raw", 1
+            )
+
+    def _spin_once(self, header, img):
+        with torch.no_grad():
+            ret = self._model.segment(img, is_rgb_order=True).cpu()
+        
+        if ret.masks is None:
+            self.get_logger().debug("No masks detected in the image.")
+            return
+
+        instance_seg_img = self.convert_to_instance_seg_img(ret)
+        # Convert to int32 to match 32SC1 encoding expected by cv_bridge
+        instance_seg_img = instance_seg_img.astype(np.int32)
+        msg = Conversions.to_image_msg(header, instance_seg_img, encoding="32SC1")
+        self._pub.publish(msg)
+        self.get_logger().debug("Published instance segmentation image.")
+
+        if self._visualizer is not None:
+            # color_img = self._visualizer.call(ret)
+            color_img = self.visualizer_call(ret, img)
+            self._color_pub.publish(
+                Conversions.to_image_msg(header, color_img, encoding="rgb8")
+            )
+
+    def stop(self):
+        """Stop the underlying image worker."""
+        self._worker.stop()
+    
+    def convert_to_instance_seg_img(self, ret):
+        '''
+        Convert segmentation results to instance segmentation image.
+        Each pixel value encodes both category id and instance id.
+        First 16 bits are category id, last 16 bits are instance id.
+        '''
+        masks = ret.masks.cpu().numpy()
+        category_ids = ret.categories.cpu().numpy()
+        img = np.zeros(masks[0].shape, dtype=np.uint32)
+        for i in range(masks.shape[0]):
+            category_id = int(category_ids[i]) # category id are 0-indexed
+            instance_id = i + 1  # instance ids are 1-indexed
+            combined_id = (category_id << 16) | instance_id  # combine into single uint32
+            img[masks[i, ...] > 0] = combined_id
+
+        return img
+    
+    def recover_instance_and_category(self, instance_seg_img):
+        '''
+        Recover instance ids and category ids from instance segmentation image.
+        '''
+        instance_ids = (instance_seg_img & 0xFFFF).astype(np.uint16)  # last 16 bits
+        category_ids = (instance_seg_img >> 16).astype(np.uint16)     # first 16 bits
+        return instance_ids, category_ids
+
+    def visualizer_call(self, ret, img):
+        '''
+        Process the result from yolo instance segmenter and generate color image. 
+        The returned color image contain bounding boxes, masks, and category labels.
+        '''
+        
+        categories = ret.categories
+        masks = ret.masks
+        boxes = ret.boxes
+        confidences = ret.confidences
+        
+        # TODO: place holder directly from model, need to be replace by proper yaml
+        category_names = self._model.segmenter.model.names
+        
+        # Convert RGB to BGR for OpenCV
+        vis_img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+
+        # Generate random colors for each class
+        np.random.seed(42) # for consistent colors
+        colors = np.random.randint(0, 255, size=(len(category_names), 3), dtype=np.uint8)
+
+        # Overlay segmentation masks
+        if masks is not None:
+            for i, mask_tensor in enumerate(masks.data):
+                box = boxes[i]
+                cls = int(categories[i].cpu().numpy())
+                
+                # Get color for the class
+                color = colors[cls].tolist()
+                
+                # Get mask and resize it to the image dimensions
+                mask_np = mask_tensor.cpu().numpy().astype(np.uint8)
+                mask_resized = cv2.resize(mask_np, (vis_img_bgr.shape[1], vis_img_bgr.shape[0]), interpolation=cv2.INTER_NEAREST)
+                
+                # Find contours to create a mask overlay
+                contours, _ = cv2.findContours(mask_resized, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                
+                # Create a transparent overlay
+                overlay = vis_img_bgr.copy()
+                cv2.drawContours(overlay, contours, -1, color, -1)
+                
+                # Blend the overlay with the original image
+                alpha = 0.5
+                vis_img_bgr = cv2.addWeighted(overlay, alpha, vis_img_bgr, 1 - alpha, 0)
+
+        # Draw bounding boxes and labels
+        for i, box in enumerate(boxes):
+            x1, y1, x2, y2 = map(int, box.cpu().numpy())
+            conf = confidences[i].cpu().numpy()
+            cls = int(categories[i].cpu().numpy())
+            label = f"{category_names[cls]} {conf:.2f}"
+            
+            color = colors[cls].tolist()
+            
+            # Draw bounding box
+            cv2.rectangle(vis_img_bgr, (x1, y1), (x2, y2), color, 2)
+            
+            # Put label above the bounding box
+            (label_width, label_height), baseline = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)
+            cv2.rectangle(vis_img_bgr, (x1, y1 - label_height - 10), (x1 + label_width, y1), color, -1)
+            cv2.putText(vis_img_bgr, label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
+
+        # Convert BGR back to RGB for displaying with matplotlib
+        vis_img_rgb = cv2.cvtColor(vis_img_bgr, cv2.COLOR_BGR2RGB)
+        
+        return vis_img_rgb
+
+def main():
+    """Start a node."""
+    rclpy.init()
+
+    node = None
+    try:
+        node = InstanceSegmentationNode()
+        semantic_inference_ros.setup_ros_log_forwarding(node)
+        rclpy.spin(node)
+    except KeyboardInterrupt:
+        pass
+    finally:
+        rclpy.try_shutdown()
+        if node is not None:
+            node.stop()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/semantic_inference_ros/config/instance_segmentation/yolov11.yaml b/semantic_inference_ros/config/instance_segmentation/yolov11.yaml
new file mode 100644
index 0000000..8e5aab5
--- /dev/null
+++ b/semantic_inference_ros/config/instance_segmentation/yolov11.yaml
@@ -0,0 +1,4 @@
+---
+instance_model:
+  type: yolov11
+  model_name: yolo11n-seg.pt
\ No newline at end of file
diff --git a/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml b/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml
new file mode 100644
index 0000000..13036ab
--- /dev/null
+++ b/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml
@@ -0,0 +1,25 @@
+---
+launch:
+  - arg: {name: config_path, default: $(find-pkg-share semantic_inference_ros)/config/instance_segmentation/yolov11.yaml, description: Configuration file for instance segmentation object detector}
+  - arg: {name: compressed_rgb, default: 'false', description: Triggers decompression for RGB stream}
+  - arg: {name: log-level, default: info, description: Set the ROS2 log level}
+  - node:
+      if: $(var compressed_rgb)
+      pkg: image_transport
+      exec: republish
+      name: decompress_rgb
+      param:
+        - {name: in_transport, value: compressed}
+        - {name: out_transport, value: raw}
+      remap:
+        - {from: in/compressed, to: /acl_jackal/forward/color/image_raw/compressed}
+        - {from: out, to: color/image_raw}
+  - node:
+      pkg: semantic_inference_ros
+      exec: instance_segmentation_node
+      name: semantic_inference
+      on_exit: shutdown
+      param:
+        - {name: config_path, value: $(var config_path), type: str}
+      args: >
+        --ros-args --log-level $(var log-level)

From 27bc19079a6dd9b4513d010c94f35668bc681811 Mon Sep 17 00:00:00 2001
From: multyxu <multyxu@gmail.com>
Date: Tue, 4 Nov 2025 17:18:55 -0500
Subject: [PATCH 2/9] close set instance segmentation with python environment

---
 semantic_inference_ros/app/instance_segmentation_node      | 7 ++++---
 .../config/instance_segmentation/yolov11.yaml              | 7 ++++---
 .../launch/instance_segmentation_yolov11.launch.yaml       | 4 +++-
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/semantic_inference_ros/app/instance_segmentation_node b/semantic_inference_ros/app/instance_segmentation_node
index e1f72d3..0197e2b 100755
--- a/semantic_inference_ros/app/instance_segmentation_node
+++ b/semantic_inference_ros/app/instance_segmentation_node
@@ -47,6 +47,7 @@ class InstanceSegmentationNode(Node):
             self.get_logger().warn(f"config path '{config_path}' does not exist!")
             self.config = InstanceSegmentationNodeConfig()
         else:
+            self.get_logger().info(f"Loading config from '{config_path}'")
             self.config = sc.Config.load(InstanceSegmentationNodeConfig, config_path)
 
         self.get_logger().info(f"Initializing with {self.config.show()}")
@@ -66,8 +67,8 @@ class InstanceSegmentationNode(Node):
         # self._visualizer = self.config.visualizer.create()
         if self._visualizer is not None:
         # TODO: write proper visualizer for instance segmentation
-            self._color_pub = self.create_publisher(
-                Image, "semantic_color/image_raw", 1
+            self._overlay_pub = self.create_publisher(
+                Image, "semantic_overlay/image_raw", 1
             )
 
     def _spin_once(self, header, img):
@@ -88,7 +89,7 @@ class InstanceSegmentationNode(Node):
         if self._visualizer is not None:
             # color_img = self._visualizer.call(ret)
             color_img = self.visualizer_call(ret, img)
-            self._color_pub.publish(
+            self._overlay_pub.publish(
                 Conversions.to_image_msg(header, color_img, encoding="rgb8")
             )
 
diff --git a/semantic_inference_ros/config/instance_segmentation/yolov11.yaml b/semantic_inference_ros/config/instance_segmentation/yolov11.yaml
index 8e5aab5..c0f5ac9 100644
--- a/semantic_inference_ros/config/instance_segmentation/yolov11.yaml
+++ b/semantic_inference_ros/config/instance_segmentation/yolov11.yaml
@@ -1,4 +1,5 @@
 ---
-instance_model:
-  type: yolov11
-  model_name: yolo11n-seg.pt
\ No newline at end of file
+model:
+  instance_model:
+    type: yolov11
+    model_name: yolo11n-seg.pt
\ No newline at end of file
diff --git a/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml b/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml
index 13036ab..816a454 100644
--- a/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml
+++ b/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml
@@ -1,5 +1,6 @@
 ---
 launch:
+  - arg: {name: instance_seg_env, default: /home/multyxu/environments/crisp, description: Path to instance segmentation environment}
   - arg: {name: config_path, default: $(find-pkg-share semantic_inference_ros)/config/instance_segmentation/yolov11.yaml, description: Configuration file for instance segmentation object detector}
   - arg: {name: compressed_rgb, default: 'false', description: Triggers decompression for RGB stream}
   - arg: {name: log-level, default: info, description: Set the ROS2 log level}
@@ -14,10 +15,11 @@ launch:
       remap:
         - {from: in/compressed, to: /acl_jackal/forward/color/image_raw/compressed}
         - {from: out, to: color/image_raw}
-  - node:
+  - pyenv_node:
       pkg: semantic_inference_ros
       exec: instance_segmentation_node
       name: semantic_inference
+      pyenv: $(var instance_seg_env)
       on_exit: shutdown
       param:
         - {name: config_path, value: $(var config_path), type: str}

From 842e48e53ba1958218c91ed305e564bb1da52f0c Mon Sep 17 00:00:00 2001
From: multyxu <multyxu@gmail.com>
Date: Tue, 4 Nov 2025 20:38:24 -0500
Subject: [PATCH 3/9] pre-commit fix

---
 .../semantic_inference/models/__init__.py     |   2 +-
 .../models/instance_segmenter.py              |  28 ++---
 .../semantic_inference/models/wrappers.py     |  19 +--
 .../app/instance_segmentation_node            | 116 +++++++++++-------
 .../config/instance_segmentation/yolov11.yaml |   2 +-
 .../instance_segmentation_yolov11.launch.yaml |   2 +-
 6 files changed, 95 insertions(+), 74 deletions(-)

diff --git a/semantic_inference/python/semantic_inference/models/__init__.py b/semantic_inference/python/semantic_inference/models/__init__.py
index b4dc607..0a0a069 100644
--- a/semantic_inference/python/semantic_inference/models/__init__.py
+++ b/semantic_inference/python/semantic_inference/models/__init__.py
@@ -30,12 +30,12 @@
 import torch
 
 from semantic_inference.models.feature_visualizers import *
+from semantic_inference.models.instance_segmenter import *
 from semantic_inference.models.mask_functions import *
 from semantic_inference.models.openset_segmenter import *
 from semantic_inference.models.patch_extractor import *
 from semantic_inference.models.segment_refinement import *
 from semantic_inference.models.wrappers import *
-from semantic_inference.models.instance_segmenter import *
 
 
 def default_device(use_cuda=True):
diff --git a/semantic_inference/python/semantic_inference/models/instance_segmenter.py b/semantic_inference/python/semantic_inference/models/instance_segmenter.py
index 616501d..0c19136 100644
--- a/semantic_inference/python/semantic_inference/models/instance_segmenter.py
+++ b/semantic_inference/python/semantic_inference/models/instance_segmenter.py
@@ -30,23 +30,14 @@
 """Model to segment an image and encode segments with CLIP embeddings."""
 
 import dataclasses
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from typing import Any
 
 import numpy as np
 import torch
-import torch.nn.functional as F
 from spark_config import Config, config_field
 from torch import nn
 
-from semantic_inference.models.mask_functions import ConstantMask
-from semantic_inference.models.patch_extractor import (
-    PatchExtractor,
-    center_crop,
-    default_normalization_parameters,
-    get_image_preprocessor,
-)
-from semantic_inference.models.segment_refinement import SegmentRefinement
 
 def _map_opt(values, f):
     return {k: v if v is None else f(v) for k, v in values.items()}
@@ -57,9 +48,9 @@ class Results:
     """Openset Segmentation Results."""
 
     masks: torch.Tensor
-    boxes: torch.Tensor # bounding boxes for the masks
+    boxes: torch.Tensor  # bounding boxes for the masks
     categories: torch.Tensor
-    confidences: torch.Tensor   
+    confidences: torch.Tensor
 
     @property
     def instances(self):
@@ -74,7 +65,7 @@ def instances(self):
             img[np_masks[i, ...] > 0] = i + 1
 
         # TODO: 16 + 16 int for instance id and category id
-        
+
         return img
 
     def cpu(self):
@@ -107,10 +98,9 @@ def __init__(self, config):
 
         self.config = config
         self.segmenter = self.config.instance_model.create()
-        # self.segment_refinement = SegmentRefinement(config.refinement) # might be useful clean up inprecise edges
-    
+
     def eval(self):
-        """ 
+        """
         Override eval to avoid issues with certain models
         """
         self.segmenter.eval()
@@ -153,8 +143,10 @@ def forward(self, rgb_img):
             Encoded image
         """
         categories, masks, boxes, confidences = self.segmenter(rgb_img)
-        
+
         # img = torch.from_numpy(rgb_img).to(self.device)
         # return self.encode(img, masks, boxes)
         # TODO: return the results of the actual instance segmentation model here
-        return Results(masks=masks, boxes=boxes, categories=categories, confidences=confidences)
+        return Results(
+            masks=masks, boxes=boxes, categories=categories, confidences=confidences
+        )
diff --git a/semantic_inference/python/semantic_inference/models/wrappers.py b/semantic_inference/python/semantic_inference/models/wrappers.py
index b1b13ff..98c6d51 100644
--- a/semantic_inference/python/semantic_inference/models/wrappers.py
+++ b/semantic_inference/python/semantic_inference/models/wrappers.py
@@ -347,7 +347,7 @@ def __init__(self, config):
 
         self.config = config
         self.model = YOLO(config.model_name)
-    
+
     def eval(self):
         """
         override eval to avoid issues with yolo model
@@ -363,17 +363,20 @@ def construct(cls, **kwargs):
 
     def forward(self, img):
         """Segment image."""
-        result = self.model(img)[0] # assume batch size 1
+        result = self.model(img)[0]  # assume batch size 1
         if result.masks is None:
             return None, None, None, None
-        categories = result.boxes.cls # int8
-        masks = result.masks.data.to(torch.bool) # 
-        boxes = result.boxes.xyxy # float32
-        confidences = result.boxes.conf # float32
+        categories = result.boxes.cls  # int8
+        masks = result.masks.data.to(torch.bool)  #
+        boxes = result.boxes.xyxy  # float32
+        confidences = result.boxes.conf  # float32
         # assume the instance id is the index in the result?
         return categories, masks, boxes, confidences
 
-@register_config("instance_model", name="yolov11", constructor=Yolov11InstanceSegmenterWrapper)
+
+@register_config(
+    "instance_model", name="yolov11", constructor=Yolov11InstanceSegmenterWrapper
+)
 @dataclasses.dataclass
 class Yolov11InstanceSegmenterConfig(Config):
     """Configuration for Yolov11 instance segmenter."""
@@ -383,4 +386,4 @@ class Yolov11InstanceSegmenterConfig(Config):
     @classmethod
     def load(cls, filepath):
         """Load config from file."""
-        return Config.load(cls, filepath)
\ No newline at end of file
+        return Config.load(cls, filepath)
diff --git a/semantic_inference_ros/app/instance_segmentation_node b/semantic_inference_ros/app/instance_segmentation_node
index 0197e2b..eaf5f76 100755
--- a/semantic_inference_ros/app/instance_segmentation_node
+++ b/semantic_inference_ros/app/instance_segmentation_node
@@ -3,8 +3,11 @@
 
 import pathlib
 from dataclasses import dataclass, field
-from typing import Any
 
+import cv2
+
+# additional imports that may not be here
+import numpy as np
 import rclpy
 import spark_config as sc
 import torch
@@ -13,12 +16,8 @@ from sensor_msgs.msg import Image
 
 import semantic_inference.models as models
 import semantic_inference_ros
-from semantic_inference_msgs.msg import FeatureImage, FeatureVectorStamped
 from semantic_inference_ros import Conversions, ImageWorkerConfig
 
-# additional imports that may not be here
-import numpy as np
-import cv2
 
 @dataclass
 class InstanceSegmentationNodeConfig(sc.Config):
@@ -27,7 +26,7 @@ class InstanceSegmentationNodeConfig(sc.Config):
     worker: ImageWorkerConfig = field(default_factory=ImageWorkerConfig)
     model: models.InstanceSegmenterConfig = field(
         default_factory=models.InstanceSegmenterConfig
-    ) # create InstanceSegmenterConfig in models
+    )  # create InstanceSegmenterConfig in models
     # visualizer: Any = sc.config_field(
     #     "feature_visualizer", default="component", required=False
     # )
@@ -53,20 +52,20 @@ class InstanceSegmentationNode(Node):
         self.get_logger().info(f"Initializing with {self.config.show()}")
         device = models.default_device()
         self._model = models.InstanceSegmenter(self.config.model).to(device)
-        self._model.eval() # TODO: causing issue with yolo model
+        self._model.eval()  # TODO: causing issue with yolo model
         self.get_logger().info("Finished initializing!")
 
-        self._pub = self.create_publisher(Image, "semantic/image_raw", 1) # publish segmented image
+        self._pub = self.create_publisher(
+            Image, "semantic/image_raw", 1
+        )  # publish segmented image
         self._worker = semantic_inference_ros.ImageWorker(
             self, self.config.worker, "color/image_raw", self._spin_once
-        ) # put image in queue for processing
-        # self._embedder = semantic_inference_ros.PromptEncoder(self, self._model.encoder)
-        # could be relavent if use GroundedSAM, put flexible label space in config
-        
+        )  # put image in queue for processing
+
         self._visualizer = "place_holder"
         # self._visualizer = self.config.visualizer.create()
         if self._visualizer is not None:
-        # TODO: write proper visualizer for instance segmentation
+            # TODO: write proper visualizer for instance segmentation
             self._overlay_pub = self.create_publisher(
                 Image, "semantic_overlay/image_raw", 1
             )
@@ -74,7 +73,7 @@ class InstanceSegmentationNode(Node):
     def _spin_once(self, header, img):
         with torch.no_grad():
             ret = self._model.segment(img, is_rgb_order=True).cpu()
-        
+
         if ret.masks is None:
             self.get_logger().debug("No masks detected in the image.")
             return
@@ -96,73 +95,83 @@ class InstanceSegmentationNode(Node):
     def stop(self):
         """Stop the underlying image worker."""
         self._worker.stop()
-    
+
     def convert_to_instance_seg_img(self, ret):
-        '''
+        """
         Convert segmentation results to instance segmentation image.
         Each pixel value encodes both category id and instance id.
         First 16 bits are category id, last 16 bits are instance id.
-        '''
+        """
         masks = ret.masks.cpu().numpy()
         category_ids = ret.categories.cpu().numpy()
         img = np.zeros(masks[0].shape, dtype=np.uint32)
         for i in range(masks.shape[0]):
-            category_id = int(category_ids[i]) # category id are 0-indexed
+            category_id = int(category_ids[i])  # category id are 0-indexed
             instance_id = i + 1  # instance ids are 1-indexed
-            combined_id = (category_id << 16) | instance_id  # combine into single uint32
+            combined_id = (
+                category_id << 16
+            ) | instance_id  # combine into single uint32
             img[masks[i, ...] > 0] = combined_id
 
         return img
-    
+
     def recover_instance_and_category(self, instance_seg_img):
-        '''
+        """
         Recover instance ids and category ids from instance segmentation image.
-        '''
+        """
         instance_ids = (instance_seg_img & 0xFFFF).astype(np.uint16)  # last 16 bits
-        category_ids = (instance_seg_img >> 16).astype(np.uint16)     # first 16 bits
+        category_ids = (instance_seg_img >> 16).astype(np.uint16)  # first 16 bits
         return instance_ids, category_ids
 
     def visualizer_call(self, ret, img):
-        '''
-        Process the result from yolo instance segmenter and generate color image. 
+        """
+        Process the result from yolo instance segmenter and generate color image.
         The returned color image contain bounding boxes, masks, and category labels.
-        '''
-        
+        """
+
         categories = ret.categories
         masks = ret.masks
         boxes = ret.boxes
         confidences = ret.confidences
-        
+
         # TODO: place holder directly from model, need to be replace by proper yaml
         category_names = self._model.segmenter.model.names
-        
+
         # Convert RGB to BGR for OpenCV
         vis_img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
 
         # Generate random colors for each class
-        np.random.seed(42) # for consistent colors
-        colors = np.random.randint(0, 255, size=(len(category_names), 3), dtype=np.uint8)
+        np.random.seed(42)  # for consistent colors
+        colors = np.random.randint(
+            0, 255, size=(len(category_names), 3), dtype=np.uint8
+        )
 
         # Overlay segmentation masks
         if masks is not None:
             for i, mask_tensor in enumerate(masks.data):
                 box = boxes[i]
                 cls = int(categories[i].cpu().numpy())
-                
+
                 # Get color for the class
                 color = colors[cls].tolist()
-                
+
                 # Get mask and resize it to the image dimensions
                 mask_np = mask_tensor.cpu().numpy().astype(np.uint8)
-                mask_resized = cv2.resize(mask_np, (vis_img_bgr.shape[1], vis_img_bgr.shape[0]), interpolation=cv2.INTER_NEAREST)
-                
+                mask_resized = cv2.resize(
+                    mask_np,
+                    (vis_img_bgr.shape[1], vis_img_bgr.shape[0]),
+                    interpolation=cv2.INTER_NEAREST,
+                )
+
                 # Find contours to create a mask overlay
-                contours, _ = cv2.findContours(mask_resized, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-                
+                contours, _ = cv2.findContours(
+                    mask_resized, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+                )
+
                 # Create a transparent overlay
                 overlay = vis_img_bgr.copy()
                 cv2.drawContours(overlay, contours, -1, color, -1)
-                
+
                 # Blend the overlay with the original image
                 alpha = 0.5
                 vis_img_bgr = cv2.addWeighted(overlay, alpha, vis_img_bgr, 1 - alpha, 0)
@@ -173,22 +182,39 @@ class InstanceSegmentationNode(Node):
             conf = confidences[i].cpu().numpy()
             cls = int(categories[i].cpu().numpy())
             label = f"{category_names[cls]} {conf:.2f}"
-            
+
             color = colors[cls].tolist()
-            
+
             # Draw bounding box
             cv2.rectangle(vis_img_bgr, (x1, y1), (x2, y2), color, 2)
-            
+
             # Put label above the bounding box
-            (label_width, label_height), baseline = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)
-            cv2.rectangle(vis_img_bgr, (x1, y1 - label_height - 10), (x1 + label_width, y1), color, -1)
-            cv2.putText(vis_img_bgr, label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
+            (label_width, label_height), baseline = cv2.getTextSize(
+                label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2
+            )
+            cv2.rectangle(
+                vis_img_bgr,
+                (x1, y1 - label_height - 10),
+                (x1 + label_width, y1),
+                color,
+                -1,
+            )
+            cv2.putText(
+                vis_img_bgr,
+                label,
+                (x1, y1 - 5),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                0.5,
+                (255, 255, 255),
+                1,
+            )
 
         # Convert BGR back to RGB for displaying with matplotlib
         vis_img_rgb = cv2.cvtColor(vis_img_bgr, cv2.COLOR_BGR2RGB)
-        
+
         return vis_img_rgb
 
+
 def main():
     """Start a node."""
     rclpy.init()
diff --git a/semantic_inference_ros/config/instance_segmentation/yolov11.yaml b/semantic_inference_ros/config/instance_segmentation/yolov11.yaml
index c0f5ac9..e67c3d3 100644
--- a/semantic_inference_ros/config/instance_segmentation/yolov11.yaml
+++ b/semantic_inference_ros/config/instance_segmentation/yolov11.yaml
@@ -2,4 +2,4 @@
 model:
   instance_model:
     type: yolov11
-    model_name: yolo11n-seg.pt
\ No newline at end of file
+    model_name: yolo11n-seg.pt
diff --git a/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml b/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml
index 816a454..261eee1 100644
--- a/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml
+++ b/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml
@@ -23,5 +23,5 @@ launch:
       on_exit: shutdown
       param:
         - {name: config_path, value: $(var config_path), type: str}
-      args: >
+      args: >-
         --ros-args --log-level $(var log-level)

From 8a8610ffb1dcd33e4ad3a360d038f66171d2f0e6 Mon Sep 17 00:00:00 2001
From: multyxu <multyxu@gmail.com>
Date: Thu, 13 Nov 2025 00:14:03 -0500
Subject: [PATCH 4/9] Add grounded sam 2 as an instance segmenter.  Fix yaml to
 use environment variable for home instead of hardcoding it.

---
 .../models/instance_segmenter.py              |  14 +-
 .../semantic_inference/models/wrappers.py     | 188 +++++++++++++++++-
 .../app/instance_segmentation_node            |   3 +-
 .../config/instance_segmentation/gdsam2.yaml  |  12 ++
 .../instance_segmentation_gdsam2.launch.yaml  |  27 +++
 .../instance_segmentation_yolov11.launch.yaml |   2 +-
 6 files changed, 236 insertions(+), 10 deletions(-)
 create mode 100644 semantic_inference_ros/config/instance_segmentation/gdsam2.yaml
 create mode 100644 semantic_inference_ros/launch/instance_segmentation_gdsam2.launch.yaml

diff --git a/semantic_inference/python/semantic_inference/models/instance_segmenter.py b/semantic_inference/python/semantic_inference/models/instance_segmenter.py
index 0c19136..39c637f 100644
--- a/semantic_inference/python/semantic_inference/models/instance_segmenter.py
+++ b/semantic_inference/python/semantic_inference/models/instance_segmenter.py
@@ -47,10 +47,11 @@ def _map_opt(values, f):
 class Results:
     """Openset Segmentation Results."""
 
-    masks: torch.Tensor
-    boxes: torch.Tensor  # bounding boxes for the masks
-    categories: torch.Tensor
-    confidences: torch.Tensor
+    # all on cuda/tensor device? TODO: Maybe should move to cpu by default
+    masks: torch.Tensor # (n, H, W), torch.bool
+    boxes: torch.Tensor # (n, 4) xyxy format, torch.float32
+    categories: torch.Tensor # (n,), torch.float32/int64 (doesn't matter)
+    confidences: torch.Tensor # (n,), torch.float32
 
     @property
     def instances(self):
@@ -131,6 +132,11 @@ def segment(self, rgb_img, is_rgb_order=True):
     def device(self):
         """Get current model device."""
         return self._canary_param.device
+    
+    @property
+    def category_names(self):
+        """Get category names."""
+        return self.segmenter.category_names
 
     def forward(self, rgb_img):
         """
diff --git a/semantic_inference/python/semantic_inference/models/wrappers.py b/semantic_inference/python/semantic_inference/models/wrappers.py
index 98c6d51..fd5599c 100644
--- a/semantic_inference/python/semantic_inference/models/wrappers.py
+++ b/semantic_inference/python/semantic_inference/models/wrappers.py
@@ -31,10 +31,14 @@
 
 import dataclasses
 
+import os
 import einops
+import numpy as np
 import torch
 import torch.nn as nn
 import torchvision
+from torchvision.ops import box_convert
+
 from spark_config import Config, register_config
 
 from semantic_inference import root_path
@@ -44,6 +48,10 @@ def models_path():
     """Get path to pre-trained weight storage."""
     return root_path().parent.parent / "models"
 
+def path_to_dot_semantic_inference():
+    """Get path to ~/.semantic_inference directory."""
+    return os.getenv("HOME") + "/.semantic_inference"
+
 
 class FastSAMSegmentation(nn.Module):
     """Fast SAM wrapper."""
@@ -353,6 +361,11 @@ def eval(self):
         override eval to avoid issues with yolo model
         """
         self.model.model.eval()
+    
+    @property
+    def category_names(self):
+        """Get category names."""
+        return self.model.names
 
     @classmethod
     def construct(cls, **kwargs):
@@ -366,10 +379,10 @@ def forward(self, img):
         result = self.model(img)[0]  # assume batch size 1
         if result.masks is None:
             return None, None, None, None
-        categories = result.boxes.cls  # int8
-        masks = result.masks.data.to(torch.bool)  #
-        boxes = result.boxes.xyxy  # float32
-        confidences = result.boxes.conf  # float32
+        categories = result.boxes.cls.cpu()  # int8
+        masks = result.masks.data.to(torch.bool).cpu()  #
+        boxes = result.boxes.xyxy.cpu()  # float32
+        confidences = result.boxes.conf.cpu()  # float32
         # assume the instance id is the index in the result?
         return categories, masks, boxes, confidences
 
@@ -387,3 +400,170 @@ class Yolov11InstanceSegmenterConfig(Config):
     def load(cls, filepath):
         """Load config from file."""
         return Config.load(cls, filepath)
+
+
+class GDSam2InstanceSegmenterWrapper(nn.Module):
+    """Grounded SAM 2 instance segmentation wrapper."""
+
+    def __init__(self, config):
+        """Load Grounded SAM 2 model."""
+        super().__init__()
+        from sam2.build_sam import build_sam2
+        from sam2.sam2_image_predictor import SAM2ImagePredictor
+        from groundingdino.util.inference import load_model
+
+        self.config = config
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.text_prompt = config.text_prompt
+        self.multimask_output = config.multimask_output
+        
+        sam2_model_config_path = os.path.join(
+            "configs/sam2.1", config.sam2_model_config
+        )
+        sam2_checkpoint_path = os.path.join(
+            path_to_dot_semantic_inference(), config.sam2_checkpoint
+        ) # this uses hydra config packages so only need relative path to the pkg installation dir
+        grounding_dino_config_path = os.path.join(
+            path_to_dot_semantic_inference(), "gdsam2_config", config.grounding_dino_config
+        ) 
+        grounding_dino_checkpoint_path = os.path.join(
+            path_to_dot_semantic_inference(), config.grounding_dino_checkpoint
+        )
+        
+        # build SAM2 image predictor
+        self.sam2_model = build_sam2(
+            sam2_model_config_path, 
+            sam2_checkpoint_path
+        )
+        self.sam2_predictor = SAM2ImagePredictor(self.sam2_model)
+        
+        # build grounding dino model
+        self.grounding_model = load_model(
+            model_config_path=grounding_dino_config_path, 
+            model_checkpoint_path=grounding_dino_checkpoint_path, 
+            device=self.device
+        )
+        
+        # convert text prompt to category names
+        self.category_names = self.text_prompt.lower().split('. ')
+        self.category_names = [cat.strip() for cat in self.category_names if len(cat.strip()) > 0]
+        self.category_names[-1] = self.category_names[-1].rstrip('.') # remove the last dot if any
+
+    def preprocess_image(self, img):
+        """Preprocess image for Grounded SAM 2.
+        Input: 
+            - img np.ndarray (H, W, C) uint8
+        Output:
+            - image_transformed torch.Tensor (C, H, W) float32
+        """
+        import groundingdino.datasets.transforms as T
+        transform = T.Compose(
+            [
+                T.RandomResize([800], max_size=1333),
+                T.ToTensor(),
+                T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+            ]
+        )
+        pil_img = torchvision.transforms.ToPILImage()(img)
+        image_transformed, _ = transform(pil_img, None)
+        return image_transformed
+    
+    @classmethod
+    def construct(cls, **kwargs):
+        """Load model from configuration dictionary."""
+        config = GDSam2InstanceSegmenterConfig()
+        config.update(kwargs)
+        return cls(config)
+
+    def forward(self, img):
+        """Segment image."""
+        from groundingdino.util.inference import load_image, predict
+        
+        # preprocess img
+        img_transformed = self.preprocess_image(img)
+        
+        # gdino prediction
+        boxes, confidences, labels = predict(
+            model=self.grounding_model,
+            image=img_transformed,
+            caption=self.text_prompt,
+            box_threshold=self.config.box_threshold,
+            text_threshold=self.config.text_threshold,
+            device=self.device
+        )
+        
+        # if nothing detected
+        if boxes.shape[0] == 0:
+            return None, None, None, None
+
+        # process the box prompt for SAM 2
+        h, w, _ = img.shape
+        boxes = boxes * torch.Tensor([w, h, w, h])
+        input_boxes = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
+
+        # FIXME: figure how does this influence the G-DINO model (from offical gdsam2 demo)
+        # torch.autocast(device_type=self.device.type, dtype=torch.bfloat16).__enter__()
+        # if torch.cuda.is_available() and torch.cuda.get_device_properties(0).major >= 8:
+        #     # turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)
+        #     torch.backends.cuda.matmul.allow_tf32 = True
+        #     torch.backends.cudnn.allow_tf32 = True
+            
+        # SAM 2 predicts mask
+        self.sam2_predictor.set_image(img)
+        masks, scores, logits = self.sam2_predictor.predict(
+            point_coords=None,
+            point_labels=None,
+            box=input_boxes,
+            multimask_output=self.multimask_output,
+        )
+
+        # Sample best according to scores if multimask output
+        if self.multimask_output:
+            best = np.argmax(scores, axis=1)                     
+            masks = masks[np.arange(masks.shape[0]), best]    
+
+        # convert the shape to (n, H, W)
+        if masks.ndim == 4:
+            masks = masks.squeeze(1)
+        
+        # convert string labels to indexes based on the text prompt
+        categories = []
+        for label in labels:
+            label_str = label.lower()
+            if label_str in self.category_names:
+                label_indx = self.category_names.index(label_str)
+            else:
+                label_indx = -1 # unknown
+            categories.append(label_indx)
+        categories = torch.tensor(categories)
+        
+        # convert masks to boolean
+        masks = masks.astype(bool)
+        masks = torch.tensor(masks)
+        
+        # use xyxy boxes
+        boxes = torch.tensor(input_boxes)
+    
+        return categories, masks, boxes, confidences
+        
+
+@register_config(
+    "instance_model", name="gdsam2", constructor=GDSam2InstanceSegmenterWrapper
+)
+@dataclasses.dataclass
+class GDSam2InstanceSegmenterConfig(Config):
+    """Configuration for Grounded SAM 2 instance segmenter."""
+    
+    text_prompt: str = "car. tire."
+    sam2_checkpoint: str = "sam2.1_hiera_large.pt"
+    sam2_model_config: str = "sam2.1_hiera_l.yaml"
+    grounding_dino_config: str = "GroundingDINO_SwinT_OGC.py"
+    grounding_dino_checkpoint: str = "groundingdino_swint_ogc.pth"
+    box_threshold: float = 0.35
+    text_threshold: float = 0.25
+    multimask_output: bool = False
+
+    @classmethod
+    def load(cls, filepath):
+        """Load config from file."""
+        return Config.load(cls, filepath)
\ No newline at end of file
diff --git a/semantic_inference_ros/app/instance_segmentation_node b/semantic_inference_ros/app/instance_segmentation_node
index eaf5f76..34394b1 100755
--- a/semantic_inference_ros/app/instance_segmentation_node
+++ b/semantic_inference_ros/app/instance_segmentation_node
@@ -135,7 +135,8 @@ class InstanceSegmentationNode(Node):
         confidences = ret.confidences
 
         # TODO: place holder directly from model, need to be replace by proper yaml
-        category_names = self._model.segmenter.model.names
+        # category_names = self._model.segmenter.model.names
+        category_names = self._model.category_names
 
         # Convert RGB to BGR for OpenCV
         vis_img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
diff --git a/semantic_inference_ros/config/instance_segmentation/gdsam2.yaml b/semantic_inference_ros/config/instance_segmentation/gdsam2.yaml
new file mode 100644
index 0000000..e97db70
--- /dev/null
+++ b/semantic_inference_ros/config/instance_segmentation/gdsam2.yaml
@@ -0,0 +1,12 @@
+---
+model:
+  instance_model:
+    type: gdsam2
+    text_prompt: "person. bicycle. car. truck. bench. fire hydrant."
+    sam2_checkpoint: "sam2.1_hiera_large.pt"
+    sam2_model_config: "sam2.1_hiera_l.yaml"
+    grounding_dino_config: "GroundingDINO_SwinT_OGC.py"
+    grounding_dino_checkpoint: "groundingdino_swint_ogc.pth"
+    box_threshold: 0.35
+    text_threshold: 0.25
+    multimask_output: False
\ No newline at end of file
diff --git a/semantic_inference_ros/launch/instance_segmentation_gdsam2.launch.yaml b/semantic_inference_ros/launch/instance_segmentation_gdsam2.launch.yaml
new file mode 100644
index 0000000..46fcac5
--- /dev/null
+++ b/semantic_inference_ros/launch/instance_segmentation_gdsam2.launch.yaml
@@ -0,0 +1,27 @@
+---
+launch:
+  - arg: {name: gdsam2_env, default: $(env HOME)/environments/gdsam2, description: Path to instance segmentation environment}
+  - arg: {name: config_path, default: $(find-pkg-share semantic_inference_ros)/config/instance_segmentation/gdsam2.yaml, description: Configuration file for instance segmentation object detector}
+  - arg: {name: compressed_rgb, default: 'false', description: Triggers decompression for RGB stream}
+  - arg: {name: log-level, default: info, description: Set the ROS2 log level}
+  - node:
+      if: $(var compressed_rgb)
+      pkg: image_transport
+      exec: republish
+      name: decompress_rgb
+      param:
+        - {name: in_transport, value: compressed}
+        - {name: out_transport, value: raw}
+      remap:
+        - {from: in/compressed, to: /acl_jackal/forward/color/image_raw/compressed}
+        - {from: out, to: color/image_raw}
+  - pyenv_node:
+      pkg: semantic_inference_ros
+      exec: instance_segmentation_node
+      name: semantic_inference
+      pyenv: $(var gdsam2_env)
+      on_exit: shutdown
+      param:
+        - {name: config_path, value: $(var config_path), type: str}
+      args: >-
+        --ros-args --log-level $(var log-level)
diff --git a/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml b/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml
index 261eee1..ff4c2ec 100644
--- a/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml
+++ b/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml
@@ -1,6 +1,6 @@
 ---
 launch:
-  - arg: {name: instance_seg_env, default: /home/multyxu/environments/crisp, description: Path to instance segmentation environment}
+  - arg: {name: instance_seg_env, default: $(env HOME)/environments/crisp, description: Path to instance segmentation environment}
   - arg: {name: config_path, default: $(find-pkg-share semantic_inference_ros)/config/instance_segmentation/yolov11.yaml, description: Configuration file for instance segmentation object detector}
   - arg: {name: compressed_rgb, default: 'false', description: Triggers decompression for RGB stream}
   - arg: {name: log-level, default: info, description: Set the ROS2 log level}

From 3eec3e5b8a24c6b57ef2ab1693682f180dbaaf73 Mon Sep 17 00:00:00 2001
From: multyxu <multyxu@gmail.com>
Date: Fri, 14 Nov 2025 17:20:34 -0500
Subject: [PATCH 5/9] Add inital document, modify label space

---
 docs/instance_seg.md                          | 70 +++++++++++++++++++
 .../config/instance_segmentation/gdsam2.yaml  |  2 +-
 2 files changed, 71 insertions(+), 1 deletion(-)
 create mode 100644 docs/instance_seg.md

diff --git a/docs/instance_seg.md b/docs/instance_seg.md
new file mode 100644
index 0000000..74e1ae0
--- /dev/null
+++ b/docs/instance_seg.md
@@ -0,0 +1,70 @@
+# Instance Segmentation
+
+## Setting Up
+
+The open-set segmentation interface works with and without ROS. For working with ROS, we assume you have already built your workspace with this repository in it beforehand (i.e., by running `colcon build`).
+
+> **Note </br>**
+> If you intend only to use the open-set segmentation interface, you may want to turn off building against TensorRT, which you can do by the following:
+> ```shell
+> colcon build --cmake-args --no-warn-unused-cli -DSEMANTIC_INFERENCE_USE_TRT=OFF
+> ```
+
+### Installing
+
+We assume you are using a virtual environment. You may want to install `virtualenv` (usually `sudo apt install python3-virtualenv`) if you haven't already.
+To set up a virtual environment for use with ROS:
+```shell
+python3 -m virtualenv -p /usr/bin/python3 --system-site-packages <DESIRED_PATH_TO_ENVIRONMENT>
+```
+Otherwise, omit the ``--system-site-packages`` option:
+```shell
+python3 -m virtualenv -p /usr/bin/python3 --download <DESIRED_PATH_TO_ENVIRONMENT>
+```
+
+Then, install `semantic_inference`
+```shell
+cd <PATH_TO_REPO>
+source <PATH_TO_ENVIRONMENT>/bin/activate
+pip install ./semantic_inference[openset]  # note that the openset extra is required for open-set semantic segmentation
+```
+
+The above setup allows you to use `yolov11`, in order to use `grounded sam 2`, we have to manually install it. 
+```shell
+# cd to your favorite path 
+git clone -b more_gpu https://github.com/MultyXu/Grounded-SAM-2.git
+```
+And follow the `README.md` to install gdsam2.
+
+
+<!-- ## Models
+
+Note that both CLIP and FastSAM automatically download the relevant model weights when they are first run.
+Running with the original SAM may require downloading the model weights. See the official SAM repository [here](https://github.com/facebookresearch/segment-anything) for more details.
+
+## Trying out open-set segmentation nodes
+
+Similar to the example [here](../README.md#usage), you can run any of the open-set launch files:
+
+```shell
+activate <PATH_TO_ENVIRONMENT>/bin/activate
+## this example just produces an embedding vector per image
+# ros2 launch semantic_inference_ros image_embedding_node.launch.yaml
+ros2 launch semantic_inference_ros open_set.launch.yaml
+```
+and then run
+```shell
+ros2 bag play PATH_TO_BAG --remap INPUT_TOPIC:=/color/image_raw
+```
+
+You should see a single embedding vector published under `/semantic/feature` and (if running the full open-set segmenter), the segmentation results under `/semantic/image_raw` and a visualization of the results under `/semantic_color/image_raw` and `/semantic_overlay/image_raw`.
+
+## Using open-set segmentation online
+
+To use the open-set segmentation as part of a larger system, include [open_set.launch.yaml](../semantic_inference_ros/launch/open_set.launch.yaml) in your launch file. Often this will look like this:
+```yaml
+launch:
+    # ... rest of launch file ...
+    - set_remap: {from: "color/image_raw", to: "YOUR_INPUT_TOPIC_HERE"}
+    - include:  {file: "$(find-pkg-share semantic_inference_ros)/launch/opsen_set.launch.yaml"}
+``` -->
diff --git a/semantic_inference_ros/config/instance_segmentation/gdsam2.yaml b/semantic_inference_ros/config/instance_segmentation/gdsam2.yaml
index e97db70..6bf2649 100644
--- a/semantic_inference_ros/config/instance_segmentation/gdsam2.yaml
+++ b/semantic_inference_ros/config/instance_segmentation/gdsam2.yaml
@@ -2,7 +2,7 @@
 model:
   instance_model:
     type: gdsam2
-    text_prompt: "person. bicycle. car. truck. bench. fire hydrant."
+    text_prompt: "bench. car. trash bin. fire hydrant."
     sam2_checkpoint: "sam2.1_hiera_large.pt"
     sam2_model_config: "sam2.1_hiera_l.yaml"
     grounding_dino_config: "GroundingDINO_SwinT_OGC.py"

From fd19d3321013db6a60c2430dfd805711ea44d605 Mon Sep 17 00:00:00 2001
From: multyxu <multyxu@gmail.com>
Date: Tue, 18 Nov 2025 16:51:09 -0500
Subject: [PATCH 6/9] mini change before merge hungytae's commit

---
 semantic_inference/pyproject.toml                              | 2 +-
 .../launch/instance_segmentation_gdsam2.launch.yaml            | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/semantic_inference/pyproject.toml b/semantic_inference/pyproject.toml
index 29bf976..5d931f9 100644
--- a/semantic_inference/pyproject.toml
+++ b/semantic_inference/pyproject.toml
@@ -30,6 +30,7 @@ dependencies = [
     "torchvision",
     "spark_config@git+https://github.com/MIT-SPARK/Spark-Config.git",
     "numpy<2",
+    "ultralytics",
 ]
 
 [tool.setuptools.packages.find]
@@ -41,7 +42,6 @@ semantic-inference = "semantic_inference.__main__:cli"
 [project.optional-dependencies]
 dev = ["pytest"]
 openset = [
-    "ultralytics",
     "clip@git+https://github.com/openai/CLIP.git",
     "open_clip_torch",
     "numpy >= 1.20",
diff --git a/semantic_inference_ros/launch/instance_segmentation_gdsam2.launch.yaml b/semantic_inference_ros/launch/instance_segmentation_gdsam2.launch.yaml
index 46fcac5..04a7098 100644
--- a/semantic_inference_ros/launch/instance_segmentation_gdsam2.launch.yaml
+++ b/semantic_inference_ros/launch/instance_segmentation_gdsam2.launch.yaml
@@ -1,7 +1,8 @@
 ---
 launch:
   - arg: {name: gdsam2_env, default: $(env HOME)/environments/gdsam2, description: Path to instance segmentation environment}
-  - arg: {name: config_path, default: $(find-pkg-share semantic_inference_ros)/config/instance_segmentation/gdsam2.yaml, description: Configuration file for instance segmentation object detector}
+  - arg: {name: model_name, default: gdsam2, description: Name of the segmentation model to use}
+  - arg: {name: config_path, default: $(find-pkg-share semantic_inference_ros)/config/instance_segmentation/$(var model_name).yaml, description: Configuration file for instance segmentation object detector}
   - arg: {name: compressed_rgb, default: 'false', description: Triggers decompression for RGB stream}
   - arg: {name: log-level, default: info, description: Set the ROS2 log level}
   - node:

From a2cc0d5cd3d0fb657ac1b02fca66de9886489495 Mon Sep 17 00:00:00 2001
From: Multyxu <multyxu@gmail.com>
Date: Wed, 3 Dec 2025 12:10:41 -0500
Subject: [PATCH 7/9] Fix wrong comment line. Add more info to instance seg
 setup doc

---
 docs/instance_seg.md                                          | 2 ++
 .../python/semantic_inference/models/wrappers.py              | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/instance_seg.md b/docs/instance_seg.md
index 74e1ae0..219ae47 100644
--- a/docs/instance_seg.md
+++ b/docs/instance_seg.md
@@ -36,6 +36,8 @@ git clone -b more_gpu https://github.com/MultyXu/Grounded-SAM-2.git
 ```
 And follow the `README.md` to install gdsam2.
 
+### Setup model
+Put (or symlink) `GroundingDINO_SwinT_OGC.py` under `~/.semantic_inference/gdsam2_config/`. And, put `sam2.1_hiera_large.pt` and `groundingdino_swint_ogc.pth` under `~/.semantic_inference/`
 
 <!-- ## Models
 
diff --git a/semantic_inference/python/semantic_inference/models/wrappers.py b/semantic_inference/python/semantic_inference/models/wrappers.py
index fd5599c..185cebe 100644
--- a/semantic_inference/python/semantic_inference/models/wrappers.py
+++ b/semantic_inference/python/semantic_inference/models/wrappers.py
@@ -419,10 +419,10 @@ def __init__(self, config):
         
         sam2_model_config_path = os.path.join(
             "configs/sam2.1", config.sam2_model_config
-        )
+        ) # this uses hydra config packages so only need relative path to the pkg installation dir
         sam2_checkpoint_path = os.path.join(
             path_to_dot_semantic_inference(), config.sam2_checkpoint
-        ) # this uses hydra config packages so only need relative path to the pkg installation dir
+        ) 
         grounding_dino_config_path = os.path.join(
             path_to_dot_semantic_inference(), "gdsam2_config", config.grounding_dino_config
         ) 

From c714f7de7de50a4b8534e11301559d07c75dd60e Mon Sep 17 00:00:00 2001
From: Multyxu <multyxu@gmail.com>
Date: Wed, 3 Dec 2025 13:31:34 -0500
Subject: [PATCH 8/9] clean up the code by following the comments on the PR

---
 docs/instance_seg.md                          |  4 ++-
 .../models/instance_segmenter.py              | 31 ++++++++--------
 .../semantic_inference/models/wrappers.py     |  6 ++--
 .../app/instance_segmentation_node            | 36 ++-----------------
 ...yaml => instance_segmentation.launch.yaml} |  4 +--
 .../instance_segmentation_yolov11.launch.yaml | 27 --------------
 6 files changed, 27 insertions(+), 81 deletions(-)
 rename semantic_inference_ros/launch/{instance_segmentation_gdsam2.launch.yaml => instance_segmentation.launch.yaml} (87%)
 delete mode 100644 semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml

diff --git a/docs/instance_seg.md b/docs/instance_seg.md
index 219ae47..a9cd135 100644
--- a/docs/instance_seg.md
+++ b/docs/instance_seg.md
@@ -22,6 +22,8 @@ Otherwise, omit the ``--system-site-packages`` option:
 python3 -m virtualenv -p /usr/bin/python3 --download <DESIRED_PATH_TO_ENVIRONMENT>
 ```
 
+> NOTE: we default the virtual environment name to `gdsam2` currently.
+
 Then, install `semantic_inference`
 ```shell
 cd <PATH_TO_REPO>
@@ -31,7 +33,7 @@ pip install ./semantic_inference[openset]  # note that the openset extra is requ
 
 The above setup allows you to use `yolov11`, in order to use `grounded sam 2`, we have to manually install it. 
 ```shell
-# cd to your favorite path 
+# cd to your favorite path, we can default to `~/.semantic_inference/`
 git clone -b more_gpu https://github.com/MultyXu/Grounded-SAM-2.git
 ```
 And follow the `README.md` to install gdsam2.
diff --git a/semantic_inference/python/semantic_inference/models/instance_segmenter.py b/semantic_inference/python/semantic_inference/models/instance_segmenter.py
index 39c637f..00ead5b 100644
--- a/semantic_inference/python/semantic_inference/models/instance_segmenter.py
+++ b/semantic_inference/python/semantic_inference/models/instance_segmenter.py
@@ -54,18 +54,22 @@ class Results:
     confidences: torch.Tensor # (n,), torch.float32
 
     @property
-    def instances(self):
-        """Get instance image (if it exists)."""
-        if self.masks.shape[0] == 0:
-            return None
-
-        np_masks = self.masks.numpy()
-        img = np.zeros(np_masks[0].shape, dtype=np.uint16)
-        for i in range(self.masks.shape[0]):
-            # instance ids are 1-indexed
-            img[np_masks[i, ...] > 0] = i + 1
-
-        # TODO: 16 + 16 int for instance id and category id
+    def instance_seg_img(self):
+        """
+        Convert segmentation results to instance segmentation image.
+        Each pixel value encodes both category id and instance id.
+        First 16 bits are category id, last 16 bits are instance id.
+        """
+        masks = self.masks.cpu().numpy()
+        category_ids = self.categories.cpu().numpy()
+        img = np.zeros(masks[0].shape, dtype=np.uint32)
+        for i in range(masks.shape[0]):
+            category_id = int(category_ids[i])  # category id are 0-indexed
+            instance_id = i + 1  # instance ids are 1-indexed
+            combined_id = (
+                category_id << 16
+            ) | instance_id  # combine into single uint32
+            img[masks[i, ...] > 0] = combined_id
 
         return img
 
@@ -150,9 +154,6 @@ def forward(self, rgb_img):
         """
         categories, masks, boxes, confidences = self.segmenter(rgb_img)
 
-        # img = torch.from_numpy(rgb_img).to(self.device)
-        # return self.encode(img, masks, boxes)
-        # TODO: return the results of the actual instance segmentation model here
         return Results(
             masks=masks, boxes=boxes, categories=categories, confidences=confidences
         )
diff --git a/semantic_inference/python/semantic_inference/models/wrappers.py b/semantic_inference/python/semantic_inference/models/wrappers.py
index 185cebe..8aaff71 100644
--- a/semantic_inference/python/semantic_inference/models/wrappers.py
+++ b/semantic_inference/python/semantic_inference/models/wrappers.py
@@ -357,9 +357,7 @@ def __init__(self, config):
         self.model = YOLO(config.model_name)
 
     def eval(self):
-        """
-        override eval to avoid issues with yolo model
-        """
+        """override eval to avoid issues with yolo model"""
         self.model.model.eval()
     
     @property
@@ -379,6 +377,7 @@ def forward(self, img):
         result = self.model(img)[0]  # assume batch size 1
         if result.masks is None:
             return None, None, None, None
+        
         categories = result.boxes.cls.cpu()  # int8
         masks = result.masks.data.to(torch.bool).cpu()  #
         boxes = result.boxes.xyxy.cpu()  # float32
@@ -501,6 +500,7 @@ def forward(self, img):
         boxes = boxes * torch.Tensor([w, h, w, h])
         input_boxes = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
 
+        # Below is the comment from the official sam2 demo:
         # FIXME: figure how does this influence the G-DINO model (from offical gdsam2 demo)
         # torch.autocast(device_type=self.device.type, dtype=torch.bfloat16).__enter__()
         # if torch.cuda.is_available() and torch.cuda.get_device_properties(0).major >= 8:
diff --git a/semantic_inference_ros/app/instance_segmentation_node b/semantic_inference_ros/app/instance_segmentation_node
index 34394b1..e0e78c7 100755
--- a/semantic_inference_ros/app/instance_segmentation_node
+++ b/semantic_inference_ros/app/instance_segmentation_node
@@ -5,8 +5,6 @@ import pathlib
 from dataclasses import dataclass, field
 
 import cv2
-
-# additional imports that may not be here
 import numpy as np
 import rclpy
 import spark_config as sc
@@ -62,6 +60,7 @@ class InstanceSegmentationNode(Node):
             self, self.config.worker, "color/image_raw", self._spin_once
         )  # put image in queue for processing
 
+        # TODO(multy): proper visualizer classs needed?
         self._visualizer = "place_holder"
         # self._visualizer = self.config.visualizer.create()
         if self._visualizer is not None:
@@ -78,7 +77,7 @@ class InstanceSegmentationNode(Node):
             self.get_logger().debug("No masks detected in the image.")
             return
 
-        instance_seg_img = self.convert_to_instance_seg_img(ret)
+        instance_seg_img = ret.instance_seg_img
         # Convert to int32 to match 32SC1 encoding expected by cv_bridge
         instance_seg_img = instance_seg_img.astype(np.int32)
         msg = Conversions.to_image_msg(header, instance_seg_img, encoding="32SC1")
@@ -86,7 +85,6 @@ class InstanceSegmentationNode(Node):
         self.get_logger().debug("Published instance segmentation image.")
 
         if self._visualizer is not None:
-            # color_img = self._visualizer.call(ret)
             color_img = self.visualizer_call(ret, img)
             self._overlay_pub.publish(
                 Conversions.to_image_msg(header, color_img, encoding="rgb8")
@@ -96,33 +94,6 @@ class InstanceSegmentationNode(Node):
         """Stop the underlying image worker."""
         self._worker.stop()
 
-    def convert_to_instance_seg_img(self, ret):
-        """
-        Convert segmentation results to instance segmentation image.
-        Each pixel value encodes both category id and instance id.
-        First 16 bits are category id, last 16 bits are instance id.
-        """
-        masks = ret.masks.cpu().numpy()
-        category_ids = ret.categories.cpu().numpy()
-        img = np.zeros(masks[0].shape, dtype=np.uint32)
-        for i in range(masks.shape[0]):
-            category_id = int(category_ids[i])  # category id are 0-indexed
-            instance_id = i + 1  # instance ids are 1-indexed
-            combined_id = (
-                category_id << 16
-            ) | instance_id  # combine into single uint32
-            img[masks[i, ...] > 0] = combined_id
-
-        return img
-
-    def recover_instance_and_category(self, instance_seg_img):
-        """
-        Recover instance ids and category ids from instance segmentation image.
-        """
-        instance_ids = (instance_seg_img & 0xFFFF).astype(np.uint16)  # last 16 bits
-        category_ids = (instance_seg_img >> 16).astype(np.uint16)  # first 16 bits
-        return instance_ids, category_ids
-
     def visualizer_call(self, ret, img):
         """
         Process the result from yolo instance segmenter and generate color image.
@@ -134,8 +105,7 @@ class InstanceSegmentationNode(Node):
         boxes = ret.boxes
         confidences = ret.confidences
 
-        # TODO: place holder directly from model, need to be replace by proper yaml
-        # category_names = self._model.segmenter.model.names
+        # TODO(multy): place holder directly from model, need to be replace by proper yaml
         category_names = self._model.category_names
 
         # Convert RGB to BGR for OpenCV
diff --git a/semantic_inference_ros/launch/instance_segmentation_gdsam2.launch.yaml b/semantic_inference_ros/launch/instance_segmentation.launch.yaml
similarity index 87%
rename from semantic_inference_ros/launch/instance_segmentation_gdsam2.launch.yaml
rename to semantic_inference_ros/launch/instance_segmentation.launch.yaml
index 04a7098..90a2066 100644
--- a/semantic_inference_ros/launch/instance_segmentation_gdsam2.launch.yaml
+++ b/semantic_inference_ros/launch/instance_segmentation.launch.yaml
@@ -1,6 +1,6 @@
 ---
 launch:
-  - arg: {name: gdsam2_env, default: $(env HOME)/environments/gdsam2, description: Path to instance segmentation environment}
+  - arg: {name: instance_seg_env, default: $(env HOME)/environments/gdsam2, description: Path to instance segmentation environment}
   - arg: {name: model_name, default: gdsam2, description: Name of the segmentation model to use}
   - arg: {name: config_path, default: $(find-pkg-share semantic_inference_ros)/config/instance_segmentation/$(var model_name).yaml, description: Configuration file for instance segmentation object detector}
   - arg: {name: compressed_rgb, default: 'false', description: Triggers decompression for RGB stream}
@@ -20,7 +20,7 @@ launch:
       pkg: semantic_inference_ros
       exec: instance_segmentation_node
       name: semantic_inference
-      pyenv: $(var gdsam2_env)
+      pyenv: $(var instance_seg_env)
       on_exit: shutdown
       param:
         - {name: config_path, value: $(var config_path), type: str}
diff --git a/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml b/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml
deleted file mode 100644
index ff4c2ec..0000000
--- a/semantic_inference_ros/launch/instance_segmentation_yolov11.launch.yaml
+++ /dev/null
@@ -1,27 +0,0 @@
----
-launch:
-  - arg: {name: instance_seg_env, default: $(env HOME)/environments/crisp, description: Path to instance segmentation environment}
-  - arg: {name: config_path, default: $(find-pkg-share semantic_inference_ros)/config/instance_segmentation/yolov11.yaml, description: Configuration file for instance segmentation object detector}
-  - arg: {name: compressed_rgb, default: 'false', description: Triggers decompression for RGB stream}
-  - arg: {name: log-level, default: info, description: Set the ROS2 log level}
-  - node:
-      if: $(var compressed_rgb)
-      pkg: image_transport
-      exec: republish
-      name: decompress_rgb
-      param:
-        - {name: in_transport, value: compressed}
-        - {name: out_transport, value: raw}
-      remap:
-        - {from: in/compressed, to: /acl_jackal/forward/color/image_raw/compressed}
-        - {from: out, to: color/image_raw}
-  - pyenv_node:
-      pkg: semantic_inference_ros
-      exec: instance_segmentation_node
-      name: semantic_inference
-      pyenv: $(var instance_seg_env)
-      on_exit: shutdown
-      param:
-        - {name: config_path, value: $(var config_path), type: str}
-      args: >-
-        --ros-args --log-level $(var log-level)

From 340247d8476a7a6c45073a055309a3a37fc1cca3 Mon Sep 17 00:00:00 2001
From: Multyxu <multyxu@gmail.com>
Date: Wed, 3 Dec 2025 14:19:23 -0500
Subject: [PATCH 9/9] clean with pre-commit

---
 docs/instance_seg.md                          |   2 +-
 .../models/instance_segmenter.py              |  10 +-
 .../semantic_inference/models/wrappers.py     | 106 ++++++++++--------
 .../app/instance_segmentation_node            |   3 +-
 .../config/instance_segmentation/gdsam2.yaml  |  12 +-
 5 files changed, 71 insertions(+), 62 deletions(-)

diff --git a/docs/instance_seg.md b/docs/instance_seg.md
index a9cd135..6818d67 100644
--- a/docs/instance_seg.md
+++ b/docs/instance_seg.md
@@ -31,7 +31,7 @@ source <PATH_TO_ENVIRONMENT>/bin/activate
 pip install ./semantic_inference[openset]  # note that the openset extra is required for open-set semantic segmentation
 ```
 
-The above setup allows you to use `yolov11`, in order to use `grounded sam 2`, we have to manually install it. 
+The above setup allows you to use `yolov11`, in order to use `grounded sam 2`, we have to manually install it.
 ```shell
 # cd to your favorite path, we can default to `~/.semantic_inference/`
 git clone -b more_gpu https://github.com/MultyXu/Grounded-SAM-2.git
diff --git a/semantic_inference/python/semantic_inference/models/instance_segmenter.py b/semantic_inference/python/semantic_inference/models/instance_segmenter.py
index 00ead5b..42b1a77 100644
--- a/semantic_inference/python/semantic_inference/models/instance_segmenter.py
+++ b/semantic_inference/python/semantic_inference/models/instance_segmenter.py
@@ -48,10 +48,10 @@ class Results:
     """Openset Segmentation Results."""
 
     # all on cuda/tensor device? TODO: Maybe should move to cpu by default
-    masks: torch.Tensor # (n, H, W), torch.bool
-    boxes: torch.Tensor # (n, 4) xyxy format, torch.float32
-    categories: torch.Tensor # (n,), torch.float32/int64 (doesn't matter)
-    confidences: torch.Tensor # (n,), torch.float32
+    masks: torch.Tensor  # (n, H, W), torch.bool
+    boxes: torch.Tensor  # (n, 4) xyxy format, torch.float32
+    categories: torch.Tensor  # (n,), torch.float32/int64 (doesn't matter)
+    confidences: torch.Tensor  # (n,), torch.float32
 
     @property
     def instance_seg_img(self):
@@ -136,7 +136,7 @@ def segment(self, rgb_img, is_rgb_order=True):
     def device(self):
         """Get current model device."""
         return self._canary_param.device
-    
+
     @property
     def category_names(self):
         """Get category names."""
diff --git a/semantic_inference/python/semantic_inference/models/wrappers.py b/semantic_inference/python/semantic_inference/models/wrappers.py
index 8aaff71..e61b417 100644
--- a/semantic_inference/python/semantic_inference/models/wrappers.py
+++ b/semantic_inference/python/semantic_inference/models/wrappers.py
@@ -30,16 +30,15 @@
 """Model wrappers for image segmentation."""
 
 import dataclasses
-
 import os
+
 import einops
 import numpy as np
 import torch
 import torch.nn as nn
 import torchvision
-from torchvision.ops import box_convert
-
 from spark_config import Config, register_config
+from torchvision.ops import box_convert
 
 from semantic_inference import root_path
 
@@ -48,6 +47,7 @@ def models_path():
     """Get path to pre-trained weight storage."""
     return root_path().parent.parent / "models"
 
+
 def path_to_dot_semantic_inference():
     """Get path to ~/.semantic_inference directory."""
     return os.getenv("HOME") + "/.semantic_inference"
@@ -359,7 +359,7 @@ def __init__(self, config):
     def eval(self):
         """override eval to avoid issues with yolo model"""
         self.model.model.eval()
-    
+
     @property
     def category_names(self):
         """Get category names."""
@@ -377,7 +377,7 @@ def forward(self, img):
         result = self.model(img)[0]  # assume batch size 1
         if result.masks is None:
             return None, None, None, None
-        
+
         categories = result.boxes.cls.cpu()  # int8
         masks = result.masks.data.to(torch.bool).cpu()  #
         boxes = result.boxes.xyxy.cpu()  # float32
@@ -407,55 +407,60 @@ class GDSam2InstanceSegmenterWrapper(nn.Module):
     def __init__(self, config):
         """Load Grounded SAM 2 model."""
         super().__init__()
+        from groundingdino.util.inference import load_model
         from sam2.build_sam import build_sam2
         from sam2.sam2_image_predictor import SAM2ImagePredictor
-        from groundingdino.util.inference import load_model
 
         self.config = config
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.text_prompt = config.text_prompt
         self.multimask_output = config.multimask_output
-        
+
+        # hydra config pkg: only need relative path to the pkg installation dir
         sam2_model_config_path = os.path.join(
             "configs/sam2.1", config.sam2_model_config
-        ) # this uses hydra config packages so only need relative path to the pkg installation dir
+        )
         sam2_checkpoint_path = os.path.join(
             path_to_dot_semantic_inference(), config.sam2_checkpoint
-        ) 
+        )
         grounding_dino_config_path = os.path.join(
-            path_to_dot_semantic_inference(), "gdsam2_config", config.grounding_dino_config
-        ) 
+            path_to_dot_semantic_inference(),
+            "gdsam2_config",
+            config.grounding_dino_config,
+        )
         grounding_dino_checkpoint_path = os.path.join(
             path_to_dot_semantic_inference(), config.grounding_dino_checkpoint
         )
-        
+
         # build SAM2 image predictor
-        self.sam2_model = build_sam2(
-            sam2_model_config_path, 
-            sam2_checkpoint_path
-        )
+        self.sam2_model = build_sam2(sam2_model_config_path, sam2_checkpoint_path)
         self.sam2_predictor = SAM2ImagePredictor(self.sam2_model)
-        
+
         # build grounding dino model
         self.grounding_model = load_model(
-            model_config_path=grounding_dino_config_path, 
-            model_checkpoint_path=grounding_dino_checkpoint_path, 
-            device=self.device
+            model_config_path=grounding_dino_config_path,
+            model_checkpoint_path=grounding_dino_checkpoint_path,
+            device=self.device,
         )
-        
+
         # convert text prompt to category names
-        self.category_names = self.text_prompt.lower().split('. ')
-        self.category_names = [cat.strip() for cat in self.category_names if len(cat.strip()) > 0]
-        self.category_names[-1] = self.category_names[-1].rstrip('.') # remove the last dot if any
+        self.category_names = self.text_prompt.lower().split(". ")
+        self.category_names = [
+            cat.strip() for cat in self.category_names if len(cat.strip()) > 0
+        ]
+        self.category_names[-1] = self.category_names[-1].rstrip(
+            "."
+        )  # remove the last dot if any
 
     def preprocess_image(self, img):
         """Preprocess image for Grounded SAM 2.
-        Input: 
+        Input:
             - img np.ndarray (H, W, C) uint8
         Output:
             - image_transformed torch.Tensor (C, H, W) float32
         """
         import groundingdino.datasets.transforms as T
+
         transform = T.Compose(
             [
                 T.RandomResize([800], max_size=1333),
@@ -466,7 +471,7 @@ def preprocess_image(self, img):
         pil_img = torchvision.transforms.ToPILImage()(img)
         image_transformed, _ = transform(pil_img, None)
         return image_transformed
-    
+
     @classmethod
     def construct(cls, **kwargs):
         """Load model from configuration dictionary."""
@@ -476,11 +481,11 @@ def construct(cls, **kwargs):
 
     def forward(self, img):
         """Segment image."""
-        from groundingdino.util.inference import load_image, predict
-        
+        from groundingdino.util.inference import predict
+
         # preprocess img
         img_transformed = self.preprocess_image(img)
-        
+
         # gdino prediction
         boxes, confidences, labels = predict(
             model=self.grounding_model,
@@ -488,9 +493,9 @@ def forward(self, img):
             caption=self.text_prompt,
             box_threshold=self.config.box_threshold,
             text_threshold=self.config.text_threshold,
-            device=self.device
+            device=self.device,
         )
-        
+
         # if nothing detected
         if boxes.shape[0] == 0:
             return None, None, None, None
@@ -500,14 +505,17 @@ def forward(self, img):
         boxes = boxes * torch.Tensor([w, h, w, h])
         input_boxes = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
 
-        # Below is the comment from the official sam2 demo:
-        # FIXME: figure how does this influence the G-DINO model (from offical gdsam2 demo)
-        # torch.autocast(device_type=self.device.type, dtype=torch.bfloat16).__enter__()
-        # if torch.cuda.is_available() and torch.cuda.get_device_properties(0).major >= 8:
-        #     # turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)
-        #     torch.backends.cuda.matmul.allow_tf32 = True
-        #     torch.backends.cudnn.allow_tf32 = True
-            
+        """
+        Below is the comment from the official sam2 demo:
+        FIXME: figure how does this influence the G-DINO model
+        torch.autocast(device_type=self.device.type, dtype=torch.bfloat16).__enter__()
+        if torch.cuda.is_available() and torch.cuda.get_device_properties(0).major >= 8:
+            # turn on tfloat32 for Ampere GPUs
+            (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)
+            torch.backends.cuda.matmul.allow_tf32 = True
+            torch.backends.cudnn.allow_tf32 = True
+        """
+
         # SAM 2 predicts mask
         self.sam2_predictor.set_image(img)
         masks, scores, logits = self.sam2_predictor.predict(
@@ -519,13 +527,13 @@ def forward(self, img):
 
         # Sample best according to scores if multimask output
         if self.multimask_output:
-            best = np.argmax(scores, axis=1)                     
-            masks = masks[np.arange(masks.shape[0]), best]    
+            best = np.argmax(scores, axis=1)
+            masks = masks[np.arange(masks.shape[0]), best]
 
         # convert the shape to (n, H, W)
         if masks.ndim == 4:
             masks = masks.squeeze(1)
-        
+
         # convert string labels to indexes based on the text prompt
         categories = []
         for label in labels:
@@ -533,19 +541,19 @@ def forward(self, img):
             if label_str in self.category_names:
                 label_indx = self.category_names.index(label_str)
             else:
-                label_indx = -1 # unknown
+                label_indx = -1  # unknown
             categories.append(label_indx)
         categories = torch.tensor(categories)
-        
+
         # convert masks to boolean
         masks = masks.astype(bool)
         masks = torch.tensor(masks)
-        
+
         # use xyxy boxes
         boxes = torch.tensor(input_boxes)
-    
+
         return categories, masks, boxes, confidences
-        
+
 
 @register_config(
     "instance_model", name="gdsam2", constructor=GDSam2InstanceSegmenterWrapper
@@ -553,7 +561,7 @@ def forward(self, img):
 @dataclasses.dataclass
 class GDSam2InstanceSegmenterConfig(Config):
     """Configuration for Grounded SAM 2 instance segmenter."""
-    
+
     text_prompt: str = "car. tire."
     sam2_checkpoint: str = "sam2.1_hiera_large.pt"
     sam2_model_config: str = "sam2.1_hiera_l.yaml"
@@ -566,4 +574,4 @@ class GDSam2InstanceSegmenterConfig(Config):
     @classmethod
     def load(cls, filepath):
         """Load config from file."""
-        return Config.load(cls, filepath)
\ No newline at end of file
+        return Config.load(cls, filepath)
diff --git a/semantic_inference_ros/app/instance_segmentation_node b/semantic_inference_ros/app/instance_segmentation_node
index e0e78c7..2c8c7db 100755
--- a/semantic_inference_ros/app/instance_segmentation_node
+++ b/semantic_inference_ros/app/instance_segmentation_node
@@ -105,7 +105,8 @@ class InstanceSegmentationNode(Node):
         boxes = ret.boxes
         confidences = ret.confidences
 
-        # TODO(multy): place holder directly from model, need to be replace by proper yaml
+        # TODO(multy): place holder directly from model,
+        # need to be replace by proper yaml
         category_names = self._model.category_names
 
         # Convert RGB to BGR for OpenCV
diff --git a/semantic_inference_ros/config/instance_segmentation/gdsam2.yaml b/semantic_inference_ros/config/instance_segmentation/gdsam2.yaml
index 6bf2649..7179069 100644
--- a/semantic_inference_ros/config/instance_segmentation/gdsam2.yaml
+++ b/semantic_inference_ros/config/instance_segmentation/gdsam2.yaml
@@ -2,11 +2,11 @@
 model:
   instance_model:
     type: gdsam2
-    text_prompt: "bench. car. trash bin. fire hydrant."
-    sam2_checkpoint: "sam2.1_hiera_large.pt"
-    sam2_model_config: "sam2.1_hiera_l.yaml"
-    grounding_dino_config: "GroundingDINO_SwinT_OGC.py"
-    grounding_dino_checkpoint: "groundingdino_swint_ogc.pth"
+    text_prompt: bench. car. trash bin. fire hydrant.
+    sam2_checkpoint: sam2.1_hiera_large.pt
+    sam2_model_config: sam2.1_hiera_l.yaml
+    grounding_dino_config: GroundingDINO_SwinT_OGC.py
+    grounding_dino_checkpoint: groundingdino_swint_ogc.pth
     box_threshold: 0.35
     text_threshold: 0.25
-    multimask_output: False
\ No newline at end of file
+    multimask_output: false