MIT-SPARK · MultyXu · Oct 31, 2025 · Nov 4, 2025 · Nov 5, 2025 · Nov 13, 2025
diff --git a/docs/instance_seg.md b/docs/instance_seg.md
@@ -0,0 +1,74 @@
+# Instance Segmentation
+
+## Setting Up
+
+The open-set segmentation interface works with and without ROS. For working with ROS, we assume you have already built your workspace with this repository in it beforehand (i.e., by running `colcon build`).
+
+> **Note </br>**
+> If you intend only to use the open-set segmentation interface, you may want to turn off building against TensorRT, which you can do by the following:
+> ```shell
+> colcon build --cmake-args --no-warn-unused-cli -DSEMANTIC_INFERENCE_USE_TRT=OFF
+> ```
+
+### Installing
+
+We assume you are using a virtual environment. You may want to install `virtualenv` (usually `sudo apt install python3-virtualenv`) if you haven't already.
+To set up a virtual environment for use with ROS:
+```shell
+python3 -m virtualenv -p /usr/bin/python3 --system-site-packages <DESIRED_PATH_TO_ENVIRONMENT>
+```
+Otherwise, omit the ``--system-site-packages`` option:
+```shell
+python3 -m virtualenv -p /usr/bin/python3 --download <DESIRED_PATH_TO_ENVIRONMENT>
+```
+
+> NOTE: we default the virtual environment name to `gdsam2` currently.
+
+Then, install `semantic_inference`
+```shell
+cd <PATH_TO_REPO>
+source <PATH_TO_ENVIRONMENT>/bin/activate
+pip install ./semantic_inference[openset]  # note that the openset extra is required for open-set semantic segmentation
+```
+
+The above setup allows you to use `yolov11`, in order to use `grounded sam 2`, we have to manually install it.
+```shell
+# cd to your favorite path, we can default to `~/.semantic_inference/`
+git clone -b more_gpu https://github.com/MultyXu/Grounded-SAM-2.git
+```
+And follow the `README.md` to install gdsam2.
+
+### Setup model
+Put (or symlink) `GroundingDINO_SwinT_OGC.py` under `~/.semantic_inference/gdsam2_config/`. And, put `sam2.1_hiera_large.pt` and `groundingdino_swint_ogc.pth` under `~/.semantic_inference/`
+
+<!-- ## Models
+
+Note that both CLIP and FastSAM automatically download the relevant model weights when they are first run.
+Running with the original SAM may require downloading the model weights. See the official SAM repository [here](https://github.com/facebookresearch/segment-anything) for more details.
+
+## Trying out open-set segmentation nodes
+
+Similar to the example [here](../README.md#usage), you can run any of the open-set launch files:
+
+```shell
+activate <PATH_TO_ENVIRONMENT>/bin/activate
+## this example just produces an embedding vector per image
+# ros2 launch semantic_inference_ros image_embedding_node.launch.yaml
+ros2 launch semantic_inference_ros open_set.launch.yaml
+```
+and then run
+```shell
+ros2 bag play PATH_TO_BAG --remap INPUT_TOPIC:=/color/image_raw
+```
+
+You should see a single embedding vector published under `/semantic/feature` and (if running the full open-set segmenter), the segmentation results under `/semantic/image_raw` and a visualization of the results under `/semantic_color/image_raw` and `/semantic_overlay/image_raw`.
+
+## Using open-set segmentation online
+
+To use the open-set segmentation as part of a larger system, include [open_set.launch.yaml](../semantic_inference_ros/launch/open_set.launch.yaml) in your launch file. Often this will look like this:
+```yaml
+launch:
+    # ... rest of launch file ...
+    - set_remap: {from: "color/image_raw", to: "YOUR_INPUT_TOPIC_HERE"}
+    - include:  {file: "$(find-pkg-share semantic_inference_ros)/launch/opsen_set.launch.yaml"}
+``` -->
diff --git a/semantic_inference/pyproject.toml b/semantic_inference/pyproject.toml
@@ -30,6 +30,7 @@ dependencies = [
     "torchvision",
     "spark_config@git+https://github.com/MIT-SPARK/Spark-Config.git",
     "numpy<2",
+    "ultralytics",
 ]
 
 [tool.setuptools.packages.find]
@@ -41,7 +42,6 @@ semantic-inference = "semantic_inference.__main__:cli"
 [project.optional-dependencies]
 dev = ["pytest"]
 openset = [
-    "ultralytics",
     "clip@git+https://github.com/openai/CLIP.git",
     "open_clip_torch",
     "numpy >= 1.20",

diff --git a/semantic_inference/python/semantic_inference/models/__init__.py b/semantic_inference/python/semantic_inference/models/__init__.py
@@ -30,6 +30,7 @@
 import torch
 
 from semantic_inference.models.feature_visualizers import *
+from semantic_inference.models.instance_segmenter import *
 from semantic_inference.models.mask_functions import *
 from semantic_inference.models.openset_segmenter import *
 from semantic_inference.models.patch_extractor import *

diff --git a/semantic_inference/python/semantic_inference/models/instance_segmenter.py b/semantic_inference/python/semantic_inference/models/instance_segmenter.py
@@ -0,0 +1,159 @@
+# BSD 3-Clause License
+#
+# Copyright (c) 2021-2024, Massachusetts Institute of Technology.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+#    contributors may be used to endorse or promote products derived from
+#    this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+"""Model to segment an image and encode segments with CLIP embeddings."""
+
+import dataclasses
+from dataclasses import dataclass
+from typing import Any
+
+import numpy as np
+import torch
+from spark_config import Config, config_field
+from torch import nn
+
+
+def _map_opt(values, f):
+    return {k: v if v is None else f(v) for k, v in values.items()}
+
+
+@dataclass
+class Results:
+    """Openset Segmentation Results."""
+
+    # all on cuda/tensor device? TODO: Maybe should move to cpu by default
+    masks: torch.Tensor  # (n, H, W), torch.bool
+    boxes: torch.Tensor  # (n, 4) xyxy format, torch.float32
+    categories: torch.Tensor  # (n,), torch.float32/int64 (doesn't matter)
+    confidences: torch.Tensor  # (n,), torch.float32
+
+    @property
+    def instance_seg_img(self):
+        """
+        Convert segmentation results to instance segmentation image.
+        Each pixel value encodes both category id and instance id.
+        First 16 bits are category id, last 16 bits are instance id.
+        """
+        masks = self.masks.cpu().numpy()
+        category_ids = self.categories.cpu().numpy()
+        img = np.zeros(masks[0].shape, dtype=np.uint32)
+        for i in range(masks.shape[0]):
+            category_id = int(category_ids[i])  # category id are 0-indexed
+            instance_id = i + 1  # instance ids are 1-indexed
+            combined_id = (
+                category_id << 16
+            ) | instance_id  # combine into single uint32
+            img[masks[i, ...] > 0] = combined_id
+
+        return img
+
+    def cpu(self):
+        """Move results to CPU."""
+        values = dataclasses.asdict(self)
+        return Results(**_map_opt(values, lambda v: v.cpu()))
+
+    def to(self, *args, **kwargs):
+        """Forward to to all tensors."""
+        values = dataclasses.asdict(self)
+        return Results(**_map_opt(values, lambda v: v.to(*args, **kwargs)))
+
+
+@dataclass
+class InstanceSegmenterConfig(Config):
+    """Main config for instance segmenter."""
+
+    instance_model: Any = config_field("instance_model", default="yolov11")
+    # relevant configs (model path, model weights) for the model
+
+
+class InstanceSegmenter(nn.Module):
+    """Module to segment and encode an image."""
+
+    def __init__(self, config):
+        """Construct an instance segmenter."""
+        super().__init__()
+        # for detecting model device
+        self._canary_param = nn.Parameter(torch.empty(0))
+
+        self.config = config
+        self.segmenter = self.config.instance_model.create()
+
+    def eval(self):
+        """
+        Override eval to avoid issues with certain models
+        """
+        self.segmenter.eval()
+
+    @classmethod
+    def construct(cls, **kwargs):
+        """Load model from configuration dictionary."""
+        config = InstanceSegmenterConfig()
+        config.update(kwargs)
+        return cls(config)
+
+    @torch.no_grad()
+    def segment(self, rgb_img, is_rgb_order=True):
+        """
+        Segment image and compute language embeddings for each mask.
+
+        Args:
+            img (np.ndarry): uint8 image of shape (R, C, 3) in rgb order
+            is_rgb_order (bool): whether the image is rgb order or not
+
+        Returns:
+            Encoded image
+        """
+        img = rgb_img if is_rgb_order else rgb_img[:, :, ::-1].copy()
+        return self(img)
+
+    @property
+    def device(self):
+        """Get current model device."""
+        return self._canary_param.device
+
+    @property
+    def category_names(self):
+        """Get category names."""
+        return self.segmenter.category_names
+
+    def forward(self, rgb_img):
+        """
+        Segment image and compute language embeddings for each mask.
+
+        Args:
+            img (np.ndarray): uint8 image of shape (R, C, 3) in rgb order
+
+        Returns:
+            Encoded image
+        """
+        categories, masks, boxes, confidences = self.segmenter(rgb_img)
+
+        return Results(
+            masks=masks, boxes=boxes, categories=categories, confidences=confidences
+        )