From c586171a39f5b0ce56137e88695dc0d265ce2b33 Mon Sep 17 00:00:00 2001 From: sinha7y <61509723+sinha7y@users.noreply.github.com> Date: Tue, 6 Jan 2026 16:11:38 -0800 Subject: [PATCH 01/13] added Agentic Detection to navigate to objects registered in ObjectDB Former-commit-id: 94ad22c3a7712302b7e39f8758fb32b5e368d2fb --- dimos/agents/skills/navigation.py | 58 +++++++++++++++++++ dimos/perception/detection/moduleDB.py | 19 +++++- dimos/robot/all_blueprints.py | 1 + .../unitree_webrtc/unitree_go2_blueprints.py | 11 ++-- .../package-lock.json | 23 +++++++- 5 files changed, 104 insertions(+), 8 deletions(-) diff --git a/dimos/agents/skills/navigation.py b/dimos/agents/skills/navigation.py index 054246d6ee..ad29ce0e32 100644 --- a/dimos/agents/skills/navigation.py +++ b/dimos/agents/skills/navigation.py @@ -52,6 +52,7 @@ class NavigationSkillContainer(SkillModule): "WavefrontFrontierExplorer.stop_exploration", "WavefrontFrontierExplorer.explore", "WavefrontFrontierExplorer.is_exploration_active", + "ObjectDBModule.lookup", ] color_image: In[Image] @@ -113,6 +114,63 @@ def tag_location(self, location_name: str) -> str: logger.info(f"Tagged {location}") return f"Tagged '{location_name}': ({position.x},{position.y})." + @skill() + def navigate_to_detected_object(self, object_name: str) -> str: + """Navigate to an object that was detected by the vision system. + + This uses ObjectDB to find objects that were previously seen and tracked. + Only use this for specific objects like "cup", "red box", "person", "chair". + + The object must have been previously detected by the camera system. + + Args: + object_name: Name/class of the object to navigate to + + Returns: + Success or failure message + """ + if not self._skill_started: + raise ValueError(f"{self} has not been started.") + + try: + lookup_rpc = self.get_rpc_calls("ObjectDBModule.lookup") + except Exception: + logger.error("ObjectDBModule not connected") + return "Error: ObjectDB module not connected. Make sure detection blueprint is loaded." + + try: + objects = lookup_rpc(object_name) + except Exception as e: + logger.error(f"Failed to query ObjectDB for '{object_name}': {e}") + return f"Error querying ObjectDB for '{object_name}'" + + if not objects or len(objects) == 0: + logger.warning(f"No objects matching '{object_name}' found in ObjectDB") + return f"No detected object matching '{object_name}'. The object may not have been seen yet." + + if len(objects) > 1: + logger.info( + f"Found {len(objects)} instances of '{object_name}', navigating to first one" + ) + + # Get pose from first matching object + try: + pose = objects[0].to_pose() + except Exception as e: + logger.error(f"Failed to get pose for object '{object_name}': {e}") + return f"Error: Could not determine location of '{object_name}'" + + logger.info( + f"Navigating to detected '{object_name}' at ({pose.position.x:.2f}, {pose.position.y:.2f}, {pose.position.z:.2f})" + ) + + result = self._navigate_to(pose) + + if result: + return f"Successfully navigated to '{object_name}'" + else: + return f"Failed to reach '{object_name}'. Navigation was cancelled or failed." + @skill() def navigate_with_text(self, query: str) -> str: """Navigate to a location by querying the existing semantic map using natural language. diff --git a/dimos/perception/detection/moduleDB.py b/dimos/perception/detection/moduleDB.py index c37dff8dea..52c9a808fe 100644 --- a/dimos/perception/detection/moduleDB.py +++ b/dimos/perception/detection/moduleDB.py @@ -267,9 +267,22 @@ def agent_encode(self) -> str: # return ret[0] if ret else None - def lookup(self, label: str) -> list[Detection3DPC]: - """Look up a detection by label.""" - return [] + @rpc + def lookup(self, label: str) -> list[Object3D]: + """Look up objects by label/name. + + Args: + label: Name/class of object to search for + + Returns: + List of Object3D instances matching the label + """ + matching = [] + for obj in self.objects.values(): + if obj.name and label.lower() in obj.name.lower(): + if obj.detections >= 3: # Filter out noise + matching.append(obj) + return matching @rpc def stop(self): # type: ignore[no-untyped-def] diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py index f989098f05..53e29d949c 100644 --- a/dimos/robot/all_blueprints.py +++ b/dimos/robot/all_blueprints.py @@ -36,6 +36,7 @@ "unitree-g1-joystick": "dimos.robot.unitree_webrtc.unitree_g1_blueprints:with_joystick", "unitree-g1-full": "dimos.robot.unitree_webrtc.unitree_g1_blueprints:full_featured", "unitree-g1-detection": "dimos.robot.unitree_webrtc.unitree_g1_blueprints:detection", + "unitree-go2-agentic-detection": "dimos.robot.unitree_webrtc.unitree_go2_blueprints:agentic_with_detection", # xArm manipulator blueprints "xarm-servo": "dimos.hardware.manipulators.xarm.xarm_blueprints:xarm_servo", "xarm5-servo": "dimos.hardware.manipulators.xarm.xarm_blueprints:xarm5_servo", diff --git a/dimos/robot/unitree_webrtc/unitree_go2_blueprints.py b/dimos/robot/unitree_webrtc/unitree_go2_blueprints.py index 7629644ed6..bd88cf58a7 100644 --- a/dimos/robot/unitree_webrtc/unitree_go2_blueprints.py +++ b/dimos/robot/unitree_webrtc/unitree_go2_blueprints.py @@ -168,10 +168,13 @@ _common_agentic, ) -agentic_mcp = autoconnect( - agentic, - MCPModule.blueprint(), -) +agentic_detection = autoconnect( + detection, # Has: nav + detectionDB_module + remappings + transports + spatial_memory(), + utilization(), + llm_agent(), + _common_agentic, +).global_config(n_dask_workers=8) agentic_ollama = autoconnect( spatial, diff --git a/dimos/web/command-center-extension/package-lock.json b/dimos/web/command-center-extension/package-lock.json index 09f9be88b4..48bfef6d0c 100644 --- a/dimos/web/command-center-extension/package-lock.json +++ b/dimos/web/command-center-extension/package-lock.json @@ -61,6 +61,7 @@ "resolved": "https://registry.npmjs.org/@babel/core/-/core-7.28.5.tgz", "integrity": "sha512-e7jT4DxYvIDLk1ZHmU/m/mB19rex9sv0c2ftBtjSBv+kVM/902eh0fINUzD7UwLLNR+jU585GxUJ8/EBfAM5fw==", "dev": true, + "peer": true, "dependencies": { "@babel/code-frame": "^7.27.1", "@babel/generator": "^7.28.5", @@ -1842,6 +1843,7 @@ "integrity": "sha512-FXx2pKgId/WyYo2jXw63kk7/+TY7u7AziEJxJAnSFzHlqTAS3Ync6SvgYAN/k4/PQpnnVuzoMuVnByKK2qp0ag==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@types/estree": "*", "@types/json-schema": "*" @@ -1942,6 +1944,7 @@ "integrity": "sha512-0dLEBsA1kI3OezMBF8nSsb7Nk19ZnsyE1LLhB8r27KbgU5H4pvuqZLdtE+aUkJVoXgTVuA+iLIwmZ0TuK4tx6A==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@types/prop-types": "*", "csstype": "^3.0.2" @@ -1963,6 +1966,7 @@ "integrity": "sha512-w/EboPlBwnmOBtRbiOvzjD+wdiZdgFeo17lkltrtn7X37vagKKWJABvyfsJXTlHe6XBzugmYgd4A4nW+k8Mixw==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@eslint-community/regexpp": "^4.10.0", "@typescript-eslint/scope-manager": "8.40.0", @@ -2003,6 +2007,7 @@ "integrity": "sha512-jCNyAuXx8dr5KJMkecGmZ8KI61KBUhkCob+SD+C+I5+Y1FWI2Y3QmY4/cxMCC5WAsZqoEtEETVhUiUMIGCf6Bw==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@typescript-eslint/scope-manager": "8.40.0", "@typescript-eslint/types": "8.40.0", @@ -2403,6 +2408,7 @@ "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==", "dev": true, "license": "MIT", + "peer": true, "bin": { "acorn": "bin/acorn" }, @@ -2426,6 +2432,7 @@ "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "fast-deep-equal": "^3.1.1", "fast-json-stable-stringify": "^2.0.0", @@ -2784,6 +2791,7 @@ } ], "license": "MIT", + "peer": true, "dependencies": { "caniuse-lite": "^1.0.30001735", "electron-to-chromium": "^1.5.204", @@ -3353,6 +3361,7 @@ "resolved": "https://registry.npmjs.org/d3-selection/-/d3-selection-3.0.0.tgz", "integrity": "sha512-fmTRWbNMmsmWq6xJV8D19U/gw/bwrHfNXxrIN+HfZgnzqTHp9jOmKMhsTUjXOJnZOdZY9Q28y4yebKzqDKlxlQ==", "license": "ISC", + "peer": true, "engines": { "node": ">=12" } @@ -3950,6 +3959,7 @@ "integrity": "sha512-RNCHRX5EwdrESy3Jc9o8ie8Bog+PeYvvSR8sDGoZxNFTvZ4dlxUB3WzQ3bQMztFrSRODGrLLj8g6OFuGY/aiQg==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@eslint-community/eslint-utils": "^4.2.0", "@eslint-community/regexpp": "^4.12.1", @@ -4011,6 +4021,7 @@ "integrity": "sha512-iI1f+D2ViGn+uvv5HuHVUamg8ll4tN+JRHGc6IJi4TP9Kl976C57fzPXgseXNs8v0iA8aSJpHsTWjDb9QJamGQ==", "dev": true, "license": "MIT", + "peer": true, "bin": { "eslint-config-prettier": "bin/cli.js" }, @@ -5807,7 +5818,8 @@ "node_modules/leaflet": { "version": "1.9.4", "resolved": "https://registry.npmjs.org/leaflet/-/leaflet-1.9.4.tgz", - "integrity": "sha512-nxS1ynzJOmOlHp+iL3FyWqK89GtNL8U8rvlMOsQdTTssxZwCXh8N2NB3GDQOL+YR3XnWyZAxwQixURb+FA74PA==" + "integrity": "sha512-nxS1ynzJOmOlHp+iL3FyWqK89GtNL8U8rvlMOsQdTTssxZwCXh8N2NB3GDQOL+YR3XnWyZAxwQixURb+FA74PA==", + "peer": true }, "node_modules/levn": { "version": "0.4.1", @@ -6520,6 +6532,7 @@ "integrity": "sha512-I7AIg5boAr5R0FFtJ6rCfD+LFsWHp81dolrFD8S79U9tb8Az2nGrJncnMSnys+bpQJfRUzqs9hnA81OAA3hCuQ==", "dev": true, "license": "MIT", + "peer": true, "bin": { "prettier": "bin/prettier.cjs" }, @@ -6608,6 +6621,7 @@ "resolved": "https://registry.npmjs.org/react/-/react-18.3.1.tgz", "integrity": "sha512-wS+hAgJShR0KhEvPJArfuPVN1+Hz1t0Y6n5jLrGQbkb4urgPE/0Rve+1kMB1v/oWgHgm4WIcV+i7F2pTVj+2iQ==", "license": "MIT", + "peer": true, "dependencies": { "loose-envify": "^1.1.0" }, @@ -6620,6 +6634,7 @@ "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-18.3.1.tgz", "integrity": "sha512-5m4nQKp+rZRb09LNH59GM4BxTh9251/ylbKIbpe7TpGxfJ+9kv6BLkLBXIjjspbgbnIBNqlI23tRnTWT0snUIw==", "license": "MIT", + "peer": true, "dependencies": { "loose-envify": "^1.1.0", "scheduler": "^0.23.2" @@ -7697,6 +7712,7 @@ "integrity": "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "fast-deep-equal": "^3.1.3", "fast-uri": "^3.0.1", @@ -7793,6 +7809,7 @@ "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.3.tgz", "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", "dev": true, + "peer": true, "engines": { "node": ">=12" }, @@ -7997,6 +8014,7 @@ "integrity": "sha512-CWBzXQrc/qOkhidw1OzBTQuYRbfyxDXJMVJ1XNwUHGROVmuaeiEm3OslpZ1RV96d7SKKjZKrSJu3+t/xlw3R9A==", "dev": true, "license": "Apache-2.0", + "peer": true, "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" @@ -8115,6 +8133,7 @@ "resolved": "https://registry.npmjs.org/vite/-/vite-6.4.1.tgz", "integrity": "sha512-+Oxm7q9hDoLMyJOYfUYBuHQo+dkAloi33apOPP56pzj+vsdJDzr+j1NISE5pyaAuKL4A3UD34qd0lx5+kfKp2g==", "dev": true, + "peer": true, "dependencies": { "esbuild": "^0.25.0", "fdir": "^6.4.4", @@ -8206,6 +8225,7 @@ "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.3.tgz", "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", "dev": true, + "peer": true, "engines": { "node": ">=12" }, @@ -8240,6 +8260,7 @@ "integrity": "sha512-l2LlBSvVZGhL4ZrPwyr8+37AunkcYj5qh8o6u2/2rzoPc8gxFJkLj1WxNgooi9pnoc06jh0BjuXnamM4qlujZA==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@types/eslint-scope": "^3.7.7", "@types/estree": "^1.0.6", From 3fdd9643ecffbdf361088531aae892dd03c917c7 Mon Sep 17 00:00:00 2001 From: Dimensional Date: Tue, 6 Jan 2026 18:28:27 -0800 Subject: [PATCH 02/13] Minor changes after testing Former-commit-id: 8f0368b2754b15dcf77fc88ca512b7bcd4612578 --- dimos/agents/skills/navigation.py | 2 +- dimos/robot/all_blueprints.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dimos/agents/skills/navigation.py b/dimos/agents/skills/navigation.py index ad29ce0e32..6e4a45e724 100644 --- a/dimos/agents/skills/navigation.py +++ b/dimos/agents/skills/navigation.py @@ -116,7 +116,7 @@ def tag_location(self, location_name: str) -> str: @skill() def navigate_to_detected_object(self, object_name: str) -> str: - """Navigate to an object that was detected by the vision system. + """Navigate to an object or person that was detected by the vision system. This uses ObjectDB to find objects that were previously seen and tracked. Only use this for specific objects like "cup", "red box", "person", "chair". diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py index 53e29d949c..75cd5652f4 100644 --- a/dimos/robot/all_blueprints.py +++ b/dimos/robot/all_blueprints.py @@ -36,7 +36,7 @@ "unitree-g1-joystick": "dimos.robot.unitree_webrtc.unitree_g1_blueprints:with_joystick", "unitree-g1-full": "dimos.robot.unitree_webrtc.unitree_g1_blueprints:full_featured", "unitree-g1-detection": "dimos.robot.unitree_webrtc.unitree_g1_blueprints:detection", - "unitree-go2-agentic-detection": "dimos.robot.unitree_webrtc.unitree_go2_blueprints:agentic_with_detection", + "unitree-go2-agentic-detection": "dimos.robot.unitree_webrtc.unitree_go2_blueprints:agentic_detection", # xArm manipulator blueprints "xarm-servo": "dimos.hardware.manipulators.xarm.xarm_blueprints:xarm_servo", "xarm5-servo": "dimos.hardware.manipulators.xarm.xarm_blueprints:xarm5_servo", From d081bc9f9ec6e72328d6b2e9b028767edef2f25f Mon Sep 17 00:00:00 2001 From: sinha7y <61509723+sinha7y@users.noreply.github.com> Date: Tue, 6 Jan 2026 20:36:32 -0800 Subject: [PATCH 03/13] updated debugging for spatial perception Please enter the commit message for your changes. Lines starting Former-commit-id: d62355a2fbaa50951b120ccd64f6295647abc4ea --- dimos/perception/detection/moduleDB.py | 2 +- dimos/perception/spatial_perception.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/dimos/perception/detection/moduleDB.py b/dimos/perception/detection/moduleDB.py index 52c9a808fe..8c844454ba 100644 --- a/dimos/perception/detection/moduleDB.py +++ b/dimos/perception/detection/moduleDB.py @@ -280,7 +280,7 @@ def lookup(self, label: str) -> list[Object3D]: matching = [] for obj in self.objects.values(): if obj.name and label.lower() in obj.name.lower(): - if obj.detections >= 3: # Filter out noise + if obj.detections >= 1: # Filter out noise matching.append(obj) return matching diff --git a/dimos/perception/spatial_perception.py b/dimos/perception/spatial_perception.py index 013d242ba8..d2632c827d 100644 --- a/dimos/perception/spatial_perception.py +++ b/dimos/perception/spatial_perception.py @@ -189,6 +189,7 @@ def start(self) -> None: # Subscribe to LCM streams def set_video(image_msg: Image) -> None: + logger.info("Received camera frame") # Convert Image message to numpy array if hasattr(image_msg, "data"): frame = image_msg.data @@ -218,8 +219,15 @@ def _process_frame(self) -> None: """Process the latest frame with pose data if available.""" tf = self.tf.get("map", "base_link") if self._latest_video_frame is None or tf is None: + logger.warning(" No video frame available") return + if tf is None: + logger.warning("No TF transform (map -> base_link) available") + return + + logger.info(f"Processing frame with TF at ({tf.translation.x:.2f}, {tf.translation.y:.2f})") + # Create Pose object with position and orientation current_pose = tf.to_pose() From f3aa43e76ed39ef9b99cf2cb8e0956a399d8ca0f Mon Sep 17 00:00:00 2001 From: Dimensional Date: Tue, 6 Jan 2026 22:15:47 -0800 Subject: [PATCH 04/13] spatial memory working Former-commit-id: bd20bd7c84b670bb60c7fca732bd2ea2dd3a5615 --- dimos/agents/skills/navigation.py | 4 ++-- dimos/perception/spatial_perception.py | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/dimos/agents/skills/navigation.py b/dimos/agents/skills/navigation.py index 6e4a45e724..da7670c8dc 100644 --- a/dimos/agents/skills/navigation.py +++ b/dimos/agents/skills/navigation.py @@ -94,7 +94,7 @@ def tag_location(self, location_name: str) -> str: if not self._skill_started: raise ValueError(f"{self} has not been started.") - tf = self.tf.get("map", "base_link", time_tolerance=2.0) + tf = self.tf.get("world", "base_link", time_tolerance=2.0) if not tf: return "Could not get the robot's current transform." @@ -114,7 +114,7 @@ def tag_location(self, location_name: str) -> str: logger.info(f"Tagged {location}") return f"Tagged '{location_name}': ({position.x},{position.y})." - @skill() + #@skill() def navigate_to_detected_object(self, object_name: str) -> str: """Navigate to an object or person that was detected by the vision system. diff --git a/dimos/perception/spatial_perception.py b/dimos/perception/spatial_perception.py index d2632c827d..0e6c69ea75 100644 --- a/dimos/perception/spatial_perception.py +++ b/dimos/perception/spatial_perception.py @@ -33,6 +33,7 @@ from dimos.agents_deprecated.memory.visual_memory import VisualMemory from dimos.constants import DIMOS_PROJECT_ROOT from dimos.core import DimosCluster, In, Module, rpc +from dimos.core.skill_module import SkillModule from dimos.msgs.sensor_msgs import Image from dimos.types.robot_location import RobotLocation from dimos.utils.logging_config import setup_logger @@ -50,7 +51,7 @@ logger = setup_logger() -class SpatialMemory(Module): +class SpatialMemory(SkillModule): """ A Dask module for building and querying Robot spatial memory. @@ -186,6 +187,9 @@ def __init__( @rpc def start(self) -> None: super().start() + _ = self.tf + import time + time.sleep(1.0) # Subscribe to LCM streams def set_video(image_msg: Image) -> None: @@ -217,7 +221,7 @@ def stop(self) -> None: def _process_frame(self) -> None: """Process the latest frame with pose data if available.""" - tf = self.tf.get("map", "base_link") + tf = self.tf.get("world", "base_link") if self._latest_video_frame is None or tf is None: logger.warning(" No video frame available") return From a9b722d23868167d7cb5c7f7fca3371e8d620576 Mon Sep 17 00:00:00 2001 From: Dimensional Date: Wed, 7 Jan 2026 21:34:37 -0800 Subject: [PATCH 05/13] function: spatial memory and detection-based navigation Former-commit-id: 908c2056a26689e8710e558773bf0e2188f3e8c2 --- dimos/agents/skills/navigation.py | 123 +++++++++--------- .../memory/spatial_vector_db.py | 6 +- dimos/perception/detection/moduleDB.py | 60 ++++++++- dimos/perception/spatial_perception.py | 18 +-- 4 files changed, 124 insertions(+), 83 deletions(-) diff --git a/dimos/agents/skills/navigation.py b/dimos/agents/skills/navigation.py index da7670c8dc..95d01dbb7b 100644 --- a/dimos/agents/skills/navigation.py +++ b/dimos/agents/skills/navigation.py @@ -31,7 +31,6 @@ logger = setup_logger() - class NavigationSkillContainer(SkillModule): _latest_image: Image | None = None _latest_odom: PoseStamped | None = None @@ -114,95 +113,95 @@ def tag_location(self, location_name: str) -> str: logger.info(f"Tagged {location}") return f"Tagged '{location_name}': ({position.x},{position.y})." - #@skill() + @skill() def navigate_to_detected_object(self, object_name: str) -> str: """Navigate to an object or person that was detected by the vision system. - + This uses ObjectDB to find objects that were previously seen and tracked. Only use this for specific objects like "cup", "red box", "person", "chair". - The object must have been previously detected by the camera system. - + Args: object_name: Name/class of the object to navigate to - + Returns: Success or failure message """ if not self._skill_started: raise ValueError(f"{self} has not been started.") - + + # Get RPC handle try: lookup_rpc = self.get_rpc_calls("ObjectDBModule.lookup") except Exception: logger.error("ObjectDBModule not connected") return "Error: ObjectDB module not connected. Make sure detection blueprint is loaded." - + + # Query ObjectDB try: objects = lookup_rpc(object_name) except Exception as e: - logger.error(f"Failed to query ObjectDB for '{object_name}': {e}") return f"Error querying ObjectDB for '{object_name}'" - + + # Check if we found anything if not objects or len(objects) == 0: - logger.warning(f"No objects matching '{object_name}' found in ObjectDB") return f"No detected object matching '{object_name}'. The object may not have been seen yet." - + if len(objects) > 1: logger.info( f"Found {len(objects)} instances of '{object_name}', navigating to first one" ) - - # Get pose from first matching object + + # Extract pose from dict (not Object3D anymore!) try: - pose = objects[0].to_pose() + obj = objects[0] # First match (now a dict) + + # Create pose from dict fields + goal_pose = PoseStamped( + position=make_vector3(obj["pos_x"], obj["pos_y"], obj["pos_z"]), + orientation=Quaternion(), + frame_id=obj["frame_id"] + ) + + logger.info( + f"Navigating to detected '{obj['name']}' at ({obj['pos_x']:.2f}, {obj['pos_y']:.2f}, {obj['pos_z']:.2f})" + ) + logger.info( + f"Object info: {obj['detections']} detections, confidence: {obj['confidence']:.2f}, last seen: {obj['last_seen']:.1f}s ago" + ) + + except (KeyError, TypeError) as e: + return f"Error: Could not determine location of '{object_name}'" except Exception as e: - logger.error(f"Failed to get pose for object '{object_name}': {e}") return f"Error: Could not determine location of '{object_name}'" - - logger.info( - f"Navigating to detected '{object_name}' at ({pose.position.x:.2f}, {pose.position.y:.2f}, {pose.position.z:.2f})" - ) - - result = self._navigate_to(pose) - + + # Navigate to the object + result = self._navigate_to(goal_pose) + if result: return f"Successfully navigated to '{object_name}'" else: return f"Failed to reach '{object_name}'. Navigation was cancelled or failed." - - @skill() + + #@skill() def navigate_with_text(self, query: str) -> str: - """Navigate to a location by querying the existing semantic map using natural language. - - First attempts to locate an object in the robot's camera view using vision. - If the object is found, navigates to it. If not, falls back to querying the - semantic map for a location matching the description. - CALL THIS SKILL FOR ONE SUBJECT AT A TIME. For example: "Go to the person wearing a blue shirt in the living room", - you should call this skill twice, once for the person wearing a blue shirt and once for the living room. - Args: - query: Text query to search for in the semantic map - """ - + """Navigate to a location by querying the existing semantic map...""" if not self._skill_started: raise ValueError(f"{self} has not been started.") + success_msg = self._navigate_by_tagged_location(query) if success_msg: return success_msg - - logger.info(f"No tagged location found for {query}") - + success_msg = self._navigate_to_object(query) if success_msg: return success_msg - - logger.info(f"No object in view found for {query}") - + success_msg = self._navigate_using_semantic_map(query) if success_msg: return success_msg - - return f"No tagged location called '{query}'. No object in view matching '{query}'. No matching location found in semantic map for '{query}'." + + return f"Could not find '{query}' using any method" def _navigate_by_tagged_location(self, query: str) -> str | None: try: @@ -220,7 +219,7 @@ def _navigate_by_tagged_location(self, query: str) -> str | None: goal_pose = PoseStamped( position=make_vector3(*robot_location.position), orientation=Quaternion.from_euler(Vector3(*robot_location.rotation)), - frame_id="map", + frame_id="world", ) result = self._navigate_to(goal_pose) @@ -330,30 +329,30 @@ def _get_bbox_for_current_frame(self, query: str) -> BBox | None: return get_object_bbox_from_image(self._vl_model, self._latest_image, query) def _navigate_using_semantic_map(self, query: str) -> str: - try: - query_by_text_rpc = self.get_rpc_calls("SpatialMemory.query_by_text") - except Exception: - return "Error: The SpatialMemory module is not connected." + try: + query_by_text_rpc = self.get_rpc_calls("SpatialMemory.query_by_text") + except Exception: + return "Error: The SpatialMemory module is not connected." - results = query_by_text_rpc(query) + results = query_by_text_rpc(query) - if not results: - return f"No matching location found in semantic map for '{query}'" + if not results: + return f"No matching location found in semantic map for '{query}'" - best_match = results[0] + best_match = results[0] - goal_pose = self._get_goal_pose_from_result(best_match) + goal_pose = self._get_goal_pose_from_result(best_match) - print("Goal pose for semantic nav:", goal_pose) - if not goal_pose: - return f"Found a result for '{query}' but it didn't have a valid position." + print("Goal pose for semantic nav:", goal_pose) + if not goal_pose: + return f"Found a result for '{query}' but it didn't have a valid position." - result = self._navigate_to(goal_pose) + result = self._navigate_to(goal_pose) - if not result: - return f"Failed to navigate for '{query}'" + if not result: + return f"Failed to navigate for '{query}'" - return f"Successfuly arrived at '{query}'" + return f"Successfuly arrived at '{query}'" @skill() def follow_human(self, person: str) -> str: @@ -451,7 +450,7 @@ def _get_goal_pose_from_result(self, result: dict[str, Any]) -> PoseStamped | No return PoseStamped( position=make_vector3(pos_x, pos_y, 0), orientation=Quaternion.from_euler(make_vector3(0, 0, theta)), - frame_id="map", + frame_id="world", ) diff --git a/dimos/agents_deprecated/memory/spatial_vector_db.py b/dimos/agents_deprecated/memory/spatial_vector_db.py index 0c8774cd95..1f02bb7ebf 100644 --- a/dimos/agents_deprecated/memory/spatial_vector_db.py +++ b/dimos/agents_deprecated/memory/spatial_vector_db.py @@ -132,7 +132,7 @@ def add_image_vector( ids=[vector_id], embeddings=[embedding.tolist()], metadatas=[metadata] ) - logger.info(f"Added image vector {vector_id} with metadata: {metadata}") + #logger.info(f"Added image vector {vector_id} with metadata: {metadata}") def query_by_embedding(self, embedding: np.ndarray, limit: int = 5) -> list[dict]: # type: ignore[type-arg] """ @@ -225,8 +225,8 @@ def _process_query_results(self, results) -> list[dict]: # type: ignore[no-unty ) # Get the image from visual memory - image = self.visual_memory.get(lookup_id) - result["image"] = image + #image = self.visual_memory.get(lookup_id) + #result["image"] = image processed_results.append(result) diff --git a/dimos/perception/detection/moduleDB.py b/dimos/perception/detection/moduleDB.py index 8c844454ba..d6184f3cae 100644 --- a/dimos/perception/detection/moduleDB.py +++ b/dimos/perception/detection/moduleDB.py @@ -30,6 +30,9 @@ from dimos.perception.detection.module3D import Detection3DModule from dimos.perception.detection.type import ImageDetections3DPC, TableStr from dimos.perception.detection.type.detection3d import Detection3DPC +from dimos.utils.logging_config import setup_logger + +logger = setup_logger() # Represents an object in space, as collection of 3d detections over time @@ -267,21 +270,64 @@ def agent_encode(self) -> str: # return ret[0] if ret else None + @rpc - def lookup(self, label: str) -> list[Object3D]: + def lookup(self, label: str, min_detections: int = 0.5) -> list[dict]: """Look up objects by label/name. - + + Returns lightweight dict instead of full Object3D to avoid RPC timeout. + Args: - label: Name/class of object to search for - + label: Name/class to search for + min_detections: Minimum number of detections required (default: 1) + Returns: - List of Object3D instances matching the label + List of dicts with object info (track_id, name, position, etc.) """ + import time + + # DEBUG: Log search details + logger.error(f"Lookup called for '{label}'") + logger.error(f"Total objects in DB: {len(self.objects)}") + + # DEBUG: Log ALL object names in database + all_names = [obj.name for obj in self.objects.values() if obj.name] + logger.error(f"All object names in DB: {all_names}") + matching = [] + for obj in self.objects.values(): + # DEBUG: Log each comparison + if obj.name: + logger.debug(f" Checking: '{obj.name}' vs '{label}' (detections: {obj.detections})") + + # Check name match if obj.name and label.lower() in obj.name.lower(): - if obj.detections >= 1: # Filter out noise - matching.append(obj) + + # Check detection threshold + if obj.detections >= min_detections: + + # Create lightweight dict (NO pointcloud, NO heavy data) + try: + pose = obj.to_pose() + result = { + "track_id": obj.track_id, + "name": obj.name, + "detections": obj.detections, + "confidence": obj.confidence, + "pos_x": pose.position.x, + "pos_y": pose.position.y, + "pos_z": pose.position.z, + "frame_id": pose.frame_id, + "last_seen": time.time() - obj.ts, + } + matching.append(result) + logger.error(f"Added to results: {result}") + except Exception as e: + logger.error(f"Failed to get pose for {obj.track_id}: {e}") + continue + + logger.error(f"Returning {len(matching)} matches") return matching @rpc diff --git a/dimos/perception/spatial_perception.py b/dimos/perception/spatial_perception.py index 0e6c69ea75..cb35a22343 100644 --- a/dimos/perception/spatial_perception.py +++ b/dimos/perception/spatial_perception.py @@ -193,8 +193,6 @@ def start(self) -> None: # Subscribe to LCM streams def set_video(image_msg: Image) -> None: - logger.info("Received camera frame") - # Convert Image message to numpy array if hasattr(image_msg, "data"): frame = image_msg.data frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) @@ -222,7 +220,7 @@ def stop(self) -> None: def _process_frame(self) -> None: """Process the latest frame with pose data if available.""" tf = self.tf.get("world", "base_link") - if self._latest_video_frame is None or tf is None: + if self._latest_video_frame is None: logger.warning(" No video frame available") return @@ -230,8 +228,6 @@ def _process_frame(self) -> None: logger.warning("No TF transform (map -> base_link) available") return - logger.info(f"Processing frame with TF at ({tf.translation.x:.2f}, {tf.translation.y:.2f})") - # Create Pose object with position and orientation current_pose = tf.to_pose() @@ -297,11 +293,11 @@ def _process_frame(self) -> None: self.last_record_time = current_time self.stored_frame_count += 1 - logger.info( - f"Stored frame at position ({current_pose.position.x:.2f}, {current_pose.position.y:.2f}, {current_pose.position.z:.2f}), " - f"rotation ({euler.x:.2f}, {euler.y:.2f}, {euler.z:.2f}) " - f"stored {self.stored_frame_count}/{self.frame_count} frames" - ) + #logger.info( + # f"Stored frame at position ({current_pose.position.x:.2f}, {current_pose.position.y:.2f}, {current_pose.position.z:.2f}), " + # f"rotation ({euler.x:.2f}, {euler.y:.2f}, {euler.z:.2f}) " + # f"stored {self.stored_frame_count}/{self.frame_count} frames" + #) # Periodically save visual memory to disk if self._visual_memory is not None and self.visual_memory_path is not None: @@ -513,7 +509,7 @@ def add_named_location( Returns: True if successfully added, False otherwise """ - tf = self.tf.get("map", "base_link") + tf = self.tf.get("world", "base_link") if not tf: logger.error("No position available for robot location") return False From a3b4046eabdaaab76ce067fb659c0ce787cbff2b Mon Sep 17 00:00:00 2001 From: sinha7y <61509723+sinha7y@users.noreply.github.com> Date: Thu, 8 Jan 2026 05:41:21 +0000 Subject: [PATCH 06/13] CI code cleanup Former-commit-id: aad0f0cb7816db2f391b90e40ee7acb7cbdb0bb8 --- dimos/agents/skills/navigation.py | 79 +++++++++++++------------- dimos/perception/detection/moduleDB.py | 25 ++++---- dimos/perception/spatial_perception.py | 5 +- 3 files changed, 55 insertions(+), 54 deletions(-) diff --git a/dimos/agents/skills/navigation.py b/dimos/agents/skills/navigation.py index 95d01dbb7b..5078f5ae22 100644 --- a/dimos/agents/skills/navigation.py +++ b/dimos/agents/skills/navigation.py @@ -31,6 +31,7 @@ logger = setup_logger() + class NavigationSkillContainer(SkillModule): _latest_image: Image | None = None _latest_odom: PoseStamped | None = None @@ -116,91 +117,91 @@ def tag_location(self, location_name: str) -> str: @skill() def navigate_to_detected_object(self, object_name: str) -> str: """Navigate to an object or person that was detected by the vision system. - + This uses ObjectDB to find objects that were previously seen and tracked. Only use this for specific objects like "cup", "red box", "person", "chair". The object must have been previously detected by the camera system. - + Args: object_name: Name/class of the object to navigate to - + Returns: Success or failure message """ if not self._skill_started: raise ValueError(f"{self} has not been started.") - + # Get RPC handle try: lookup_rpc = self.get_rpc_calls("ObjectDBModule.lookup") except Exception: logger.error("ObjectDBModule not connected") return "Error: ObjectDB module not connected. Make sure detection blueprint is loaded." - + # Query ObjectDB try: objects = lookup_rpc(object_name) - except Exception as e: + except Exception: return f"Error querying ObjectDB for '{object_name}'" - + # Check if we found anything if not objects or len(objects) == 0: return f"No detected object matching '{object_name}'. The object may not have been seen yet." - + if len(objects) > 1: logger.info( f"Found {len(objects)} instances of '{object_name}', navigating to first one" ) - + # Extract pose from dict (not Object3D anymore!) try: obj = objects[0] # First match (now a dict) - + # Create pose from dict fields goal_pose = PoseStamped( position=make_vector3(obj["pos_x"], obj["pos_y"], obj["pos_z"]), orientation=Quaternion(), - frame_id=obj["frame_id"] + frame_id=obj["frame_id"], ) - + logger.info( f"Navigating to detected '{obj['name']}' at ({obj['pos_x']:.2f}, {obj['pos_y']:.2f}, {obj['pos_z']:.2f})" ) logger.info( f"Object info: {obj['detections']} detections, confidence: {obj['confidence']:.2f}, last seen: {obj['last_seen']:.1f}s ago" ) - - except (KeyError, TypeError) as e: + + except (KeyError, TypeError): return f"Error: Could not determine location of '{object_name}'" - except Exception as e: + except Exception: return f"Error: Could not determine location of '{object_name}'" - + # Navigate to the object result = self._navigate_to(goal_pose) - + if result: return f"Successfully navigated to '{object_name}'" else: return f"Failed to reach '{object_name}'. Navigation was cancelled or failed." - - #@skill() + + # @skill() def navigate_with_text(self, query: str) -> str: """Navigate to a location by querying the existing semantic map...""" if not self._skill_started: raise ValueError(f"{self} has not been started.") - + success_msg = self._navigate_by_tagged_location(query) if success_msg: return success_msg - + success_msg = self._navigate_to_object(query) if success_msg: return success_msg - + success_msg = self._navigate_using_semantic_map(query) if success_msg: return success_msg - + return f"Could not find '{query}' using any method" def _navigate_by_tagged_location(self, query: str) -> str | None: @@ -329,30 +330,30 @@ def _get_bbox_for_current_frame(self, query: str) -> BBox | None: return get_object_bbox_from_image(self._vl_model, self._latest_image, query) def _navigate_using_semantic_map(self, query: str) -> str: - try: - query_by_text_rpc = self.get_rpc_calls("SpatialMemory.query_by_text") - except Exception: - return "Error: The SpatialMemory module is not connected." + try: + query_by_text_rpc = self.get_rpc_calls("SpatialMemory.query_by_text") + except Exception: + return "Error: The SpatialMemory module is not connected." - results = query_by_text_rpc(query) + results = query_by_text_rpc(query) - if not results: - return f"No matching location found in semantic map for '{query}'" + if not results: + return f"No matching location found in semantic map for '{query}'" - best_match = results[0] + best_match = results[0] - goal_pose = self._get_goal_pose_from_result(best_match) + goal_pose = self._get_goal_pose_from_result(best_match) - print("Goal pose for semantic nav:", goal_pose) - if not goal_pose: - return f"Found a result for '{query}' but it didn't have a valid position." + print("Goal pose for semantic nav:", goal_pose) + if not goal_pose: + return f"Found a result for '{query}' but it didn't have a valid position." - result = self._navigate_to(goal_pose) + result = self._navigate_to(goal_pose) - if not result: - return f"Failed to navigate for '{query}'" + if not result: + return f"Failed to navigate for '{query}'" - return f"Successfuly arrived at '{query}'" + return f"Successfuly arrived at '{query}'" @skill() def follow_human(self, person: str) -> str: diff --git a/dimos/perception/detection/moduleDB.py b/dimos/perception/detection/moduleDB.py index d6184f3cae..1d8093c5da 100644 --- a/dimos/perception/detection/moduleDB.py +++ b/dimos/perception/detection/moduleDB.py @@ -270,43 +270,42 @@ def agent_encode(self) -> str: # return ret[0] if ret else None - @rpc def lookup(self, label: str, min_detections: int = 0.5) -> list[dict]: """Look up objects by label/name. - + Returns lightweight dict instead of full Object3D to avoid RPC timeout. - + Args: label: Name/class to search for min_detections: Minimum number of detections required (default: 1) - + Returns: List of dicts with object info (track_id, name, position, etc.) """ import time - + # DEBUG: Log search details logger.error(f"Lookup called for '{label}'") logger.error(f"Total objects in DB: {len(self.objects)}") - + # DEBUG: Log ALL object names in database all_names = [obj.name for obj in self.objects.values() if obj.name] logger.error(f"All object names in DB: {all_names}") - + matching = [] - + for obj in self.objects.values(): # DEBUG: Log each comparison if obj.name: - logger.debug(f" Checking: '{obj.name}' vs '{label}' (detections: {obj.detections})") - + logger.debug( + f" Checking: '{obj.name}' vs '{label}' (detections: {obj.detections})" + ) + # Check name match if obj.name and label.lower() in obj.name.lower(): - # Check detection threshold if obj.detections >= min_detections: - # Create lightweight dict (NO pointcloud, NO heavy data) try: pose = obj.to_pose() @@ -326,7 +325,7 @@ def lookup(self, label: str, min_detections: int = 0.5) -> list[dict]: except Exception as e: logger.error(f"Failed to get pose for {obj.track_id}: {e}") continue - + logger.error(f"Returning {len(matching)} matches") return matching diff --git a/dimos/perception/spatial_perception.py b/dimos/perception/spatial_perception.py index cb35a22343..f8791387e8 100644 --- a/dimos/perception/spatial_perception.py +++ b/dimos/perception/spatial_perception.py @@ -189,6 +189,7 @@ def start(self) -> None: super().start() _ = self.tf import time + time.sleep(1.0) # Subscribe to LCM streams @@ -293,11 +294,11 @@ def _process_frame(self) -> None: self.last_record_time = current_time self.stored_frame_count += 1 - #logger.info( + # logger.info( # f"Stored frame at position ({current_pose.position.x:.2f}, {current_pose.position.y:.2f}, {current_pose.position.z:.2f}), " # f"rotation ({euler.x:.2f}, {euler.y:.2f}, {euler.z:.2f}) " # f"stored {self.stored_frame_count}/{self.frame_count} frames" - #) + # ) # Periodically save visual memory to disk if self._visual_memory is not None and self.visual_memory_path is not None: From cd47a33356fadf82aec994e62024b4a56f14225c Mon Sep 17 00:00:00 2001 From: sinha7y <61509723+sinha7y@users.noreply.github.com> Date: Wed, 7 Jan 2026 22:16:13 -0800 Subject: [PATCH 07/13] updated module DB for rich labels Former-commit-id: 6e192a4726f54570045a5dffa101b57eed1490f6 --- dimos/perception/detection/moduleDB.py | 90 ++++++++++++++++++++++++-- 1 file changed, 85 insertions(+), 5 deletions(-) diff --git a/dimos/perception/detection/moduleDB.py b/dimos/perception/detection/moduleDB.py index 1d8093c5da..2c1a2bbdbe 100644 --- a/dimos/perception/detection/moduleDB.py +++ b/dimos/perception/detection/moduleDB.py @@ -24,12 +24,12 @@ from reactivex.observable import Observable from dimos.core import In, Out, rpc +from dimos.models.vl.qwen import QwenVlModel # ← Correct path from dimos.msgs.geometry_msgs import PoseStamped, Quaternion, Transform, Vector3 from dimos.msgs.sensor_msgs import Image, PointCloud2 from dimos.msgs.vision_msgs import Detection2DArray from dimos.perception.detection.module3D import Detection3DModule from dimos.perception.detection.type import ImageDetections3DPC, TableStr -from dimos.perception.detection.type.detection3d import Detection3DPC from dimos.utils.logging_config import setup_logger logger = setup_logger() @@ -41,6 +41,8 @@ class Object3D(Detection3DPC): center: Vector3 | None = None # type: ignore[assignment] track_id: str | None = None # type: ignore[assignment] detections: int = 0 + yolo_label: str | None = None # Original YOLO detection class + vlm_label: str | None = None # VLM-enriched description def to_repr_dict(self) -> dict[str, Any]: if self.center is None: @@ -94,6 +96,9 @@ def __add__(self, detection: Detection3DPC) -> "Object3D": else: new_object.best_detection = self.best_detection + new_object.yolo_label = self.yolo_label + new_object.vlm_label = self.vlm_label + return new_object def get_image(self) -> Image | None: @@ -145,6 +150,9 @@ class ObjectDBModule(Detection3DModule, TableStr): goto: Callable[[PoseStamped], Any] | None = None + _vlm_model: QwenVlModel | None = None + enable_vlm_enrichment: bool = True + color_image: In[Image] pointcloud: In[PointCloud2] @@ -189,9 +197,53 @@ def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] self.objects = {} self.remembered_locations = {} + def vlm_model(self): + """Lazy load VLM model.""" + if self._vlm_model is None and self.enable_vlm_enrichment: + logger.info("Initializing VLM model for label enrichment") + self._vlm_model = QwenVlModel() + return self._vlm_model + + def _enrich_with_vlm(self, obj: Object3D) -> str: + """Generate rich description using VLM. + + Args: + obj: Object3D with detection + + Returns: + Rich description string + """ + if not self.enable_vlm_enrichment or self.vlm_model is None: + return obj.yolo_label # Fall back to YOLO label + + try: + # Get image from best detection + image = obj.get_image() + if image is None: + logger.warning(f"No image for {obj.track_id}, using YOLO label") + return obj.yolo_label + + # Generate rich description with VLM + prompt = f"Describe this {obj.yolo_label} in detail. Include color, appearance, and distinguishing features. Keep it concise (under 10 words)." + + description = self.vlm_model.query(image, prompt) + + # Clean up the description + rich_label = description.strip() + + logger.info(f"VLM enrichment: '{obj.yolo_label}' → '{rich_label}'") + return rich_label + + except Exception as e: + logger.error(f"VLM enrichment failed for {obj.track_id}: {e}") + return obj.yolo_label # Fall back to YOLO label + def closest_object(self, detection: Detection3DPC) -> Object3D | None: - # Filter objects to only those with matching names - matching_objects = [obj for obj in self.objects.values() if obj.name == detection.name] + matching_objects = [ + obj + for obj in self.objects.values() + if obj.yolo_label == detection.name # ← Use yolo_label for tracking + ] if not matching_objects: return None @@ -214,17 +266,45 @@ def add_detection(self, detection: Detection3DPC): # type: ignore[no-untyped-de else: return self.create_new_object(detection) - def add_to_object(self, closest: Object3D, detection: Detection3DPC): # type: ignore[no-untyped-def] + def add_to_object(self, closest: Object3D, detection: Detection3DPC): + """Add detection to existing object.""" new_object = closest + detection + + # Re-enrich every 5 detections + if self.enable_vlm_enrichment and new_object.detections % 5 == 0: + logger.info( + f"Re-enriching {new_object.track_id} after {new_object.detections} detections" + ) + rich_label = self._enrich_with_vlm(new_object) + new_object.vlm_label = rich_label + new_object.name = rich_label + else: + new_object.name = new_object.vlm_label or new_object.yolo_label + if closest.track_id is not None: self.objects[closest.track_id] = new_object + return new_object - def create_new_object(self, detection: Detection3DPC): # type: ignore[no-untyped-def] + def create_new_object(self, detection: Detection3DPC): + """Create new object and enrich with VLM.""" new_object = Object3D(f"obj_{self.cnt}", detection) + + # Enrich label with VLM + if self.enable_vlm_enrichment and new_object.track_id is not None: + rich_label = self._enrich_with_vlm(new_object) + new_object.vlm_label = rich_label + new_object.name = rich_label + else: + new_object.name = new_object.yolo_label + if new_object.track_id is not None: self.objects[new_object.track_id] = new_object + self.cnt += 1 + logger.info( + f"Created object {new_object.track_id}: YOLO='{new_object.yolo_label}' VLM='{new_object.vlm_label}'" + ) return new_object def agent_encode(self) -> str: From d129ede7c41682fffaf02796809357509054ca18 Mon Sep 17 00:00:00 2001 From: Dimensional Date: Thu, 8 Jan 2026 12:41:36 -0800 Subject: [PATCH 08/13] Added detection answering ability for agent Former-commit-id: d000bc853ea844e79e7d110017806c68d4bf300b --- dimos/agents/skills/navigation.py | 50 +++++++++++++++++++++++++- dimos/perception/detection/moduleDB.py | 40 +++++++++++++++++++-- 2 files changed, 87 insertions(+), 3 deletions(-) diff --git a/dimos/agents/skills/navigation.py b/dimos/agents/skills/navigation.py index 5078f5ae22..61555da4cc 100644 --- a/dimos/agents/skills/navigation.py +++ b/dimos/agents/skills/navigation.py @@ -53,6 +53,7 @@ class NavigationSkillContainer(SkillModule): "WavefrontFrontierExplorer.explore", "WavefrontFrontierExplorer.is_exploration_active", "ObjectDBModule.lookup", + "ObjectDBModule.get_all_detected_objects", ] color_image: In[Image] @@ -184,7 +185,54 @@ def navigate_to_detected_object(self, object_name: str) -> str: else: return f"Failed to reach '{object_name}'. Navigation was cancelled or failed." - # @skill() + @skill() + def list_detected_objects(self) -> str: + """List all objects and people detected by the vision system. + + Use this skill when the user asks questions about what the robot can see + or what objects are in the environment, such as: + - "What do you see?" + - "What have you detected?" + - "What objects are around?" + - "List everything you can see" + - "What's in the room?" + - "Are there any people nearby?" + + This provides situational awareness of the robot's surroundings based on + what the camera system has detected and tracked. + + Returns: + A description of all detected objects with their names and detection counts + """ + if not self._skill_started: + raise ValueError(f"{self} has not been started.") + + # Get objects + try: + get_all_rpc = self.get_rpc_calls("ObjectDBModule.get_all_detected_objects") + except Exception as e: + logger.error(f"ObjectDBModule not connected: {e}") + return "Error: ObjectDB module not connected. Detection system may not be running." + + try: + objects = get_all_rpc() + except Exception as e: + logger.error(f"Failed to query ObjectDB: {e}") + return "Error: Cannot access detection system" + + if not objects: + return "I haven't detected any objects yet." + + # Build list of detected objects + lines = [f"I've detected {len(objects)} object(s):"] + for obj in objects[:10]: # Limit to 10 most recent + lines.append(f"- {obj['name']} ({obj['detections']} detections)") + + if len(objects) > 10: + lines.append(f"... and {len(objects) - 10} more objects") + + return "\n".join(lines) + def navigate_with_text(self, query: str) -> str: """Navigate to a location by querying the existing semantic map...""" if not self._skill_started: diff --git a/dimos/perception/detection/moduleDB.py b/dimos/perception/detection/moduleDB.py index 2c1a2bbdbe..47fc913abe 100644 --- a/dimos/perception/detection/moduleDB.py +++ b/dimos/perception/detection/moduleDB.py @@ -24,12 +24,13 @@ from reactivex.observable import Observable from dimos.core import In, Out, rpc -from dimos.models.vl.qwen import QwenVlModel # ← Correct path +from dimos.models.vl.qwen import QwenVlModel from dimos.msgs.geometry_msgs import PoseStamped, Quaternion, Transform, Vector3 from dimos.msgs.sensor_msgs import Image, PointCloud2 from dimos.msgs.vision_msgs import Detection2DArray from dimos.perception.detection.module3D import Detection3DModule from dimos.perception.detection.type import ImageDetections3DPC, TableStr +from dimos.perception.detection.type.detection3d import Detection3DPC from dimos.utils.logging_config import setup_logger logger = setup_logger() @@ -197,6 +198,7 @@ def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] self.objects = {} self.remembered_locations = {} + @property def vlm_model(self): """Lazy load VLM model.""" if self._vlm_model is None and self.enable_vlm_enrichment: @@ -271,7 +273,7 @@ def add_to_object(self, closest: Object3D, detection: Detection3DPC): new_object = closest + detection # Re-enrich every 5 detections - if self.enable_vlm_enrichment and new_object.detections % 5 == 0: + if self.enable_vlm_enrichment and new_object.detections % 10 == 0: logger.info( f"Re-enriching {new_object.track_id} after {new_object.detections} detections" ) @@ -319,6 +321,40 @@ def agent_encode(self) -> str: return "No objects detected yet." return "\n".join(ret) + @rpc + def get_all_detected_objects(self) -> list[dict]: + """Get all detected objects with their details.""" + import time + + results = [] + current_time = time.time() + + for obj in self.objects.values(): + if not obj.name: + continue + + try: + pose = obj.to_pose() + except Exception as e: + logger.warning(f"Failed to get pose for {obj.track_id}: {e}") + continue + + results.append({ + "track_id": obj.track_id, + "name": obj.name, + "yolo_label": obj.yolo_label, + "vlm_label": obj.vlm_label, + "detections": obj.detections, + "confidence": obj.confidence, + "pos_x": pose.position.x, + "pos_y": pose.position.y, + "pos_z": pose.position.z, + "last_seen": current_time - obj.ts, + }) + + results.sort(key=lambda x: x["last_seen"]) + return results + # @rpc # def vlm_query(self, description: str) -> Object3D | None: # imageDetections2D = super().ask_vlm(description) From 0dcfe6ce35790a287b84ea1c1d5cb4a4fb68d841 Mon Sep 17 00:00:00 2001 From: sinha7y <61509723+sinha7y@users.noreply.github.com> Date: Thu, 8 Jan 2026 20:46:11 +0000 Subject: [PATCH 09/13] CI code cleanup Former-commit-id: 479754b0715b6fe8b2368da2be140e7502187f9b --- dimos/agents/skills/navigation.py | 18 ++++++------ dimos/perception/detection/moduleDB.py | 38 ++++++++++++++------------ 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/dimos/agents/skills/navigation.py b/dimos/agents/skills/navigation.py index 61555da4cc..5befc50951 100644 --- a/dimos/agents/skills/navigation.py +++ b/dimos/agents/skills/navigation.py @@ -188,7 +188,7 @@ def navigate_to_detected_object(self, object_name: str) -> str: @skill() def list_detected_objects(self) -> str: """List all objects and people detected by the vision system. - + Use this skill when the user asks questions about what the robot can see or what objects are in the environment, such as: - "What do you see?" @@ -197,40 +197,40 @@ def list_detected_objects(self) -> str: - "List everything you can see" - "What's in the room?" - "Are there any people nearby?" - + This provides situational awareness of the robot's surroundings based on what the camera system has detected and tracked. - + Returns: A description of all detected objects with their names and detection counts """ if not self._skill_started: raise ValueError(f"{self} has not been started.") - + # Get objects try: get_all_rpc = self.get_rpc_calls("ObjectDBModule.get_all_detected_objects") except Exception as e: logger.error(f"ObjectDBModule not connected: {e}") return "Error: ObjectDB module not connected. Detection system may not be running." - + try: objects = get_all_rpc() except Exception as e: logger.error(f"Failed to query ObjectDB: {e}") return "Error: Cannot access detection system" - + if not objects: return "I haven't detected any objects yet." - + # Build list of detected objects lines = [f"I've detected {len(objects)} object(s):"] for obj in objects[:10]: # Limit to 10 most recent lines.append(f"- {obj['name']} ({obj['detections']} detections)") - + if len(objects) > 10: lines.append(f"... and {len(objects) - 10} more objects") - + return "\n".join(lines) def navigate_with_text(self, query: str) -> str: diff --git a/dimos/perception/detection/moduleDB.py b/dimos/perception/detection/moduleDB.py index 47fc913abe..b3d63d62d5 100644 --- a/dimos/perception/detection/moduleDB.py +++ b/dimos/perception/detection/moduleDB.py @@ -24,7 +24,7 @@ from reactivex.observable import Observable from dimos.core import In, Out, rpc -from dimos.models.vl.qwen import QwenVlModel +from dimos.models.vl.qwen import QwenVlModel from dimos.msgs.geometry_msgs import PoseStamped, Quaternion, Transform, Vector3 from dimos.msgs.sensor_msgs import Image, PointCloud2 from dimos.msgs.vision_msgs import Detection2DArray @@ -325,33 +325,35 @@ def agent_encode(self) -> str: def get_all_detected_objects(self) -> list[dict]: """Get all detected objects with their details.""" import time - + results = [] current_time = time.time() - + for obj in self.objects.values(): if not obj.name: continue - + try: pose = obj.to_pose() except Exception as e: logger.warning(f"Failed to get pose for {obj.track_id}: {e}") continue - - results.append({ - "track_id": obj.track_id, - "name": obj.name, - "yolo_label": obj.yolo_label, - "vlm_label": obj.vlm_label, - "detections": obj.detections, - "confidence": obj.confidence, - "pos_x": pose.position.x, - "pos_y": pose.position.y, - "pos_z": pose.position.z, - "last_seen": current_time - obj.ts, - }) - + + results.append( + { + "track_id": obj.track_id, + "name": obj.name, + "yolo_label": obj.yolo_label, + "vlm_label": obj.vlm_label, + "detections": obj.detections, + "confidence": obj.confidence, + "pos_x": pose.position.x, + "pos_y": pose.position.y, + "pos_z": pose.position.z, + "last_seen": current_time - obj.ts, + } + ) + results.sort(key=lambda x: x["last_seen"]) return results From d998f67529f72ec620c9a302bad049f2052cb9fd Mon Sep 17 00:00:00 2001 From: sinha7y <61509723+sinha7y@users.noreply.github.com> Date: Thu, 8 Jan 2026 13:13:26 -0800 Subject: [PATCH 10/13] clean up code Former-commit-id: eca3684186fdeb8c227f4b805c8dbab4ff1f4aaa --- dimos/agents/skills/navigation.py | 4 +- dimos/perception/detection/moduleDB.py | 84 ++++++++++---------------- 2 files changed, 34 insertions(+), 54 deletions(-) diff --git a/dimos/agents/skills/navigation.py b/dimos/agents/skills/navigation.py index 5befc50951..73e09b3c56 100644 --- a/dimos/agents/skills/navigation.py +++ b/dimos/agents/skills/navigation.py @@ -145,7 +145,6 @@ def navigate_to_detected_object(self, object_name: str) -> str: except Exception: return f"Error querying ObjectDB for '{object_name}'" - # Check if we found anything if not objects or len(objects) == 0: return f"No detected object matching '{object_name}'. The object may not have been seen yet." @@ -156,7 +155,7 @@ def navigate_to_detected_object(self, object_name: str) -> str: # Extract pose from dict (not Object3D anymore!) try: - obj = objects[0] # First match (now a dict) + obj = objects[0] # Create pose from dict fields goal_pose = PoseStamped( @@ -177,7 +176,6 @@ def navigate_to_detected_object(self, object_name: str) -> str: except Exception: return f"Error: Could not determine location of '{object_name}'" - # Navigate to the object result = self._navigate_to(goal_pose) if result: diff --git a/dimos/perception/detection/moduleDB.py b/dimos/perception/detection/moduleDB.py index b3d63d62d5..1d8cfba437 100644 --- a/dimos/perception/detection/moduleDB.py +++ b/dimos/perception/detection/moduleDB.py @@ -42,8 +42,8 @@ class Object3D(Detection3DPC): center: Vector3 | None = None # type: ignore[assignment] track_id: str | None = None # type: ignore[assignment] detections: int = 0 - yolo_label: str | None = None # Original YOLO detection class - vlm_label: str | None = None # VLM-enriched description + yolo_label: str | None = None + vlm_label: str | None = None def to_repr_dict(self) -> dict[str, Any]: if self.center is None: @@ -108,7 +108,7 @@ def get_image(self) -> Image | None: def scene_entity_label(self) -> str: return f"{self.name} ({self.detections})" - def agent_encode(self): # type: ignore[no-untyped-def] + def agent_encode(self): return { "id": self.track_id, "name": self.name, @@ -128,8 +128,6 @@ def to_pose(self) -> PoseStamped: child_frame_id="camera_optical", ).inverse() - print("transform is", self.best_detection.transform) - global_transform = optical_inverse + self.best_detection.transform print("inverse optical is", global_transform) @@ -192,7 +190,7 @@ def scene_thread() -> None: self.detection_stream_3d.subscribe(update_objects) - def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] + def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) self.goto = None self.objects = {} @@ -219,18 +217,15 @@ def _enrich_with_vlm(self, obj: Object3D) -> str: return obj.yolo_label # Fall back to YOLO label try: - # Get image from best detection image = obj.get_image() if image is None: logger.warning(f"No image for {obj.track_id}, using YOLO label") return obj.yolo_label - # Generate rich description with VLM prompt = f"Describe this {obj.yolo_label} in detail. Include color, appearance, and distinguishing features. Keep it concise (under 10 words)." description = self.vlm_model.query(image, prompt) - # Clean up the description rich_label = description.strip() logger.info(f"VLM enrichment: '{obj.yolo_label}' → '{rich_label}'") @@ -272,7 +267,7 @@ def add_to_object(self, closest: Object3D, detection: Detection3DPC): """Add detection to existing object.""" new_object = closest + detection - # Re-enrich every 5 detections + # Re-enrich every 10 detections if self.enable_vlm_enrichment and new_object.detections % 10 == 0: logger.info( f"Re-enriching {new_object.track_id} after {new_object.detections} detections" @@ -312,11 +307,9 @@ def create_new_object(self, detection: Detection3DPC): def agent_encode(self) -> str: ret = [] for obj in copy(self.objects).values(): - # we need at least 3 detectieons to consider it a valid object - # for this to be serious we need a ratio of detections within the window of observations - if len(obj.detections) < 4: # type: ignore[arg-type] + if len(obj.detections) < 4: continue - ret.append(str(obj.agent_encode())) # type: ignore[no-untyped-call] + ret.append(str(obj.agent_encode())) if not ret: return "No objects detected yet." return "\n".join(ret) @@ -389,7 +382,7 @@ def get_all_detected_objects(self) -> list[dict]: # return ret[0] if ret else None @rpc - def lookup(self, label: str, min_detections: int = 0.5) -> list[dict]: + def lookup(self, label: str, min_detections: int = 1) -> list[dict]: """Look up objects by label/name. Returns lightweight dict instead of full Object3D to avoid RPC timeout. @@ -403,48 +396,37 @@ def lookup(self, label: str, min_detections: int = 0.5) -> list[dict]: """ import time - # DEBUG: Log search details - logger.error(f"Lookup called for '{label}'") - logger.error(f"Total objects in DB: {len(self.objects)}") - - # DEBUG: Log ALL object names in database - all_names = [obj.name for obj in self.objects.values() if obj.name] - logger.error(f"All object names in DB: {all_names}") + logger.info(f"Looking up '{label}' (min_detections: {min_detections})") + logger.debug(f"Total objects in DB: {len(self.objects)}") matching = [] for obj in self.objects.values(): - # DEBUG: Log each comparison - if obj.name: - logger.debug( - f" Checking: '{obj.name}' vs '{label}' (detections: {obj.detections})" + if not obj.name or label.lower() not in obj.name.lower(): + continue + + if obj.detections < min_detections: + continue + + try: + pose = obj.to_pose() + matching.append( + { + "track_id": obj.track_id, + "name": obj.name, + "detections": obj.detections, + "confidence": obj.confidence, + "pos_x": pose.position.x, + "pos_y": pose.position.y, + "pos_z": pose.position.z, + "frame_id": pose.frame_id, + "last_seen": time.time() - obj.ts, + } ) + except Exception as e: + logger.warning(f"Failed to get pose for {obj.track_id}: {e}") - # Check name match - if obj.name and label.lower() in obj.name.lower(): - # Check detection threshold - if obj.detections >= min_detections: - # Create lightweight dict (NO pointcloud, NO heavy data) - try: - pose = obj.to_pose() - result = { - "track_id": obj.track_id, - "name": obj.name, - "detections": obj.detections, - "confidence": obj.confidence, - "pos_x": pose.position.x, - "pos_y": pose.position.y, - "pos_z": pose.position.z, - "frame_id": pose.frame_id, - "last_seen": time.time() - obj.ts, - } - matching.append(result) - logger.error(f"Added to results: {result}") - except Exception as e: - logger.error(f"Failed to get pose for {obj.track_id}: {e}") - continue - - logger.error(f"Returning {len(matching)} matches") + logger.info(f"Found {len(matching)} objects matching '{label}'") return matching @rpc From 2dec4f739413115388c4bf613a57c26bae2acba4 Mon Sep 17 00:00:00 2001 From: Yash Sinha Date: Thu, 8 Jan 2026 17:07:58 -0800 Subject: [PATCH 11/13] Bug fix for objectDB labels --- dimos/perception/detection/moduleDB.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dimos/perception/detection/moduleDB.py b/dimos/perception/detection/moduleDB.py index 1d8cfba437..8a323eb773 100644 --- a/dimos/perception/detection/moduleDB.py +++ b/dimos/perception/detection/moduleDB.py @@ -67,6 +67,7 @@ def __init__( # type: ignore[no-untyped-def] self.track_id = track_id self.class_id = detection.class_id self.name = detection.name + self.yolo_label = detection.name self.confidence = detection.confidence self.pointcloud = detection.pointcloud self.bbox = detection.bbox From 779ec10320905d3118b921d03e122e7bedb7c0d5 Mon Sep 17 00:00:00 2001 From: Yash Sinha Date: Fri, 9 Jan 2026 16:30:42 -0800 Subject: [PATCH 12/13] - Move list_detected_objects to ObjectDBModule with SkillModule inheritance - Add DetectedObjectInfo dataclass for passing object data to navigation (can move to dimos.msgs if needed) - Improve lookup with hybrid substring/token matching - Clean up skill logging --- dimos/agents/skills/navigation.py | 138 +++------- dimos/perception/detection/moduleDB.py | 185 ++++++------- ...igation and fixed semantic navigation -967 | 258 ++++++++++++++++++ 3 files changed, 378 insertions(+), 203 deletions(-) create mode 100644 ed navigation and fixed semantic navigation -967 diff --git a/dimos/agents/skills/navigation.py b/dimos/agents/skills/navigation.py index 73e09b3c56..780fb96656 100644 --- a/dimos/agents/skills/navigation.py +++ b/dimos/agents/skills/navigation.py @@ -53,7 +53,6 @@ class NavigationSkillContainer(SkillModule): "WavefrontFrontierExplorer.explore", "WavefrontFrontierExplorer.is_exploration_active", "ObjectDBModule.lookup", - "ObjectDBModule.get_all_detected_objects", ] color_image: In[Image] @@ -117,120 +116,45 @@ def tag_location(self, location_name: str) -> str: @skill() def navigate_to_detected_object(self, object_name: str) -> str: - """Navigate to an object or person that was detected by the vision system. - - This uses ObjectDB to find objects that were previously seen and tracked. - Only use this for specific objects like "cup", "red box", "person", "chair". - The object must have been previously detected by the camera system. - + """Navigate to an object detected by the vision system. + + Use this skill to navigate to specific objects that have been detected + by the robot's cameras, such as: + - "Navigate to person in white" + - "Go to red coffee mug" + - "Move to the wooden chair" + Args: - object_name: Name/class of the object to navigate to - + object_name: Description or name of the object to navigate to + Returns: - Success or failure message + Status message indicating success or failure """ if not self._skill_started: raise ValueError(f"{self} has not been started.") - - # Get RPC handle - try: - lookup_rpc = self.get_rpc_calls("ObjectDBModule.lookup") - except Exception: - logger.error("ObjectDBModule not connected") - return "Error: ObjectDB module not connected. Make sure detection blueprint is loaded." - - # Query ObjectDB - try: - objects = lookup_rpc(object_name) - except Exception: - return f"Error querying ObjectDB for '{object_name}'" - - if not objects or len(objects) == 0: - return f"No detected object matching '{object_name}'. The object may not have been seen yet." - - if len(objects) > 1: - logger.info( - f"Found {len(objects)} instances of '{object_name}', navigating to first one" - ) - - # Extract pose from dict (not Object3D anymore!) - try: - obj = objects[0] - - # Create pose from dict fields - goal_pose = PoseStamped( - position=make_vector3(obj["pos_x"], obj["pos_y"], obj["pos_z"]), - orientation=Quaternion(), - frame_id=obj["frame_id"], - ) - - logger.info( - f"Navigating to detected '{obj['name']}' at ({obj['pos_x']:.2f}, {obj['pos_y']:.2f}, {obj['pos_z']:.2f})" - ) - logger.info( - f"Object info: {obj['detections']} detections, confidence: {obj['confidence']:.2f}, last seen: {obj['last_seen']:.1f}s ago" - ) - - except (KeyError, TypeError): - return f"Error: Could not determine location of '{object_name}'" - except Exception: - return f"Error: Could not determine location of '{object_name}'" - + + lookup_rpc = self.get_rpc_calls("ObjectDBModule.lookup") + objects = lookup_rpc(object_name) + + if not objects: + return f"No objects found matching '{object_name}'" + + obj = objects[0] + + goal_pose = PoseStamped( + position=obj.pose.position, + orientation=Quaternion(), + frame_id=obj.pose.frame_id + ) + result = self._navigate_to(goal_pose) - - if result: - return f"Successfully navigated to '{object_name}'" + + if result == "SUCCESS": + return f"Successfully navigated to '{obj.name}'" else: - return f"Failed to reach '{object_name}'. Navigation was cancelled or failed." - - @skill() - def list_detected_objects(self) -> str: - """List all objects and people detected by the vision system. - - Use this skill when the user asks questions about what the robot can see - or what objects are in the environment, such as: - - "What do you see?" - - "What have you detected?" - - "What objects are around?" - - "List everything you can see" - - "What's in the room?" - - "Are there any people nearby?" - - This provides situational awareness of the robot's surroundings based on - what the camera system has detected and tracked. - - Returns: - A description of all detected objects with their names and detection counts - """ - if not self._skill_started: - raise ValueError(f"{self} has not been started.") - - # Get objects - try: - get_all_rpc = self.get_rpc_calls("ObjectDBModule.get_all_detected_objects") - except Exception as e: - logger.error(f"ObjectDBModule not connected: {e}") - return "Error: ObjectDB module not connected. Detection system may not be running." - - try: - objects = get_all_rpc() - except Exception as e: - logger.error(f"Failed to query ObjectDB: {e}") - return "Error: Cannot access detection system" - - if not objects: - return "I haven't detected any objects yet." - - # Build list of detected objects - lines = [f"I've detected {len(objects)} object(s):"] - for obj in objects[:10]: # Limit to 10 most recent - lines.append(f"- {obj['name']} ({obj['detections']} detections)") - - if len(objects) > 10: - lines.append(f"... and {len(objects) - 10} more objects") - - return "\n".join(lines) + return f"Failed to reach '{obj.name}'" + #@skill() def navigate_with_text(self, query: str) -> str: """Navigate to a location by querying the existing semantic map...""" if not self._skill_started: diff --git a/dimos/perception/detection/moduleDB.py b/dimos/perception/detection/moduleDB.py index 8a323eb773..9f8ce63dd3 100644 --- a/dimos/perception/detection/moduleDB.py +++ b/dimos/perception/detection/moduleDB.py @@ -16,6 +16,7 @@ import threading import time from typing import Any +from dataclasses import dataclass from dimos_lcm.foxglove_msgs.ImageAnnotations import ( ImageAnnotations, @@ -31,10 +32,22 @@ from dimos.perception.detection.module3D import Detection3DModule from dimos.perception.detection.type import ImageDetections3DPC, TableStr from dimos.perception.detection.type.detection3d import Detection3DPC +from dimos.core.skill_module import SkillModule +from dimos.protocol.skill.skill import skill from dimos.utils.logging_config import setup_logger logger = setup_logger() +@dataclass +class DetectedObjectInfo: + """Lightweight detected object information for RPC.""" + track_id: str + name: str + pose: PoseStamped + detections: int + confidence: float + last_seen: float + # Represents an object in space, as collection of 3d detections over time class Object3D(Detection3DPC): @@ -143,7 +156,7 @@ def to_pose(self) -> PoseStamped: ) -class ObjectDBModule(Detection3DModule, TableStr): +class ObjectDBModule(Detection3DModule, SkillModule, TableStr): cnt: int = 0 objects: dict[str, Object3D] object_stream: Observable[Object3D] | None = None @@ -206,41 +219,34 @@ def vlm_model(self): return self._vlm_model def _enrich_with_vlm(self, obj: Object3D) -> str: - """Generate rich description using VLM. - - Args: - obj: Object3D with detection - - Returns: - Rich description string - """ + """Generate rich description using VLM.""" if not self.enable_vlm_enrichment or self.vlm_model is None: - return obj.yolo_label # Fall back to YOLO label - + return obj.yolo_label or "unknown" # Fallback for None + try: image = obj.get_image() if image is None: - logger.warning(f"No image for {obj.track_id}, using YOLO label") - return obj.yolo_label - - prompt = f"Describe this {obj.yolo_label} in detail. Include color, appearance, and distinguishing features. Keep it concise (under 10 words)." - + return obj.yolo_label or "unknown" + + if obj.yolo_label: + prompt = f"Describe this {obj.yolo_label} in detail. Include color, appearance, and distinguishing features. Keep it concise (under 10 words)." + else: + prompt = "Describe the main object in this image in detail. Include color, appearance, and distinguishing features. Keep it concise (under 10 words)." + description = self.vlm_model.query(image, prompt) - rich_label = description.strip() - + logger.info(f"VLM enrichment: '{obj.yolo_label}' → '{rich_label}'") return rich_label - + except Exception as e: - logger.error(f"VLM enrichment failed for {obj.track_id}: {e}") - return obj.yolo_label # Fall back to YOLO label + return obj.yolo_label or "unknown" def closest_object(self, detection: Detection3DPC) -> Object3D | None: matching_objects = [ obj for obj in self.objects.values() - if obj.yolo_label == detection.name # ← Use yolo_label for tracking + if obj.yolo_label == detection.name ] if not matching_objects: @@ -315,42 +321,6 @@ def agent_encode(self) -> str: return "No objects detected yet." return "\n".join(ret) - @rpc - def get_all_detected_objects(self) -> list[dict]: - """Get all detected objects with their details.""" - import time - - results = [] - current_time = time.time() - - for obj in self.objects.values(): - if not obj.name: - continue - - try: - pose = obj.to_pose() - except Exception as e: - logger.warning(f"Failed to get pose for {obj.track_id}: {e}") - continue - - results.append( - { - "track_id": obj.track_id, - "name": obj.name, - "yolo_label": obj.yolo_label, - "vlm_label": obj.vlm_label, - "detections": obj.detections, - "confidence": obj.confidence, - "pos_x": pose.position.x, - "pos_y": pose.position.y, - "pos_z": pose.position.z, - "last_seen": current_time - obj.ts, - } - ) - - results.sort(key=lambda x: x["last_seen"]) - return results - # @rpc # def vlm_query(self, description: str) -> Object3D | None: # imageDetections2D = super().ask_vlm(description) @@ -382,54 +352,77 @@ def get_all_detected_objects(self) -> list[dict]: # return ret[0] if ret else None - @rpc - def lookup(self, label: str, min_detections: int = 1) -> list[dict]: - """Look up objects by label/name. - - Returns lightweight dict instead of full Object3D to avoid RPC timeout. - - Args: - label: Name/class to search for - min_detections: Minimum number of detections required (default: 1) + def _to_detection_info(self, obj: Object3D, current_time: float) -> DetectedObjectInfo | None: + """Convert Object3D to DetectedObjectInfo.""" + try: + return DetectedObjectInfo( + track_id=obj.track_id, + name=obj.name, + pose=obj.to_pose(), + detections=obj.detections, + confidence=obj.confidence, + last_seen=current_time - obj.ts, + ) + except Exception: + return None - Returns: - List of dicts with object info (track_id, name, position, etc.) - """ + @rpc + def lookup(self, label: str, min_detections: int = 1) -> list[DetectedObjectInfo]: + """Look up objects by label/name.""" import time - - logger.info(f"Looking up '{label}' (min_detections: {min_detections})") - logger.debug(f"Total objects in DB: {len(self.objects)}") - + + current_time = time.time() matching = [] - + + label_lower = label.lower() + for obj in self.objects.values(): - if not obj.name or label.lower() not in obj.name.lower(): + if not obj.name: continue - if obj.detections < min_detections: continue + + obj_name_lower = obj.name.lower() - try: - pose = obj.to_pose() - matching.append( - { - "track_id": obj.track_id, - "name": obj.name, - "detections": obj.detections, - "confidence": obj.confidence, - "pos_x": pose.position.x, - "pos_y": pose.position.y, - "pos_z": pose.position.z, - "frame_id": pose.frame_id, - "last_seen": time.time() - obj.ts, - } - ) - except Exception as e: - logger.warning(f"Failed to get pose for {obj.track_id}: {e}") - - logger.info(f"Found {len(matching)} objects matching '{label}'") + is_match = ( + label_lower in obj_name_lower + or + any(token in obj_name_lower for token in label_lower.split() if len(token) > 2) + ) + + if is_match: + info = self._to_detection_info(obj, current_time) + if info: + matching.append(info) + return matching + @skill() + def list_detected_objects(self) -> str: + """List all objects detected by the vision system. + + Use this skill when the user asks questions about what the robot can see + or what objects are in the environment, such as: + - "What do you see?" + - "What have you detected?" + - "What objects are around?" + - "List everything you can see" + - "What's in the room?" + - "Are there any people nearby?" + """ + if not self.objects: + return "I haven't detected any objects yet." + + lines = [f"I've detected {len(self.objects)} object(s):"] + for obj in list(self.objects.values())[:10]: + if obj.name: + lines.append(f"- {obj.name} ({obj.detections} detections)") + + if len(self.objects) > 10: + lines.append(f"... and {len(self.objects) - 10} more objects") + + return "\n".join(lines) + @rpc def stop(self): # type: ignore[no-untyped-def] return super().stop() diff --git a/ed navigation and fixed semantic navigation -967 b/ed navigation and fixed semantic navigation -967 new file mode 100644 index 0000000000..333a0b576c --- /dev/null +++ b/ed navigation and fixed semantic navigation -967 @@ -0,0 +1,258 @@ + + SSUUMMMMAARRYY OOFF LLEESSSS CCOOMMMMAANNDDSS + + Commands marked with * may be preceded by a number, _N. + Notes in parentheses indicate the behavior if _N is given. + A key preceded by a caret indicates the Ctrl key; thus ^K is ctrl-K. + + h H Display this help. + q :q Q :Q ZZ Exit. + --------------------------------------------------------------------------- + + MMOOVVIINNGG + + e ^E j ^N CR * Forward one line (or _N lines). + y ^Y k ^K ^P * Backward one line (or _N lines). + f ^F ^V SPACE * Forward one window (or _N lines). + b ^B ESC-v * Backward one window (or _N lines). + z * Forward one window (and set window to _N). + w * Backward one window (and set window to _N). + ESC-SPACE * Forward one window, but don't stop at end-of-file. + d ^D * Forward one half-window (and set half-window to _N). + u ^U * Backward one half-window (and set half-window to _N). + ESC-) RightArrow * Right one half screen width (or _N positions). + ESC-( LeftArrow * Left one half screen width (or _N positions). + ESC-} ^RightArrow Right to last column displayed. + ESC-{ ^LeftArrow Left to first column. + F Forward forever; like "tail -f". + ESC-F Like F but stop when search pattern is found. + r ^R ^L Repaint screen. + R Repaint screen, discarding buffered input. + --------------------------------------------------- + Default "window" is the screen height. + Default "half-window" is half of the screen height. + --------------------------------------------------------------------------- + + SSEEAARRCCHHIINNGG + + /_p_a_t_t_e_r_n * Search forward for (_N-th) matching line. + ?_p_a_t_t_e_r_n * Search backward for (_N-th) matching line. + n * Repeat previous search (for _N-th occurrence). + N * Repeat previous search in reverse direction. + ESC-n * Repeat previous search, spanning files. + ESC-N * Repeat previous search, reverse dir. & spanning files. + ESC-u Undo (toggle) search highlighting. + ESC-U Clear search highlighting. + &_p_a_t_t_e_r_n * Display only matching lines. + --------------------------------------------------- + A search pattern may begin with one or more of: + ^N or ! Search for NON-matching lines. + ^E or * Search multiple files (pass thru END OF FILE). + ^F or @ Start search at FIRST file (for /) or last file (for ?). + ^K Highlight matches, but don't move (KEEP position). + ^R Don't use REGULAR EXPRESSIONS. + ^W WRAP search if no match found. + --------------------------------------------------------------------------- + + JJUUMMPPIINNGG + + g < ESC-< * Go to first line in file (or line _N). + G > ESC-> * Go to last line in file (or line _N). + p % * Go to beginning of file (or _N percent into file). + t * Go to the (_N-th) next tag. + T * Go to the (_N-th) previous tag. + { ( [ * Find close bracket } ) ]. + } ) ] * Find open bracket { ( [. + ESC-^F _<_c_1_> _<_c_2_> * Find close bracket _<_c_2_>. + ESC-^B _<_c_1_> _<_c_2_> * Find open bracket _<_c_1_>. + --------------------------------------------------- + Each "find close bracket" command goes forward to the close bracket + matching the (_N-th) open bracket in the top line. + Each "find open bracket" command goes backward to the open bracket + matching the (_N-th) close bracket in the bottom line. + + m_<_l_e_t_t_e_r_> Mark the current top line with . + M_<_l_e_t_t_e_r_> Mark the current bottom line with . + '_<_l_e_t_t_e_r_> Go to a previously marked position. + '' Go to the previous position. + ^X^X Same as '. + ESC-M_<_l_e_t_t_e_r_> Clear a mark. + --------------------------------------------------- + A mark is any upper-case or lower-case letter. + Certain marks are predefined: + ^ means beginning of the file + $ means end of the file + --------------------------------------------------------------------------- + + CCHHAANNGGIINNGG FFIILLEESS + + :e [_f_i_l_e] Examine a new file. + ^X^V Same as :e. + :n * Examine the (_N-th) next file from the command line. + :p * Examine the (_N-th) previous file from the command line. + :x * Examine the first (or _N-th) file from the command line. + :d Delete the current file from the command line list. + = ^G :f Print current file name. + --------------------------------------------------------------------------- + + MMIISSCCEELLLLAANNEEOOUUSS CCOOMMMMAANNDDSS + + -_<_f_l_a_g_> Toggle a command line option [see OPTIONS below]. + --_<_n_a_m_e_> Toggle a command line option, by name. + __<_f_l_a_g_> Display the setting of a command line option. + ___<_n_a_m_e_> Display the setting of an option, by name. + +_c_m_d Execute the less cmd each time a new file is examined. + + !_c_o_m_m_a_n_d Execute the shell command with $SHELL. + |XX_c_o_m_m_a_n_d Pipe file between current pos & mark XX to shell command. + s _f_i_l_e Save input to a file. + v Edit the current file with $VISUAL or $EDITOR. + V Print version number of "less". + --------------------------------------------------------------------------- + + OOPPTTIIOONNSS + + Most options may be changed either on the command line, + or from within less by using the - or -- command. + Options may be given in one of two forms: either a single + character preceded by a -, or a name preceded by --. + + -? ........ --help + Display help (from command line). + -a ........ --search-skip-screen + Search skips current screen. + -A ........ --SEARCH-SKIP-SCREEN + Search starts just after target line. + -b [_N] .... --buffers=[_N] + Number of buffers. + -B ........ --auto-buffers + Don't automatically allocate buffers for pipes. + -c ........ --clear-screen + Repaint by clearing rather than scrolling. + -d ........ --dumb + Dumb terminal. + -D xx_c_o_l_o_r . --color=xx_c_o_l_o_r + Set screen colors. + -e -E .... --quit-at-eof --QUIT-AT-EOF + Quit at end of file. + -f ........ --force + Force open non-regular files. + -F ........ --quit-if-one-screen + Quit if entire file fits on first screen. + -g ........ --hilite-search + Highlight only last match for searches. + -G ........ --HILITE-SEARCH + Don't highlight any matches for searches. + -h [_N] .... --max-back-scroll=[_N] + Backward scroll limit. + -i ........ --ignore-case + Ignore case in searches that do not contain uppercase. + -I ........ --IGNORE-CASE + Ignore case in all searches. + -j [_N] .... --jump-target=[_N] + Screen position of target lines. + -J ........ --status-column + Display a status column at left edge of screen. + -k [_f_i_l_e] . --lesskey-file=[_f_i_l_e] + Use a lesskey file. + -K ........ --quit-on-intr + Exit less in response to ctrl-C. + -L ........ --no-lessopen + Ignore the LESSOPEN environment variable. + -m -M .... --long-prompt --LONG-PROMPT + Set prompt style. + -n -N .... --line-numbers --LINE-NUMBERS + Don't use line numbers. + -o [_f_i_l_e] . --log-file=[_f_i_l_e] + Copy to log file (standard input only). + -O [_f_i_l_e] . --LOG-FILE=[_f_i_l_e] + Copy to log file (unconditionally overwrite). + -p [_p_a_t_t_e_r_n] --pattern=[_p_a_t_t_e_r_n] + Start at pattern (from command line). + -P [_p_r_o_m_p_t] --prompt=[_p_r_o_m_p_t] + Define new prompt. + -q -Q .... --quiet --QUIET --silent --SILENT + Quiet the terminal bell. + -r -R .... --raw-control-chars --RAW-CONTROL-CHARS + Output "raw" control characters. + -s ........ --squeeze-blank-lines + Squeeze multiple blank lines. + -S ........ --chop-long-lines + Chop (truncate) long lines rather than wrapping. + -t [_t_a_g] .. --tag=[_t_a_g] + Find a tag. + -T [_t_a_g_s_f_i_l_e] --tag-file=[_t_a_g_s_f_i_l_e] + Use an alternate tags file. + -u -U .... --underline-special --UNDERLINE-SPECIAL + Change handling of backspaces. + -V ........ --version + Display the version number of "less". + -w ........ --hilite-unread + Highlight first new line after forward-screen. + -W ........ --HILITE-UNREAD + Highlight first new line after any forward movement. + -x [_N[,...]] --tabs=[_N[,...]] + Set tab stops. + -X ........ --no-init + Don't use termcap init/deinit strings. + -y [_N] .... --max-forw-scroll=[_N] + Forward scroll limit. + -z [_N] .... --window=[_N] + Set size of window. + -" [_c[_c]] . --quotes=[_c[_c]] + Set shell quote characters. + -~ ........ --tilde + Don't display tildes after end of file. + -# [_N] .... --shift=[_N] + Set horizontal scroll amount (0 = one half screen width). + --file-size + Automatically determine the size of the input file. + --follow-name + The F command changes files if the input file is renamed. + --incsearch + Search file as each pattern character is typed in. + --line-num-width=N + Set the width of the -N line number field to N characters. + --mouse + Enable mouse input. + --no-keypad + Don't send termcap keypad init/deinit strings. + --no-histdups + Remove duplicates from command history. + --rscroll=C + Set the character used to mark truncated lines. + --save-marks + Retain marks across invocations of less. + --status-col-width=N + Set the width of the -J status column to N characters. + --use-backslash + Subsequent options use backslash as escape char. + --use-color + Enables colored text. + --wheel-lines=N + Each click of the mouse wheel moves N lines. + + + --------------------------------------------------------------------------- + + LLIINNEE EEDDIITTIINNGG + + These keys can be used to edit text being entered + on the "command line" at the bottom of the screen. + + RightArrow ..................... ESC-l ... Move cursor right one character. + LeftArrow ...................... ESC-h ... Move cursor left one character. + ctrl-RightArrow ESC-RightArrow ESC-w ... Move cursor right one word. + ctrl-LeftArrow ESC-LeftArrow ESC-b ... Move cursor left one word. + HOME ........................... ESC-0 ... Move cursor to start of line. + END ............................ ESC-$ ... Move cursor to end of line. + BACKSPACE ................................ Delete char to left of cursor. + DELETE ......................... ESC-x ... Delete char under cursor. + ctrl-BACKSPACE ESC-BACKSPACE ........... Delete word to left of cursor. + ctrl-DELETE .... ESC-DELETE .... ESC-X ... Delete word under cursor. + ctrl-U ......... ESC (MS-DOS only) ....... Delete entire line. + UpArrow ........................ ESC-k ... Retrieve previous command line. + DownArrow ...................... ESC-j ... Retrieve next command line. + TAB ...................................... Complete filename & cycle. + SHIFT-TAB ...................... ESC-TAB Complete filename & reverse cycle. + ctrl-L ................................... Complete filename, list all. From b74bb4004b79551f185729c8826af445cfc1d35e Mon Sep 17 00:00:00 2001 From: Yash Sinha Date: Sat, 10 Jan 2026 15:39:03 -0800 Subject: [PATCH 13/13] Greptile bug fixes --- dimos/agents/skills/navigation.py | 4 ++-- dimos/perception/detection/moduleDB.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dimos/agents/skills/navigation.py b/dimos/agents/skills/navigation.py index 780fb96656..f6a7fadb67 100644 --- a/dimos/agents/skills/navigation.py +++ b/dimos/agents/skills/navigation.py @@ -148,8 +148,8 @@ def navigate_to_detected_object(self, object_name: str) -> str: ) result = self._navigate_to(goal_pose) - - if result == "SUCCESS": + + if result: return f"Successfully navigated to '{obj.name}'" else: return f"Failed to reach '{obj.name}'" diff --git a/dimos/perception/detection/moduleDB.py b/dimos/perception/detection/moduleDB.py index 9f8ce63dd3..75a9993520 100644 --- a/dimos/perception/detection/moduleDB.py +++ b/dimos/perception/detection/moduleDB.py @@ -314,7 +314,7 @@ def create_new_object(self, detection: Detection3DPC): def agent_encode(self) -> str: ret = [] for obj in copy(self.objects).values(): - if len(obj.detections) < 4: + if obj.detections < 4: continue ret.append(str(obj.agent_encode())) if not ret: