Merge pull request #1180 from Shared-Reality-Lab/fix-llm-loop

jeffbl · web-flow · commit a9c356202ee4 · 2025-11-25T12:04:32.000-05:00
Fix llm looping on complex graphics
diff --git a/preprocessors/object-detection-llm/object-detection-llm.py b/preprocessors/object-detection-llm/object-detection-llm.py
@@ -68,33 +68,30 @@ def normalize_bbox(bbox, width, height):
     ]
 
 
-def process_objects(objects, threshold):
+def process_objects(qwen_output, width, height, threshold):
     """
-    Process detected objects by filtering, transforming, and enriching them.
+    Transform Qwen object detection output to IMAGE schema format.
 
-    - Filters objects by confidence threshold
+    - Transforms from Qwen format (bbox_2d, label) to IMAGE format
+    - Normalizes bounding boxes to [0,1] range
+    - Assigns confidence threshold to all objects
     - Normalizes labels (replaces underscores with spaces)
-    - Renumbers IDs sequentially
     - Calculates geometric properties (area, centroid)
+    - Filters objects by confidence threshold
 
     Args:
-        objects (list): List of detected objects with confidence scores
+        qwen_output (list): Qwen detection output with bbox_2d and label
+        width (int): Image width in pixels for normalization
+        height (int): Image height in pixels for normalization
         threshold (float): Minimum confidence score (0-1)
 
     Returns:
         list: Processed objects with computed properties
     """
     processed = []
-    for obj in objects:
-        if obj.get("confidence", 0) >= threshold:
-            obj['type'] = obj['type'].replace('_', ' ')
-            processed.append(obj)
-
-    # Renumber IDs sequentially after filtering
-    for idx, obj in enumerate(processed):
-        obj['ID'] = idx
-
-        x1, y1, x2, y2 = obj["dimensions"]
+    for idx, item in enumerate(qwen_output):
+        # Normalize bounding box
+        x1, y1, x2, y2 = normalize_bbox(item["bbox_2d"], width, height)
 
         # Calculate area (width * height)
         area = (x2 - x1) * (y2 - y1)
@@ -103,13 +100,20 @@ def process_objects(objects, threshold):
         centroid_x = (x1 + x2) / 2
         centroid_y = (y1 + y2) / 2
 
-        # Create object entry according to schema
-        obj["area"] = area
-        obj["centroid"] = [centroid_x, centroid_y]
+        # Create object entry according to IMAGE schema
+        obj = {
+            "ID": idx,
+            "type": item["label"].replace('_', ' '),
+            "dimensions": [x1, y1, x2, y2],
+            "confidence": threshold,
+            "area": area,
+            "centroid": [centroid_x, centroid_y]
+        }
+
+        processed.append(obj)
 
     logging.debug(
-        f"Processed {len(objects)} objects to {len(processed)} "
-        f"objects with confidence >= {threshold}"
+        f"Processed {len(qwen_output)} objects from Qwen output"
     )
     return processed
 
@@ -155,35 +159,42 @@ def detect_objects():
     if error:
         return jsonify(error), error["code"]
 
+    stop_tokens = [
+        "<|im_end|>",          # Qwen's end token
+        "<|endoftext|>",        # Alternative end token
+        "\n\n\n",               # Triple newline
+        "```",                  # Code block end
+    ]
+
     try:
         # Get object info
-        object_json = llm_client.chat_completion(
+        qwen_output = llm_client.chat_completion(
             prompt=OBJECT_DETECTION_PROMPT,
             image_base64=base64_image,
             json_schema=BBOX_RESPONSE_SCHEMA,
-            temperature=0.0,
-            parse_json=True
+            temperature=0.5,
+            parse_json=True,
+            stop=stop_tokens
         )
 
-        if object_json is None or len(object_json.get("objects", [])) == 0:
+        logging.debug(f"Qwen output received: {qwen_output}")
+
+        if qwen_output is None or len(qwen_output) == 0:
             logging.error("Failed to extract objects from the graphic.")
             return jsonify({"error": "No objects extracted"}), 204
 
-        # Normalize bounding boxes
+        # Transform Qwen format to IMAGE schema format
         width, height = pil_image.size
-        for obj in object_json["objects"]:
-            # Normalize bounding boxes
-            obj["dimensions"] = normalize_bbox(
-                obj["dimensions"], width, height
-            )
-
-        # Filter objects by confidence threshold, add area and centroid,
-        # remove underscores from labels, and renumber IDs
-        object_json["objects"] = process_objects(
-            object_json["objects"],
+        processed_objects = process_objects(
+            qwen_output,
+            width,
+            height,
             CONF_THRESHOLD
         )
 
+        # Wrap in "objects" for schema compliance
+        object_json = {"objects": processed_objects}
+
         logging.pii(f"Normalized output: {object_json}")
 
         # Data schema validation
diff --git a/preprocessors/object-detection-llm/object-detection.schema.json b/preprocessors/object-detection-llm/object-detection.schema.json
@@ -1,45 +1,23 @@
 {
   "$schema": "http://json-schema.org/draft-07/schema",
-  "type": "object",
+  "type": "array",
   "title": "Object Detection Data",
-  "description": "Detected object data with bounding boxes.",
-  "definitions": {
-    "object": {
-      "type": "object",
-      "title": "BoundingBoxItem",
-      "properties": {
-        "ID": {
-          "description": "A number identifying this object in the set.",
-          "type": "integer"
-        },
-        "type": {
-          "description": "The type of object detected (e.g., 'person', 'car').",
-          "type": "string"
-        },
-        "dimensions": {
-          "description": "Bounding box coordinates of this object [x1, y1, x2, y2].",
-          "type": "array",
-          "items": { "type": "number" },
-          "minItems": 4,
-          "maxItems": 4,
-          "additionalItems": false
-        },
-        "confidence": {
-          "description": "Confidence in the correctness of this object's data (0-1).",
-          "type": "number",
-          "minimum": 0,
-          "maximum": 1
-        }
+  "description": "Detected object data with bounding boxes in Qwen format.",
+  "items": {
+    "type": "object",
+    "properties": {
+      "bbox_2d": {
+        "description": "Bounding box coordinates [x1, y1, x2, y2].",
+        "type": "array",
+        "items": { "type": "number" },
+        "minItems": 4,
+        "maxItems": 4
       },
-      "required": ["ID", "type", "dimensions", "confidence"]
-    }
-  },
-  "properties": {
-    "objects": {
-      "description": "The set of detected objects in the image.",
-      "type": "array",
-      "items": { "$ref": "#/definitions/object" }
-    }
-  },
-  "required": ["objects"]
+      "label": {
+        "description": "The type of object detected (e.g., 'person', 'car').",
+        "type": "string"
+      }
+    },
+    "required": ["bbox_2d", "label"]
+  }
 }
diff --git a/utils/llm/prompts.py b/utils/llm/prompts.py
@@ -10,39 +10,35 @@
 """
 # Object detection
 OBJECT_DETECTION_PROMPT = """
-Give the bounding boxes for the objects found in this image.
+Step 1:
+Determine from 0 to 10 major and important objects in the image.
+Focus ONLY on the objects that are clearly visible and identifiable.
+
+Step 2:
+Give the bounding boxes for the objects determined in the first step.
 Output a only JSON list of bounding boxes where each entry contains:
-- the unique numeric ID in the key "ID",
-- the object label in the key "type",
-- the pixel coordinates of a 2D bounding box in the key "dimensions",
-- and the confidence score in the key "confidence".
+- the pixel coordinates of a 2D bounding box in the key "bbox_2d",
+- the object label in the key "label".
 
 Example:
 ```json
-{
-  "objects": [
+[
     {
-        "ID": 0,
-        "type": "car",
-        "dimensions": [120, 200, 300, 450],
-        "confidence": 0.92
+        "bbox_2d": [120, 200, 300, 450],
+        "label": "car",
     },
     {
-        "ID": 1,
-        "type": "person",
-        "dimensions": [50, 100, 120, 300],
-        "confidence": 0.95
+        "bbox_2d": [50, 100, 120, 300],
+        "label": "person",
     }
-  ]
-}
-
+]
 ```
 Ensure that the bounding boxes are in the format [x1, y1, x2, y2].
 
 Rules:
 1. Focus ONLY on the major and important objects in the image.
 2. The graphic can contain any number of objects, from zero to many.
-3. If no objects are detected, return an empty list: {"objects": []}.
+3. If no objects are detected, return an empty list: [].
 4. Use simple and common object labels (e.g., "car", "person", "tree").
 5. Include ONLY objects that are clearly visible and identifiable.
 6. Multiple objects can have the same confidence score.