Skip to content

Commit a9c3562

Browse files
authored
Merge pull request #1180 from Shared-Reality-Lab/fix-llm-loop
Fix llm looping on complex graphics
2 parents c5d44a3 + 85c8220 commit a9c3562

3 files changed

Lines changed: 79 additions & 94 deletions

File tree

preprocessors/object-detection-llm/object-detection-llm.py

Lines changed: 46 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -68,33 +68,30 @@ def normalize_bbox(bbox, width, height):
6868
]
6969

7070

71-
def process_objects(objects, threshold):
71+
def process_objects(qwen_output, width, height, threshold):
7272
"""
73-
Process detected objects by filtering, transforming, and enriching them.
73+
Transform Qwen object detection output to IMAGE schema format.
7474
75-
- Filters objects by confidence threshold
75+
- Transforms from Qwen format (bbox_2d, label) to IMAGE format
76+
- Normalizes bounding boxes to [0,1] range
77+
- Assigns confidence threshold to all objects
7678
- Normalizes labels (replaces underscores with spaces)
77-
- Renumbers IDs sequentially
7879
- Calculates geometric properties (area, centroid)
80+
- Filters objects by confidence threshold
7981
8082
Args:
81-
objects (list): List of detected objects with confidence scores
83+
qwen_output (list): Qwen detection output with bbox_2d and label
84+
width (int): Image width in pixels for normalization
85+
height (int): Image height in pixels for normalization
8286
threshold (float): Minimum confidence score (0-1)
8387
8488
Returns:
8589
list: Processed objects with computed properties
8690
"""
8791
processed = []
88-
for obj in objects:
89-
if obj.get("confidence", 0) >= threshold:
90-
obj['type'] = obj['type'].replace('_', ' ')
91-
processed.append(obj)
92-
93-
# Renumber IDs sequentially after filtering
94-
for idx, obj in enumerate(processed):
95-
obj['ID'] = idx
96-
97-
x1, y1, x2, y2 = obj["dimensions"]
92+
for idx, item in enumerate(qwen_output):
93+
# Normalize bounding box
94+
x1, y1, x2, y2 = normalize_bbox(item["bbox_2d"], width, height)
9895

9996
# Calculate area (width * height)
10097
area = (x2 - x1) * (y2 - y1)
@@ -103,13 +100,20 @@ def process_objects(objects, threshold):
103100
centroid_x = (x1 + x2) / 2
104101
centroid_y = (y1 + y2) / 2
105102

106-
# Create object entry according to schema
107-
obj["area"] = area
108-
obj["centroid"] = [centroid_x, centroid_y]
103+
# Create object entry according to IMAGE schema
104+
obj = {
105+
"ID": idx,
106+
"type": item["label"].replace('_', ' '),
107+
"dimensions": [x1, y1, x2, y2],
108+
"confidence": threshold,
109+
"area": area,
110+
"centroid": [centroid_x, centroid_y]
111+
}
112+
113+
processed.append(obj)
109114

110115
logging.debug(
111-
f"Processed {len(objects)} objects to {len(processed)} "
112-
f"objects with confidence >= {threshold}"
116+
f"Processed {len(qwen_output)} objects from Qwen output"
113117
)
114118
return processed
115119

@@ -155,35 +159,42 @@ def detect_objects():
155159
if error:
156160
return jsonify(error), error["code"]
157161

162+
stop_tokens = [
163+
"<|im_end|>", # Qwen's end token
164+
"<|endoftext|>", # Alternative end token
165+
"\n\n\n", # Triple newline
166+
"```", # Code block end
167+
]
168+
158169
try:
159170
# Get object info
160-
object_json = llm_client.chat_completion(
171+
qwen_output = llm_client.chat_completion(
161172
prompt=OBJECT_DETECTION_PROMPT,
162173
image_base64=base64_image,
163174
json_schema=BBOX_RESPONSE_SCHEMA,
164-
temperature=0.0,
165-
parse_json=True
175+
temperature=0.5,
176+
parse_json=True,
177+
stop=stop_tokens
166178
)
167179

168-
if object_json is None or len(object_json.get("objects", [])) == 0:
180+
logging.debug(f"Qwen output received: {qwen_output}")
181+
182+
if qwen_output is None or len(qwen_output) == 0:
169183
logging.error("Failed to extract objects from the graphic.")
170184
return jsonify({"error": "No objects extracted"}), 204
171185

172-
# Normalize bounding boxes
186+
# Transform Qwen format to IMAGE schema format
173187
width, height = pil_image.size
174-
for obj in object_json["objects"]:
175-
# Normalize bounding boxes
176-
obj["dimensions"] = normalize_bbox(
177-
obj["dimensions"], width, height
178-
)
179-
180-
# Filter objects by confidence threshold, add area and centroid,
181-
# remove underscores from labels, and renumber IDs
182-
object_json["objects"] = process_objects(
183-
object_json["objects"],
188+
processed_objects = process_objects(
189+
qwen_output,
190+
width,
191+
height,
184192
CONF_THRESHOLD
185193
)
186194

195+
# Wrap in "objects" for schema compliance
196+
object_json = {"objects": processed_objects}
197+
187198
logging.pii(f"Normalized output: {object_json}")
188199

189200
# Data schema validation
Lines changed: 18 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,23 @@
11
{
22
"$schema": "http://json-schema.org/draft-07/schema",
3-
"type": "object",
3+
"type": "array",
44
"title": "Object Detection Data",
5-
"description": "Detected object data with bounding boxes.",
6-
"definitions": {
7-
"object": {
8-
"type": "object",
9-
"title": "BoundingBoxItem",
10-
"properties": {
11-
"ID": {
12-
"description": "A number identifying this object in the set.",
13-
"type": "integer"
14-
},
15-
"type": {
16-
"description": "The type of object detected (e.g., 'person', 'car').",
17-
"type": "string"
18-
},
19-
"dimensions": {
20-
"description": "Bounding box coordinates of this object [x1, y1, x2, y2].",
21-
"type": "array",
22-
"items": { "type": "number" },
23-
"minItems": 4,
24-
"maxItems": 4,
25-
"additionalItems": false
26-
},
27-
"confidence": {
28-
"description": "Confidence in the correctness of this object's data (0-1).",
29-
"type": "number",
30-
"minimum": 0,
31-
"maximum": 1
32-
}
5+
"description": "Detected object data with bounding boxes in Qwen format.",
6+
"items": {
7+
"type": "object",
8+
"properties": {
9+
"bbox_2d": {
10+
"description": "Bounding box coordinates [x1, y1, x2, y2].",
11+
"type": "array",
12+
"items": { "type": "number" },
13+
"minItems": 4,
14+
"maxItems": 4
3315
},
34-
"required": ["ID", "type", "dimensions", "confidence"]
35-
}
36-
},
37-
"properties": {
38-
"objects": {
39-
"description": "The set of detected objects in the image.",
40-
"type": "array",
41-
"items": { "$ref": "#/definitions/object" }
42-
}
43-
},
44-
"required": ["objects"]
16+
"label": {
17+
"description": "The type of object detected (e.g., 'person', 'car').",
18+
"type": "string"
19+
}
20+
},
21+
"required": ["bbox_2d", "label"]
22+
}
4523
}

utils/llm/prompts.py

Lines changed: 15 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10,39 +10,35 @@
1010
"""
1111
# Object detection
1212
OBJECT_DETECTION_PROMPT = """
13-
Give the bounding boxes for the objects found in this image.
13+
Step 1:
14+
Determine from 0 to 10 major and important objects in the image.
15+
Focus ONLY on the objects that are clearly visible and identifiable.
16+
17+
Step 2:
18+
Give the bounding boxes for the objects determined in the first step.
1419
Output a only JSON list of bounding boxes where each entry contains:
15-
- the unique numeric ID in the key "ID",
16-
- the object label in the key "type",
17-
- the pixel coordinates of a 2D bounding box in the key "dimensions",
18-
- and the confidence score in the key "confidence".
20+
- the pixel coordinates of a 2D bounding box in the key "bbox_2d",
21+
- the object label in the key "label".
1922
2023
Example:
2124
```json
22-
{
23-
"objects": [
25+
[
2426
{
25-
"ID": 0,
26-
"type": "car",
27-
"dimensions": [120, 200, 300, 450],
28-
"confidence": 0.92
27+
"bbox_2d": [120, 200, 300, 450],
28+
"label": "car",
2929
},
3030
{
31-
"ID": 1,
32-
"type": "person",
33-
"dimensions": [50, 100, 120, 300],
34-
"confidence": 0.95
31+
"bbox_2d": [50, 100, 120, 300],
32+
"label": "person",
3533
}
36-
]
37-
}
38-
34+
]
3935
```
4036
Ensure that the bounding boxes are in the format [x1, y1, x2, y2].
4137
4238
Rules:
4339
1. Focus ONLY on the major and important objects in the image.
4440
2. The graphic can contain any number of objects, from zero to many.
45-
3. If no objects are detected, return an empty list: {"objects": []}.
41+
3. If no objects are detected, return an empty list: [].
4642
4. Use simple and common object labels (e.g., "car", "person", "tree").
4743
5. Include ONLY objects that are clearly visible and identifiable.
4844
6. Multiple objects can have the same confidence score.

0 commit comments

Comments
 (0)