diff --git a/.gitignore b/.gitignore index afe2fff..f544566 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,12 @@ build/ venv/ env/ +# IDE configurations +.vscode/ +.idea/ +*.sublime-project +*.sublime-workspace + # Test / tooling caches .pytest_cache/ .mypy_cache/ diff --git a/interface/config.py b/interface/config.py index d99c2e0..81f239f 100644 --- a/interface/config.py +++ b/interface/config.py @@ -9,8 +9,10 @@ class ExperimentConfig: """Selects one implementation along each experimental axis.""" prompting: Literal["minimal", "standard", "verbose"] = "standard" - observation: Literal["text_only", "image_text", "image_only"] = "image_text" - context_window: Literal["current", "last3"] = "last3" + observation: Literal["text_only", "image_text", "image_only"] = "image_only" + include_current_observation_description: bool = False + observation_text_includes_facing: bool = False + context_window: Literal["current", "last3"] = "current" querying: Literal["step_by_step", "subgoal", "full_trajectory"] = "step_by_step" chat_history: Literal["stateless", "rolling", "full"] = "stateless" chat_turns_max: int = 3 diff --git a/interface/coords.py b/interface/coords.py index 6511c2c..e3b3153 100644 --- a/interface/coords.py +++ b/interface/coords.py @@ -4,6 +4,7 @@ from gridworld.backends.base import GridState from gridworld.task_spec import Position, TaskSpecification +from prompting_experiments.prompt_templates import observation as observation_templates FACING_ORDER = ["NORTH", "EAST", "SOUTH", "WEST"] @@ -126,29 +127,42 @@ def describe_cell( cols: int, ) -> str: if row < 1 or row > rows or col < 1 or col > cols: - return "out of bounds" + return observation_templates.CELL_OUT_OF_BOUNDS if (row, col) in walls: - return "wall" + return observation_templates.CELL_WALL if (row, col) == goal: - return f"GOAL ({row},{col})" + return observation_templates.CELL_GOAL.format(row=row, col=col) key_color = key_at_cell(task_spec, state, row, col) if key_color: - return f"{key_color} key ({row},{col})" + return observation_templates.CELL_KEY.format( + key_color=key_color, + row=row, + col=col, + ) for door in task_spec.mechanisms.doors: if to_row_col(door.position) == (row, col): status = "open" if door.id in state.open_doors else door.initial_state - return f"{status} {door.requires_key} door ({row},{col})" + return observation_templates.CELL_DOOR.format( + status=status, + requires_key=door.requires_key, + row=row, + col=col, + ) for gate in task_spec.mechanisms.gates: if to_row_col(gate.position) == (row, col): cur = "open" if gate.id in state.open_gates else gate.initial_state - return f"{cur} gate ({row},{col})" + return observation_templates.CELL_GATE.format(state=cur, row=row, col=col) for switch in task_spec.mechanisms.switches: if to_row_col(switch.position) == (row, col): on_off = "on" if switch.id in state.active_switches else switch.initial_state - return f"switch ({on_off}) ({row},{col})" + return observation_templates.CELL_SWITCH.format( + state=on_off, + row=row, + col=col, + ) - return f"open ({row},{col})" + return observation_templates.CELL_OPEN.format(row=row, col=col) diff --git a/interface/feedback.py b/interface/feedback.py index 18cc3aa..95416df 100644 --- a/interface/feedback.py +++ b/interface/feedback.py @@ -15,6 +15,7 @@ switch_at_cell, switches_controlling_gate, ) +from prompting_experiments.prompt_templates import feedback as feedback_templates def infer_step_outcome( @@ -35,13 +36,17 @@ def infer_step_outcome( door = next((d for d in task_spec.mechanisms.doors if d.id == door_id), None) color = door.requires_key if door else "matching" if action == "MOVE_FORWARD" and prev_pos != curr_pos: - return "OPENED", f"Opened {color} door {door_id} and moved to {curr_pos}." - return "OPENED", f"Opened {color} door {door_id}." + return "OPENED", feedback_templates.OPENED_AND_MOVED.format( + color=color, + door_id=door_id, + position=curr_pos, + ) + return "OPENED", feedback_templates.OPENED_DOOR.format(color=color, door_id=door_id) if action in ("TURN_LEFT", "TURN_RIGHT"): if prev.agent_direction != curr.agent_direction: - return "TURNED", f"Now facing {agent_facing(curr)}." - return "NOTHING", f"{action} had no effect." + return "TURNED", feedback_templates.NOW_FACING.format(facing=agent_facing(curr)) + return "NOTHING", feedback_templates.ACTION_NO_EFFECT.format(action=action) if action == "MOVE_FORWARD": if prev_pos == curr_pos: @@ -50,9 +55,10 @@ def infer_step_outcome( if key_color: return ( "BLOCKED", - f"MOVE_FORWARD blocked by a {key_color} key at {fwd}. " - "Keys occupy their cell; you cannot walk onto them. " - "Face the key and use PICKUP from your current cell.", + feedback_templates.MOVE_BLOCKED_BY_KEY.format( + key_color=key_color, + position=fwd, + ), ) gate = gate_at_cell(task_spec, prev, fwd[0], fwd[1]) if gate and not gate["open"]: @@ -61,17 +67,23 @@ def infer_step_outcome( switch_list = ", ".join(controllers) return ( "BLOCKED", - f"MOVE_FORWARD blocked by closed gate {gate['id']} at {fwd}. " - f"Activate switch(es) {switch_list} to open it.", + feedback_templates.MOVE_BLOCKED_BY_GATE_WITH_SWITCHES.format( + gate_id=gate["id"], + position=fwd, + switches=switch_list, + ), ) return ( "BLOCKED", - f"MOVE_FORWARD blocked by closed gate {gate['id']} at {fwd}.", + feedback_templates.MOVE_BLOCKED_BY_GATE.format( + gate_id=gate["id"], + position=fwd, + ), ) - return "BLOCKED", "MOVE_FORWARD blocked by wall or closed door/gate." + return "BLOCKED", feedback_templates.MOVE_BLOCKED_GENERIC if terminated and reward > 0 and curr_pos == goal: - return "DONE", f"Reached goal at {goal}." - return "MOVED", f"Moved to {curr_pos}." + return "DONE", feedback_templates.REACHED_GOAL.format(goal=goal) + return "MOVED", feedback_templates.MOVED_TO.format(position=curr_pos) if action == "PICKUP": if ( @@ -79,15 +91,15 @@ def infer_step_outcome( or len(curr.collected_keys) > len(prev.collected_keys) ): carried = curr.agent_carrying or "a" - return "PICKUP", f"Picked up {carried} key." - return "NOTHING", "Nothing to pick up here." + return "PICKUP", feedback_templates.PICKED_UP_KEY.format(key_color=carried) + return "NOTHING", feedback_templates.NOTHING_TO_PICK_UP if action == "TOGGLE": if ( prev.active_switches != curr.active_switches or prev.open_gates != curr.open_gates ): - return "TOGGLED", "Toggled switch or gate state changed." + return "TOGGLED", feedback_templates.TOGGLED_STATE_CHANGED fwd = forward_cell(prev) switch_ahead = switch_at_cell(task_spec, fwd[0], fwd[1]) switch_here = switch_at_cell(task_spec, prev_pos[0], prev_pos[1]) @@ -96,12 +108,11 @@ def infer_step_outcome( if switch_ahead["switch_type"] == "hold": return ( "NOTHING", - f"TOGGLE had no effect. MOVE_FORWARD onto the switch at {fwd} " - "(hold switches activate while you stand on them).", + feedback_templates.TOGGLE_HOLD_SWITCH_HINT.format(position=fwd), ) return ( "NOTHING", - f"TOGGLE had no effect. MOVE_FORWARD onto the switch at {fwd}, then TOGGLE.", + feedback_templates.TOGGLE_SWITCH_HINT.format(position=fwd), ) if gate_ahead and not gate_ahead["open"]: controllers = switches_controlling_gate(task_spec, str(gate_ahead["id"])) @@ -109,21 +120,22 @@ def infer_step_outcome( switch_list = ", ".join(controllers) return ( "NOTHING", - "Gates cannot be toggled directly. " - f"Activate switch(es) {switch_list} instead.", + feedback_templates.GATE_TOGGLE_WITH_SWITCHES.format( + switches=switch_list, + ), ) - return "NOTHING", "Gates cannot be toggled directly. Activate a linked switch instead." + return "NOTHING", feedback_templates.GATE_TOGGLE_GENERIC return ( "NOTHING", - "TOGGLE had no effect. Stand on a switch and TOGGLE, or use PICKUP/keys for doors.", + feedback_templates.TOGGLE_NO_EFFECT, ) if action == "DONE": if terminated and reward > 0 and curr_pos == goal: - return "DONE", f"Task complete at {goal}." - return "WRONG_DONE", f"DONE called but not at goal {goal}." + return "DONE", feedback_templates.TASK_COMPLETE.format(goal=goal) + return "WRONG_DONE", feedback_templates.WRONG_DONE.format(goal=goal) - return "INVALID", f"Unknown or unsupported action {action}." + return "INVALID", feedback_templates.UNKNOWN_ACTION.format(action=action) def format_step_feedback( @@ -139,23 +151,27 @@ def format_step_feedback( ) prev_pos = agent_row_col(prev) if event_type == "BLOCKED": - return f"BLOCKED — {action}: {event_message} You remain at {prev_pos}.", event_type + return feedback_templates.BLOCKED_FEEDBACK.format(action=action, message=event_message, position=prev_pos), event_type if event_type == "TURNED": - return f"TURNED — {action}: {event_message}", event_type + return feedback_templates.TURNED_FEEDBACK.format(action=action, message=event_message), event_type if event_type == "MOVED": - return f"MOVED — {action}: {event_message}", event_type + return feedback_templates.MOVED_FEEDBACK.format(action=action, message=event_message), event_type if event_type == "DONE": - return f"SUCCESS — {action}: {event_message}", event_type + return feedback_templates.SUCCESS_FEEDBACK.format(action=action, message=event_message), event_type if event_type == "PICKUP": - return f"PICKUP — {action}: {event_message}", event_type + return feedback_templates.PICKUP_FEEDBACK.format(action=action, message=event_message), event_type if event_type == "NOTHING": - return f"NOTHING — {action}: {event_message} You remain at {prev_pos}.", event_type + return feedback_templates.NOTHING_FEEDBACK.format(action=action, message=event_message, position=prev_pos), event_type if event_type == "OPENED": - return f"OPENED — {action}: {event_message}", event_type + return feedback_templates.OPENED_FEEDBACK.format(action=action, message=event_message), event_type if event_type == "TOGGLED": - return f"TOGGLED — {action}: {event_message}", event_type + return feedback_templates.TOGGLED_FEEDBACK.format(action=action, message=event_message), event_type if event_type == "WRONG_DONE": - return f"WRONG DONE — {action}: {event_message} You remain at {prev_pos}.", event_type + return feedback_templates.WRONG_DONE_FEEDBACK.format(action=action, message=event_message, position=prev_pos), event_type if event_type == "INVALID": - return f"INVALID — {action}: {event_message} You remain at {prev_pos}.", event_type - return f"{event_type} — {action}: {event_message}", event_type + return feedback_templates.INVALID_FEEDBACK.format(action=action, message=event_message, position=prev_pos), event_type + return feedback_templates.DEFAULT_FEEDBACK.format( + event_type=event_type, + action=action, + message=event_message, + ), event_type diff --git a/interface/observation.py b/interface/observation.py index d898d26..87fa51d 100644 --- a/interface/observation.py +++ b/interface/observation.py @@ -3,7 +3,7 @@ History when ``context_window == "last3"`` (last 3 executed steps, oldest first): * **text_only** — full text history only (position, facing, action, feedback). -* **image_only** — prior decision-frame PNGs + ``Action: …`` labels (no text history). +* **image_only** — prior decision-frame PNGs + inventory/action labels (no text history). * **image_text** — full text history **and** prior decision-frame PNGs. History is derived from enriched ``transcript`` step records. @@ -18,7 +18,12 @@ from gridworld.backends.base import GridState from gridworld.task_spec import TaskSpecification -from interface.renderer import render_user_observation_text, rgb_to_image_block +from interface.renderer import ( + render_current_inventory_text, + render_user_observation_text, + rgb_to_image_block, +) +from prompting_experiments.prompt_templates import observation as observation_templates ObservationMode = Literal["text_only", "image_text", "image_only"] ContextWindow = Literal["current", "last3"] @@ -51,11 +56,17 @@ def history_text( if not recs: return "" - lines = ["Recent history (last 3 steps, oldest first):"] + lines = [observation_templates.RECENT_HISTORY_HEADER] for rec in recs: row, col = rec["position_after"] lines.append( - f" ({int(row)}, {int(col)}) facing {rec['facing_after']} -> {rec['action']} -> {rec['prompt_feedback']}" + observation_templates.RECENT_HISTORY_STEP.format( + row=int(row), + col=int(col), + facing=rec["facing_after"], + action=rec["action"], + feedback=rec["prompt_feedback"], + ) ) return "\n".join(lines) @@ -77,17 +88,24 @@ def history_content_blocks( if rgb is None: continue blocks.append(rgb_to_image_block(rgb)) - if observation == "image_only": - blocks.append({"type": "text", "text": f"Action: {rec['action']}\n\n"}) + inventory = _history_record_inventory(rec) + text = ( + observation_templates.IMAGE_HISTORY_INVENTORY_ACTION.format( + inventory=inventory, + action=rec["action"], + ) + if observation == "image_only" + else observation_templates.IMAGE_HISTORY_INVENTORY.format(inventory=inventory) + ) + blocks.append({"type": "text", "text": text}) if not blocks: return [] intro = ( - "Recent steps (oldest first). Each image is the maze view from which the " - "following action was chosen; infer pose and environment state from the image.\n\n" + observation_templates.IMAGE_ONLY_HISTORY_INTRO if observation == "image_only" - else "Recent step views (oldest first):\n\n" + else observation_templates.IMAGE_TEXT_HISTORY_INTRO ) return [{"type": "text", "text": intro}] + blocks @@ -96,13 +114,34 @@ def current_observation_text( observation: ObservationMode, task_spec: TaskSpecification, state: GridState, + *, + include_description: bool = False, + include_facing: bool = False, ) -> str: if observation == "image_only": + return render_current_inventory_text(state) + if not include_description: return "" - return render_user_observation_text(task_spec, state) + return render_user_observation_text(task_spec, state, include_facing=include_facing) def current_image_blocks(observation: ObservationMode, rgb: np.ndarray | None) -> list[dict]: if observation == "text_only" or rgb is None: return [] return [rgb_to_image_block(rgb)] + + +def _history_record_inventory(rec: dict[str, Any]) -> str: + state_before = rec.get("state_before") + if isinstance(state_before, dict): + inventory = state_before.get("inventory") + if isinstance(inventory, list): + return ", ".join(str(item) for item in inventory) or "empty" + + state_after = rec.get("state_after") + if isinstance(state_after, dict): + inventory = state_after.get("inventory") + if isinstance(inventory, list): + return ", ".join(str(item) for item in inventory) or "empty" + + return "unknown" diff --git a/interface/prompt_strategies.py b/interface/prompt_strategies.py index 64580cc..b6f9b05 100644 --- a/interface/prompt_strategies.py +++ b/interface/prompt_strategies.py @@ -6,44 +6,15 @@ from gridworld.task_spec import TaskSpecification from interface.coords import ( - FACING_ORDER, - FACING_TO_DELTA, agent_facing, agent_row_col, - describe_cell, goal_row_col, - inventory_list, - maze_rows_cols, - wall_cells, ) +from prompting_experiments.prompt_templates import system as system_templates +from prompting_experiments.prompt_templates import user as user_templates -MECHANISM_LIST = ( - "The environment may contain:\n" - "- Keys: pick them up to open doors of the matching color\n" - "- Doors: blocked passages that require a matching key\n" - "- Switches: step onto them to activate (hold) or TOGGLE while standing on them\n" - "- Gates: blocked passages controlled by switches\n" -) - -MECHANISM_RULES = ( - "RULES (domain logic):\n" - " - PICKUP: pick up a key from the adjacent cell you are facing. Keys block movement — you\n" - " cannot MOVE_FORWARD onto a key; stand beside it, face it, and PICKUP.\n" - " - Doors: face a locked door with the matching key in inventory and TOGGLE to open it, then\n" - " MOVE_FORWARD through the open door. MOVE_FORWARD alone does not open a locked door.\n" - " - Switches: MOVE_FORWARD onto the switch cell, then TOGGLE (toggle/one-shot types). Hold-type\n" - " switches activate automatically while you stand on them. Only switches are toggled. Linked\n" - " gates are open if at least one linked switch is on, and closed if all are off.\n" - " - Gates: you cannot TOGGLE a gate. CLOSED gates block movement; OPEN gates do not.\n" - " - Closed gates and doors you lack a key for block movement like walls until resolved.\n" - " - Use DONE only when you are standing on the goal cell." -) - -FINAL_OUTPUT_INSTRUCTION = ( - "On the last line, output exactly:\n" - "FINAL_OUTPUT: or FINAL_OUTPUT: , , ... " - "(comma-separated; one or more valid actions)" -) +MECHANISM_LIST = system_templates.MECHANISM_LIST +MECHANISM_RULES = system_templates.MECHANISM_RULES class MinimalPromptStrategy: @@ -51,12 +22,13 @@ def __init__(self, actions_hint: str) -> None: self._actions_hint = actions_hint def build_system_prompt(self, querying_suffix: str = "") -> str: - return ( - "Task: move to the goal cell in the grid.\n" - f"Valid actions: {self._actions_hint}.\n" - f"{FINAL_OUTPUT_INSTRUCTION}" - + (f"\n\n{querying_suffix}" if querying_suffix else "") - ) + del querying_suffix + chunks = [ + system_templates.TASK_PREFIX, + MECHANISM_LIST, + system_templates.VALID_ACTIONS_TEMPLATE.format(actions_hint=self._actions_hint), + ] + return "\n".join(chunks) def build_user_prompt( self, @@ -65,39 +37,41 @@ def build_user_prompt( task_spec: TaskSpecification, state: GridState, last_feedback: str, + *, + include_status_footer: bool = False, ) -> str: - history_block = f"{history_text}\n\n" if history_text else "" - obs_block = f"Observation:\n{obs_text}\n\n" if obs_text else "" + obs_block = ( + user_templates.OBSERVATION_SECTION.format(obs_text=obs_text) + if obs_text + else "" + ) pos = agent_row_col(state) goal = goal_row_col(task_spec) - return ( - f"{history_block}" - f"{obs_block}" - f"Position: {pos} | Facing: {agent_facing(state)} | Goal: {goal} | " - f"Step {state.step_count + 1}/{state.max_steps}\n" - f"Last result: {last_feedback}\n" - "What is your next action?" + status_block = _status_block( + include_status_footer, + position=pos, + facing=agent_facing(state), + goal=goal, + last_feedback=last_feedback, + ) + prompt = user_templates.STANDARD_USER_PROMPT.format( + obs_block=obs_block, + status_block=status_block, ) + return _with_history(prompt, history_text) class StandardPromptStrategy(MinimalPromptStrategy): - def build_system_prompt(self, querying_suffix: str = "") -> str: - return ( - "Task: move to the goal cell in the grid.\n" - f"{MECHANISM_LIST}\n" - f"Valid actions: {self._actions_hint}.\n" - f"{FINAL_OUTPUT_INSTRUCTION}" - + (f"\n\n{querying_suffix}" if querying_suffix else "") - ) + pass class VerbosePromptStrategy(StandardPromptStrategy): + include_mechanism_hints = False + def build_system_prompt(self, querying_suffix: str = "") -> str: - std = StandardPromptStrategy.build_system_prompt(self, "").rstrip() - chunks = [std, MECHANISM_RULES] - if querying_suffix: - chunks.append(querying_suffix) - return "\n\n".join(chunks) + del querying_suffix + std = StandardPromptStrategy.build_system_prompt(self).rstrip() + return "\n\n".join([std, MECHANISM_RULES]) def build_user_prompt( self, @@ -106,76 +80,68 @@ def build_user_prompt( task_spec: TaskSpecification, state: GridState, last_feedback: str, + *, + include_status_footer: bool = False, ) -> str: - steps_left = state.max_steps - state.step_count - budget_warn = ( - f" WARNING: Only {steps_left} steps remaining!\n" - if steps_left <= max(5, state.max_steps // 5) + mechanism_block = ( + _mechanism_hints_text(task_spec) if self.include_mechanism_hints else "" + ) + obs_block = ( + user_templates.OBSERVATION_SECTION.format(obs_text=obs_text) + if obs_text else "" ) - row, col = agent_row_col(state) - grow, gcol = goal_row_col(task_spec) - manhattan = abs(row - grow) + abs(col - gcol) - rows, cols = maze_rows_cols(task_spec) - walls = wall_cells(task_spec) - - facing_idx = FACING_ORDER.index(agent_facing(state)) - rel_dirs = [ - ("AHEAD", FACING_ORDER[facing_idx % 4]), - ("RIGHT", FACING_ORDER[(facing_idx + 1) % 4]), - ("BEHIND", FACING_ORDER[(facing_idx + 2) % 4]), - ("LEFT", FACING_ORDER[(facing_idx + 3) % 4]), - ] - neighbour_lines = [] - for rel, cardinal in rel_dirs: - dr, dc = FACING_TO_DELTA[cardinal] - nr, nc = row + dr, col + dc - desc = describe_cell( - task_spec, - state, - nr, - nc, - walls=walls, - goal=(grow, gcol), - rows=rows, - cols=cols, - ) - neighbour_lines.append(f" {rel}: {desc}") - neighbour_block = "From your perspective:\n" + "\n".join(neighbour_lines) + "\n" - mechanism_block = _mechanism_hints_text(task_spec) - history_block = f"{history_text}\n\n" if history_text else "" - obs_block = f"Observation:\n{obs_text}\n\n" if obs_text else "" - inventory_str = ", ".join(inventory_list(state)) or "none" - - return ( - f"{history_block}" - f"{obs_block}" - f"Position: {row, col} | Facing: {agent_facing(state)} | Goal: {(grow, gcol)} | " - f"Manhattan: {manhattan} | Step {state.step_count + 1}/{state.max_steps} ({steps_left} left)\n" - f"Inventory: {inventory_str}\n" - f"{budget_warn}" - f"{neighbour_block}" - f"{mechanism_block}" - f"Last result: {last_feedback}\n" - "What is your next action?" + pos = agent_row_col(state) + goal = goal_row_col(task_spec) + status_block = _status_block( + include_status_footer, + position=pos, + facing=agent_facing(state), + goal=goal, + last_feedback=last_feedback, ) + prompt = user_templates.VERBOSE_USER_PROMPT.format( + obs_block=obs_block, + mechanism_block=mechanism_block, + status_block=status_block, + ) + return _with_history(prompt, history_text) + PromptStrategy = MinimalPromptStrategy +def _with_history(prompt: str, history_text: str) -> str: + if not history_text: + return prompt + return f"{history_text}\n\n{prompt}" + + +def _status_block( + include: bool, + *, + position: tuple[int, int], + facing: str, + goal: tuple[int, int], + last_feedback: str, +) -> str: + if not include: + return "" + return user_templates.STATUS_BLOCK.format( + position=position, + facing=facing, + goal=goal, + last_feedback=last_feedback, + ) + + def _mechanism_hints_text(task_spec: TaskSpecification) -> str: lines = [] if task_spec.mechanisms.keys or task_spec.mechanisms.doors: - lines.append( - " - Face an adjacent key and PICKUP (do not walk onto the key). " - "Face a locked door with the matching key and TOGGLE to open it, then MOVE_FORWARD through." - ) + lines.append(user_templates.KEY_DOOR_HINT) if task_spec.mechanisms.switches or task_spec.mechanisms.gates: - lines.append( - " - MOVE_FORWARD onto a switch, then TOGGLE (hold switches activate on step). " - "Gates cannot be toggled — activate their linked switch(es)." - ) + lines.append(user_templates.SWITCH_GATE_HINT) if not lines: return "" - return "Hints:\n" + "\n".join(lines) + "\n" + return user_templates.MECHANISM_HINTS_HEADER + "\n".join(lines) + "\n" diff --git a/interface/querying.py b/interface/querying.py index daa4117..6f6921e 100644 --- a/interface/querying.py +++ b/interface/querying.py @@ -4,6 +4,7 @@ from typing import List, Literal from interface.parser import normalize_action, parse_final_output +from prompting_experiments.prompt_templates import querying as querying_templates QueryingKind = Literal["step_by_step", "subgoal", "full_trajectory"] @@ -47,22 +48,27 @@ def parse_actions(self, model_text: str) -> List[str]: self._trajectory_loaded = True return actions - def system_prompt_suffix(self) -> str: + def user_prompt_suffix(self) -> str: if self.kind == "step_by_step": return "" if self.kind == "subgoal": - return ( - "For each turn output:\n" - " SUB_GOAL: \n" - " ACTIONS: " - ) - return ( - "Output your complete trajectory once as:\n" - " SUB_GOAL: \n" - " ACTIONS: \n" - "The last action in ACTIONS should be DONE (when you expect to be at the goal).\n" - "You will not be queried again — this is your only planning turn." - ) + return querying_templates.SUBGOAL_SUFFIX + return querying_templates.FULL_TRAJECTORY_SUFFIX + + def user_prompt_question(self) -> str: + if self.kind == "full_trajectory": + return querying_templates.FULL_TRAJECTORY_QUESTION + return "" + + def final_output_instruction(self) -> str: + if self.kind == "full_trajectory": + return querying_templates.FULL_TRAJECTORY_FINAL_OUTPUT_INSTRUCTION + if self.kind == "subgoal": + return querying_templates.SUBGOAL_FINAL_OUTPUT_INSTRUCTION + return querying_templates.SINGLE_ACTION_FINAL_OUTPUT_INSTRUCTION + + def system_prompt_suffix(self) -> str: + return "" def step_metadata(self) -> dict: if self.kind == "step_by_step": diff --git a/interface/renderer.py b/interface/renderer.py index 34881d3..e2440a5 100644 --- a/interface/renderer.py +++ b/interface/renderer.py @@ -18,11 +18,13 @@ to_row_col, wall_cells, ) +from prompting_experiments.prompt_templates import observation as observation_templates if TYPE_CHECKING: from gridworld.backends.base import GridState from gridworld.task_spec import TaskSpecification + #TODO: Move to utils.py def rgb_to_png_bytes(rgb: np.ndarray) -> bytes: img = Image.fromarray(np.asarray(rgb, dtype=np.uint8)) @@ -43,13 +45,11 @@ def _static_layout_lines(task_spec: TaskSpecification) -> list[str]: start = to_row_col(task_spec.maze.start) goal = goal_row_col(task_spec) return [ - f"The world is a {rows} by {cols} grid.", - "Coordinates: JSON lists use ``[x, y]`` (east, south) from the **top-left** corner ``(1, 1)``;" - " tuples in this text use ``(row, column)`` matching env state (row southward, column east)." - " So ``x`` = column index, ``y`` = row index.", - f"The start is at {start}.", - f"The goal is at {goal}.", - f"The following cells are walls: {wall_str}.", + observation_templates.WORLD_SIZE_LINE.format(rows=rows, cols=cols), + observation_templates.COORDINATE_EXPLANATION, + observation_templates.START_LINE.format(start=start), + observation_templates.GOAL_LINE.format(goal=goal), + observation_templates.WALLS_LINE.format(walls=wall_str) ] @@ -64,14 +64,20 @@ def _mechanism_lines(task_spec: TaskSpecification, state: GridState | None = Non if key.id in collected: continue row, col = to_row_col(key.position) - parts.append(f"There is a {key.color} key at ({row},{col}).") + parts.append( + observation_templates.KEY_LINE.format(color=key.color, row=row, col=col) + ) for door in task_spec.mechanisms.doors: row, col = to_row_col(door.position) status = "open" if door.id in open_doors else door.initial_state parts.append( - f"There is a {status} {door.requires_key} door at ({row},{col})." - f" It requires the {door.requires_key} key to open." + observation_templates.DOOR_LINE.format( + status=status, + requires_key=door.requires_key, + row=row, + col=col, + ) ) for switch in task_spec.mechanisms.switches: @@ -79,16 +85,26 @@ def _mechanism_lines(task_spec: TaskSpecification, state: GridState | None = Non on_off = "on" if switch.id in active else switch.initial_state controls = ", ".join(switch.controls) parts.append( - f"There is a {switch.switch_type} switch at ({row},{col}) (currently {on_off})." - f" It controls: {controls}." + observation_templates.SWITCH_LINE.format( + switch_type=switch.switch_type, + row=row, + col=col, + state=on_off, + controls=controls, + ) ) for gate in task_spec.mechanisms.gates: row, col = to_row_col(gate.position) cur = "open" if gate.id in open_gates else gate.initial_state parts.append( - f"There is a gate ({gate.id}) at ({row},{col})." - f" It is currently {cur} (initially {gate.initial_state})." + observation_templates.GATE_LINE.format( + gate_id=gate.id, + row=row, + col=col, + state=cur, + initial_state=gate.initial_state, + ) ) return parts @@ -97,23 +113,37 @@ def render_initial_maze_text(task_spec: TaskSpecification) -> str: return "\n".join(_static_layout_lines(task_spec) + _mechanism_lines(task_spec)) -def render_user_observation_text(task_spec: TaskSpecification, state: GridState) -> str: - goal = goal_row_col(task_spec) +def render_user_observation_text( + task_spec: TaskSpecification, + state: GridState, + *, + include_facing: bool = False, +) -> str: pos = agent_row_col(state) inv = ", ".join(inventory_list(state)) or "empty" + agent_line = ( + observation_templates.CURRENT_AGENT_LINE.format( + position=pos, + facing=agent_facing(state), + ) + if include_facing + else observation_templates.CURRENT_AGENT_POSITION_LINE.format(position=pos) + ) head = [ - "Current situation (this step):", - f"The goal is at {goal}.", - f"You are at {pos} facing {agent_facing(state)}.", - f"Environment steps used so far: {state.step_count} (max {state.max_steps} before timeout).", - f"Your inventory: {inv}.", + observation_templates.CURRENT_SITUATION_HEADER, + agent_line, + observation_templates.CURRENT_INVENTORY_LINE.format(inventory=inv), "", - "Map contents as of this step (keys on the ground, doors, switches, gates):", + observation_templates.CURRENT_MAP_CONTENTS_HEADER, ] mech = _mechanism_lines(task_spec, state) if mech: head.extend(mech) else: - head.append("(No keys on the ground, doors, switches, or gates in the current state description.)") + head.append(observation_templates.NO_MECHANISMS_LINE) return "\n".join(head) + +def render_current_inventory_text(state: GridState) -> str: + inv = ", ".join(inventory_list(state)) or "empty" + return observation_templates.CURRENT_INVENTORY_LINE.format(inventory=inv) diff --git a/interface/runner.py b/interface/runner.py index 19fb907..a5260f6 100644 --- a/interface/runner.py +++ b/interface/runner.py @@ -23,6 +23,7 @@ current_observation_text, history_content_blocks, history_text, + recent_history_steps, ) from interface.parser import ACTIONS_HINT from interface.prompt_strategies import ( @@ -33,6 +34,8 @@ ) from interface.querying import QueryingMode from interface.renderer import render_initial_maze_text +from prompting_experiments.prompt_templates import feedback as feedback_templates +from prompting_experiments.prompt_templates import system as system_templates logger = logging.getLogger(__name__) @@ -57,6 +60,26 @@ def _trim_rolling_chat(messages: List[dict], max_pairs: int) -> None: del messages[1 : 1 + (tail_len - cap)] +def _replace_current_question(prompt_text: str, question: str) -> str: + standard_question = "What is your next action?" + before, match, after = prompt_text.rpartition(standard_question) + if not match: + return prompt_text + return f"{before}{question}{after}" + + +def _append_after_current_question(prompt_text: str, instruction: str) -> str: + questions = ( + "What is the full sequence of actions you will take to complete the task?", + "What is your next action?", + ) + for question in questions: + before, match, after = prompt_text.rpartition(question) + if match: + return f"{before}{match}\n\n{instruction}{after}" + return f"{prompt_text}\n\n{instruction}" + + def build_runner( config: ExperimentConfig, backend: MiniGridBackend, @@ -87,6 +110,18 @@ def __init__( self.querying = querying self.last_rgb: np.ndarray | None = None + def build_prompt_message( + self, + state, + last_feedback: str, + transcript: List[dict], + ) -> tuple[str, dict]: + return self.prompt.build_system_prompt(), self._build_message( + state, + last_feedback, + transcript, + ) + def run( self, agent: Callable[[List[dict]], str], @@ -97,18 +132,13 @@ def run( self.last_rgb, state, reset_info = self.backend.reset(seed=self.task_spec.seed) self.querying.reset() - system_prompt = self.prompt.build_system_prompt(self.querying.system_prompt_suffix()) - if self.config.observation in ("text_only", "image_text"): - system_prompt = ( - f"{system_prompt}\n\nInitial maze (fixed for this episode):\n" - f"{render_initial_maze_text(self.task_spec)}" - ) + system_prompt = self.prompt.build_system_prompt() system_message = {"role": "system", "content": system_prompt} chat_history = self.config.chat_history messages: List[dict] = [system_message] if chat_history in ("rolling", "full") else [] action_queue: List[str] = [] - last_feedback = "Episode start." + last_feedback = feedback_templates.INITIAL_FEEDBACK consecutive_failures = 0 transcript: List[dict] = [] max_steps = self.task_spec.max_steps @@ -202,8 +232,9 @@ def run( self.config.max_parse_retries, ) last_feedback = ( - f"Could not parse FINAL_OUTPUT (one or more valid actions). " - f"Use only: {ACTIONS_HINT}." + feedback_templates.PARSE_FAILURE_FEEDBACK.format( + actions_hint=ACTIONS_HINT + ) ) if parse_failures >= self.config.max_parse_retries: end_reason = "parse_failed" @@ -331,14 +362,40 @@ def run( def _build_message(self, state, last_feedback: str, transcript: List[dict]) -> dict: obs = self.config.observation ctx = self.config.context_window - obs_text = current_observation_text(obs, self.task_spec, state) + obs_text = current_observation_text( + obs, + self.task_spec, + state, + include_description=self.config.include_current_observation_description, + include_facing=self.config.observation_text_includes_facing, + ) prompt_text = self.prompt.build_user_prompt( obs_text, history_text(obs, ctx, transcript), self.task_spec, state, last_feedback, + include_status_footer=False, + ) + prompt_question = self.querying.user_prompt_question() + if prompt_question: + prompt_text = _replace_current_question(prompt_text, prompt_question) + prompt_text = _append_after_current_question( + prompt_text, + self.querying.final_output_instruction(), ) + sections = [] + if self.config.observation in ("text_only", "image_text"): + sections.append( + system_templates.INITIAL_MAZE_SECTION.format( + maze_text=render_initial_maze_text(self.task_spec) + ) + ) + sections.append(prompt_text) + querying_suffix = self.querying.user_prompt_suffix() + if querying_suffix: + sections.append(querying_suffix) + prompt_text = "\n\n".join(sections) hist_blocks = history_content_blocks(obs, ctx, transcript) images = current_image_blocks(obs, self.last_rgb) text_block = {"type": "text", "text": prompt_text} diff --git a/interface/smoke_tests/smoke_prompting_observation_querying.py b/interface/smoke_tests/smoke_prompting_observation_querying.py index 06b2db1..a19f948 100644 --- a/interface/smoke_tests/smoke_prompting_observation_querying.py +++ b/interface/smoke_tests/smoke_prompting_observation_querying.py @@ -72,17 +72,17 @@ def __call__(self, messages: list[dict]) -> str: isinstance(blk, dict) and blk.get("type") == "image_url" for blk in user_content ) - full_mode = "You will not be queried again" in system_text - subgoal_mode = "SUB_GOAL:" in system_text and "ACTIONS:" in system_text + full_mode = "full sequence of actions" in user_text + subgoal_mode = "SUB_GOAL:" in user_text and "valid actions to reach it" in user_text if full_mode: reply = ( "SUB_GOAL: Execute maze-aware end-to-end plan.\n" - f"ACTIONS: {', '.join(self._full_trajectory_actions)}" + f"FINAL_OUTPUT: {', '.join(self._full_trajectory_actions)}" ) elif subgoal_mode: chunk = _plan_to_goal_from_prompt(user_text, budget=4) - reply = f"SUB_GOAL: Advance toward goal.\nACTIONS: {', '.join(chunk)}" + reply = f"SUB_GOAL: Advance toward goal.\nFINAL_OUTPUT: {', '.join(chunk)}" else: step = _plan_to_goal_from_prompt(user_text, budget=1)[0] reply = f"FINAL_OUTPUT: {step}" @@ -121,12 +121,8 @@ def _collect_checks(cfg: ExperimentConfig, calls: list[dict[str, Any]]) -> list[ system = first["system"] checks: list[dict[str, Any]] = [] - if cfg.prompting == "minimal": - checks.append(_check("minimal omits mechanism list", "The environment may contain:" not in system)) - if cfg.prompting == "standard": - checks.append(_check("standard includes mechanism list", "The environment may contain:" in system)) - if cfg.prompting == "verbose": - checks.append(_check("verbose includes rules block", "RULES (domain logic):" in system)) + checks.append(_check("system includes mechanism list", "The environment may contain:" in system)) + checks.append(_check("system includes rules block", "RULES (domain logic):" in system)) if cfg.observation == "text_only": checks.append(_check("text_only user content is plain string", first["user_content_type"] == "str")) @@ -137,11 +133,11 @@ def _collect_checks(cfg: ExperimentConfig, calls: list[dict[str, Any]]) -> list[ if cfg.observation == "image_only": checks.append( - _check("image_only omits initial NL map in system", "Initial maze (fixed for this episode):" not in system) + _check("image_only omits initial NL map in user", "Initial maze (fixed for this episode):" not in first["user_text"]) ) elif cfg.observation in ("text_only", "image_text"): checks.append( - _check(f"{cfg.observation} includes initial NL map in system", "Initial maze (fixed for this episode):" in system) + _check(f"{cfg.observation} includes initial NL map in user", "Initial maze (fixed for this episode):" in first["user_text"]) ) if cfg.querying == "full_trajectory": diff --git a/ogbench b/ogbench new file mode 160000 index 0000000..84b5770 --- /dev/null +++ b/ogbench @@ -0,0 +1 @@ +Subproject commit 84b5770c8fba35d13a2693180b9f524980c3b2fc diff --git a/prompting_experiments/__init__.py b/prompting_experiments/__init__.py new file mode 100644 index 0000000..d7ccc0f --- /dev/null +++ b/prompting_experiments/__init__.py @@ -0,0 +1,5 @@ +"""Prompt condition-set configs for interface experiments.""" + +from .exp_design import CONDITION_SETS, ConditionSet, Variant, iter_condition_configs + +__all__ = ["CONDITION_SETS", "ConditionSet", "Variant", "iter_condition_configs"] diff --git a/prompting_experiments/condition_set_1_prompt.py b/prompting_experiments/condition_set_1_prompt.py new file mode 100644 index 0000000..d8d1dd2 --- /dev/null +++ b/prompting_experiments/condition_set_1_prompt.py @@ -0,0 +1,26 @@ +"""Condition set 1: prompt verbosity.""" + +from __future__ import annotations + +from .core import ConditionSet, Variant + + +CONDITION_SET = ConditionSet( + name="Prompt", + comparisons=( + "Standard: goal + mechanism descriptions + action list", + "Verbose: standard + explicit rules", + ), + decision="If delta < 5%, use standard. If > 5%, use verbose.", + variants={ + "standard": Variant( + name="standard", + description="Standard task prompt with mechanism descriptions.", + ), + "verbose": Variant( + name="verbose", + description="Standard prompt plus explicit domain rules.", + config_overrides={"prompting": "verbose"}, + ), + }, +) diff --git a/prompting_experiments/condition_set_2_observation_format.py b/prompting_experiments/condition_set_2_observation_format.py new file mode 100644 index 0000000..9c3f7ba --- /dev/null +++ b/prompting_experiments/condition_set_2_observation_format.py @@ -0,0 +1,40 @@ +"""Condition set 2: observation format.""" + +from __future__ import annotations + +from .core import ConditionSet, Variant + + +CONDITION_SET = ConditionSet( + name="Observation format", + comparisons=( + "Standard image only", + "Text only", + "Image + text", + ), + decision="Measure whether text adds meaningful signal beyond image input.", + variants={ + "standard": Variant( + name="image_only", + description="Image block with no initial natural-language maze map-same as the standard prompt.", + ), + "text_only": Variant( + name="text_only", + description="Natural-language current observation, no image blocks.", + config_overrides={ + "observation": "text_only", + "include_current_observation_description": True, + "observation_text_includes_facing": True, + }, + ), + "image_text": Variant( + name="image_text", + description="Image block plus natural-language observation.", + config_overrides={ + "observation": "image_text", + "include_current_observation_description": True, + "observation_text_includes_facing": True, + }, + ), + }, +) diff --git a/prompting_experiments/condition_set_3_context_window.py b/prompting_experiments/condition_set_3_context_window.py new file mode 100644 index 0000000..4876369 --- /dev/null +++ b/prompting_experiments/condition_set_3_context_window.py @@ -0,0 +1,32 @@ +"""Condition set 3: context window.""" + +from __future__ import annotations + +from .core import ConditionSet, Variant + + +CONDITION_SET = ConditionSet( + name="Context window", + comparisons=( + "Standard 0 history: current observation only", + "Last 3 executed steps", + "Current observation + text summary of prior actions", + ), + decision="Compare current-state-only prompting against recent history.", + variants={ + "standard": Variant( + name="current", + description="Prompt only with the current observation-same as the standard prompt.", + ), + "last3": Variant( + name="last3", + description="Include up to the last three executed steps.", + config_overrides={"context_window": "last3"}, + ), + "text_summary": Variant( + name="text_summary", + description="PR #12 design axis; no ExperimentConfig summary mode exists yet.", + implemented=False, + ), + }, +) diff --git a/prompting_experiments/condition_set_4_querying_strategy.py b/prompting_experiments/condition_set_4_querying_strategy.py new file mode 100644 index 0000000..7a4b564 --- /dev/null +++ b/prompting_experiments/condition_set_4_querying_strategy.py @@ -0,0 +1,32 @@ +"""Condition set 5: querying strategy.""" + +from __future__ import annotations + +from .core import ConditionSet, Variant + + +CONDITION_SET = ConditionSet( + name="Querying strategy", + comparisons=( + "Standard step-by-step: one action per query", + "Subgoal planning: model outputs a subgoal and action chunk", + "Full trajectory: model outputs a complete plan once", + ), + decision="Determine whether chunked or one-shot planning improves performance.", + variants={ + "standard": Variant( + name="step_by_step", + description="Ask for one action each query-same as the standard prompt.", + ), + "subgoal": Variant( + name="subgoal", + description="Ask for a short subgoal and action chunk.", + config_overrides={"querying": "subgoal"}, + ), + "full_trajectory": Variant( + name="full_trajectory", + description="Ask once for a complete action trajectory.", + config_overrides={"querying": "full_trajectory"}, + ), + }, +) diff --git a/prompting_experiments/condition_set_5_in_context_learning.py b/prompting_experiments/condition_set_5_in_context_learning.py new file mode 100644 index 0000000..4cfe4eb --- /dev/null +++ b/prompting_experiments/condition_set_5_in_context_learning.py @@ -0,0 +1,31 @@ +"""Condition set 6: in-context learning.""" + +from __future__ import annotations + +from .core import ConditionSet, Variant + + +CONDITION_SET = ConditionSet( + name="In-context learning", + comparisons=( + "Standard zero-shot: no examples", + "1-shot: one example trajectory from a different maze", + ), + decision=( + "If 1-shot dramatically improves performance, the bottleneck is likely " + "task understanding rather than navigation capability." + ), + variants={ + "standard": Variant( + name="zero_shot", + description="No examples-same as the standard prompt.", + ), + "one_shot": Variant( + name="one_shot", + description="PR #12 design axis; example selection/injection is not implemented yet.", + implemented=False, + ), + }, + implemented=False, + notes="ICL examples must not use evaluation mazes.", +) diff --git a/prompting_experiments/core.py b/prompting_experiments/core.py new file mode 100644 index 0000000..a5c1a0a --- /dev/null +++ b/prompting_experiments/core.py @@ -0,0 +1,50 @@ +"""Shared types for prompt experiment condition registries.""" + +from __future__ import annotations + +from dataclasses import dataclass, replace +from typing import TYPE_CHECKING, Iterator, Mapping + +if TYPE_CHECKING: + from interface.config import ExperimentConfig + + +@dataclass(frozen=True) +class Variant: + """One experiment variant expressed as overrides to ``ExperimentConfig``.""" + + name: str + description: str + config_overrides: Mapping[str, object] | None = None + implemented: bool = True + + def build_config(self, base: ExperimentConfig | None = None) -> ExperimentConfig: + if not self.implemented: + raise ValueError(f"Variant is not implemented in ExperimentConfig: {self.name}") + from interface.config import ExperimentConfig + + cfg = base or ExperimentConfig() + return replace(cfg, **dict(self.config_overrides or {})) + + +@dataclass(frozen=True) +class ConditionSet: + """A named experimental axis and its comparable variants.""" + + name: str + comparisons: tuple[str, ...] + decision: str + variants: Mapping[str, Variant] + implemented: bool = True + notes: str = "" + + +def iter_condition_configs( + condition: ConditionSet, + base: ExperimentConfig | None = None, +) -> Iterator[tuple[str, ExperimentConfig]]: + """Yield ``(variant_name, config)`` pairs for implemented variants.""" + + for variant_name, variant in condition.variants.items(): + if variant.implemented: + yield variant_name, variant.build_config(base) diff --git a/prompting_experiments/exp_design.py b/prompting_experiments/exp_design.py new file mode 100644 index 0000000..d9237c7 --- /dev/null +++ b/prompting_experiments/exp_design.py @@ -0,0 +1,40 @@ +"""Experiment prompt condition-set registry. + +Each condition set is split into its own module to mirror the PR #12 experiment +design while keeping runnable prompt behavior centralized in ``interface``. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Iterator, Mapping + +if TYPE_CHECKING: + from interface.config import ExperimentConfig + +from .condition_set_1_prompt import CONDITION_SET as CONDITION_SET_1 +from .condition_set_2_observation_format import CONDITION_SET as CONDITION_SET_2 +from .condition_set_3_context_window import CONDITION_SET as CONDITION_SET_3 +from .condition_set_4_querying_strategy import CONDITION_SET as CONDITION_SET_5 +from .condition_set_5_in_context_learning import CONDITION_SET as CONDITION_SET_6 +from .core import ConditionSet, Variant, iter_condition_configs as _iter_condition_configs + + +CONDITION_SETS: Mapping[str, ConditionSet] = { + CONDITION_SET_1.name: CONDITION_SET_1, + CONDITION_SET_2.name: CONDITION_SET_2, + CONDITION_SET_3.name: CONDITION_SET_3, + CONDITION_SET_5.name: CONDITION_SET_5, + CONDITION_SET_6.name: CONDITION_SET_6, +} + + +def iter_condition_configs( + condition_name: str, + base: ExperimentConfig | None = None, +) -> Iterator[tuple[str, ExperimentConfig]]: + """Yield runnable ``(variant_name, config)`` pairs for one condition set.""" + + yield from _iter_condition_configs(CONDITION_SETS[condition_name], base) + + +__all__ = ["CONDITION_SETS", "ConditionSet", "Variant", "iter_condition_configs"] diff --git a/prompting_experiments/preview_prompts.py b/prompting_experiments/preview_prompts.py new file mode 100644 index 0000000..e5e0631 --- /dev/null +++ b/prompting_experiments/preview_prompts.py @@ -0,0 +1,232 @@ +"""Generate a text preview of prompt experiment condition variants.""" + +from __future__ import annotations + +import argparse +import random +from pathlib import Path +from typing import Any + +from prompting_experiments import CONDITION_SETS +from prompting_experiments.prompt_templates import feedback as feedback_templates + + +def _content_to_text(content: Any) -> str: + if isinstance(content, str): + return content + if not isinstance(content, list): + return str(content) + + lines: list[str] = [] + image_count = 0 + for block in content: + if not isinstance(block, dict): + continue + if block.get("type") == "text": + lines.append(block.get("text", "")) + elif block.get("type") == "image_url": + image_count += 1 + lines.append(f"[image block {image_count}]") + return "\n".join(part for part in lines if part) + + +def _missing_dependency_message(exc: ModuleNotFoundError) -> str: + return ( + f"Missing dependency: {exc.name}. Install the project dependencies in this environment, " + "for example: python3 -m pip install -e '.[dev]'" + ) + + +def _rollout_preview_steps( + runner, + state, + steps: int, + seed: int, +) -> tuple[Any, str, list[dict]]: + from interface.actions_map import nlu_action_to_int + from interface.coords import agent_facing, agent_row_col + from interface.episode_log import state_snapshot + from interface.feedback import format_step_feedback + from interface.parser import ACTION_ORDER + + rng = random.Random(seed) + actions = [action for action in ACTION_ORDER if action != "DONE"] + last_feedback = feedback_templates.INITIAL_FEEDBACK + transcript: list[dict] = [] + + for step_index in range(1, steps + 1): + action = rng.choice(actions) + position_before = agent_row_col(state) + facing_before = agent_facing(state) + state_before = state_snapshot(state) + decision_frame_rgb = runner.last_rgb + prev_state = state + + runner.last_rgb, reward, terminated, truncated, state, info = runner.backend.step( + nlu_action_to_int(action) + ) + step_detail, event_type = format_step_feedback( + action, prev_state, state, reward, terminated, runner.task_spec + ) + last_feedback = step_detail + transcript.append( + { + "kind": "step", + "step_index": step_index, + "query_index": 0, + "action_queue_index": 0, + "env_step_count": state.step_count, + "action": action, + "event_type": event_type, + "feedback": step_detail, + "prompt_feedback": last_feedback, + "facing_before": facing_before, + "facing_after": agent_facing(state), + "position_before": list(position_before), + "position_after": list(agent_row_col(state)), + "state_before": state_before, + "state_after": state_snapshot(state), + "reward": reward, + "terminated": terminated, + "truncated": truncated, + "backend_info": info, + "actions_remaining_after": [], + "consecutive_failures_after": 0, + "_decision_frame_rgb": decision_frame_rgb, + "_post_step_rgb": runner.last_rgb, + } + ) + if terminated or truncated: + break + + return state, last_feedback, transcript + + +def _prompt_preview( + config, + maze_path: Path, + max_steps: int, + preview_steps: int, + rollout_seed: int, +) -> tuple[str, str]: + try: + from interface.loader import load_task + from interface.runner import build_runner + except ModuleNotFoundError as exc: + raise SystemExit(_missing_dependency_message(exc)) from exc + + backend, spec = load_task(maze_path) + spec.max_steps = max_steps + runner = build_runner(config, backend, spec) + runner.last_rgb, state, _reset_info = backend.reset(seed=spec.seed) + state, last_feedback, transcript = _rollout_preview_steps( + runner, + state, + preview_steps, + rollout_seed, + ) + system_prompt, user_message = runner.build_prompt_message( + state, + last_feedback, + transcript, + ) + return system_prompt, _content_to_text(user_message.get("content")) + + +def build_preview( + maze_path: Path, + max_steps: int, + preview_steps: int, + rollout_seed: int, +) -> str: + chunks = [ + "Prompt Experiment Preview", + f"Maze: {maze_path}", + f"Max steps: {max_steps}", + f"Preview prompt state: after {preview_steps} random steps (seed: {rollout_seed})", + "", + ] + + for idx, condition in enumerate(CONDITION_SETS.values(), start=1): + chunks.extend( + [ + "=" * 88, + f"condition set {idx}: {condition.name}", + "=" * 88, + ] + ) + for variant_name, variant in condition.variants.items(): + chunks.extend( + [ + f"variant name: {variant_name}", + f"description: {variant.description}", + "prompts:", + ] + ) + if not variant.implemented: + chunks.extend( + [ + "Status: not implemented in ExperimentConfig", + "-" * 88, + ] + ) + continue + + try: + config = variant.build_config() + except ModuleNotFoundError as exc: + raise SystemExit(_missing_dependency_message(exc)) from exc + system_prompt, user_prompt = _prompt_preview( + config, + maze_path, + max_steps, + preview_steps, + rollout_seed, + ) + chunks.extend( + [ + "[system prompt]", + system_prompt, + "", + "[user prompt]", + user_prompt, + "-" * 88, + ] + ) + + return "\n".join(chunks).rstrip() + "\n" + + +def _default_maze_path(name: str) -> Path: + return Path(__file__).resolve().parents[1] / "mazes" / "validation_10" / name + + +def main() -> None: + parser = argparse.ArgumentParser(description="Write prompt experiment previews to prompts.txt.") + parser.add_argument("--maze", default="V01_empty_room.json") + parser.add_argument("--max-steps", type=int, default=5) + parser.add_argument("--preview-steps", type=int, default=3) + parser.add_argument("--rollout-seed", type=int, default=0) + parser.add_argument( + "--output", + type=Path, + default=Path(__file__).resolve().parent / "prompts.txt", + ) + args = parser.parse_args() + + maze_path = Path(args.maze) + if not maze_path.is_file(): + maze_path = _default_maze_path(args.maze) + + preview = build_preview( + maze_path, + args.max_steps, + args.preview_steps, + args.rollout_seed, + ) + args.output.write_text(preview, encoding="utf-8") + print(f"wrote {args.output}") + + +if __name__ == "__main__": + main() diff --git a/prompting_experiments/prompt_templates/__init__.py b/prompting_experiments/prompt_templates/__init__.py new file mode 100644 index 0000000..344f4fc --- /dev/null +++ b/prompting_experiments/prompt_templates/__init__.py @@ -0,0 +1 @@ +"""Agent-facing prompt templates grouped by prompt surface.""" diff --git a/prompting_experiments/prompt_templates/feedback.py b/prompting_experiments/prompt_templates/feedback.py new file mode 100644 index 0000000..c8e854b --- /dev/null +++ b/prompting_experiments/prompt_templates/feedback.py @@ -0,0 +1,47 @@ +"""Step feedback templates.""" + +INITIAL_FEEDBACK = "Episode start." +OPENED_AND_MOVED = "Opened {color} door {door_id} and moved to {position}." +OPENED_DOOR = "Opened {color} door {door_id}." +NOW_FACING = "Now facing {facing}." +ACTION_NO_EFFECT = "{action} had no effect." +MOVE_BLOCKED_BY_KEY = ( + "MOVE_FORWARD blocked by a {key_color} key at {position}. " + "Keys occupy their cell; you cannot walk onto them. " + "Face the key and use PICKUP from your current cell." +) +MOVE_BLOCKED_BY_GATE_WITH_SWITCHES = ( + "MOVE_FORWARD blocked by closed gate {gate_id} at {position}. " + "Activate switch(es) {switches} to open it." +) +MOVE_BLOCKED_BY_GATE = "MOVE_FORWARD blocked by closed gate {gate_id} at {position}." +MOVE_BLOCKED_GENERIC = "MOVE_FORWARD blocked by wall or closed door/gate." +REACHED_GOAL = "Reached goal at {goal}." +MOVED_TO = "Moved to {position}." +PICKED_UP_KEY = "Picked up {key_color} key." +NOTHING_TO_PICK_UP = "Nothing to pick up here." +TOGGLED_STATE_CHANGED = "Toggled switch or gate state changed." +TOGGLE_HOLD_SWITCH_HINT = ( + "TOGGLE had no effect. MOVE_FORWARD onto the switch at {position} " + "(hold switches activate while you stand on them)." +) +TOGGLE_SWITCH_HINT = "TOGGLE had no effect. MOVE_FORWARD onto the switch at {position}, then TOGGLE." +GATE_TOGGLE_WITH_SWITCHES = "Gates cannot be toggled directly. Activate switch(es) {switches} instead." +GATE_TOGGLE_GENERIC = "Gates cannot be toggled directly. Activate a linked switch instead." +TOGGLE_NO_EFFECT = "TOGGLE had no effect. Stand on a switch and TOGGLE, or use PICKUP/keys for doors." +TASK_COMPLETE = "Task complete at {goal}." +WRONG_DONE = "DONE called but not at goal {goal}." +UNKNOWN_ACTION = "Unknown or unsupported action {action}." + +BLOCKED_FEEDBACK = "BLOCKED — {action}: {message} You remain at {position}." +TURNED_FEEDBACK = "TURNED — {action}: {message}" +MOVED_FEEDBACK = "MOVED — {action}: {message}" +SUCCESS_FEEDBACK = "SUCCESS — {action}: {message}" +PICKUP_FEEDBACK = "PICKUP — {action}: {message}" +NOTHING_FEEDBACK = "NOTHING — {action}: {message} You remain at {position}." +OPENED_FEEDBACK = "OPENED — {action}: {message}" +TOGGLED_FEEDBACK = "TOGGLED — {action}: {message}" +WRONG_DONE_FEEDBACK = "WRONG DONE — {action}: {message} You remain at {position}." +INVALID_FEEDBACK = "INVALID — {action}: {message} You remain at {position}." +DEFAULT_FEEDBACK = "{event_type} — {action}: {message}" +PARSE_FAILURE_FEEDBACK = "Could not parse FINAL_OUTPUT (one or more valid actions). Use only: {actions_hint}." diff --git a/prompting_experiments/prompt_templates/observation.py b/prompting_experiments/prompt_templates/observation.py new file mode 100644 index 0000000..d17bbc0 --- /dev/null +++ b/prompting_experiments/prompt_templates/observation.py @@ -0,0 +1,52 @@ +"""Observation and history prompt templates.""" + +RECENT_HISTORY_HEADER = "Recent history (last 3 steps, oldest first):" +RECENT_HISTORY_STEP = " ({row}, {col}) facing {facing} -> {action} -> {feedback}" + +IMAGE_HISTORY_INVENTORY = "Your inventory: {inventory}.\n\n" +IMAGE_HISTORY_INVENTORY_ACTION = "Your inventory: {inventory}.\nAction: {action}\n\n" +IMAGE_ONLY_HISTORY_INTRO = ( + "Recent steps (oldest first). Each image is the maze view from which the " + "following action was chosen.\n\n" +) +IMAGE_TEXT_HISTORY_INTRO = "Recent step views (oldest first):\n\n" + +WORLD_SIZE_LINE = "The world is a {rows} by {cols} grid." +COORDINATE_EXPLANATION = ( + "Coordinates: JSON lists use ``[x, y]`` (east, south) from the **top-left** corner ``(1, 1)``;" + " tuples in this text use ``(row, column)`` matching env state (row southward, column east)." + " So ``x`` = column index, ``y`` = row index." +) +START_LINE = "The start is at {start}." +GOAL_LINE = "The goal is at {goal}." +WALLS_LINE = "The following cells are walls: {walls}." + +KEY_LINE = "There is a {color} key at ({row},{col})." +DOOR_LINE = ( + "There is a {status} {requires_key} door at ({row},{col})." + " It requires the {requires_key} key to open." +) +SWITCH_LINE = ( + "There is a {switch_type} switch at ({row},{col}) (currently {state})." + " It controls: {controls}." +) +GATE_LINE = ( + "There is a gate ({gate_id}) at ({row},{col})." + " It is currently {state} (initially {initial_state})." +) + +CURRENT_SITUATION_HEADER = "Current situation (this step):" +CURRENT_AGENT_LINE = "You are at {position} facing {facing}." +CURRENT_AGENT_POSITION_LINE = "You are at {position}." +CURRENT_INVENTORY_LINE = "Your inventory: {inventory}." +CURRENT_MAP_CONTENTS_HEADER = "Map contents as of this step (keys on the ground, doors, switches, gates):" +NO_MECHANISMS_LINE = "(No keys on the ground, doors, switches, or gates in the current state description.)" + +CELL_OUT_OF_BOUNDS = "out of bounds" +CELL_WALL = "wall" +CELL_GOAL = "GOAL ({row},{col})" +CELL_KEY = "{key_color} key ({row},{col})" +CELL_DOOR = "{status} {requires_key} door ({row},{col})" +CELL_GATE = "{state} gate ({row},{col})" +CELL_SWITCH = "switch ({state}) ({row},{col})" +CELL_OPEN = "open ({row},{col})" diff --git a/prompting_experiments/prompt_templates/querying.py b/prompting_experiments/prompt_templates/querying.py new file mode 100644 index 0000000..04d8764 --- /dev/null +++ b/prompting_experiments/prompt_templates/querying.py @@ -0,0 +1,25 @@ +"""Querying strategy prompt templates.""" + +SUBGOAL_SUFFIX = "" + +FULL_TRAJECTORY_QUESTION = ( + "What is the full sequence of actions you will take to complete the task?" +) + +SINGLE_ACTION_FINAL_OUTPUT_INSTRUCTION = ( + "Output exactly:\n" + "FINAL_OUTPUT: " +) + +SUBGOAL_FINAL_OUTPUT_INSTRUCTION = ( + "Output exactly:\n" + "SUB_GOAL: \n" + "FINAL_OUTPUT: , , ... (comma-separated; one or more valid actions to reach it)" +) + +FULL_TRAJECTORY_FINAL_OUTPUT_INSTRUCTION = ( + "Output exactly:\n" + "FINAL_OUTPUT: , , ... (comma-separated; one or more valid actions)" +) + +FULL_TRAJECTORY_SUFFIX = "" diff --git a/prompting_experiments/prompt_templates/system.py b/prompting_experiments/prompt_templates/system.py new file mode 100644 index 0000000..6ba748e --- /dev/null +++ b/prompting_experiments/prompt_templates/system.py @@ -0,0 +1,28 @@ + +TASK_PREFIX = "Task: You are the triangular agent trying to navigate this maze. You are facing the pointy end. Move to the green goal cell in the grid." + +MECHANISM_LIST = ( + "The environment may contain:\n" + "- Keys: pick them up to open doors of the matching color\n" + "- Doors: blocked passages that require a matching key\n" + "- Switches: TOGGLE while standing on them\n" + "- Gates: blocked passages controlled by switches\n" +) + +MECHANISM_RULES = ( + "RULES (domain logic):\n" + " - PICKUP: pick up a key from the adjacent cell you are facing. Keys block movement — you\n" + " cannot MOVE_FORWARD onto a key; stand beside it, face it, and PICKUP.\n" + " - Doors: face a locked door with the matching key in inventory and TOGGLE to open it, then\n" + " MOVE_FORWARD through the open door. MOVE_FORWARD alone does not open a locked door.\n" + " - Switches: MOVE_FORWARD onto the switch cell, then TOGGLE (toggle/one-shot types). Hold-type\n" + " switches activate automatically while you stand on them. Only switches are toggled. Linked\n" + " gates are open if at least one linked switch is on, and closed if all are off.\n" + " - Gates: you cannot TOGGLE a gate. CLOSED gates block movement; OPEN gates do not.\n" + " - Closed gates and doors you lack a key for block movement like walls until resolved.\n" + " - Use DONE only when you are standing on the goal cell." +) + +VALID_ACTIONS_TEMPLATE = "Valid actions: {actions_hint}." + +INITIAL_MAZE_SECTION = "Initial maze (fixed for this episode):\n{maze_text}" diff --git a/prompting_experiments/prompt_templates/user.py b/prompting_experiments/prompt_templates/user.py new file mode 100644 index 0000000..a5f108a --- /dev/null +++ b/prompting_experiments/prompt_templates/user.py @@ -0,0 +1,31 @@ +"""User prompt templates.""" + +OBSERVATION_SECTION = "Observation:\n{obs_text}\n\n" + +STANDARD_USER_PROMPT = ( + "{obs_block}" + "{status_block}" + "What is your next action?" +) + +VERBOSE_USER_PROMPT = ( + "{obs_block}" + "{mechanism_block}" + "{status_block}" + "What is your next action?" +) + +STATUS_BLOCK = ( + "Position: {position} | Facing: {facing} | Goal: {goal}\n" + "Last result: {last_feedback}\n" +) + +MECHANISM_HINTS_HEADER = "Hints:\n" +KEY_DOOR_HINT = ( + " - Face an adjacent key and PICKUP (do not walk onto the key). " + "Face a locked door with the matching key and TOGGLE to open it, then MOVE_FORWARD through." +) +SWITCH_GATE_HINT = ( + " - MOVE_FORWARD onto a switch, then TOGGLE (hold switches activate on step). " + "Gates cannot be toggled — activate their linked switch(es)." +) diff --git a/prompting_experiments/prompts.txt b/prompting_experiments/prompts.txt new file mode 100644 index 0000000..c22bebf --- /dev/null +++ b/prompting_experiments/prompts.txt @@ -0,0 +1,337 @@ +Prompt Experiment Preview +Maze: /Users/helenlu/HRI/MultiNet-v2.0/mazes/validation_10/V01_empty_room.json +Max steps: 5 +Preview prompt state: after 3 random steps (seed: 0) + +======================================================================================== +condition set 1: Prompt +======================================================================================== +variant name: standard +description: Standard task prompt with mechanism descriptions. +prompts: +[system prompt] +Task: You are the triangular agent trying to navigate this maze. You are facing the pointy end. Move to the green goal cell in the grid. +The environment may contain: +- Keys: pick them up to open doors of the matching color +- Doors: blocked passages that require a matching key +- Switches: TOGGLE while standing on them +- Gates: blocked passages controlled by switches + +Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE. + +[user prompt] +[image block 1] +Observation: +Your inventory: empty. + +What is your next action? + +Output exactly: +FINAL_OUTPUT: +---------------------------------------------------------------------------------------- +variant name: verbose +description: Standard prompt plus explicit domain rules. +prompts: +[system prompt] +Task: You are the triangular agent trying to navigate this maze. You are facing the pointy end. Move to the green goal cell in the grid. +The environment may contain: +- Keys: pick them up to open doors of the matching color +- Doors: blocked passages that require a matching key +- Switches: TOGGLE while standing on them +- Gates: blocked passages controlled by switches + +Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE. + +RULES (domain logic): + - PICKUP: pick up a key from the adjacent cell you are facing. Keys block movement — you + cannot MOVE_FORWARD onto a key; stand beside it, face it, and PICKUP. + - Doors: face a locked door with the matching key in inventory and TOGGLE to open it, then + MOVE_FORWARD through the open door. MOVE_FORWARD alone does not open a locked door. + - Switches: MOVE_FORWARD onto the switch cell, then TOGGLE (toggle/one-shot types). Hold-type + switches activate automatically while you stand on them. Only switches are toggled. Linked + gates are open if at least one linked switch is on, and closed if all are off. + - Gates: you cannot TOGGLE a gate. CLOSED gates block movement; OPEN gates do not. + - Closed gates and doors you lack a key for block movement like walls until resolved. + - Use DONE only when you are standing on the goal cell. + +[user prompt] +[image block 1] +Observation: +Your inventory: empty. + +What is your next action? + +Output exactly: +FINAL_OUTPUT: +---------------------------------------------------------------------------------------- +======================================================================================== +condition set 2: Observation format +======================================================================================== +variant name: standard +description: Image block with no initial natural-language maze map-same as the standard prompt. +prompts: +[system prompt] +Task: You are the triangular agent trying to navigate this maze. You are facing the pointy end. Move to the green goal cell in the grid. +The environment may contain: +- Keys: pick them up to open doors of the matching color +- Doors: blocked passages that require a matching key +- Switches: TOGGLE while standing on them +- Gates: blocked passages controlled by switches + +Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE. + +[user prompt] +[image block 1] +Observation: +Your inventory: empty. + +What is your next action? + +Output exactly: +FINAL_OUTPUT: +---------------------------------------------------------------------------------------- +variant name: text_only +description: Natural-language current observation, no image blocks. +prompts: +[system prompt] +Task: You are the triangular agent trying to navigate this maze. You are facing the pointy end. Move to the green goal cell in the grid. +The environment may contain: +- Keys: pick them up to open doors of the matching color +- Doors: blocked passages that require a matching key +- Switches: TOGGLE while standing on them +- Gates: blocked passages controlled by switches + +Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE. + +[user prompt] +Initial maze (fixed for this episode): +The world is a 8 by 8 grid. +Coordinates: JSON lists use ``[x, y]`` (east, south) from the **top-left** corner ``(1, 1)``; tuples in this text use ``(row, column)`` matching env state (row southward, column east). So ``x`` = column index, ``y`` = row index. +The start is at (1, 1). +The goal is at (6, 6). +The following cells are walls: none. + +Observation: +Current situation (this step): +You are at (1, 1) facing NORTH. +Your inventory: empty. + +Map contents as of this step (keys on the ground, doors, switches, gates): +(No keys on the ground, doors, switches, or gates in the current state description.) + +What is your next action? + +Output exactly: +FINAL_OUTPUT: +---------------------------------------------------------------------------------------- +variant name: image_text +description: Image block plus natural-language observation. +prompts: +[system prompt] +Task: You are the triangular agent trying to navigate this maze. You are facing the pointy end. Move to the green goal cell in the grid. +The environment may contain: +- Keys: pick them up to open doors of the matching color +- Doors: blocked passages that require a matching key +- Switches: TOGGLE while standing on them +- Gates: blocked passages controlled by switches + +Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE. + +[user prompt] +[image block 1] +Initial maze (fixed for this episode): +The world is a 8 by 8 grid. +Coordinates: JSON lists use ``[x, y]`` (east, south) from the **top-left** corner ``(1, 1)``; tuples in this text use ``(row, column)`` matching env state (row southward, column east). So ``x`` = column index, ``y`` = row index. +The start is at (1, 1). +The goal is at (6, 6). +The following cells are walls: none. + +Observation: +Current situation (this step): +You are at (1, 1) facing NORTH. +Your inventory: empty. + +Map contents as of this step (keys on the ground, doors, switches, gates): +(No keys on the ground, doors, switches, or gates in the current state description.) + +What is your next action? + +Output exactly: +FINAL_OUTPUT: +---------------------------------------------------------------------------------------- +======================================================================================== +condition set 3: Context window +======================================================================================== +variant name: standard +description: Prompt only with the current observation-same as the standard prompt. +prompts: +[system prompt] +Task: You are the triangular agent trying to navigate this maze. You are facing the pointy end. Move to the green goal cell in the grid. +The environment may contain: +- Keys: pick them up to open doors of the matching color +- Doors: blocked passages that require a matching key +- Switches: TOGGLE while standing on them +- Gates: blocked passages controlled by switches + +Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE. + +[user prompt] +[image block 1] +Observation: +Your inventory: empty. + +What is your next action? + +Output exactly: +FINAL_OUTPUT: +---------------------------------------------------------------------------------------- +variant name: last3 +description: Include up to the last three executed steps. +prompts: +[system prompt] +Task: You are the triangular agent trying to navigate this maze. You are facing the pointy end. Move to the green goal cell in the grid. +The environment may contain: +- Keys: pick them up to open doors of the matching color +- Doors: blocked passages that require a matching key +- Switches: TOGGLE while standing on them +- Gates: blocked passages controlled by switches + +Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE. + +[user prompt] +Recent steps (oldest first). Each image is the maze view from which the following action was chosen. + + +[image block 1] +Your inventory: empty. +Action: PICKUP + + +[image block 2] +Your inventory: empty. +Action: PICKUP + + +[image block 3] +Your inventory: empty. +Action: TURN_LEFT + + +[image block 4] +Observation: +Your inventory: empty. + +What is your next action? + +Output exactly: +FINAL_OUTPUT: +---------------------------------------------------------------------------------------- +variant name: text_summary +description: PR #12 design axis; no ExperimentConfig summary mode exists yet. +prompts: +Status: not implemented in ExperimentConfig +---------------------------------------------------------------------------------------- +======================================================================================== +condition set 4: Querying strategy +======================================================================================== +variant name: standard +description: Ask for one action each query-same as the standard prompt. +prompts: +[system prompt] +Task: You are the triangular agent trying to navigate this maze. You are facing the pointy end. Move to the green goal cell in the grid. +The environment may contain: +- Keys: pick them up to open doors of the matching color +- Doors: blocked passages that require a matching key +- Switches: TOGGLE while standing on them +- Gates: blocked passages controlled by switches + +Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE. + +[user prompt] +[image block 1] +Observation: +Your inventory: empty. + +What is your next action? + +Output exactly: +FINAL_OUTPUT: +---------------------------------------------------------------------------------------- +variant name: subgoal +description: Ask for a short subgoal and action chunk. +prompts: +[system prompt] +Task: You are the triangular agent trying to navigate this maze. You are facing the pointy end. Move to the green goal cell in the grid. +The environment may contain: +- Keys: pick them up to open doors of the matching color +- Doors: blocked passages that require a matching key +- Switches: TOGGLE while standing on them +- Gates: blocked passages controlled by switches + +Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE. + +[user prompt] +[image block 1] +Observation: +Your inventory: empty. + +What is your next action? + +Output exactly: +SUB_GOAL: +FINAL_OUTPUT: , , ... (comma-separated; one or more valid actions to reach it) +---------------------------------------------------------------------------------------- +variant name: full_trajectory +description: Ask once for a complete action trajectory. +prompts: +[system prompt] +Task: You are the triangular agent trying to navigate this maze. You are facing the pointy end. Move to the green goal cell in the grid. +The environment may contain: +- Keys: pick them up to open doors of the matching color +- Doors: blocked passages that require a matching key +- Switches: TOGGLE while standing on them +- Gates: blocked passages controlled by switches + +Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE. + +[user prompt] +[image block 1] +Observation: +Your inventory: empty. + +What is the full sequence of actions you will take to complete the task? + +Output exactly: +FINAL_OUTPUT: , , ... (comma-separated; one or more valid actions) +---------------------------------------------------------------------------------------- +======================================================================================== +condition set 5: In-context learning +======================================================================================== +variant name: standard +description: No examples-same as the standard prompt. +prompts: +[system prompt] +Task: You are the triangular agent trying to navigate this maze. You are facing the pointy end. Move to the green goal cell in the grid. +The environment may contain: +- Keys: pick them up to open doors of the matching color +- Doors: blocked passages that require a matching key +- Switches: TOGGLE while standing on them +- Gates: blocked passages controlled by switches + +Valid actions: TURN_LEFT, TURN_RIGHT, MOVE_FORWARD, PICKUP, TOGGLE, DONE. + +[user prompt] +[image block 1] +Observation: +Your inventory: empty. + +What is your next action? + +Output exactly: +FINAL_OUTPUT: +---------------------------------------------------------------------------------------- +variant name: one_shot +description: PR #12 design axis; example selection/injection is not implemented yet. +prompts: +Status: not implemented in ExperimentConfig +---------------------------------------------------------------------------------------- diff --git a/pyproject.toml b/pyproject.toml index 530ffa4..8700c57 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ multinet-probe-vlm = "scripts.probe_vlm:main" multinet-ollama-vision-check = "scripts.ollama_vision_check:main" multinet-ollama-maze-shape-check = "scripts.ollama_maze_shape_check:main" multinet-vlm-sanity = "scripts.vlm_sanity_check:main" +multinet-preview-prompts = "prompting_experiments.preview_prompts:main" [tool.setuptools] include-package-data = true @@ -60,6 +61,7 @@ include = [ "interface*", "mazes*", "multigrid*", + "prompting_experiments*", "scripts*", ] diff --git a/tests/test_prompt_observation_text.py b/tests/test_prompt_observation_text.py new file mode 100644 index 0000000..25c6550 --- /dev/null +++ b/tests/test_prompt_observation_text.py @@ -0,0 +1,356 @@ +from __future__ import annotations + +from dataclasses import replace + +import numpy as np + +from interface.config import ExperimentConfig +from interface.loader import default_maze_path, load_task +from interface.observation import current_observation_text, history_content_blocks +from interface.parser import ACTIONS_HINT +from interface.prompt_strategies import ( + StandardPromptStrategy, + VerbosePromptStrategy, + _mechanism_hints_text, +) +from interface.runner import build_runner +from prompting_experiments import CONDITION_SETS +from prompting_experiments.condition_set_2_observation_format import CONDITION_SET +from prompting_experiments.prompt_templates import feedback as feedback_templates + + +def _initial_spec_and_state(): + backend, spec = load_task(default_maze_path()) + _rgb, state, _info = backend.reset(seed=spec.seed) + return spec, state + + +def _initial_user_prompt_text(cfg: ExperimentConfig) -> str: + backend, spec = load_task(default_maze_path()) + runner = build_runner(cfg, backend, spec) + runner.last_rgb, state, _info = backend.reset(seed=spec.seed) + message = runner._build_message(state, feedback_templates.INITIAL_FEEDBACK, []) + content = message["content"] + if isinstance(content, list): + return "\n".join( + block["text"] + for block in content + if isinstance(block, dict) and block.get("type") == "text" + ) + return content + + +def _user_prompt_text_with_transcript( + cfg: ExperimentConfig, transcript: list[dict] +) -> str: + backend, spec = load_task(default_maze_path()) + runner = build_runner(cfg, backend, spec) + runner.last_rgb, state, _info = backend.reset(seed=spec.seed) + message = runner._build_message(state, feedback_templates.INITIAL_FEEDBACK, transcript) + content = message["content"] + if isinstance(content, list): + return "\n".join( + block["text"] + for block in content + if isinstance(block, dict) and block.get("type") == "text" + ) + return content + + +def _initial_user_prompt_text_for_maze(cfg: ExperimentConfig, maze_name: str) -> str: + backend, spec = load_task(default_maze_path(maze_name)) + runner = build_runner(cfg, backend, spec) + runner.last_rgb, state, _info = backend.reset(seed=spec.seed) + message = runner._build_message(state, feedback_templates.INITIAL_FEEDBACK, []) + content = message["content"] + if isinstance(content, list): + return "\n".join( + block["text"] + for block in content + if isinstance(block, dict) and block.get("type") == "text" + ) + return content + + +def test_current_observation_omits_description_by_default(): + spec, state = _initial_spec_and_state() + + text = current_observation_text("image_text", spec, state) + + assert text == "" + + +def test_current_observation_can_render_without_facing(): + spec, state = _initial_spec_and_state() + + text = current_observation_text( + "image_text", + spec, + state, + include_description=True, + ) + + assert "Current situation (this step):" in text + assert "The goal is at" not in text + assert "You are at (1, 1)." in text + assert "You are at (1, 1) facing EAST." not in text + + +def test_observation_format_text_variants_keep_facing(): + spec, state = _initial_spec_and_state() + base = ExperimentConfig(observation_text_includes_facing=False) + + for variant_name in ("text_only", "image_text"): + cfg = CONDITION_SET.variants[variant_name].build_config(base) + text = current_observation_text( + cfg.observation, + spec, + state, + include_description=cfg.include_current_observation_description, + include_facing=cfg.observation_text_includes_facing, + ) + + assert "Current situation (this step):" in text + assert "The goal is at" not in text + assert "You are at (1, 1) facing EAST." in text + + +def test_observation_format_image_only_includes_inventory_only_text(): + spec, state = _initial_spec_and_state() + cfg = CONDITION_SET.variants["standard"].build_config( + replace(ExperimentConfig(), observation_text_includes_facing=False) + ) + + text = current_observation_text( + cfg.observation, + spec, + state, + include_description=cfg.include_current_observation_description, + include_facing=cfg.observation_text_includes_facing, + ) + + assert text == "Your inventory: empty." + assert "Current situation (this step):" not in text + assert "You are at" not in text + + +def test_image_only_prompt_puts_inventory_text_after_current_image(): + backend, spec = load_task(default_maze_path()) + runner = build_runner(ExperimentConfig(observation="image_only"), backend, spec) + runner.last_rgb, state, _info = backend.reset(seed=spec.seed) + + message = runner._build_message(state, feedback_templates.INITIAL_FEEDBACK, []) + content = message["content"] + + assert isinstance(content, list) + assert content[0]["type"] == "image_url" + assert content[1]["type"] == "text" + assert "Your inventory: empty." in content[1]["text"] + + +def test_image_only_last3_history_puts_inventory_before_action_under_images(): + frame = np.zeros((2, 2, 3), dtype=np.uint8) + transcript = [ + { + "kind": "step", + "event_type": "VALID", + "action": "MOVE_FORWARD", + "state_before": {"inventory": []}, + "_decision_frame_rgb": frame, + }, + { + "kind": "step", + "event_type": "VALID", + "action": "PICKUP", + "state_before": {"inventory": ["red"]}, + "_decision_frame_rgb": frame, + }, + ] + + blocks = history_content_blocks("image_only", "last3", transcript) + + assert blocks[0]["type"] == "text" + assert blocks[1]["type"] == "image_url" + assert blocks[2] == { + "type": "text", + "text": "Your inventory: empty.\nAction: MOVE_FORWARD\n\n", + } + assert blocks[3]["type"] == "image_url" + assert blocks[4] == { + "type": "text", + "text": "Your inventory: red.\nAction: PICKUP\n\n", + } + + +def test_non_observation_format_conditions_omit_current_description_from_prompt(): + for condition_name, condition in CONDITION_SETS.items(): + if condition is CONDITION_SET: + continue + + for variant in condition.variants.values(): + if not variant.implemented: + continue + + backend, spec = load_task(default_maze_path()) + cfg = variant.build_config(ExperimentConfig()) + runner = build_runner(cfg, backend, spec) + runner.last_rgb, state, _info = backend.reset(seed=spec.seed) + + message = runner._build_message(state, feedback_templates.INITIAL_FEEDBACK, []) + content = message["content"] + if isinstance(content, list): + prompt_text = "\n".join( + block["text"] + for block in content + if isinstance(block, dict) and block.get("type") == "text" + ) + else: + prompt_text = content + + assert "Observation:\nCurrent situation (this step):" not in prompt_text, ( + condition_name, + variant.name, + ) + + +def test_non_observation_format_conditions_omit_initial_maze_from_prompt(): + for condition_name, condition in CONDITION_SETS.items(): + if condition is CONDITION_SET: + continue + + for variant in condition.variants.values(): + if not variant.implemented: + continue + + prompt_text = _initial_user_prompt_text(variant.build_config(ExperimentConfig())) + + assert "Initial maze (fixed for this episode):" not in prompt_text, ( + condition_name, + variant.name, + ) + + +def test_observation_format_initial_maze_only_for_text_variants(): + text_variants = {"text_only", "image_text"} + for variant_name, variant in CONDITION_SET.variants.items(): + cfg = variant.build_config(ExperimentConfig()) + prompt_text = _initial_user_prompt_text(cfg) + has_initial_maze = "Initial maze (fixed for this episode):" in prompt_text + + assert has_initial_maze is (variant_name in text_variants), variant_name + + +def test_initial_prompts_omit_current_status_footer_without_history_context(): + for variant in CONDITION_SET.variants.values(): + cfg = variant.build_config(ExperimentConfig()) + prompt_text = _initial_user_prompt_text(cfg) + + assert "Position: (1, 1) | Facing: EAST | Goal: (6, 6)" not in prompt_text + assert "Last result: Episode start." not in prompt_text + + +def test_last3_prompt_omits_current_status_footer_with_history_context(): + transcript = [ + { + "kind": "step", + "event_type": "VALID", + "position_after": (1, 2), + "facing_after": "EAST", + "action": "MOVE_FORWARD", + "prompt_feedback": "MOVED", + } + ] + cfg = ExperimentConfig(observation="text_only", context_window="last3") + + prompt_text = _user_prompt_text_with_transcript( + cfg, + transcript, + ) + + assert "Recent history (last 3 steps, oldest first):" in prompt_text + assert "Position: (1, 1) | Facing: EAST | Goal: (6, 6)" not in prompt_text + assert "Last result: Episode start." not in prompt_text + + +def test_observation_format_image_only_matches_standard_prompt_text(): + standard_text = _initial_user_prompt_text(ExperimentConfig()) + image_only_text = _initial_user_prompt_text( + CONDITION_SET.variants["standard"].build_config(ExperimentConfig()) + ) + + assert image_only_text == standard_text + + +def test_standard_variants_use_default_config_without_overrides(): + for condition in CONDITION_SETS.values(): + variant = condition.variants.get("standard") + if variant is None or not variant.implemented: + continue + + cfg = variant.build_config() + + assert cfg == ExperimentConfig() + assert variant.config_overrides is None + + +def test_implemented_non_verbose_conditions_share_standard_system_prompt(): + standard_prompt = StandardPromptStrategy(ACTIONS_HINT).build_system_prompt() + verbose_prompt = None + for condition_name, condition in CONDITION_SETS.items(): + for variant in condition.variants.values(): + if not variant.implemented: + continue + + backend, spec = load_task(default_maze_path()) + cfg = variant.build_config(ExperimentConfig()) + runner = build_runner(cfg, backend, spec) + system_prompt = runner.prompt.build_system_prompt() + + if variant.name == "verbose": + verbose_prompt = system_prompt + else: + assert system_prompt == standard_prompt, (condition_name, variant.name) + + assert verbose_prompt is not None + assert verbose_prompt != standard_prompt + + +def test_verbose_prompt_omits_mechanism_hints_by_default(): + prompt_text = _initial_user_prompt_text_for_maze( + ExperimentConfig(prompting="verbose"), + "V04_single_key.json", + ) + + assert "Hints:" not in prompt_text + assert "Face an adjacent key and PICKUP" not in prompt_text + assert "Inventory:" not in prompt_text + assert "From your perspective:" not in prompt_text + + +def test_mechanism_hint_insertion_helper_still_generates_hints(): + _backend, spec = load_task(default_maze_path("V04_single_key.json")) + hints = _mechanism_hints_text(spec) + + assert "Hints:" in hints + assert "Face an adjacent key and PICKUP" in hints + + +def test_verbose_prompt_can_insert_mechanism_hints_when_enabled(): + class HintingVerbosePromptStrategy(VerbosePromptStrategy): + include_mechanism_hints = True + + backend, spec = load_task(default_maze_path("V04_single_key.json")) + runner = build_runner(ExperimentConfig(prompting="verbose"), backend, spec) + runner.prompt = HintingVerbosePromptStrategy(ACTIONS_HINT) + runner.last_rgb, state, _info = backend.reset(seed=spec.seed) + + message = runner._build_message(state, feedback_templates.INITIAL_FEEDBACK, []) + content = message["content"] + prompt_text = "\n".join( + block["text"] + for block in content + if isinstance(block, dict) and block.get("type") == "text" + ) + + assert "Hints:" in prompt_text + assert "Face an adjacent key and PICKUP" in prompt_text