From 357c51d108c80490751d0c3d1394430b8debc249 Mon Sep 17 00:00:00 2001 From: kavan Date: Fri, 18 Jul 2025 18:13:47 -0700 Subject: [PATCH 1/7] update --- .../src/actions/desktopActions.ts | 164 ++++++++++++++++++ .../src/connectors/desktopConnector.ts | 143 +++++++++++++++ packages/magnitude-core/src/index.ts | 1 + 3 files changed, 308 insertions(+) create mode 100644 packages/magnitude-core/src/actions/desktopActions.ts create mode 100644 packages/magnitude-core/src/connectors/desktopConnector.ts diff --git a/packages/magnitude-core/src/actions/desktopActions.ts b/packages/magnitude-core/src/actions/desktopActions.ts new file mode 100644 index 00000000..740a152f --- /dev/null +++ b/packages/magnitude-core/src/actions/desktopActions.ts @@ -0,0 +1,164 @@ +import { createAction } from "."; +import { z } from "zod"; +import { DesktopConnector } from "@/connectors/desktopConnector"; + +// Mouse actions +export const desktopClickAction = createAction({ + name: 'desktop:click', + description: "Click at screen coordinates", + schema: z.object({ + x: z.number().int().describe("X coordinate"), + y: z.number().int().describe("Y coordinate"), + }), + resolver: async ({ input: { x, y }, agent }) => { + const desktop = agent.require(DesktopConnector); + await desktop.getInterface().click(x, y); + }, + render: ({ x, y }) => `⊙ click (${x}, ${y})` +}); + +export const desktopRightClickAction = createAction({ + name: 'desktop:right_click', + description: "Right-click at screen coordinates", + schema: z.object({ + x: z.number().int(), + y: z.number().int(), + }), + resolver: async ({ input: { x, y }, agent }) => { + const desktop = agent.require(DesktopConnector); + await desktop.getInterface().rightClick(x, y); + }, + render: ({ x, y }) => `⊙ right-click (${x}, ${y})` +}); + +export const desktopDoubleClickAction = createAction({ + name: 'desktop:double_click', + description: "Double-click at screen coordinates", + schema: z.object({ + x: z.number().int(), + y: z.number().int(), + }), + resolver: async ({ input: { x, y }, agent }) => { + const desktop = agent.require(DesktopConnector); + await desktop.getInterface().doubleClick(x, y); + }, + render: ({ x, y }) => `⊙ double-click (${x}, ${y})` +}); + +export const desktopDragAction = createAction({ + name: 'desktop:drag', + description: "Drag from one position to another", + schema: z.object({ + fromX: z.number().int(), + fromY: z.number().int(), + toX: z.number().int(), + toY: z.number().int(), + }), + resolver: async ({ input: { fromX, fromY, toX, toY }, agent }) => { + const desktop = agent.require(DesktopConnector); + await desktop.getInterface().drag(fromX, fromY, toX, toY); + }, + render: ({ fromX, fromY, toX, toY }) => `⊙ drag from (${fromX}, ${fromY}) to (${toX}, ${toY})` +}); + +export const desktopScrollAction = createAction({ + name: 'desktop:scroll', + description: "Scroll at position", + schema: z.object({ + x: z.number().int(), + y: z.number().int(), + direction: z.enum(['up', 'down']).optional(), + amount: z.number().int().optional(), + }), + resolver: async ({ input: { x, y, direction, amount }, agent }) => { + const desktop = agent.require(DesktopConnector); + await desktop.getInterface().scroll(x, y, direction, amount); + }, + render: ({ x, y, direction, amount }) => `⊙ scroll at (${x}, ${y})${direction ? ` ${direction}` : ''}${amount ? ` ${amount}px` : ''}` +}); + +// Keyboard actions +export const desktopTypeAction = createAction({ + name: 'desktop:type', + description: "Type text at current cursor position", + schema: z.object({ + text: z.string().describe("Text to type"), + }), + resolver: async ({ input: { text }, agent }) => { + const desktop = agent.require(DesktopConnector); + await desktop.getInterface().type(text); + }, + render: ({ text }) => `⌨ type "${text}"` +}); + +export const desktopKeyAction = createAction({ + name: 'desktop:key', + description: "Press a single key", + schema: z.object({ + key: z.string().describe("Key to press (e.g., 'Return', 'Tab', 'Escape')"), + }), + resolver: async ({ input: { key }, agent }) => { + const desktop = agent.require(DesktopConnector); + await desktop.getInterface().key(key); + }, + render: ({ key }) => `⌨ key '${key}'` +}); + +export const desktopHotkeyAction = createAction({ + name: 'desktop:hotkey', + description: "Press a key combination", + schema: z.object({ + keys: z.array(z.string()).describe("Keys to press together (e.g., ['cmd', 'c'])"), + }), + resolver: async ({ input: { keys }, agent }) => { + const desktop = agent.require(DesktopConnector); + await desktop.getInterface().hotkey(keys); + }, + render: ({ keys }) => `⌨ hotkey ${keys.join('+')}` +}); + +// Navigation +export const desktopNavigateAction = createAction({ + name: 'desktop:navigate', + description: "Navigate to a URL in the browser", + schema: z.object({ + url: z.string().describe("URL to navigate to"), + }), + resolver: async ({ input: { url }, agent }) => { + const desktop = agent.require(DesktopConnector); + const interface_ = desktop.getInterface(); + if (!interface_.navigate) { + throw new Error("Desktop interface does not support navigation"); + } + await interface_.navigate(url); + }, + render: ({ url }) => `⛓ navigate to ${url}` +}); + +// Utility +export const desktopWaitAction = createAction({ + name: 'desktop:wait', + description: "Wait for specified milliseconds", + schema: z.object({ + ms: z.number().int().min(0).describe("Milliseconds to wait"), + }), + resolver: async ({ input: { ms }, agent }) => { + const desktop = agent.require(DesktopConnector); + await desktop.getInterface().wait(ms); + }, + render: ({ ms }) => `⏱ wait ${ms}ms` +}); + +// Export all desktop actions +export const desktopActions = [ + desktopClickAction, + desktopRightClickAction, + desktopDoubleClickAction, + desktopDragAction, + desktopScrollAction, + desktopTypeAction, + desktopKeyAction, + desktopHotkeyAction, + desktopNavigateAction, + desktopWaitAction, +]; \ No newline at end of file diff --git a/packages/magnitude-core/src/connectors/desktopConnector.ts b/packages/magnitude-core/src/connectors/desktopConnector.ts new file mode 100644 index 00000000..e70e5ffd --- /dev/null +++ b/packages/magnitude-core/src/connectors/desktopConnector.ts @@ -0,0 +1,143 @@ +import { AgentConnector } from "."; +import { ActionDefinition } from '@/actions'; +import { desktopActions } from '@/actions/desktopActions'; +import { Observation } from "@/memory/observation"; +import { Image } from "@/memory/image"; +import logger from "@/logger"; +import { Logger } from 'pino'; +import sharp from 'sharp'; + +/** + * Generic desktop automation interface. + * Implementations can use any desktop automation technology + * (Lume, PyAutoGUI, Windows UI Automation, etc.) + */ +export interface DesktopInterface { + // Mouse operations + click(x: number, y: number): Promise; + rightClick(x: number, y: number): Promise; + doubleClick(x: number, y: number): Promise; + moveCursor(x: number, y: number): Promise; + drag(fromX: number, fromY: number, toX: number, toY: number): Promise; + scroll(x: number, y: number, direction?: 'up' | 'down', amount?: number): Promise; + + // Keyboard operations + type(text: string): Promise; + key(key: string): Promise; + hotkey(keys: string[]): Promise; + + // Screen operations + screenshot(): Promise; + getScreenSize(): Promise<{ width: number; height: number }>; + + // Navigation (browser-specific but common in desktop automation) + navigate?(url: string): Promise; + + // System operations + wait(ms: number): Promise; + + // Optional: Window management + getActiveWindow?(): Promise<{ title: string; app: string }>; + getOpenWindows?(): Promise>; + focusWindow?(title: string): Promise; + + // Optional: Application control + openApplication?(name: string): Promise; + closeApplication?(name: string): Promise; +} + +export interface DesktopConnectorOptions { + desktopInterface: DesktopInterface; + virtualScreenDimensions?: { width: number; height: number }; + minScreenshots?: number; +} + +export class DesktopConnector implements AgentConnector { + public readonly id: string = "desktop"; + private desktopInterface: DesktopInterface; + private options: DesktopConnectorOptions; + private logger: Logger; + + constructor(options: DesktopConnectorOptions) { + this.options = options; + this.desktopInterface = options.desktopInterface; + this.logger = logger.child({ + name: `connectors.${this.id}` + }); + } + + async onStart(): Promise { + // Desktop interface should already be initialized by the service + this.logger.info("Desktop connector started"); + } + + async onStop(): Promise { + // Cleanup handled by the interface provider + this.logger.info("Desktop connector stopped"); + } + + getActionSpace(): ActionDefinition[] { + return desktopActions; + } + + async collectObservations(): Promise { + const observations: Observation[] = []; + + // Always collect screenshot + const screenshot = await this.desktopInterface.screenshot(); + const sharpImage = sharp(screenshot); + const image = new Image(sharpImage); + + // Apply virtual screen dimensions if configured + const transformedImage = this.options.virtualScreenDimensions + ? await image.resize( + this.options.virtualScreenDimensions.width, + this.options.virtualScreenDimensions.height + ) + : image; + + observations.push( + Observation.fromConnector( + this.id, + transformedImage, + { + type: 'screenshot', + limit: this.options.minScreenshots ?? 2, + dedupe: true + } + ) + ); + + // Optional: Window information + if (this.desktopInterface.getOpenWindows) { + try { + const windows = await this.desktopInterface.getOpenWindows(); + const windowInfo = this.formatWindowInfo(windows); + observations.push( + Observation.fromConnector( + this.id, + windowInfo, + { type: 'window-info', limit: 1 } + ) + ); + } catch (error) { + this.logger.warn('Failed to get window information', error); + } + } + + return observations; + } + + private formatWindowInfo(windows: Array<{ title: string; app: string; isActive: boolean }>): string { + let info = "Open Windows:\n"; + windows.forEach(window => { + info += `${window.isActive ? '[ACTIVE] ' : ''}${window.title} (${window.app})\n`; + }); + return info; + } + + // Expose interface for actions to use + getInterface(): DesktopInterface { + return this.desktopInterface; + } +} \ No newline at end of file diff --git a/packages/magnitude-core/src/index.ts b/packages/magnitude-core/src/index.ts index 3aa4278d..2d934d5c 100644 --- a/packages/magnitude-core/src/index.ts +++ b/packages/magnitude-core/src/index.ts @@ -12,6 +12,7 @@ export * from "@/actions"; export * from "@/connectors"; export * from "@/web/browserProvider"; export * from "@/connectors/browserConnector"; +export * from "@/connectors/desktopConnector"; export * from "@/agent/errors"; export * from "@/types"; export * from "@/ai/types"; From 072d42f3bca833f126a8efd1ef98611a31519da1 Mon Sep 17 00:00:00 2001 From: kavan Date: Sat, 19 Jul 2025 17:15:07 -0700 Subject: [PATCH 2/7] PR improvements --- .../src/actions/desktopActions.ts | 29 +++++++++---------- .../src/connectors/desktopConnector.ts | 7 ++--- 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/packages/magnitude-core/src/actions/desktopActions.ts b/packages/magnitude-core/src/actions/desktopActions.ts index 740a152f..b3136513 100644 --- a/packages/magnitude-core/src/actions/desktopActions.ts +++ b/packages/magnitude-core/src/actions/desktopActions.ts @@ -63,18 +63,18 @@ export const desktopDragAction = createAction({ export const desktopScrollAction = createAction({ name: 'desktop:scroll', - description: "Scroll at position", + description: "Hover mouse over target and scroll", schema: z.object({ x: z.number().int(), y: z.number().int(), - direction: z.enum(['up', 'down']).optional(), - amount: z.number().int().optional(), + deltaX: z.number().int().describe("Pixels to scroll horizontally"), + deltaY: z.number().int().describe("Pixels to scroll vertically"), }), - resolver: async ({ input: { x, y, direction, amount }, agent }) => { + resolver: async ({ input: { x, y, deltaX, deltaY }, agent }) => { const desktop = agent.require(DesktopConnector); - await desktop.getInterface().scroll(x, y, direction, amount); + await desktop.getInterface().scroll(x, y, deltaX, deltaY); }, - render: ({ x, y, direction, amount }) => `⊙ scroll at (${x}, ${y})${direction ? ` ${direction}` : ''}${amount ? ` ${amount}px` : ''}` + render: ({ x, y, deltaX, deltaY }) => `↕ scroll (${deltaX}px, ${deltaY}px)` }); // Keyboard actions @@ -120,9 +120,9 @@ export const desktopHotkeyAction = createAction({ // Navigation export const desktopNavigateAction = createAction({ name: 'desktop:navigate', - description: "Navigate to a URL in the browser", + description: "Open a new browser window/tab with the specified URL. For navigating within an existing browser window, interact with the address bar instead.", schema: z.object({ - url: z.string().describe("URL to navigate to"), + url: z.string().describe("URL to open in a new browser window/tab"), }), resolver: async ({ input: { url }, agent }) => { const desktop = agent.require(DesktopConnector); @@ -132,21 +132,20 @@ export const desktopNavigateAction = createAction({ } await interface_.navigate(url); }, - render: ({ url }) => `⛓ navigate to ${url}` + render: ({ url }) => `⛓ open browser with ${url}` }); // Utility export const desktopWaitAction = createAction({ name: 'desktop:wait', - description: "Wait for specified milliseconds", + description: "Actions include smart waiting automatically - so only use this when a significant additional wait is clearly required.", schema: z.object({ - ms: z.number().int().min(0).describe("Milliseconds to wait"), + seconds: z.number().describe("Seconds to wait"), }), - resolver: async ({ input: { ms }, agent }) => { - const desktop = agent.require(DesktopConnector); - await desktop.getInterface().wait(ms); + resolver: async ({ input: { seconds }, agent }) => { + await new Promise((resolve) => setTimeout(resolve, seconds * 1000)); }, - render: ({ ms }) => `⏱ wait ${ms}ms` + render: ({ seconds }) => `◴ wait ${seconds}s` }); // Export all desktop actions diff --git a/packages/magnitude-core/src/connectors/desktopConnector.ts b/packages/magnitude-core/src/connectors/desktopConnector.ts index e70e5ffd..cbd08dcd 100644 --- a/packages/magnitude-core/src/connectors/desktopConnector.ts +++ b/packages/magnitude-core/src/connectors/desktopConnector.ts @@ -19,7 +19,7 @@ export interface DesktopInterface { doubleClick(x: number, y: number): Promise; moveCursor(x: number, y: number): Promise; drag(fromX: number, fromY: number, toX: number, toY: number): Promise; - scroll(x: number, y: number, direction?: 'up' | 'down', amount?: number): Promise; + scroll(x: number, y: number, deltaX: number, deltaY: number): Promise; // Keyboard operations type(text: string): Promise; @@ -30,12 +30,9 @@ export interface DesktopInterface { screenshot(): Promise; getScreenSize(): Promise<{ width: number; height: number }>; - // Navigation (browser-specific but common in desktop automation) + // Browser launch (opens new browser window/tab, not navigation within existing windows) navigate?(url: string): Promise; - // System operations - wait(ms: number): Promise; - // Optional: Window management getActiveWindow?(): Promise<{ title: string; app: string }>; getOpenWindows?(): Promise>; From 7eae3677827abf6f4974aa85e43f701d79cee492 Mon Sep 17 00:00:00 2001 From: kavan Date: Sat, 19 Jul 2025 23:03:41 -0700 Subject: [PATCH 3/7] combined key and hotkey --- .../src/actions/desktopActions.ts | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/packages/magnitude-core/src/actions/desktopActions.ts b/packages/magnitude-core/src/actions/desktopActions.ts index b3136513..00400154 100644 --- a/packages/magnitude-core/src/actions/desktopActions.ts +++ b/packages/magnitude-core/src/actions/desktopActions.ts @@ -93,28 +93,29 @@ export const desktopTypeAction = createAction({ export const desktopKeyAction = createAction({ name: 'desktop:key', - description: "Press a single key", + description: "Press a key or key combination. Use '+' to combine keys (e.g., 'cmd+c', 'ctrl+shift+t')", schema: z.object({ - key: z.string().describe("Key to press (e.g., 'Return', 'Tab', 'Escape')"), + key: z.string().describe("Key to press (e.g., 'Return', 'Tab', 'Escape') or combination (e.g., 'cmd+c', 'ctrl+a', 'ctrl+shift+t')"), }), resolver: async ({ input: { key }, agent }) => { const desktop = agent.require(DesktopConnector); - await desktop.getInterface().key(key); - }, - render: ({ key }) => `⌨ key '${key}'` -}); - -export const desktopHotkeyAction = createAction({ - name: 'desktop:hotkey', - description: "Press a key combination", - schema: z.object({ - keys: z.array(z.string()).describe("Keys to press together (e.g., ['cmd', 'c'])"), - }), - resolver: async ({ input: { keys }, agent }) => { - const desktop = agent.require(DesktopConnector); - await desktop.getInterface().hotkey(keys); + // Check if it's a key combination (contains +) + if (key.includes('+')) { + // Parse the combination and use hotkey + const keys = key.split('+').map(k => k.trim()); + await desktop.getInterface().hotkey(keys); + } else { + // Single key press + await desktop.getInterface().key(key); + } }, - render: ({ keys }) => `⌨ hotkey ${keys.join('+')}` + render: ({ key }) => { + // Show differently based on whether it's a combination + if (key.includes('+')) { + return `⌨ hotkey ${key}`; + } + return `⌨ key '${key}'`; + } }); // Navigation @@ -157,7 +158,6 @@ export const desktopActions = [ desktopScrollAction, desktopTypeAction, desktopKeyAction, - desktopHotkeyAction, desktopNavigateAction, desktopWaitAction, ]; \ No newline at end of file From 730cbe08d89eb9c45eafa2585db391b67c0a38e8 Mon Sep 17 00:00:00 2001 From: lizziescoder Date: Sun, 20 Jul 2025 15:23:53 -0700 Subject: [PATCH 4/7] Update desktopActions.ts --- packages/magnitude-core/src/actions/desktopActions.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/magnitude-core/src/actions/desktopActions.ts b/packages/magnitude-core/src/actions/desktopActions.ts index 00400154..0e8752fc 100644 --- a/packages/magnitude-core/src/actions/desktopActions.ts +++ b/packages/magnitude-core/src/actions/desktopActions.ts @@ -95,7 +95,7 @@ export const desktopKeyAction = createAction({ name: 'desktop:key', description: "Press a key or key combination. Use '+' to combine keys (e.g., 'cmd+c', 'ctrl+shift+t')", schema: z.object({ - key: z.string().describe("Key to press (e.g., 'Return', 'Tab', 'Escape') or combination (e.g., 'cmd+c', 'ctrl+a', 'ctrl+shift+t')"), + key: z.string().describe("Key to press (e.g., 'return', 'tab', 'escape') or combination (e.g., 'cmd+c', 'ctrl+a', 'ctrl+shift+t')"), }), resolver: async ({ input: { key }, agent }) => { const desktop = agent.require(DesktopConnector); @@ -160,4 +160,4 @@ export const desktopActions = [ desktopKeyAction, desktopNavigateAction, desktopWaitAction, -]; \ No newline at end of file +]; From a75bcb57480ecc1c34ddd57f6f08aa472d1a7631 Mon Sep 17 00:00:00 2001 From: lizziescoder Date: Sun, 20 Jul 2025 15:24:36 -0700 Subject: [PATCH 5/7] Update desktopActions.ts --- packages/magnitude-core/src/actions/desktopActions.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/magnitude-core/src/actions/desktopActions.ts b/packages/magnitude-core/src/actions/desktopActions.ts index 0e8752fc..562a19b9 100644 --- a/packages/magnitude-core/src/actions/desktopActions.ts +++ b/packages/magnitude-core/src/actions/desktopActions.ts @@ -95,7 +95,7 @@ export const desktopKeyAction = createAction({ name: 'desktop:key', description: "Press a key or key combination. Use '+' to combine keys (e.g., 'cmd+c', 'ctrl+shift+t')", schema: z.object({ - key: z.string().describe("Key to press (e.g., 'return', 'tab', 'escape') or combination (e.g., 'cmd+c', 'ctrl+a', 'ctrl+shift+t')"), + key: z.string().describe("Key to press (e.g., 'enter', 'tab', 'esc') or combination (e.g., 'cmd+c', 'ctrl+a', 'ctrl+shift+t')"), }), resolver: async ({ input: { key }, agent }) => { const desktop = agent.require(DesktopConnector); From 7f27ecbed062781ea846875ed1b3a622ee45d1ca Mon Sep 17 00:00:00 2001 From: kavan Date: Sun, 20 Jul 2025 16:57:22 -0700 Subject: [PATCH 6/7] added action limit and persistent memory option --- docs/core-concepts/browser-interaction.mdx | 48 ++++++++++++++ docs/reference/browser-agent.mdx | 24 +++++++ docs/reference/test-declaration.mdx | 64 +++++++++++++++---- packages/magnitude-core/src/agent/index.ts | 45 +++++++++++-- packages/magnitude-core/src/agent/narrator.ts | 6 +- packages/magnitude-core/src/common/events.ts | 1 + 6 files changed, 168 insertions(+), 20 deletions(-) diff --git a/docs/core-concepts/browser-interaction.mdx b/docs/core-concepts/browser-interaction.mdx index 25c0543a..301953f2 100644 --- a/docs/core-concepts/browser-interaction.mdx +++ b/docs/core-concepts/browser-interaction.mdx @@ -59,6 +59,54 @@ await agent.act('create a new task', { }); ``` +### Controlling Execution Steps + +By default, each `act()` call is limited to 100 steps to prevent infinite loops. You can adjust this limit based on task complexity: + +```typescript +// Simple task with default limit +await agent.act('click the submit button'); + +// Complex task that needs more steps +await agent.act('fill out the entire application form', { + maxSteps: 200 +}); + +// Very simple task with reduced limit +await agent.act('close the modal', { + maxSteps: 3 +}); +``` + +The agent will emit a warning event if it reaches the maximum steps without completing the task, which can help identify tasks that need adjustment or debugging. + +### Memory Persistence + +By default, each `act()` call starts with a fresh memory context. For tasks that build upon previous actions, you can enable memory persistence: + +```typescript +// First action creates some state +await agent.act('open the settings panel', { + reuseMemory: true // Start persistent memory +}); + +// Subsequent actions remember previous context +await agent.act('navigate to the security tab', { + reuseMemory: true // Continues with memory from previous act() +}); + +await agent.act('enable two-factor authentication', { + reuseMemory: true // Still has context from all previous actions +}); +``` + +This is particularly useful for: +- Multi-step workflows where context matters +- Complex interactions that reference previous actions +- Test scenarios that need to maintain state across steps + +Note: Memory is only persisted within the same agent instance. Creating a new agent starts fresh. + ## Navigating Directly While the agent is capable of navigating to URLs on its own, you may sometimes want to navigate to a specific URL directly. diff --git a/docs/reference/browser-agent.mdx b/docs/reference/browser-agent.mdx index a9e7c6ec..30db183d 100644 --- a/docs/reference/browser-agent.mdx +++ b/docs/reference/browser-agent.mdx @@ -112,6 +112,30 @@ await agent.act("Enter {username} into the user field", { - **`string`**: Provide additional instructions for the LLM. These are injected into the system prompt. + + Maximum number of steps the agent can take for this specific task. Defaults to 100. This prevents infinite loops and provides predictable resource usage. + + ```typescript + // Allow more steps for complex tasks + await agent.act("Complete the entire checkout process", { + maxSteps: 200 + }); + ``` + + + When true, reuses memory from previous act() calls within the same agent instance. This allows the agent to maintain context across multiple tasks. Defaults to false. + + ```typescript + // Enable memory persistence for related tasks + await agent.act("Log into the application", { + reuseMemory: true + }); + + await agent.act("Navigate to the dashboard", { + reuseMemory: true // Remembers the login context + }); + ``` + ### `nav(url: string)` diff --git a/docs/reference/test-declaration.mdx b/docs/reference/test-declaration.mdx index 5a43326f..8f12ba18 100644 --- a/docs/reference/test-declaration.mdx +++ b/docs/reference/test-declaration.mdx @@ -44,27 +44,48 @@ Defines a new test case. -## `test.group(id, options?, groupFn)` - -Defines a group of test cases, allowing shared options (like `url`) to be applied to all tests within the group. - -```typescript Group Example +```typescript Example with maxSteps import { test } from 'magnitude-test'; -test.group('User Authentication Flow', { url: '/login' }, () => { - test('should display login form', async (agent) => { - await agent.check("Login form is visible"); - }); +test('should handle complex checkout flow', async (agent) => { + await agent.act('add items to cart and proceed to checkout', { + maxSteps: 200 // Allow more steps for multi-stage process + }); + await agent.check('order confirmation is displayed'); +}); - test('should allow login with valid credentials', async (agent) => { - await agent.act("Log in with valid credentials"); - await agent.check("User is redirected to dashboard"); - }); +```typescript Example with memory persistence +import { test } from 'magnitude-test'; + +test('should complete multi-step user onboarding', async (agent) => { + // Step 1: Initial setup + await agent.act('fill out basic profile information', { + maxSteps: 50, + reuseMemory: true // Start persistent memory + }); + + // Step 2: Uses context from step 1 + await agent.act('select preferences based on my profile', { + maxSteps: 30, + reuseMemory: true // Continues with previous context + }); + + // Step 3: Final verification with all context + await agent.act('review and confirm all my selections', { + maxSteps: 20, + reuseMemory: true // Has full context from steps 1 & 2 + }); + + await agent.check('onboarding completed successfully'); }); ``` +## `test.group(id, options?, groupFn)` + +Defines a group of test cases, allowing shared options (like `url`) to be applied to all tests within the group. + A descriptive identifier for the test group. @@ -82,3 +103,20 @@ test.group('User Authentication Flow', { url: '/login' }, () => { A synchronous function that contains the `test()` declarations belonging to this group. + + +```typescript Group Example +import { test } from 'magnitude-test'; + +test.group('User Authentication Flow', { url: '/login' }, () => { + test('should display login form', async (agent) => { + await agent.check("Login form is visible"); + }); + + test('should allow login with valid credentials', async (agent) => { + await agent.act("Log in with valid credentials"); + await agent.check("User is redirected to dashboard"); + }); +}); +``` + diff --git a/packages/magnitude-core/src/agent/index.ts b/packages/magnitude-core/src/agent/index.ts index 52234394..875fe99e 100644 --- a/packages/magnitude-core/src/agent/index.ts +++ b/packages/magnitude-core/src/agent/index.ts @@ -19,6 +19,7 @@ import { retryOnError } from '@/common'; import { renderContentParts } from '@/memory/rendering'; import { MultiModelHarness } from '@/ai/multiModelHarness'; +const DEFAULT_MAX_STEPS = 100; export interface AgentOptions { llm?: LLMClient | LLMClient[]; @@ -33,6 +34,8 @@ export interface ActOptions { prompt?: string // additional task-level system prompt instructions // TODO: reimpl, or maybe for tc agent specifically data?: RenderableContent//string | Record + maxSteps?: number; // Maximum number of steps for this act() call (default: 100) + reuseMemory?: boolean; // Reuse memory from previous act() calls within the same agent instance } // Options for the startAgent helper function @@ -71,7 +74,8 @@ export class Agent { //public readonly memory: AgentMemory; private doneActing: boolean; - protected latestTaskMemory: AgentMemory;// | null = null; + protected latestTaskMemory: AgentMemory; + private persistentMemory?: AgentMemory; // Memory that persists across act() calls when reuseMemory is true constructor(baseConfig: Partial = {}) { this.options = { @@ -241,7 +245,26 @@ export class Agent { ...(this.options.prompt ? [this.options.prompt] : []), ...(options.prompt ? [options.prompt] : []), ].join('\n'); - const taskMemory = new AgentMemory({ ...this.memoryOptions, instructions: instructions === '' ? undefined : instructions }); + + let taskMemory: AgentMemory; + + if (options.reuseMemory && this.persistentMemory) { + // Reuse existing memory for this execution + taskMemory = this.persistentMemory; + logger.debug('Reusing persistent memory from previous act() calls'); + } else { + // Create new memory + taskMemory = new AgentMemory({ + ...this.memoryOptions, + instructions: instructions === '' ? undefined : instructions + }); + + // Save for future reuse if requested + if (options.reuseMemory) { + this.persistentMemory = taskMemory; + logger.debug('Created new persistent memory for future reuse'); + } + } if (Array.isArray(taskOrSteps)) { const steps = taskOrSteps; @@ -303,7 +326,10 @@ export class Agent { async _act(description: string, memory: AgentMemory, options: ActOptions = {}): Promise { this.doneActing = false; - logger.info(`Act: ${description}`); + const maxSteps = options.maxSteps ?? DEFAULT_MAX_STEPS; // Default to 100 steps + let currentStep = 0; + + logger.info(`Act: ${description} (max steps: ${maxSteps})`); // for now simply add data to task let dataContentParts: MultiMediaContentPart[] = []; @@ -333,9 +359,9 @@ export class Agent { await this._recordConnectorObservations(memory); logger.info("Initial observations recorded"); - while (true) { + while (!this.doneActing && currentStep < maxSteps) { // Removed direct screenshot/tabState access here; it's part of memoryContext via connectors - logger.info(`Creating partial recipe`); + logger.info(`Creating partial recipe (step ${currentStep + 1}/${maxSteps})`); let reasoning: string = ""; let actions: Action[] = []; @@ -405,9 +431,16 @@ export class Agent { if (this.doneActing) { break; } + + currentStep++; + } + + if (currentStep >= maxSteps && !this.doneActing) { + logger.warn(`Reached maximum steps limit (${maxSteps}) without completing task: ${description}`); + this.events.emit('maxStepsReached', description, maxSteps); } - logger.info(`Done with step`); + logger.info(`Done with step after ${currentStep} steps`); //this.events.emit('stepSuccess'); //this.currentTaskMemory = null; } diff --git a/packages/magnitude-core/src/agent/narrator.ts b/packages/magnitude-core/src/agent/narrator.ts index c2f70ef5..3619e9f2 100644 --- a/packages/magnitude-core/src/agent/narrator.ts +++ b/packages/magnitude-core/src/agent/narrator.ts @@ -1,6 +1,6 @@ import { Action } from '@/actions/types'; import { ActOptions, Agent } from '@/agent'; -import { blueBright, bold, cyanBright, gray } from 'ansis'; +import { blueBright, bold, cyanBright, gray, yellow } from 'ansis'; import { BrowserAgent } from './browserAgent'; import { z } from 'zod'; @@ -57,6 +57,10 @@ export function narrateAgent(agent: Agent) { const actionDefinition = agent.identifyAction(action); console.log(bold` ${actionDefinition.render(action)}`); }); + + agent.events.on('maxStepsReached', (task: string, maxSteps: number) => { + console.log(yellow(bold(`⚠ [warning] Reached maximum steps limit (${maxSteps}) for task: "${task}"`))); + }); } export function narrateBrowserAgent(agent: BrowserAgent) { diff --git a/packages/magnitude-core/src/common/events.ts b/packages/magnitude-core/src/common/events.ts index b3854348..2d6a33ac 100644 --- a/packages/magnitude-core/src/common/events.ts +++ b/packages/magnitude-core/src/common/events.ts @@ -20,4 +20,5 @@ export interface AgentEvents { 'actionDone': (action: Action) => void; 'tokensUsed': (usage: ModelUsage) => void; + 'maxStepsReached': (task: string, maxSteps: number) => void; } \ No newline at end of file From ad0e67b18a34366e3fc4888781c46f5fa1ff64c6 Mon Sep 17 00:00:00 2001 From: kavan Date: Sun, 20 Jul 2025 16:57:22 -0700 Subject: [PATCH 7/7] added action limit and persistent memory option --- docs/core-concepts/browser-interaction.mdx | 48 ++++++++++++++ docs/reference/browser-agent.mdx | 24 +++++++ docs/reference/test-declaration.mdx | 64 +++++++++++++++---- packages/magnitude-core/src/agent/index.ts | 63 +++++++++++++++--- packages/magnitude-core/src/agent/narrator.ts | 6 +- packages/magnitude-core/src/common/events.ts | 1 + 6 files changed, 183 insertions(+), 23 deletions(-) diff --git a/docs/core-concepts/browser-interaction.mdx b/docs/core-concepts/browser-interaction.mdx index 25c0543a..301953f2 100644 --- a/docs/core-concepts/browser-interaction.mdx +++ b/docs/core-concepts/browser-interaction.mdx @@ -59,6 +59,54 @@ await agent.act('create a new task', { }); ``` +### Controlling Execution Steps + +By default, each `act()` call is limited to 100 steps to prevent infinite loops. You can adjust this limit based on task complexity: + +```typescript +// Simple task with default limit +await agent.act('click the submit button'); + +// Complex task that needs more steps +await agent.act('fill out the entire application form', { + maxSteps: 200 +}); + +// Very simple task with reduced limit +await agent.act('close the modal', { + maxSteps: 3 +}); +``` + +The agent will emit a warning event if it reaches the maximum steps without completing the task, which can help identify tasks that need adjustment or debugging. + +### Memory Persistence + +By default, each `act()` call starts with a fresh memory context. For tasks that build upon previous actions, you can enable memory persistence: + +```typescript +// First action creates some state +await agent.act('open the settings panel', { + reuseMemory: true // Start persistent memory +}); + +// Subsequent actions remember previous context +await agent.act('navigate to the security tab', { + reuseMemory: true // Continues with memory from previous act() +}); + +await agent.act('enable two-factor authentication', { + reuseMemory: true // Still has context from all previous actions +}); +``` + +This is particularly useful for: +- Multi-step workflows where context matters +- Complex interactions that reference previous actions +- Test scenarios that need to maintain state across steps + +Note: Memory is only persisted within the same agent instance. Creating a new agent starts fresh. + ## Navigating Directly While the agent is capable of navigating to URLs on its own, you may sometimes want to navigate to a specific URL directly. diff --git a/docs/reference/browser-agent.mdx b/docs/reference/browser-agent.mdx index a9e7c6ec..30db183d 100644 --- a/docs/reference/browser-agent.mdx +++ b/docs/reference/browser-agent.mdx @@ -112,6 +112,30 @@ await agent.act("Enter {username} into the user field", { - **`string`**: Provide additional instructions for the LLM. These are injected into the system prompt. + + Maximum number of steps the agent can take for this specific task. Defaults to 100. This prevents infinite loops and provides predictable resource usage. + + ```typescript + // Allow more steps for complex tasks + await agent.act("Complete the entire checkout process", { + maxSteps: 200 + }); + ``` + + + When true, reuses memory from previous act() calls within the same agent instance. This allows the agent to maintain context across multiple tasks. Defaults to false. + + ```typescript + // Enable memory persistence for related tasks + await agent.act("Log into the application", { + reuseMemory: true + }); + + await agent.act("Navigate to the dashboard", { + reuseMemory: true // Remembers the login context + }); + ``` + ### `nav(url: string)` diff --git a/docs/reference/test-declaration.mdx b/docs/reference/test-declaration.mdx index 5a43326f..8f12ba18 100644 --- a/docs/reference/test-declaration.mdx +++ b/docs/reference/test-declaration.mdx @@ -44,27 +44,48 @@ Defines a new test case. -## `test.group(id, options?, groupFn)` - -Defines a group of test cases, allowing shared options (like `url`) to be applied to all tests within the group. - -```typescript Group Example +```typescript Example with maxSteps import { test } from 'magnitude-test'; -test.group('User Authentication Flow', { url: '/login' }, () => { - test('should display login form', async (agent) => { - await agent.check("Login form is visible"); - }); +test('should handle complex checkout flow', async (agent) => { + await agent.act('add items to cart and proceed to checkout', { + maxSteps: 200 // Allow more steps for multi-stage process + }); + await agent.check('order confirmation is displayed'); +}); - test('should allow login with valid credentials', async (agent) => { - await agent.act("Log in with valid credentials"); - await agent.check("User is redirected to dashboard"); - }); +```typescript Example with memory persistence +import { test } from 'magnitude-test'; + +test('should complete multi-step user onboarding', async (agent) => { + // Step 1: Initial setup + await agent.act('fill out basic profile information', { + maxSteps: 50, + reuseMemory: true // Start persistent memory + }); + + // Step 2: Uses context from step 1 + await agent.act('select preferences based on my profile', { + maxSteps: 30, + reuseMemory: true // Continues with previous context + }); + + // Step 3: Final verification with all context + await agent.act('review and confirm all my selections', { + maxSteps: 20, + reuseMemory: true // Has full context from steps 1 & 2 + }); + + await agent.check('onboarding completed successfully'); }); ``` +## `test.group(id, options?, groupFn)` + +Defines a group of test cases, allowing shared options (like `url`) to be applied to all tests within the group. + A descriptive identifier for the test group. @@ -82,3 +103,20 @@ test.group('User Authentication Flow', { url: '/login' }, () => { A synchronous function that contains the `test()` declarations belonging to this group. + + +```typescript Group Example +import { test } from 'magnitude-test'; + +test.group('User Authentication Flow', { url: '/login' }, () => { + test('should display login form', async (agent) => { + await agent.check("Login form is visible"); + }); + + test('should allow login with valid credentials', async (agent) => { + await agent.act("Log in with valid credentials"); + await agent.check("User is redirected to dashboard"); + }); +}); +``` + diff --git a/packages/magnitude-core/src/agent/index.ts b/packages/magnitude-core/src/agent/index.ts index a99bd715..a4e5a9d2 100644 --- a/packages/magnitude-core/src/agent/index.ts +++ b/packages/magnitude-core/src/agent/index.ts @@ -19,6 +19,7 @@ import { retryOnError } from '@/common'; import { renderContentParts } from '@/memory/rendering'; import { MultiModelHarness } from '@/ai/multiModelHarness'; +const DEFAULT_MAX_STEPS = 100; export interface AgentOptions { llm?: LLMClient | LLMClient[]; @@ -32,8 +33,10 @@ export interface AgentOptions { export interface ActOptions { prompt?: string // additional task-level system prompt instructions // TODO: reimpl, or maybe for tc agent specifically - data?: RenderableContent,//string | Record - memory?: AgentMemory,// optional memory starting point + data?: RenderableContent //string | Record + memory?: AgentMemory // optional memory starting point + maxSteps?: number; // Maximum number of steps for this act() call (default: 100) + reuseMemory?: boolean; // Reuse memory from previous act() calls within the same agent instance } // Options for the startAgent helper function @@ -72,7 +75,8 @@ export class Agent { //public readonly memory: AgentMemory; private doneActing: boolean; - protected latestTaskMemory: AgentMemory;// | null = null; + protected latestTaskMemory: AgentMemory; + private persistentMemory?: AgentMemory; // Memory that persists across act() calls when reuseMemory is true constructor(baseConfig: Partial = {}) { this.options = { @@ -114,7 +118,7 @@ export class Agent { //this.model = new ModelHarness({ llm: this.options.llm }); this.models = new MultiModelHarness(llms); - this.models.events.on('tokensUsed', (usage) => this.events.emit('tokensUsed', usage), this); + this.models.events.on('tokensUsed', (usage: any) => this.events.emit('tokensUsed', usage), this); this.doneActing = false; this.memoryOptions = { @@ -242,7 +246,38 @@ export class Agent { ...(this.options.prompt ? [this.options.prompt] : []), ...(options.prompt ? [options.prompt] : []), ].join('\n'); - const taskMemory = options.memory ?? new AgentMemory({ ...this.memoryOptions, instructions: instructions === '' ? undefined : instructions }); + + let taskMemory: AgentMemory; + + // First priority: use provided memory if available + if (options.memory) { + taskMemory = options.memory; + // Optionally save for future reuse if requested + if (options.reuseMemory) { + this.persistentMemory = taskMemory; + logger.debug('Using provided memory and saving for future reuse'); + } + } + // Second priority: reuse persistent memory if requested + else if (options.reuseMemory && this.persistentMemory) { + // Reuse existing memory for this execution + taskMemory = this.persistentMemory; + logger.debug('Reusing persistent memory from previous act() calls'); + } + // Default: create new memory + else { + // Create new memory + taskMemory = new AgentMemory({ + ...this.memoryOptions, + instructions: instructions === '' ? undefined : instructions + }); + + // Save for future reuse if requested + if (options.reuseMemory) { + this.persistentMemory = taskMemory; + logger.debug('Created new persistent memory for future reuse'); + } + } if (Array.isArray(taskOrSteps)) { const steps = taskOrSteps; @@ -304,7 +339,10 @@ export class Agent { async _act(description: string, memory: AgentMemory, options: ActOptions = {}): Promise { this.doneActing = false; - logger.info(`Act: ${description}`); + const maxSteps = options.maxSteps ?? DEFAULT_MAX_STEPS; // Default to 100 steps + let currentStep = 0; + + logger.info(`Act: ${description} (max steps: ${maxSteps})`); // for now simply add data to task let dataContentParts: MultiMediaContentPart[] = []; @@ -334,9 +372,9 @@ export class Agent { await this._recordConnectorObservations(memory); logger.info("Initial observations recorded"); - while (true) { + while (!this.doneActing && currentStep < maxSteps) { // Removed direct screenshot/tabState access here; it's part of memoryContext via connectors - logger.info(`Creating partial recipe`); + logger.info(`Creating partial recipe (step ${currentStep + 1}/${maxSteps})`); let reasoning: string = ""; let actions: Action[] = []; @@ -406,9 +444,16 @@ export class Agent { if (this.doneActing) { break; } + + currentStep++; + } + + if (currentStep >= maxSteps && !this.doneActing) { + logger.warn(`Reached maximum steps limit (${maxSteps}) without completing task: ${description}`); + this.events.emit('maxStepsReached', description, maxSteps); } - logger.info(`Done with step`); + logger.info(`Done with step after ${currentStep} steps`); //this.events.emit('stepSuccess'); //this.currentTaskMemory = null; } diff --git a/packages/magnitude-core/src/agent/narrator.ts b/packages/magnitude-core/src/agent/narrator.ts index c2f70ef5..3619e9f2 100644 --- a/packages/magnitude-core/src/agent/narrator.ts +++ b/packages/magnitude-core/src/agent/narrator.ts @@ -1,6 +1,6 @@ import { Action } from '@/actions/types'; import { ActOptions, Agent } from '@/agent'; -import { blueBright, bold, cyanBright, gray } from 'ansis'; +import { blueBright, bold, cyanBright, gray, yellow } from 'ansis'; import { BrowserAgent } from './browserAgent'; import { z } from 'zod'; @@ -57,6 +57,10 @@ export function narrateAgent(agent: Agent) { const actionDefinition = agent.identifyAction(action); console.log(bold` ${actionDefinition.render(action)}`); }); + + agent.events.on('maxStepsReached', (task: string, maxSteps: number) => { + console.log(yellow(bold(`⚠ [warning] Reached maximum steps limit (${maxSteps}) for task: "${task}"`))); + }); } export function narrateBrowserAgent(agent: BrowserAgent) { diff --git a/packages/magnitude-core/src/common/events.ts b/packages/magnitude-core/src/common/events.ts index b3854348..2d6a33ac 100644 --- a/packages/magnitude-core/src/common/events.ts +++ b/packages/magnitude-core/src/common/events.ts @@ -20,4 +20,5 @@ export interface AgentEvents { 'actionDone': (action: Action) => void; 'tokensUsed': (usage: ModelUsage) => void; + 'maxStepsReached': (task: string, maxSteps: number) => void; } \ No newline at end of file