diff --git a/web-app/src/hooks/useChat.ts b/web-app/src/hooks/useChat.ts index 15d06f506..9bc550607 100644 --- a/web-app/src/hooks/useChat.ts +++ b/web-app/src/hooks/useChat.ts @@ -16,6 +16,7 @@ import { newUserThreadContent, postMessageProcessing, sendCompletion, + captureProactiveScreenshots, } from '@/lib/completion' import { CompletionMessagesBuilder } from '@/lib/messages' import { renderInstructions } from '@/lib/instructionTemplate' @@ -419,6 +420,27 @@ export const useChat = () => { }) : [] + // Check if proactive mode is enabled + const isProactiveMode = selectedModel?.capabilities?.includes('proactive') ?? false + + // Proactive mode: Capture initial screenshot/snapshot before first LLM call + if (isProactiveMode && availableTools.length > 0 && !abortController.signal.aborted) { + console.log('Proactive mode: Capturing initial screenshots before LLM call') + try { + const initialScreenshots = await captureProactiveScreenshots(abortController) + + // Add initial screenshots to builder + for (const screenshot of initialScreenshots) { + // Generate unique tool call ID for initial screenshot + const proactiveToolCallId = `proactive_initial_${Date.now()}_${Math.random()}` + builder.addToolMessage(screenshot, proactiveToolCallId) + console.log('Initial proactive screenshot added to context') + } + } catch (e) { + console.warn('Failed to capture initial proactive screenshots:', e) + } + } + let assistantLoopSteps = 0 while ( @@ -694,6 +716,10 @@ export const useChat = () => { ) builder.addAssistantMessage(accumulatedText, undefined, toolCalls) + + // Check if proactive mode is enabled for this model + const isProactiveMode = selectedModel?.capabilities?.includes('proactive') ?? false + const updatedMessage = await postMessageProcessing( toolCalls, builder, @@ -701,7 +727,8 @@ export const useChat = () => { abortController, useToolApproval.getState().approvedTools, allowAllMCPPermissions ? undefined : showApprovalModal, - allowAllMCPPermissions + allowAllMCPPermissions, + isProactiveMode ) addMessage(updatedMessage ?? finalContent) updateStreamingContent(emptyThreadContent) diff --git a/web-app/src/lib/completion.ts b/web-app/src/lib/completion.ts index d72234024..8b511f942 100644 --- a/web-app/src/lib/completion.ts +++ b/web-app/src/lib/completion.ts @@ -378,6 +378,119 @@ export const extractToolCall = ( return calls } +/** + * Helper function to check if a tool call is a browser MCP tool + * @param toolName - The name of the tool + * @returns true if the tool is a browser-related MCP tool + */ +const isBrowserMCPTool = (toolName: string): boolean => { + const browserToolPrefixes = [ + 'browser', + 'browserbase', + 'browsermcp', + 'multi_browserbase', + ] + return browserToolPrefixes.some((prefix) => + toolName.toLowerCase().startsWith(prefix) + ) +} + +/** + * Helper function to capture screenshot and snapshot proactively + * @param abortController - The abort controller for cancellation + * @returns Promise with screenshot and snapshot results + */ +export const captureProactiveScreenshots = async ( + abortController: AbortController +): Promise => { + const results: ToolResult[] = [] + + try { + // Get available tools + const allTools = await getServiceHub().mcp().getTools() + + // Find screenshot and snapshot tools + const screenshotTool = allTools.find((t) => + t.name.toLowerCase().includes('screenshot') + ) + const snapshotTool = allTools.find((t) => + t.name.toLowerCase().includes('snapshot') + ) + + // Capture screenshot if available + if (screenshotTool && !abortController.signal.aborted) { + try { + const { promise } = getServiceHub().mcp().callToolWithCancellation({ + toolName: screenshotTool.name, + arguments: {}, + }) + const screenshotResult = await promise + if (screenshotResult && typeof screenshotResult !== 'string') { + results.push(screenshotResult as ToolResult) + } + } catch (e) { + console.warn('Failed to capture proactive screenshot:', e) + } + } + + // Capture snapshot if available + if (snapshotTool && !abortController.signal.aborted) { + try { + const { promise } = getServiceHub().mcp().callToolWithCancellation({ + toolName: snapshotTool.name, + arguments: {}, + }) + const snapshotResult = await promise + if (snapshotResult && typeof snapshotResult !== 'string') { + results.push(snapshotResult as ToolResult) + } + } catch (e) { + console.warn('Failed to capture proactive snapshot:', e) + } + } + } catch (e) { + console.error('Failed to get MCP tools for proactive capture:', e) + } + + return results +} + +/** + * Helper function to filter out old screenshot/snapshot images from builder messages + * Keeps only the latest proactive screenshots + * @param builder - The completion messages builder + */ +const filterOldProactiveScreenshots = (builder: CompletionMessagesBuilder) => { + const messages = builder.getMessages() + const filteredMessages: any[] = [] + + for (const msg of messages) { + if (msg.role === 'tool') { + // If it's a tool message with array content (multimodal) + if (Array.isArray(msg.content)) { + // Filter out images, keep text only for old tool messages + const textOnly = msg.content.filter( + (part: any) => part.type !== 'image_url' + ) + if (textOnly.length > 0) { + filteredMessages.push({ ...msg, content: textOnly }) + } + } else { + // Keep string content as-is + filteredMessages.push(msg) + } + } else { + // Keep all non-tool messages + filteredMessages.push(msg) + } + } + + // Reconstruct builder with filtered messages + // Note: This is a workaround since CompletionMessagesBuilder doesn't have a setter + // We'll need to access the private messages array + ;(builder as any).messages = filteredMessages +} + /** * @fileoverview Helper function to process the completion response. * @param calls @@ -387,6 +500,7 @@ export const extractToolCall = ( * @param approvedTools * @param showModal * @param allowAllMCPPermissions + * @param isProactiveMode */ export const postMessageProcessing = async ( calls: ChatCompletionMessageToolCall[], @@ -399,10 +513,13 @@ export const postMessageProcessing = async ( threadId: string, toolParameters?: object ) => Promise, - allowAllMCPPermissions: boolean = false + allowAllMCPPermissions: boolean = false, + isProactiveMode: boolean = false ) => { // Handle completed tool calls if (calls.length) { + // Track if any browser MCP tool was called + let hasBrowserMCPToolCall = false // Fetch RAG tool names from RAG service let ragToolNames = new Set() try { @@ -455,6 +572,7 @@ export const postMessageProcessing = async ( const toolName = toolCall.function.name const toolArgs = toolCall.function.arguments.length ? toolParameters : {} const isRagTool = ragToolNames.has(toolName) + const isBrowserTool = isBrowserMCPTool(toolName) // Auto-approve RAG tools (local/safe operations), require permission for MCP tools const approved = isRagTool @@ -544,6 +662,32 @@ export const postMessageProcessing = async ( ], } builder.addToolMessage(result as ToolResult, toolCall.id) + + // Mark if we used a browser tool (for proactive mode) + if (isBrowserTool) { + hasBrowserMCPToolCall = true + } + + // Proactive mode: Capture screenshot/snapshot after browser tool execution + if (isProactiveMode && isBrowserTool && !abortController.signal.aborted) { + console.log('Proactive mode: Capturing screenshots after browser tool call') + + // Filter out old screenshots before adding new ones + filterOldProactiveScreenshots(builder) + + // Capture new screenshots + const proactiveScreenshots = await captureProactiveScreenshots(abortController) + + // Add proactive screenshots to builder + for (const screenshot of proactiveScreenshots) { + // Generate a unique tool call ID for the proactive screenshot + const proactiveToolCallId = ulid() + builder.addToolMessage(screenshot, proactiveToolCallId) + + console.log('Proactive screenshot captured and added to context') + } + } + // update message metadata } return message