feat: Proactively take screenshot and snapshot for every browser tool call

This commit is contained in:
Vanalite 2025-10-28 11:48:55 +07:00
parent c773abb688
commit e9f469b623
2 changed files with 173 additions and 2 deletions

View File

@ -16,6 +16,7 @@ import {
newUserThreadContent, newUserThreadContent,
postMessageProcessing, postMessageProcessing,
sendCompletion, sendCompletion,
captureProactiveScreenshots,
} from '@/lib/completion' } from '@/lib/completion'
import { CompletionMessagesBuilder } from '@/lib/messages' import { CompletionMessagesBuilder } from '@/lib/messages'
import { renderInstructions } from '@/lib/instructionTemplate' import { renderInstructions } from '@/lib/instructionTemplate'
@ -419,6 +420,27 @@ export const useChat = () => {
}) })
: [] : []
// Check if proactive mode is enabled
const isProactiveMode = selectedModel?.capabilities?.includes('proactive') ?? false
// Proactive mode: Capture initial screenshot/snapshot before first LLM call
if (isProactiveMode && availableTools.length > 0 && !abortController.signal.aborted) {
console.log('Proactive mode: Capturing initial screenshots before LLM call')
try {
const initialScreenshots = await captureProactiveScreenshots(abortController)
// Add initial screenshots to builder
for (const screenshot of initialScreenshots) {
// Generate unique tool call ID for initial screenshot
const proactiveToolCallId = `proactive_initial_${Date.now()}_${Math.random()}`
builder.addToolMessage(screenshot, proactiveToolCallId)
console.log('Initial proactive screenshot added to context')
}
} catch (e) {
console.warn('Failed to capture initial proactive screenshots:', e)
}
}
let assistantLoopSteps = 0 let assistantLoopSteps = 0
while ( while (
@ -694,6 +716,10 @@ export const useChat = () => {
) )
builder.addAssistantMessage(accumulatedText, undefined, toolCalls) builder.addAssistantMessage(accumulatedText, undefined, toolCalls)
// Check if proactive mode is enabled for this model
const isProactiveMode = selectedModel?.capabilities?.includes('proactive') ?? false
const updatedMessage = await postMessageProcessing( const updatedMessage = await postMessageProcessing(
toolCalls, toolCalls,
builder, builder,
@ -701,7 +727,8 @@ export const useChat = () => {
abortController, abortController,
useToolApproval.getState().approvedTools, useToolApproval.getState().approvedTools,
allowAllMCPPermissions ? undefined : showApprovalModal, allowAllMCPPermissions ? undefined : showApprovalModal,
allowAllMCPPermissions allowAllMCPPermissions,
isProactiveMode
) )
addMessage(updatedMessage ?? finalContent) addMessage(updatedMessage ?? finalContent)
updateStreamingContent(emptyThreadContent) updateStreamingContent(emptyThreadContent)

View File

@ -378,6 +378,119 @@ export const extractToolCall = (
return calls return calls
} }
/**
* Helper function to check if a tool call is a browser MCP tool
* @param toolName - The name of the tool
* @returns true if the tool is a browser-related MCP tool
*/
const isBrowserMCPTool = (toolName: string): boolean => {
const browserToolPrefixes = [
'browser',
'browserbase',
'browsermcp',
'multi_browserbase',
]
return browserToolPrefixes.some((prefix) =>
toolName.toLowerCase().startsWith(prefix)
)
}
/**
* Helper function to capture screenshot and snapshot proactively
* @param abortController - The abort controller for cancellation
* @returns Promise with screenshot and snapshot results
*/
export const captureProactiveScreenshots = async (
abortController: AbortController
): Promise<ToolResult[]> => {
const results: ToolResult[] = []
try {
// Get available tools
const allTools = await getServiceHub().mcp().getTools()
// Find screenshot and snapshot tools
const screenshotTool = allTools.find((t) =>
t.name.toLowerCase().includes('screenshot')
)
const snapshotTool = allTools.find((t) =>
t.name.toLowerCase().includes('snapshot')
)
// Capture screenshot if available
if (screenshotTool && !abortController.signal.aborted) {
try {
const { promise } = getServiceHub().mcp().callToolWithCancellation({
toolName: screenshotTool.name,
arguments: {},
})
const screenshotResult = await promise
if (screenshotResult && typeof screenshotResult !== 'string') {
results.push(screenshotResult as ToolResult)
}
} catch (e) {
console.warn('Failed to capture proactive screenshot:', e)
}
}
// Capture snapshot if available
if (snapshotTool && !abortController.signal.aborted) {
try {
const { promise } = getServiceHub().mcp().callToolWithCancellation({
toolName: snapshotTool.name,
arguments: {},
})
const snapshotResult = await promise
if (snapshotResult && typeof snapshotResult !== 'string') {
results.push(snapshotResult as ToolResult)
}
} catch (e) {
console.warn('Failed to capture proactive snapshot:', e)
}
}
} catch (e) {
console.error('Failed to get MCP tools for proactive capture:', e)
}
return results
}
/**
* Helper function to filter out old screenshot/snapshot images from builder messages
* Keeps only the latest proactive screenshots
* @param builder - The completion messages builder
*/
const filterOldProactiveScreenshots = (builder: CompletionMessagesBuilder) => {
const messages = builder.getMessages()
const filteredMessages: any[] = []
for (const msg of messages) {
if (msg.role === 'tool') {
// If it's a tool message with array content (multimodal)
if (Array.isArray(msg.content)) {
// Filter out images, keep text only for old tool messages
const textOnly = msg.content.filter(
(part: any) => part.type !== 'image_url'
)
if (textOnly.length > 0) {
filteredMessages.push({ ...msg, content: textOnly })
}
} else {
// Keep string content as-is
filteredMessages.push(msg)
}
} else {
// Keep all non-tool messages
filteredMessages.push(msg)
}
}
// Reconstruct builder with filtered messages
// Note: This is a workaround since CompletionMessagesBuilder doesn't have a setter
// We'll need to access the private messages array
;(builder as any).messages = filteredMessages
}
/** /**
* @fileoverview Helper function to process the completion response. * @fileoverview Helper function to process the completion response.
* @param calls * @param calls
@ -387,6 +500,7 @@ export const extractToolCall = (
* @param approvedTools * @param approvedTools
* @param showModal * @param showModal
* @param allowAllMCPPermissions * @param allowAllMCPPermissions
* @param isProactiveMode
*/ */
export const postMessageProcessing = async ( export const postMessageProcessing = async (
calls: ChatCompletionMessageToolCall[], calls: ChatCompletionMessageToolCall[],
@ -399,10 +513,13 @@ export const postMessageProcessing = async (
threadId: string, threadId: string,
toolParameters?: object toolParameters?: object
) => Promise<boolean>, ) => Promise<boolean>,
allowAllMCPPermissions: boolean = false allowAllMCPPermissions: boolean = false,
isProactiveMode: boolean = false
) => { ) => {
// Handle completed tool calls // Handle completed tool calls
if (calls.length) { if (calls.length) {
// Track if any browser MCP tool was called
let hasBrowserMCPToolCall = false
// Fetch RAG tool names from RAG service // Fetch RAG tool names from RAG service
let ragToolNames = new Set<string>() let ragToolNames = new Set<string>()
try { try {
@ -455,6 +572,7 @@ export const postMessageProcessing = async (
const toolName = toolCall.function.name const toolName = toolCall.function.name
const toolArgs = toolCall.function.arguments.length ? toolParameters : {} const toolArgs = toolCall.function.arguments.length ? toolParameters : {}
const isRagTool = ragToolNames.has(toolName) const isRagTool = ragToolNames.has(toolName)
const isBrowserTool = isBrowserMCPTool(toolName)
// Auto-approve RAG tools (local/safe operations), require permission for MCP tools // Auto-approve RAG tools (local/safe operations), require permission for MCP tools
const approved = isRagTool const approved = isRagTool
@ -544,6 +662,32 @@ export const postMessageProcessing = async (
], ],
} }
builder.addToolMessage(result as ToolResult, toolCall.id) builder.addToolMessage(result as ToolResult, toolCall.id)
// Mark if we used a browser tool (for proactive mode)
if (isBrowserTool) {
hasBrowserMCPToolCall = true
}
// Proactive mode: Capture screenshot/snapshot after browser tool execution
if (isProactiveMode && isBrowserTool && !abortController.signal.aborted) {
console.log('Proactive mode: Capturing screenshots after browser tool call')
// Filter out old screenshots before adding new ones
filterOldProactiveScreenshots(builder)
// Capture new screenshots
const proactiveScreenshots = await captureProactiveScreenshots(abortController)
// Add proactive screenshots to builder
for (const screenshot of proactiveScreenshots) {
// Generate a unique tool call ID for the proactive screenshot
const proactiveToolCallId = ulid()
builder.addToolMessage(screenshot, proactiveToolCallId)
console.log('Proactive screenshot captured and added to context')
}
}
// update message metadata // update message metadata
} }
return message return message