feat: Proactively take screenshot and snapshot for every browser tool call
This commit is contained in:
parent
c773abb688
commit
e9f469b623
@ -16,6 +16,7 @@ import {
|
||||
newUserThreadContent,
|
||||
postMessageProcessing,
|
||||
sendCompletion,
|
||||
captureProactiveScreenshots,
|
||||
} from '@/lib/completion'
|
||||
import { CompletionMessagesBuilder } from '@/lib/messages'
|
||||
import { renderInstructions } from '@/lib/instructionTemplate'
|
||||
@ -419,6 +420,27 @@ export const useChat = () => {
|
||||
})
|
||||
: []
|
||||
|
||||
// Check if proactive mode is enabled
|
||||
const isProactiveMode = selectedModel?.capabilities?.includes('proactive') ?? false
|
||||
|
||||
// Proactive mode: Capture initial screenshot/snapshot before first LLM call
|
||||
if (isProactiveMode && availableTools.length > 0 && !abortController.signal.aborted) {
|
||||
console.log('Proactive mode: Capturing initial screenshots before LLM call')
|
||||
try {
|
||||
const initialScreenshots = await captureProactiveScreenshots(abortController)
|
||||
|
||||
// Add initial screenshots to builder
|
||||
for (const screenshot of initialScreenshots) {
|
||||
// Generate unique tool call ID for initial screenshot
|
||||
const proactiveToolCallId = `proactive_initial_${Date.now()}_${Math.random()}`
|
||||
builder.addToolMessage(screenshot, proactiveToolCallId)
|
||||
console.log('Initial proactive screenshot added to context')
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn('Failed to capture initial proactive screenshots:', e)
|
||||
}
|
||||
}
|
||||
|
||||
let assistantLoopSteps = 0
|
||||
|
||||
while (
|
||||
@ -694,6 +716,10 @@ export const useChat = () => {
|
||||
)
|
||||
|
||||
builder.addAssistantMessage(accumulatedText, undefined, toolCalls)
|
||||
|
||||
// Check if proactive mode is enabled for this model
|
||||
const isProactiveMode = selectedModel?.capabilities?.includes('proactive') ?? false
|
||||
|
||||
const updatedMessage = await postMessageProcessing(
|
||||
toolCalls,
|
||||
builder,
|
||||
@ -701,7 +727,8 @@ export const useChat = () => {
|
||||
abortController,
|
||||
useToolApproval.getState().approvedTools,
|
||||
allowAllMCPPermissions ? undefined : showApprovalModal,
|
||||
allowAllMCPPermissions
|
||||
allowAllMCPPermissions,
|
||||
isProactiveMode
|
||||
)
|
||||
addMessage(updatedMessage ?? finalContent)
|
||||
updateStreamingContent(emptyThreadContent)
|
||||
|
||||
@ -378,6 +378,119 @@ export const extractToolCall = (
|
||||
return calls
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function to check if a tool call is a browser MCP tool
|
||||
* @param toolName - The name of the tool
|
||||
* @returns true if the tool is a browser-related MCP tool
|
||||
*/
|
||||
const isBrowserMCPTool = (toolName: string): boolean => {
|
||||
const browserToolPrefixes = [
|
||||
'browser',
|
||||
'browserbase',
|
||||
'browsermcp',
|
||||
'multi_browserbase',
|
||||
]
|
||||
return browserToolPrefixes.some((prefix) =>
|
||||
toolName.toLowerCase().startsWith(prefix)
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function to capture screenshot and snapshot proactively
|
||||
* @param abortController - The abort controller for cancellation
|
||||
* @returns Promise with screenshot and snapshot results
|
||||
*/
|
||||
export const captureProactiveScreenshots = async (
|
||||
abortController: AbortController
|
||||
): Promise<ToolResult[]> => {
|
||||
const results: ToolResult[] = []
|
||||
|
||||
try {
|
||||
// Get available tools
|
||||
const allTools = await getServiceHub().mcp().getTools()
|
||||
|
||||
// Find screenshot and snapshot tools
|
||||
const screenshotTool = allTools.find((t) =>
|
||||
t.name.toLowerCase().includes('screenshot')
|
||||
)
|
||||
const snapshotTool = allTools.find((t) =>
|
||||
t.name.toLowerCase().includes('snapshot')
|
||||
)
|
||||
|
||||
// Capture screenshot if available
|
||||
if (screenshotTool && !abortController.signal.aborted) {
|
||||
try {
|
||||
const { promise } = getServiceHub().mcp().callToolWithCancellation({
|
||||
toolName: screenshotTool.name,
|
||||
arguments: {},
|
||||
})
|
||||
const screenshotResult = await promise
|
||||
if (screenshotResult && typeof screenshotResult !== 'string') {
|
||||
results.push(screenshotResult as ToolResult)
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn('Failed to capture proactive screenshot:', e)
|
||||
}
|
||||
}
|
||||
|
||||
// Capture snapshot if available
|
||||
if (snapshotTool && !abortController.signal.aborted) {
|
||||
try {
|
||||
const { promise } = getServiceHub().mcp().callToolWithCancellation({
|
||||
toolName: snapshotTool.name,
|
||||
arguments: {},
|
||||
})
|
||||
const snapshotResult = await promise
|
||||
if (snapshotResult && typeof snapshotResult !== 'string') {
|
||||
results.push(snapshotResult as ToolResult)
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn('Failed to capture proactive snapshot:', e)
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('Failed to get MCP tools for proactive capture:', e)
|
||||
}
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function to filter out old screenshot/snapshot images from builder messages
|
||||
* Keeps only the latest proactive screenshots
|
||||
* @param builder - The completion messages builder
|
||||
*/
|
||||
const filterOldProactiveScreenshots = (builder: CompletionMessagesBuilder) => {
|
||||
const messages = builder.getMessages()
|
||||
const filteredMessages: any[] = []
|
||||
|
||||
for (const msg of messages) {
|
||||
if (msg.role === 'tool') {
|
||||
// If it's a tool message with array content (multimodal)
|
||||
if (Array.isArray(msg.content)) {
|
||||
// Filter out images, keep text only for old tool messages
|
||||
const textOnly = msg.content.filter(
|
||||
(part: any) => part.type !== 'image_url'
|
||||
)
|
||||
if (textOnly.length > 0) {
|
||||
filteredMessages.push({ ...msg, content: textOnly })
|
||||
}
|
||||
} else {
|
||||
// Keep string content as-is
|
||||
filteredMessages.push(msg)
|
||||
}
|
||||
} else {
|
||||
// Keep all non-tool messages
|
||||
filteredMessages.push(msg)
|
||||
}
|
||||
}
|
||||
|
||||
// Reconstruct builder with filtered messages
|
||||
// Note: This is a workaround since CompletionMessagesBuilder doesn't have a setter
|
||||
// We'll need to access the private messages array
|
||||
;(builder as any).messages = filteredMessages
|
||||
}
|
||||
|
||||
/**
|
||||
* @fileoverview Helper function to process the completion response.
|
||||
* @param calls
|
||||
@ -387,6 +500,7 @@ export const extractToolCall = (
|
||||
* @param approvedTools
|
||||
* @param showModal
|
||||
* @param allowAllMCPPermissions
|
||||
* @param isProactiveMode
|
||||
*/
|
||||
export const postMessageProcessing = async (
|
||||
calls: ChatCompletionMessageToolCall[],
|
||||
@ -399,10 +513,13 @@ export const postMessageProcessing = async (
|
||||
threadId: string,
|
||||
toolParameters?: object
|
||||
) => Promise<boolean>,
|
||||
allowAllMCPPermissions: boolean = false
|
||||
allowAllMCPPermissions: boolean = false,
|
||||
isProactiveMode: boolean = false
|
||||
) => {
|
||||
// Handle completed tool calls
|
||||
if (calls.length) {
|
||||
// Track if any browser MCP tool was called
|
||||
let hasBrowserMCPToolCall = false
|
||||
// Fetch RAG tool names from RAG service
|
||||
let ragToolNames = new Set<string>()
|
||||
try {
|
||||
@ -455,6 +572,7 @@ export const postMessageProcessing = async (
|
||||
const toolName = toolCall.function.name
|
||||
const toolArgs = toolCall.function.arguments.length ? toolParameters : {}
|
||||
const isRagTool = ragToolNames.has(toolName)
|
||||
const isBrowserTool = isBrowserMCPTool(toolName)
|
||||
|
||||
// Auto-approve RAG tools (local/safe operations), require permission for MCP tools
|
||||
const approved = isRagTool
|
||||
@ -544,6 +662,32 @@ export const postMessageProcessing = async (
|
||||
],
|
||||
}
|
||||
builder.addToolMessage(result as ToolResult, toolCall.id)
|
||||
|
||||
// Mark if we used a browser tool (for proactive mode)
|
||||
if (isBrowserTool) {
|
||||
hasBrowserMCPToolCall = true
|
||||
}
|
||||
|
||||
// Proactive mode: Capture screenshot/snapshot after browser tool execution
|
||||
if (isProactiveMode && isBrowserTool && !abortController.signal.aborted) {
|
||||
console.log('Proactive mode: Capturing screenshots after browser tool call')
|
||||
|
||||
// Filter out old screenshots before adding new ones
|
||||
filterOldProactiveScreenshots(builder)
|
||||
|
||||
// Capture new screenshots
|
||||
const proactiveScreenshots = await captureProactiveScreenshots(abortController)
|
||||
|
||||
// Add proactive screenshots to builder
|
||||
for (const screenshot of proactiveScreenshots) {
|
||||
// Generate a unique tool call ID for the proactive screenshot
|
||||
const proactiveToolCallId = ulid()
|
||||
builder.addToolMessage(screenshot, proactiveToolCallId)
|
||||
|
||||
console.log('Proactive screenshot captured and added to context')
|
||||
}
|
||||
}
|
||||
|
||||
// update message metadata
|
||||
}
|
||||
return message
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user