feat: support multimodal tool results and improve tool message handling (#6816)

* feat: support multimodal tool results and improve tool message handling - Added a temporary `ToolResult` type that mirrors the structure returned by tools (text, image data, URLs, errors). - Implemented `convertToolPartToApiContentPart` to translate each tool output part into the format expected by the OpenAI chat completion API. - Updated `CompletionMessagesBuilder.addToolMessage` to accept a full `ToolResult` instead of a plain string and to: - Detect multimodal content (base64 images, image URLs) and build a structured `content` array. - Properly handle plain‑text results, tool execution errors, and unexpected formats with sensible fallbacks. - Cast the final content to `any` for the `tool` role as required by the API. - Modified `postMessageProcessing` to pass the raw tool result (`result as any`) to `addToolMessage`, avoiding premature extraction of only the first text part. - Refactored several formatting and type‑annotation sections: - Added multiline guard for empty user messages to insert a placeholder. - Split the image URL construction into a clearer multiline object. - Adjusted method signatures and added minor line‑breaks for readability. - Included extensive comments explaining the new logic and edge‑case handling. These changes enable the chat system to handle richer tool outputs (e.g., images, mixed content) and provide more robust error handling. * Satisfy ts linter * Make ts linter happy x2 * chore: update test message creation --------- Co-authored-by: Faisal Amir <urmauur@gmail.com>
2025-10-24 20:15:15 +05:30 · 2025-10-24 20:15:15 +05:30 · 2561fcd78a
commit 2561fcd78a
parent 28ed5e2af2
2 changed files with 90 additions and 8 deletions
--- a/web-app/src/lib/completion.ts
+++ b/web-app/src/lib/completion.ts
@ -32,7 +32,7 @@ type ExtendedConfigOptions = ConfigOptions & {
 }
 import { ulid } from 'ulidx'
 import { MCPTool } from '@/types/completion'
-import { CompletionMessagesBuilder } from './messages'
+import { CompletionMessagesBuilder, ToolResult } from './messages'
 import { ChatCompletionMessageToolCall } from 'openai/resources'
 import { ExtensionManager } from './extension'
 import { useAppState } from '@/hooks/useAppState'
@ -543,7 +543,7 @@ export const postMessageProcessing = async (
          },
        ],
      }
-      builder.addToolMessage(result.content[0]?.text ?? '', toolCall.id)
+      builder.addToolMessage(result as ToolResult, toolCall.id)
      // update message metadata
    }
    return message
--- a/web-app/src/lib/messages.ts
+++ b/web-app/src/lib/messages.ts
@ -1,3 +1,4 @@
+/* eslint-disable @typescript-eslint/no-explicit-any */
 import { ChatCompletionMessageParam } from 'token.js'
 import { ChatCompletionMessageToolCall } from 'openai/resources'
 import { ThreadMessage, ContentType } from '@janhq/core'
@ -6,6 +7,48 @@ import { removeReasoningContent } from '@/utils/reasoning'

 type ThreadContent = NonNullable<ThreadMessage['content']>[number]

+// Define a temporary type for the expected tool result shape (ToolResult as before)
+export type ToolResult = {
+  content: Array<{
+    type?: string
+    text?: string
+    data?: string
+    image_url?: { url: string; detail?: string }
+  }>
+  error?: string
+}
+
+// Helper function to convert the tool's output part into an API content part
+const convertToolPartToApiContentPart = (part: ToolResult['content'][0]) => {
+  if (part.text) {
+    return { type: 'text', text: part.text }
+  }
+
+  // Handle base64 image data
+  if (part.data) {
+    // Assume default image type, though a proper tool should return the mime type
+    const mimeType =
+      part.type === 'image' ? 'image/png' : part.type || 'image/png'
+    const dataUrl = `data:${mimeType};base64,${part.data}`
+
+    return {
+      type: 'image_url',
+      image_url: {
+        url: dataUrl,
+        detail: 'auto',
+      },
+    }
+  }
+
+  // Handle pre-formatted image URL
+  if (part.image_url) {
+    return { type: 'image_url', image_url: part.image_url }
+  }
+
+  // Fallback to text stringification for structured but unhandled data
+  return { type: 'text', text: JSON.stringify(part) }
+}
+
 /**
 * @fileoverview Helper functions for creating chat completion request.
 * These functions are used to create chat completion request objects
@ -26,7 +69,11 @@ export class CompletionMessagesBuilder {
        .map<ChatCompletionMessageParam>((msg) => {
          const param = this.toCompletionParamFromThread(msg)
          // In constructor context, normalize empty user text to a placeholder
-          if (param.role === 'user' && typeof param.content === 'string' && param.content === '') {
+          if (
+            param.role === 'user' &&
+            typeof param.content === 'string' &&
+            param.content === ''
+          ) {
            return { ...param, content: '.' }
          }
          return param
@ -35,7 +82,9 @@ export class CompletionMessagesBuilder {
  }

  // Normalize a ThreadMessage into a ChatCompletionMessageParam for Token.js
-  private toCompletionParamFromThread(msg: ThreadMessage): ChatCompletionMessageParam {
+  private toCompletionParamFromThread(
+    msg: ThreadMessage
+  ): ChatCompletionMessageParam {
    if (msg.role === 'assistant') {
      return {
        role: 'assistant',
@ -60,7 +109,10 @@ export class CompletionMessagesBuilder {
        if (part.type === ContentType.Image) {
          return {
            type: 'image_url' as const,
-            image_url: { url: part.image_url?.url || '', detail: part.image_url?.detail || 'auto' },
+            image_url: {
+              url: part.image_url?.url || '',
+              detail: part.image_url?.detail || 'auto',
+            },
          }
        }
        // Fallback for unknown content types
@ -110,13 +162,43 @@ export class CompletionMessagesBuilder {

  /**
   * Add a tool message to the messages array.
-   * @param content - The content of the tool message.
+   * @param content - The content of the tool message (string or ToolResult object).
   * @param toolCallId - The ID of the tool call associated with the message.
   */
-  addToolMessage(content: string, toolCallId: string) {
+  addToolMessage(result: string | ToolResult, toolCallId: string) {
+    let content: string | any[] = ''
+
+    // Handle simple string case
+    if (typeof result === 'string') {
+      content = result
+    } else {
+      // Check for multimodal content (more than just a simple text string)
+      const hasMultimodalContent = result.content?.some(
+        (p) => p.data || p.image_url
+      )
+
+      if (hasMultimodalContent) {
+        // Build the structured content array
+        content = result.content.map(convertToolPartToApiContentPart)
+      } else if (result.content?.[0]?.text) {
+        // Standard text case
+        content = result.content[0].text
+      } else if (result.error) {
+        // Error case
+        content = `Tool execution failed: ${result.error}`
+      } else {
+        // Fallback: serialize the whole result structure if content is unexpected
+        try {
+          content = JSON.stringify(result)
+        } catch {
+          content = 'Tool call completed, unexpected output format.'
+        }
+      }
+    }
    this.messages.push({
      role: 'tool',
-      content: content,
+      // for role 'tool',  need to use 'as ChatCompletionMessageParam'
+      content: content as any,
      tool_call_id: toolCallId,
    })
  }