From 2561fcd78aed5da8eb800e5a8424ff72859f947f Mon Sep 17 00:00:00 2001
From: Akarshan Biswas <akarshan@menlo.ai>
Date: Fri, 24 Oct 2025 20:15:15 +0530
Subject: [PATCH] feat: support multimodal tool results and improve tool
 message handling (#6816)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat: support multimodal tool results and improve tool message handling

- Added a temporary `ToolResult` type that mirrors the structure returned by tools (text, image data, URLs, errors).
- Implemented `convertToolPartToApiContentPart` to translate each tool output part into the format expected by the OpenAI chat completion API.
- Updated `CompletionMessagesBuilder.addToolMessage` to accept a full `ToolResult` instead of a plain string and to:
  - Detect multimodal content (base64 images, image URLs) and build a structured `content` array.
  - Properly handle plain‑text results, tool execution errors, and unexpected formats with sensible fallbacks.
  - Cast the final content to `any` for the `tool` role as required by the API.
- Modified `postMessageProcessing` to pass the raw tool result (`result as any`) to `addToolMessage`, avoiding premature extraction of only the first text part.
- Refactored several formatting and type‑annotation sections:
  - Added multiline guard for empty user messages to insert a placeholder.
  - Split the image URL construction into a clearer multiline object.
  - Adjusted method signatures and added minor line‑breaks for readability.
- Included extensive comments explaining the new logic and edge‑case handling.

These changes enable the chat system to handle richer tool outputs (e.g., images, mixed content) and provide more robust error handling.

* Satisfy ts linter

* Make ts linter happy x2

* chore: update test message creation

---------

Co-authored-by: Faisal Amir <urmauur@gmail.com>
---
 web-app/src/lib/completion.ts |  4 +-
 web-app/src/lib/messages.ts   | 94 ++++++++++++++++++++++++++++++++---
 2 files changed, 90 insertions(+), 8 deletions(-)

diff --git a/web-app/src/lib/completion.ts b/web-app/src/lib/completion.ts
index e602ff88e..d72234024 100644
--- a/web-app/src/lib/completion.ts
+++ b/web-app/src/lib/completion.ts
@@ -32,7 +32,7 @@ type ExtendedConfigOptions = ConfigOptions & {
 }
 import { ulid } from 'ulidx'
 import { MCPTool } from '@/types/completion'
-import { CompletionMessagesBuilder } from './messages'
+import { CompletionMessagesBuilder, ToolResult } from './messages'
 import { ChatCompletionMessageToolCall } from 'openai/resources'
 import { ExtensionManager } from './extension'
 import { useAppState } from '@/hooks/useAppState'
@@ -543,7 +543,7 @@ export const postMessageProcessing = async (
           },
         ],
       }
-      builder.addToolMessage(result.content[0]?.text ?? '', toolCall.id)
+      builder.addToolMessage(result as ToolResult, toolCall.id)
       // update message metadata
     }
     return message
diff --git a/web-app/src/lib/messages.ts b/web-app/src/lib/messages.ts
index 3361e2703..c06a67bad 100644
--- a/web-app/src/lib/messages.ts
+++ b/web-app/src/lib/messages.ts
@@ -1,3 +1,4 @@
+/* eslint-disable @typescript-eslint/no-explicit-any */
 import { ChatCompletionMessageParam } from 'token.js'
 import { ChatCompletionMessageToolCall } from 'openai/resources'
 import { ThreadMessage, ContentType } from '@janhq/core'
@@ -6,6 +7,48 @@ import { removeReasoningContent } from '@/utils/reasoning'
 
 type ThreadContent = NonNullable<ThreadMessage['content']>[number]
 
+// Define a temporary type for the expected tool result shape (ToolResult as before)
+export type ToolResult = {
+  content: Array<{
+    type?: string
+    text?: string
+    data?: string
+    image_url?: { url: string; detail?: string }
+  }>
+  error?: string
+}
+
+// Helper function to convert the tool's output part into an API content part
+const convertToolPartToApiContentPart = (part: ToolResult['content'][0]) => {
+  if (part.text) {
+    return { type: 'text', text: part.text }
+  }
+
+  // Handle base64 image data
+  if (part.data) {
+    // Assume default image type, though a proper tool should return the mime type
+    const mimeType =
+      part.type === 'image' ? 'image/png' : part.type || 'image/png'
+    const dataUrl = `data:${mimeType};base64,${part.data}`
+
+    return {
+      type: 'image_url',
+      image_url: {
+        url: dataUrl,
+        detail: 'auto',
+      },
+    }
+  }
+
+  // Handle pre-formatted image URL
+  if (part.image_url) {
+    return { type: 'image_url', image_url: part.image_url }
+  }
+
+  // Fallback to text stringification for structured but unhandled data
+  return { type: 'text', text: JSON.stringify(part) }
+}
+
 /**
  * @fileoverview Helper functions for creating chat completion request.
  * These functions are used to create chat completion request objects
@@ -26,7 +69,11 @@ export class CompletionMessagesBuilder {
         .map<ChatCompletionMessageParam>((msg) => {
           const param = this.toCompletionParamFromThread(msg)
           // In constructor context, normalize empty user text to a placeholder
-          if (param.role === 'user' && typeof param.content === 'string' && param.content === '') {
+          if (
+            param.role === 'user' &&
+            typeof param.content === 'string' &&
+            param.content === ''
+          ) {
             return { ...param, content: '.' }
           }
           return param
@@ -35,7 +82,9 @@ export class CompletionMessagesBuilder {
   }
 
   // Normalize a ThreadMessage into a ChatCompletionMessageParam for Token.js
-  private toCompletionParamFromThread(msg: ThreadMessage): ChatCompletionMessageParam {
+  private toCompletionParamFromThread(
+    msg: ThreadMessage
+  ): ChatCompletionMessageParam {
     if (msg.role === 'assistant') {
       return {
         role: 'assistant',
@@ -60,7 +109,10 @@ export class CompletionMessagesBuilder {
         if (part.type === ContentType.Image) {
           return {
             type: 'image_url' as const,
-            image_url: { url: part.image_url?.url || '', detail: part.image_url?.detail || 'auto' },
+            image_url: {
+              url: part.image_url?.url || '',
+              detail: part.image_url?.detail || 'auto',
+            },
           }
         }
         // Fallback for unknown content types
@@ -110,13 +162,43 @@ export class CompletionMessagesBuilder {
 
   /**
    * Add a tool message to the messages array.
-   * @param content - The content of the tool message.
+   * @param content - The content of the tool message (string or ToolResult object).
    * @param toolCallId - The ID of the tool call associated with the message.
    */
-  addToolMessage(content: string, toolCallId: string) {
+  addToolMessage(result: string | ToolResult, toolCallId: string) {
+    let content: string | any[] = ''
+
+    // Handle simple string case
+    if (typeof result === 'string') {
+      content = result
+    } else {
+      // Check for multimodal content (more than just a simple text string)
+      const hasMultimodalContent = result.content?.some(
+        (p) => p.data || p.image_url
+      )
+
+      if (hasMultimodalContent) {
+        // Build the structured content array
+        content = result.content.map(convertToolPartToApiContentPart)
+      } else if (result.content?.[0]?.text) {
+        // Standard text case
+        content = result.content[0].text
+      } else if (result.error) {
+        // Error case
+        content = `Tool execution failed: ${result.error}`
+      } else {
+        // Fallback: serialize the whole result structure if content is unexpected
+        try {
+          content = JSON.stringify(result)
+        } catch {
+          content = 'Tool call completed, unexpected output format.'
+        }
+      }
+    }
     this.messages.push({
       role: 'tool',
-      content: content,
+      // for role 'tool',  need to use 'as ChatCompletionMessageParam'
+      content: content as any,
       tool_call_id: toolCallId,
     })
   }