feat: support multimodal tool results and improve tool message handling (#6816)

* feat: support multimodal tool results and improve tool message handling

- Added a temporary `ToolResult` type that mirrors the structure returned by tools (text, image data, URLs, errors).
- Implemented `convertToolPartToApiContentPart` to translate each tool output part into the format expected by the OpenAI chat completion API.
- Updated `CompletionMessagesBuilder.addToolMessage` to accept a full `ToolResult` instead of a plain string and to:
  - Detect multimodal content (base64 images, image URLs) and build a structured `content` array.
  - Properly handle plain‑text results, tool execution errors, and unexpected formats with sensible fallbacks.
  - Cast the final content to `any` for the `tool` role as required by the API.
- Modified `postMessageProcessing` to pass the raw tool result (`result as any`) to `addToolMessage`, avoiding premature extraction of only the first text part.
- Refactored several formatting and type‑annotation sections:
  - Added multiline guard for empty user messages to insert a placeholder.
  - Split the image URL construction into a clearer multiline object.
  - Adjusted method signatures and added minor line‑breaks for readability.
- Included extensive comments explaining the new logic and edge‑case handling.

These changes enable the chat system to handle richer tool outputs (e.g., images, mixed content) and provide more robust error handling.

* Satisfy ts linter

* Make ts linter happy x2

* chore: update test message creation

---------

Co-authored-by: Faisal Amir <urmauur@gmail.com>
This commit is contained in:
Akarshan Biswas 2025-10-24 20:15:15 +05:30 committed by GitHub
parent 28ed5e2af2
commit 2561fcd78a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 90 additions and 8 deletions

View File

@ -32,7 +32,7 @@ type ExtendedConfigOptions = ConfigOptions & {
}
import { ulid } from 'ulidx'
import { MCPTool } from '@/types/completion'
import { CompletionMessagesBuilder } from './messages'
import { CompletionMessagesBuilder, ToolResult } from './messages'
import { ChatCompletionMessageToolCall } from 'openai/resources'
import { ExtensionManager } from './extension'
import { useAppState } from '@/hooks/useAppState'
@ -543,7 +543,7 @@ export const postMessageProcessing = async (
},
],
}
builder.addToolMessage(result.content[0]?.text ?? '', toolCall.id)
builder.addToolMessage(result as ToolResult, toolCall.id)
// update message metadata
}
return message

View File

@ -1,3 +1,4 @@
/* eslint-disable @typescript-eslint/no-explicit-any */
import { ChatCompletionMessageParam } from 'token.js'
import { ChatCompletionMessageToolCall } from 'openai/resources'
import { ThreadMessage, ContentType } from '@janhq/core'
@ -6,6 +7,48 @@ import { removeReasoningContent } from '@/utils/reasoning'
type ThreadContent = NonNullable<ThreadMessage['content']>[number]
// Define a temporary type for the expected tool result shape (ToolResult as before)
export type ToolResult = {
content: Array<{
type?: string
text?: string
data?: string
image_url?: { url: string; detail?: string }
}>
error?: string
}
// Helper function to convert the tool's output part into an API content part
const convertToolPartToApiContentPart = (part: ToolResult['content'][0]) => {
if (part.text) {
return { type: 'text', text: part.text }
}
// Handle base64 image data
if (part.data) {
// Assume default image type, though a proper tool should return the mime type
const mimeType =
part.type === 'image' ? 'image/png' : part.type || 'image/png'
const dataUrl = `data:${mimeType};base64,${part.data}`
return {
type: 'image_url',
image_url: {
url: dataUrl,
detail: 'auto',
},
}
}
// Handle pre-formatted image URL
if (part.image_url) {
return { type: 'image_url', image_url: part.image_url }
}
// Fallback to text stringification for structured but unhandled data
return { type: 'text', text: JSON.stringify(part) }
}
/**
* @fileoverview Helper functions for creating chat completion request.
* These functions are used to create chat completion request objects
@ -26,7 +69,11 @@ export class CompletionMessagesBuilder {
.map<ChatCompletionMessageParam>((msg) => {
const param = this.toCompletionParamFromThread(msg)
// In constructor context, normalize empty user text to a placeholder
if (param.role === 'user' && typeof param.content === 'string' && param.content === '') {
if (
param.role === 'user' &&
typeof param.content === 'string' &&
param.content === ''
) {
return { ...param, content: '.' }
}
return param
@ -35,7 +82,9 @@ export class CompletionMessagesBuilder {
}
// Normalize a ThreadMessage into a ChatCompletionMessageParam for Token.js
private toCompletionParamFromThread(msg: ThreadMessage): ChatCompletionMessageParam {
private toCompletionParamFromThread(
msg: ThreadMessage
): ChatCompletionMessageParam {
if (msg.role === 'assistant') {
return {
role: 'assistant',
@ -60,7 +109,10 @@ export class CompletionMessagesBuilder {
if (part.type === ContentType.Image) {
return {
type: 'image_url' as const,
image_url: { url: part.image_url?.url || '', detail: part.image_url?.detail || 'auto' },
image_url: {
url: part.image_url?.url || '',
detail: part.image_url?.detail || 'auto',
},
}
}
// Fallback for unknown content types
@ -110,13 +162,43 @@ export class CompletionMessagesBuilder {
/**
* Add a tool message to the messages array.
* @param content - The content of the tool message.
* @param content - The content of the tool message (string or ToolResult object).
* @param toolCallId - The ID of the tool call associated with the message.
*/
addToolMessage(content: string, toolCallId: string) {
addToolMessage(result: string | ToolResult, toolCallId: string) {
let content: string | any[] = ''
// Handle simple string case
if (typeof result === 'string') {
content = result
} else {
// Check for multimodal content (more than just a simple text string)
const hasMultimodalContent = result.content?.some(
(p) => p.data || p.image_url
)
if (hasMultimodalContent) {
// Build the structured content array
content = result.content.map(convertToolPartToApiContentPart)
} else if (result.content?.[0]?.text) {
// Standard text case
content = result.content[0].text
} else if (result.error) {
// Error case
content = `Tool execution failed: ${result.error}`
} else {
// Fallback: serialize the whole result structure if content is unexpected
try {
content = JSON.stringify(result)
} catch {
content = 'Tool call completed, unexpected output format.'
}
}
}
this.messages.push({
role: 'tool',
content: content,
// for role 'tool', need to use 'as ChatCompletionMessageParam'
content: content as any,
tool_call_id: toolCallId,
})
}