From 3c2ba624ed5c1831fa2aa58e4dab1d9753106688 Mon Sep 17 00:00:00 2001
From: Akarshan <akarshan@menlo.ai>
Date: Tue, 28 Oct 2025 18:43:09 +0530
Subject: [PATCH] feat: Add image visualization for tool_output steps

Implement support for displaying images returned in the Multi-Content Part (MCP) format within the `tool_output` step of the ReAct thinking block.

This change:
- Safely parses `tool_output` content to detect and extract image data (base64).
- Renders images as clickable thumbnails using data URLs.
- Integrates `ImageModal` to allow users to view the generated images in full size.
---
 web-app/src/containers/ThinkingBlock.tsx | 108 +++++++++++++++++++----
 1 file changed, 90 insertions(+), 18 deletions(-)
diff --git a/web-app/src/containers/ThinkingBlock.tsx b/web-app/src/containers/ThinkingBlock.tsx
index b5545a431..4f1e8bdeb 100644
--- a/web-app/src/containers/ThinkingBlock.tsx
+++ b/web-app/src/containers/ThinkingBlock.tsx
@@ -4,8 +4,9 @@ import { ChevronDown, ChevronUp, Loader, Check } from 'lucide-react'
 import { create } from 'zustand'
 import { RenderMarkdown } from './RenderMarkdown'
 import { useTranslation } from '@/i18n/react-i18next-compat'
-import { useMemo } from 'react'
+import { useMemo, useState } from 'react'
 import { cn } from '@/lib/utils'
+import ImageModal from '@/containers/dialogs/ImageModal'
 
 // Define ReActStep type (Reasoning-Action Step)
 type ReActStep = {
@@ -23,6 +24,21 @@ interface Props {
   duration?: number
 }
 
+// Utility function to safely parse JSON
+const safeParseJSON = (text: string) => {
+  try {
+    return JSON.parse(text)
+  } catch {
+    return null
+  }
+}
+
+// Utility to create data URL for images
+const createDataUrl = (base64Data: string, mimeType: string): string => {
+  if (base64Data.startsWith('data:')) return base64Data
+  return `data:${mimeType};base64,${base64Data}`
+}
+
 // Zustand store for thinking block state
 type ThinkingBlockState = {
   thinkingState: { [id: string]: boolean }
@@ -58,6 +74,15 @@ const ThinkingBlock = ({
   const setThinkingState = useThinkingStore((state) => state.setThinkingState)
   const { t } = useTranslation()
 
+  // Move useState for modal management to the top level of the component
+  const [modalImage, setModalImage] = useState<{
+    url: string
+    alt: string
+  } | null>(null)
+  const closeModal = () => setModalImage(null)
+  const handleImageClick = (url: string, alt: string) =>
+    setModalImage({ url, alt })
+
   // Actual loading state comes from prop, determined by whether final text started streaming (Req 2)
   const loading = propLoading
 
@@ -108,7 +133,12 @@ const ThinkingBlock = ({
   }
 
   // --- Rendering Functions for Expanded View ---
-  const renderStepContent = (step: ReActStep, index: number) => {
+  const renderStepContent = (
+    step: ReActStep,
+    index: number,
+    handleImageClick: (url: string, alt: string) => void,
+    t: (key: string) => string
+  ) => {
     // Updated type
     if (step.type === 'done') {
       const timeInSeconds = formatDuration(step.time ?? 0)
@@ -131,7 +161,14 @@ const ThinkingBlock = ({
       )
     }
 
-    let contentDisplay
+    const parsed = safeParseJSON(step.content)
+    const mcpContent = parsed?.content ?? []
+    const hasImages =
+      Array.isArray(mcpContent) &&
+      mcpContent.some((c) => c.type === 'image' && c.data && c.mimeType)
+
+    let contentDisplay: React.ReactNode
+
     if (step.type === 'tool_call') {
       const args = step.metadata ? step.metadata : ''
       contentDisplay = (
@@ -150,19 +187,52 @@ const ThinkingBlock = ({
         </>
       )
     } else if (step.type === 'tool_output') {
-      contentDisplay = (
-        <>
-          <p className="font-medium text-main-view-fg/90">Tool Output:</p>
-          <div className="mt-1">
-            <RenderMarkdown
-              isWrapping={true}
-              content={step.content.substring(0, 1000)}
-            />
-          </div>
-        </>
-      )
+      if (hasImages) {
+        // Display each image
+        contentDisplay = (
+          <>
+            <p className="font-medium text-main-view-fg/90">
+              Tool Output (Images):
+            </p>
+            <div className="mt-2 space-y-2">
+              {mcpContent.map((item: any, index: number) =>
+                item.type === 'image' && item.data && item.mimeType ? (
+                  <div key={index} className="my-2">
+                    <img
+                      src={createDataUrl(item.data, item.mimeType)}
+                      alt={`MCP Image ${index + 1}`}
+                      className="max-w-full max-h-64 object-contain rounded-md border border-main-view-fg/10 cursor-pointer hover:opacity-80 transition-opacity"
+                      onError={(e) => (e.currentTarget.style.display = 'none')}
+                      onClick={() =>
+                        handleImageClick(
+                          createDataUrl(item.data, item.mimeType),
+                          `MCP Image ${index + 1}`
+                        )
+                      }
+                    />
+                  </div>
+                ) : null
+              )}
+            </div>
+          </>
+        )
+      } else {
+        // Default behavior: wrap text in code block if no backticks
+        let content = step.content.substring(0, 1000)
+        if (!content.includes('```')) {
+          content = '```json\n' + content + '\n```'
+        }
+
+        contentDisplay = (
+          <>
+            <p className="font-medium text-main-view-fg/90">Tool Output:</p>
+            <div className="mt-1">
+              <RenderMarkdown isWrapping={true} content={content} />
+            </div>
+          </>
+        )
+      }
     } else {
-      // reasoning
       contentDisplay = (
         <RenderMarkdown isWrapping={true} content={step.content} />
       )
@@ -175,7 +245,7 @@ const ThinkingBlock = ({
     )
   }
 
-  const headerTitle = useMemo(() => {
+  const headerTitle: string = useMemo(() => {
     // Check if any step was a tool call
     const hasToolCalls = steps.some((step) => step.type === 'tool_call')
     const hasReasoning = steps.some((step) => step.type === 'reasoning')
@@ -255,7 +325,7 @@ const ThinkingBlock = ({
                   )}
                 />
                 {/* Active step content */}
-                {renderStepContent(activeStep, N - 1)}
+                {renderStepContent(activeStep, N - 1, handleImageClick, t)}
               </div>
             </div>
           </div>
@@ -285,13 +355,15 @@ const ThinkingBlock = ({
                   />
 
                   {/* Step Content */}
-                  {renderStepContent(step, index)}
+                  {renderStepContent(step, index, handleImageClick, t)}
                 </div>
               ))}
             </div>
           </div>
         )}
       </div>
+      {/* Render ImageModal once at the top level */}
+      <ImageModal image={modalImage} onClose={closeModal} />
     </div>
   )
 }