From 885da29f28dc74f7d066c4a67bf62b97a32777d1 Mon Sep 17 00:00:00 2001
From: Akarshan Biswas <akarshan@menlo.ai>
Date: Tue, 23 Sep 2025 07:52:19 +0530
Subject: [PATCH] feat: add getTokensCount method to compute token usage
 (#6467)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat: add getTokensCount method to compute token usage

Implemented a new async `getTokensCount` function in the LLaMA.cpp extension.
The method validates the model session, checks process health, applies the request template, and tokenizes the resulting prompt to return the token count. Includes detailed error handling for crashed models and API failures, enabling callers to assess token usage before sending completions.

* Fix: typos

* chore: update ui token usage

* chore: remove unused code

* feat: add image token handling for multimodal LlamaCPP models

Implemented support for counting image tokens when using vision-enabled models:
- Extended `SessionInfo` with optional `mmprojPath` to store the multimodal project file.
- Propagated `mmproj_path` from the Tauri plugin into the session info.
- Added import of `chatCompletionRequestMessage` and enhanced token calculation logic in the LlamaCPP extension:
- Detects image content in messages.
- Reads GGUF metadata from `mmprojPath` to compute accurate image token counts.
- Provides a fallback estimation if metadata reading fails.
- Returns the sum of text and image tokens.
- Introduced helper methods `calculateImageTokens` and `estimateImageTokensFallback`.
- Minor clean‑ups such as comment capitalization and debug logging.

* chore: update FE send params message include content type image_url

* fix mmproj path from session info and num tokens calculation

* fix: Correct image token estimation calculation in llamacpp extension

This commit addresses an inaccurate token count for images in the llama.cpp extension.

The previous logic incorrectly calculated the token count based on image patch size and dimensions. This has been replaced with a more precise method that uses the clip.vision.projection_dim value from the model metadata.

Additionally, unnecessary debug logging was removed, and a new log was added to show the mmproj metadata for improved visibility.

* fix per image calc

* fix: crash due to force unwrap

---------

Co-authored-by: Faisal Amir <urmauur@gmail.com>
Co-authored-by: Louis <louis@jan.ai>
---
 .../browser/extensions/engines/AIEngine.ts    |   3 +-
 extensions/llamacpp-extension/src/index.ts    | 150 +++++++++-
 .../tauri-plugin-llamacpp/src/commands.rs     |  20 +-
 .../tauri-plugin-llamacpp/src/state.rs        |   2 +
 web-app/src/components/TokenCounter.tsx       | 283 ++++++++++++++++++
 web-app/src/components/ui/tooltip.tsx         |   9 +-
 web-app/src/containers/ChatInput.tsx          | 126 ++++++--
 .../TokenCounterCompactSwitcher.tsx           |  17 ++
 web-app/src/hooks/useGeneralSetting.ts        |   4 +
 web-app/src/hooks/useTokensCount.ts           | 200 +++++++++++++
 web-app/src/locales/de-DE/settings.json       |   2 +
 web-app/src/locales/en/settings.json          |   2 +
 web-app/src/locales/vn/settings.json          |   2 +
 web-app/src/locales/zh-CN/settings.json       |   3 +-
 web-app/src/routes/settings/appearance.tsx    |   6 +
 web-app/src/services/models/default.ts        | 111 +++++++
 web-app/src/services/models/types.ts          |   3 +-
 17 files changed, 904 insertions(+), 39 deletions(-)
 create mode 100644 web-app/src/components/TokenCounter.tsx
 create mode 100644 web-app/src/containers/TokenCounterCompactSwitcher.tsx
 create mode 100644 web-app/src/hooks/useTokensCount.ts

diff --git a/core/src/browser/extensions/engines/AIEngine.ts b/core/src/browser/extensions/engines/AIEngine.ts
index 3236994b2..41de30c1b 100644
--- a/core/src/browser/extensions/engines/AIEngine.ts
+++ b/core/src/browser/extensions/engines/AIEngine.ts
@@ -13,7 +13,7 @@ export interface chatCompletionRequestMessage {
 }
 
 export interface Content {
-  type: 'text' | 'input_image' | 'input_audio'
+  type: 'text' | 'image_url' | 'input_audio'
   text?: string
   image_url?: string
   input_audio?: InputAudio
@@ -182,6 +182,7 @@ export interface SessionInfo {
   model_id: string //name of the model
   model_path: string // path of the loaded model
   api_key: string
+  mmproj_path?: string
 }
 
 export interface UnloadResult {
diff --git a/extensions/llamacpp-extension/src/index.ts b/extensions/llamacpp-extension/src/index.ts
index c296e06af..7229552a2 100644
--- a/extensions/llamacpp-extension/src/index.ts
+++ b/extensions/llamacpp-extension/src/index.ts
@@ -21,6 +21,7 @@ import {
   events,
   AppEvent,
   DownloadEvent,
+  chatCompletionRequestMessage,
 } from '@janhq/core'
 
 import { error, info, warn } from '@tauri-apps/plugin-log'
@@ -2296,7 +2297,9 @@ export default class llamacpp_extension extends AIEngine {
       : Math.floor(maxContextLength)
 
     const mmprojInfo = mmprojPath
-      ? `, mmprojSize=${(mmprojSize / (1024 * 1024)).toFixed(2)}MB, offloadMmproj=${offloadMmproj}`
+      ? `, mmprojSize=${(mmprojSize / (1024 * 1024)).toFixed(
+          2
+        )}MB, offloadMmproj=${offloadMmproj}`
       : ''
 
     logger.info(
@@ -2489,8 +2492,151 @@ export default class llamacpp_extension extends AIEngine {
       logger.error('Failed to validate GGUF file:', error)
       return {
         isValid: false,
-        error: `Failed to read model metadata: ${error instanceof Error ? error.message : 'Unknown error'}`,
+        error: `Failed to read model metadata: ${
+          error instanceof Error ? error.message : 'Unknown error'
+        }`,
       }
     }
   }
+
+  async getTokensCount(opts: chatCompletionRequest): Promise<number> {
+    const sessionInfo = await this.findSessionByModel(opts.model)
+    if (!sessionInfo) {
+      throw new Error(`No active session found for model: ${opts.model}`)
+    }
+
+    // Check if the process is alive
+    const result = await invoke<boolean>('plugin:llamacpp|is_process_running', {
+      pid: sessionInfo.pid,
+    })
+    if (result) {
+      try {
+        await fetch(`http://localhost:${sessionInfo.port}/health`)
+      } catch (e) {
+        this.unload(sessionInfo.model_id)
+        throw new Error('Model appears to have crashed! Please reload!')
+      }
+    } else {
+      throw new Error('Model has crashed! Please reload!')
+    }
+
+    const baseUrl = `http://localhost:${sessionInfo.port}`
+    const headers = {
+      'Content-Type': 'application/json',
+      'Authorization': `Bearer ${sessionInfo.api_key}`,
+    }
+
+    // Count image tokens first
+    let imageTokens = 0
+    const hasImages = opts.messages.some(
+      (msg) =>
+        Array.isArray(msg.content) &&
+        msg.content.some((content) => content.type === 'image_url')
+    )
+
+    if (hasImages) {
+      logger.info('Conversation has images')
+      try {
+        // Read mmproj metadata to get vision parameters
+        logger.info(`MMPROJ PATH: ${sessionInfo.mmproj_path}`)
+
+        const metadata = await readGgufMetadata(sessionInfo.mmproj_path)
+        logger.info(`mmproj metadata: ${JSON.stringify(metadata.metadata)}`)
+        imageTokens = await this.calculateImageTokens(
+          opts.messages,
+          metadata.metadata
+        )
+      } catch (error) {
+        logger.warn('Failed to calculate image tokens:', error)
+        // Fallback to a rough estimate if metadata reading fails
+        imageTokens = this.estimateImageTokensFallback(opts.messages)
+      }
+    }
+
+    // Calculate text tokens
+    const messages = JSON.stringify({ messages: opts.messages })
+
+    let parseResponse = await fetch(`${baseUrl}/apply-template`, {
+      method: 'POST',
+      headers: headers,
+      body: messages,
+    })
+
+    if (!parseResponse.ok) {
+      const errorData = await parseResponse.json().catch(() => null)
+      throw new Error(
+        `API request failed with status ${
+          parseResponse.status
+        }: ${JSON.stringify(errorData)}`
+      )
+    }
+
+    const parsedPrompt = await parseResponse.json()
+
+    const response = await fetch(`${baseUrl}/tokenize`, {
+      method: 'POST',
+      headers: headers,
+      body: JSON.stringify({
+        content: parsedPrompt.prompt,
+      }),
+    })
+
+    if (!response.ok) {
+      const errorData = await response.json().catch(() => null)
+      throw new Error(
+        `API request failed with status ${response.status}: ${JSON.stringify(
+          errorData
+        )}`
+      )
+    }
+
+    const dataTokens = await response.json()
+    const textTokens = dataTokens.tokens?.length || 0
+
+    return textTokens + imageTokens
+  }
+
+  private async calculateImageTokens(
+    messages: chatCompletionRequestMessage[],
+    metadata: Record<string, string>
+  ): Promise<number> {
+    // Extract vision parameters from metadata
+    const projectionDim = Math.floor(Number(metadata['clip.vision.projection_dim']) / 10) || 256
+
+    // Count images in messages
+    let imageCount = 0
+    for (const message of messages) {
+      if (Array.isArray(message.content)) {
+        imageCount += message.content.filter(
+          (content) => content.type === 'image_url'
+        ).length
+      }
+    }
+
+    logger.info(
+      `Calculated ${projectionDim} tokens per image, ${imageCount} images total`
+    )
+    return projectionDim * imageCount - imageCount // remove the lingering <__image__> placeholder token
+  }
+
+  private estimateImageTokensFallback(
+    messages: chatCompletionRequestMessage[]
+  ): number {
+    // Fallback estimation if metadata reading fails
+    const estimatedTokensPerImage = 256 // Gemma's siglip
+
+    let imageCount = 0
+    for (const message of messages) {
+      if (Array.isArray(message.content)) {
+        imageCount += message.content.filter(
+          (content) => content.type === 'image_url'
+        ).length
+      }
+    }
+
+    logger.warn(
+      `Fallback estimation: ${estimatedTokensPerImage} tokens per image, ${imageCount} images total`
+    )
+    return imageCount * estimatedTokensPerImage - imageCount // remove the lingering <__image__> placeholder token
+  }
 }
diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/src/commands.rs b/src-tauri/plugins/tauri-plugin-llamacpp/src/commands.rs
index 79ec81f5a..96ecb36bc 100644
--- a/src-tauri/plugins/tauri-plugin-llamacpp/src/commands.rs
+++ b/src-tauri/plugins/tauri-plugin-llamacpp/src/commands.rs
@@ -12,7 +12,7 @@ use tokio::time::Instant;
 
 use crate::device::{get_devices_from_backend, DeviceInfo};
 use crate::error::{ErrorCode, LlamacppError, ServerError, ServerResult};
-use crate::path::{validate_binary_path, validate_model_path, validate_mmproj_path};
+use crate::path::{validate_binary_path, validate_mmproj_path, validate_model_path};
 use crate::process::{
     find_session_by_model_id, get_all_active_sessions, get_all_loaded_model_ids,
     get_random_available_port, is_process_running_by_pid,
@@ -55,7 +55,20 @@ pub async fn load_llama_model<R: Runtime>(
 
     let port = parse_port_from_args(&args);
     let model_path_pb = validate_model_path(&mut args)?;
-    let _mmproj_path_pb = validate_mmproj_path(&mut args)?;
+    let mmproj_path_pb = validate_mmproj_path(&mut args)?;
+
+    let mmproj_path_string = if let Some(ref _mmproj_pb) = mmproj_path_pb {
+        // Find the actual mmproj path from args after validation/conversion
+        if let Some(mmproj_index) = args.iter().position(|arg| arg == "--mmproj") {
+            Some(args[mmproj_index + 1].clone())
+        } else {
+            None
+        }
+    } else {
+        None
+    };
+
+    log::info!("MMPROJ Path string: {}", &mmproj_path_string.as_ref().unwrap_or(&"None".to_string()));
 
     let api_key: String;
 
@@ -211,6 +224,7 @@ pub async fn load_llama_model<R: Runtime>(
         model_id: model_id,
         model_path: model_path_pb.display().to_string(),
         api_key: api_key,
+        mmproj_path: mmproj_path_string,
     };
 
     // Insert session info to process_map
@@ -265,7 +279,7 @@ pub async fn unload_llama_model<R: Runtime>(
 pub async fn get_devices(
     backend_path: &str,
     library_path: Option<&str>,
-    envs: HashMap<String, String>
+    envs: HashMap<String, String>,
 ) -> ServerResult<Vec<DeviceInfo>> {
     get_devices_from_backend(backend_path, library_path, envs).await
 }
diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/src/state.rs b/src-tauri/plugins/tauri-plugin-llamacpp/src/state.rs
index 359a27951..2aad02ecf 100644
--- a/src-tauri/plugins/tauri-plugin-llamacpp/src/state.rs
+++ b/src-tauri/plugins/tauri-plugin-llamacpp/src/state.rs
@@ -11,6 +11,8 @@ pub struct SessionInfo {
     pub model_id: String,
     pub model_path: String, // path of the loaded model
     pub api_key: String,
+    #[serde(default)]
+    pub mmproj_path: Option<String>,
 }
 
 pub struct LLamaBackendSession {
diff --git a/web-app/src/components/TokenCounter.tsx b/web-app/src/components/TokenCounter.tsx
new file mode 100644
index 000000000..0863176c7
--- /dev/null
+++ b/web-app/src/components/TokenCounter.tsx
@@ -0,0 +1,283 @@
+import { useMemo, useEffect, useState, useRef } from 'react'
+import { cn } from '@/lib/utils'
+import {
+  Tooltip,
+  TooltipContent,
+  TooltipProvider,
+  TooltipTrigger,
+} from '@/components/ui/tooltip'
+import { useTokensCount } from '@/hooks/useTokensCount'
+import { ThreadMessage } from '@janhq/core'
+
+interface TokenCounterProps {
+  messages?: ThreadMessage[]
+  className?: string
+  compact?: boolean
+  additionalTokens?: number // For vision tokens or other additions
+  uploadedFiles?: Array<{
+    name: string
+    type: string
+    size: number
+    base64: string
+    dataUrl: string
+  }>
+}
+
+export const TokenCounter = ({
+  messages = [],
+  className,
+  compact = false,
+  additionalTokens = 0,
+  uploadedFiles = [],
+}: TokenCounterProps) => {
+  const { calculateTokens, ...tokenData } = useTokensCount(
+    messages,
+    uploadedFiles
+  )
+
+  const [isAnimating, setIsAnimating] = useState(false)
+  const [prevTokenCount, setPrevTokenCount] = useState(0)
+  const [isUpdating, setIsUpdating] = useState(false)
+  const timersRef = useRef<{ update?: NodeJS.Timeout; anim?: NodeJS.Timeout }>(
+    {}
+  )
+
+  // Manual calculation - trigger on click
+  const handleCalculateTokens = () => {
+    calculateTokens()
+  }
+
+  // Handle token count changes with proper debouncing and cleanup
+  useEffect(() => {
+    const currentTotal = tokenData.tokenCount + additionalTokens
+    const timers = timersRef.current
+
+    // Clear any existing timers
+    if (timers.update) clearTimeout(timers.update)
+    if (timers.anim) clearTimeout(timers.anim)
+
+    if (currentTotal !== prevTokenCount) {
+      setIsUpdating(true)
+
+      // Clear updating state after a longer delay for smoother transitions
+      timers.update = setTimeout(() => {
+        setIsUpdating(false)
+      }, 250)
+
+      // Only animate for significant changes and avoid animating on initial load
+      if (prevTokenCount > 0) {
+        const difference = Math.abs(currentTotal - prevTokenCount)
+        if (difference > 10) {
+          // Increased threshold to reduce micro-animations
+          setIsAnimating(true)
+          timers.anim = setTimeout(() => {
+            setIsAnimating(false)
+          }, 600)
+        }
+      }
+
+      setPrevTokenCount(currentTotal)
+    }
+
+    // Cleanup function
+    return () => {
+      if (timers.update) clearTimeout(timers.update)
+      if (timers.anim) clearTimeout(timers.anim)
+    }
+  }, [tokenData.tokenCount, additionalTokens, prevTokenCount])
+
+  const totalTokens = useMemo(() => {
+    return tokenData.tokenCount + additionalTokens
+  }, [tokenData.tokenCount, additionalTokens])
+
+  // Percentage calculation to match useTokensCount exactly
+  const adjustedPercentage = useMemo(() => {
+    if (!tokenData.maxTokens) return undefined
+    return (totalTokens / tokenData.maxTokens) * 100
+  }, [totalTokens, tokenData.maxTokens])
+
+  // Check if percentage exceeds max (100%)
+  const isOverLimit = useMemo(() => {
+    return adjustedPercentage !== undefined && adjustedPercentage > 100
+  }, [adjustedPercentage])
+
+  const formatNumber = (num: number) => {
+    if (num >= 1000000) return `${(num / 1000000).toFixed(1)}M`
+    if (num >= 1000) return `${(num / 1000).toFixed(1)}K`
+    return num.toString()
+  }
+
+  if (compact) {
+    return (
+      <TooltipProvider delayDuration={isUpdating ? 1200 : 400}>
+        <Tooltip>
+          <TooltipTrigger asChild>
+            <div
+              className={cn('relative cursor-pointer', className)}
+              onClick={handleCalculateTokens}
+            >
+              {/* Main compact display */}
+              <div className="flex items-center gap-2 px-2 py-1 rounded-md bg-main-view border border-main-view-fg/10">
+                <span
+                  className={cn(
+                    'text-xs font-medium tabular-nums transition-all duration-500 ease-out',
+                    isOverLimit ? 'text-destructive' : 'text-accent',
+                    isAnimating && 'scale-110'
+                  )}
+                >
+                  {adjustedPercentage?.toFixed(1) || '0.0'}%
+                </span>
+
+                <div className="relative w-4 h-4 flex-shrink-0">
+                  <svg
+                    className="w-4 h-4 transform -rotate-90"
+                    viewBox="0 0 16 16"
+                  >
+                    <circle
+                      cx="8"
+                      cy="8"
+                      r="6"
+                      stroke="currentColor"
+                      strokeWidth="1.5"
+                      fill="none"
+                      className="text-main-view-fg/20"
+                    />
+                    <circle
+                      cx="8"
+                      cy="8"
+                      r="6"
+                      stroke="currentColor"
+                      strokeWidth="1.5"
+                      fill="none"
+                      strokeDasharray={`${2 * Math.PI * 6}`}
+                      strokeDashoffset={`${2 * Math.PI * 6 * (1 - (adjustedPercentage || 0) / 100)}`}
+                      className={cn(
+                        'transition-all duration-500 ease-out',
+                        isOverLimit ? 'stroke-destructive' : 'stroke-accent'
+                      )}
+                      style={{
+                        transformOrigin: 'center',
+                      }}
+                    />
+                  </svg>
+                </div>
+              </div>
+            </div>
+          </TooltipTrigger>
+          <TooltipContent
+            side="bottom"
+            align="center"
+            sideOffset={5}
+            showArrow={false}
+            className="min-w-[240px] max-w-[240px] bg-main-view border border-main-view-fg/10 "
+          >
+            {/* Detailed breakdown panel */}
+            <>
+              {/* Header with percentage and progress bar */}
+              <div className="mb-3">
+                <div className="flex items-center justify-between mb-2">
+                  <span
+                    className={cn(
+                      'text-lg font-semibold tabular-nums',
+                      isOverLimit ? 'text-destructive' : 'text-accent'
+                    )}
+                  >
+                    {adjustedPercentage?.toFixed(1) || '0.0'}%
+                  </span>
+                  <span className="text-sm text-main-view-fg/60 font-mono">
+                    {formatNumber(totalTokens)} /{' '}
+                    {formatNumber(tokenData.maxTokens || 0)}
+                  </span>
+                </div>
+
+                {/* Progress bar */}
+                <div className="w-full h-2 bg-main-view-fg/10 rounded-full overflow-hidden">
+                  <div
+                    className={cn(
+                      'h-2 rounded-full transition-all duration-500 ease-out',
+                      isOverLimit ? 'bg-destructive' : 'bg-accent'
+                    )}
+                    style={{
+                      width: `${Math.min(adjustedPercentage || 0, 100)}%`,
+                    }}
+                  />
+                </div>
+              </div>
+
+              {/* Token breakdown */}
+              <div className="space-y-2 mb-3">
+                <div className="flex items-center justify-between text-sm">
+                  <span className="text-main-view-fg/60">Text</span>
+                  <span className="text-main-view-fg font-mono">
+                    {formatNumber(Math.max(0, tokenData.tokenCount))}
+                  </span>
+                </div>
+              </div>
+
+              {/* Remaining tokens */}
+              <div className="border-t border-main-view-fg/10 pt-2">
+                <div className="flex items-center justify-between text-sm">
+                  <span className="text-main-view-fg/60">Remaining</span>
+                  <span className="text-main-view-fg font-semibold font-mono">
+                    {formatNumber(
+                      Math.max(0, (tokenData.maxTokens || 0) - totalTokens)
+                    )}
+                  </span>
+                </div>
+              </div>
+            </>
+          </TooltipContent>
+        </Tooltip>
+      </TooltipProvider>
+    )
+  }
+
+  // Non-compact: Simple inline display
+  return (
+    <div
+      className={cn(
+        'flex items-center w-full justify-between gap-2 py-1 text-xs text-main-view-fg/50',
+        className
+      )}
+    >
+      <div className="space-x-0.5">
+        <span>Context&nbsp;</span>
+        <span
+          className={cn(
+            'font-mono font-bold transition-all duration-500 ease-out',
+            isAnimating && 'scale-110'
+          )}
+        >
+          {formatNumber(totalTokens)}
+        </span>
+        {tokenData.maxTokens && (
+          <>
+            <span>/</span>
+            <span
+              className={cn(
+                'font-mono font-bold transition-all duration-500 ease-out',
+                isAnimating && 'scale-110'
+              )}
+            >
+              {formatNumber(tokenData.maxTokens)}
+            </span>
+            <span
+              className={cn(
+                'ml-1 font-mono font-bold transition-all duration-500 ease-out',
+                isOverLimit ? 'text-destructive' : 'text-accent',
+                isAnimating && 'scale-110'
+              )}
+            >
+              ({adjustedPercentage?.toFixed(1) || '0.0'}%)
+            </span>
+            {isOverLimit && (
+              <span className="text-xs text-main-view-fg/40">
+                &nbsp;{isOverLimit ? '⚠️ Over limit' : 'Tokens used'}
+              </span>
+            )}
+          </>
+        )}
+      </div>
+    </div>
+  )
+}
diff --git a/web-app/src/components/ui/tooltip.tsx b/web-app/src/components/ui/tooltip.tsx
index b7cae36a5..78e71a538 100644
--- a/web-app/src/components/ui/tooltip.tsx
+++ b/web-app/src/components/ui/tooltip.tsx
@@ -35,9 +35,12 @@ function TooltipTrigger({
 function TooltipContent({
   className,
   sideOffset = 0,
+  showArrow = true,
   children,
   ...props
-}: React.ComponentProps<typeof TooltipPrimitive.Content>) {
+}: React.ComponentProps<typeof TooltipPrimitive.Content> & {
+  showArrow?: boolean
+}) {
   return (
     <TooltipPrimitive.Portal>
       <TooltipPrimitive.Content
@@ -50,7 +53,9 @@ function TooltipContent({
         {...props}
       >
         {children}
-        <TooltipPrimitive.Arrow className="bg-main-view-fg fill-main-view-fg z-50 size-2.5 translate-y-[calc(-50%_-_2px)] rotate-45 rounded-[2px]" />
+        {showArrow && (
+          <TooltipPrimitive.Arrow className="bg-main-view-fg fill-main-view-fg z-50 size-2.5 translate-y-[calc(-50%_-_2px)] rotate-45 rounded-[2px]" />
+        )}
       </TooltipPrimitive.Content>
     </TooltipPrimitive.Portal>
   )
diff --git a/web-app/src/containers/ChatInput.tsx b/web-app/src/containers/ChatInput.tsx
index f82d17f52..0b34d0d3a 100644
--- a/web-app/src/containers/ChatInput.tsx
+++ b/web-app/src/containers/ChatInput.tsx
@@ -34,6 +34,9 @@ import { ModelLoader } from '@/containers/loaders/ModelLoader'
 import DropdownToolsAvailable from '@/containers/DropdownToolsAvailable'
 import { useServiceHub } from '@/hooks/useServiceHub'
 import { useTools } from '@/hooks/useTools'
+import { TokenCounter } from '@/components/TokenCounter'
+import { useMessages } from '@/hooks/useMessages'
+import { useShallow } from 'zustand/react/shallow'
 
 type ChatInputProps = {
   className?: string
@@ -56,9 +59,21 @@ const ChatInput = ({ model, className, initialMessage }: ChatInputProps) => {
   const setPrompt = usePrompt((state) => state.setPrompt)
   const currentThreadId = useThreads((state) => state.currentThreadId)
   const { t } = useTranslation()
-  const { spellCheckChatInput } = useGeneralSetting()
+  const spellCheckChatInput = useGeneralSetting(
+    (state) => state.spellCheckChatInput
+  )
+  const tokenCounterCompact = useGeneralSetting(
+    (state) => state.tokenCounterCompact
+  )
   useTools()
 
+  // Get current thread messages for token counting
+  const threadMessages = useMessages(
+    useShallow((state) =>
+      currentThreadId ? state.messages[currentThreadId] : []
+    )
+  )
+
   const maxRows = 10
 
   const selectedModel = useModelProvider((state) => state.selectedModel)
@@ -79,6 +94,7 @@ const ChatInput = ({ model, className, initialMessage }: ChatInputProps) => {
   const [connectedServers, setConnectedServers] = useState<string[]>([])
   const [isDragOver, setIsDragOver] = useState(false)
   const [hasMmproj, setHasMmproj] = useState(false)
+  const [hasActiveModels, setHasActiveModels] = useState(false)
 
   // Check for connected MCP servers
   useEffect(() => {
@@ -100,6 +116,28 @@ const ChatInput = ({ model, className, initialMessage }: ChatInputProps) => {
     return () => clearInterval(intervalId)
   }, [serviceHub])
 
+  // Check for active models
+  useEffect(() => {
+    const checkActiveModels = async () => {
+      try {
+        const activeModels = await serviceHub
+          .models()
+          .getActiveModels('llamacpp')
+        setHasActiveModels(activeModels.length > 0)
+      } catch (error) {
+        console.error('Failed to get active models:', error)
+        setHasActiveModels(false)
+      }
+    }
+
+    checkActiveModels()
+
+    // Poll for active models every 3 seconds
+    const intervalId = setInterval(checkActiveModels, 3000)
+
+    return () => clearInterval(intervalId)
+  }, [serviceHub])
+
   // Check for mmproj existence or vision capability when model changes
   useEffect(() => {
     const checkMmprojSupport = async () => {
@@ -742,35 +780,51 @@ const ChatInput = ({ model, className, initialMessage }: ChatInputProps) => {
               </div>
             </div>
 
-            {streamingContent ? (
-              <Button
-                variant="destructive"
-                size="icon"
-                onClick={() =>
-                  stopStreaming(currentThreadId ?? streamingContent.thread_id)
-                }
-              >
-                <IconPlayerStopFilled />
-              </Button>
-            ) : (
-              <Button
-                variant={
-                  !prompt.trim() && uploadedFiles.length === 0
-                    ? null
-                    : 'default'
-                }
-                size="icon"
-                disabled={!prompt.trim() && uploadedFiles.length === 0}
-                data-test-id="send-message-button"
-                onClick={() => handleSendMesage(prompt)}
-              >
-                {streamingContent ? (
-                  <span className="animate-spin h-4 w-4 border-2 border-current border-t-transparent rounded-full" />
-                ) : (
-                  <ArrowRight className="text-primary-fg" />
+            <div className="flex items-center gap-2">
+              {selectedProvider === 'llamacpp' &&
+                hasActiveModels &&
+                tokenCounterCompact &&
+                !initialMessage &&
+                (threadMessages?.length > 0 || prompt.trim().length > 0) && (
+                  <div className="flex-1 flex justify-center">
+                    <TokenCounter
+                      messages={threadMessages || []}
+                      compact={true}
+                      uploadedFiles={uploadedFiles}
+                    />
+                  </div>
                 )}
-              </Button>
-            )}
+
+              {streamingContent ? (
+                <Button
+                  variant="destructive"
+                  size="icon"
+                  onClick={() =>
+                    stopStreaming(currentThreadId ?? streamingContent.thread_id)
+                  }
+                >
+                  <IconPlayerStopFilled />
+                </Button>
+              ) : (
+                <Button
+                  variant={
+                    !prompt.trim() && uploadedFiles.length === 0
+                      ? null
+                      : 'default'
+                  }
+                  size="icon"
+                  disabled={!prompt.trim() && uploadedFiles.length === 0}
+                  data-test-id="send-message-button"
+                  onClick={() => handleSendMesage(prompt)}
+                >
+                  {streamingContent ? (
+                    <span className="animate-spin h-4 w-4 border-2 border-current border-t-transparent rounded-full" />
+                  ) : (
+                    <ArrowRight className="text-primary-fg" />
+                  )}
+                </Button>
+              )}
+            </div>
           </div>
         </div>
       </div>
@@ -792,6 +846,20 @@ const ChatInput = ({ model, className, initialMessage }: ChatInputProps) => {
           </div>
         </div>
       )}
+
+      {selectedProvider === 'llamacpp' &&
+        hasActiveModels &&
+        !tokenCounterCompact &&
+        !initialMessage &&
+        (threadMessages?.length > 0 || prompt.trim().length > 0) && (
+          <div className="flex-1 w-full flex justify-start px-2">
+            <TokenCounter
+              messages={threadMessages || []}
+              compact={false}
+              uploadedFiles={uploadedFiles}
+            />
+          </div>
+        )}
     </div>
   )
 }
diff --git a/web-app/src/containers/TokenCounterCompactSwitcher.tsx b/web-app/src/containers/TokenCounterCompactSwitcher.tsx
new file mode 100644
index 000000000..3270941cd
--- /dev/null
+++ b/web-app/src/containers/TokenCounterCompactSwitcher.tsx
@@ -0,0 +1,17 @@
+import { useGeneralSetting } from '@/hooks/useGeneralSetting'
+import { Switch } from '@/components/ui/switch'
+
+export function TokenCounterCompactSwitcher() {
+  const { tokenCounterCompact, setTokenCounterCompact } = useGeneralSetting()
+
+  const toggleTokenCounterCompact = () => {
+    setTokenCounterCompact(!tokenCounterCompact)
+  }
+
+  return (
+    <Switch
+      checked={tokenCounterCompact}
+      onCheckedChange={toggleTokenCounterCompact}
+    />
+  )
+}
diff --git a/web-app/src/hooks/useGeneralSetting.ts b/web-app/src/hooks/useGeneralSetting.ts
index b356ca8a3..e76c49017 100644
--- a/web-app/src/hooks/useGeneralSetting.ts
+++ b/web-app/src/hooks/useGeneralSetting.ts
@@ -6,9 +6,11 @@ import { ExtensionManager } from '@/lib/extension'
 type LeftPanelStoreState = {
   currentLanguage: Language
   spellCheckChatInput: boolean
+  tokenCounterCompact: boolean
   huggingfaceToken?: string
   setHuggingfaceToken: (token: string) => void
   setSpellCheckChatInput: (value: boolean) => void
+  setTokenCounterCompact: (value: boolean) => void
   setCurrentLanguage: (value: Language) => void
 }
 
@@ -17,8 +19,10 @@ export const useGeneralSetting = create<LeftPanelStoreState>()(
     (set) => ({
       currentLanguage: 'en',
       spellCheckChatInput: true,
+      tokenCounterCompact: true,
       huggingfaceToken: undefined,
       setSpellCheckChatInput: (value) => set({ spellCheckChatInput: value }),
+      setTokenCounterCompact: (value) => set({ tokenCounterCompact: value }),
       setCurrentLanguage: (value) => set({ currentLanguage: value }),
       setHuggingfaceToken: (token) => {
         set({ huggingfaceToken: token })
diff --git a/web-app/src/hooks/useTokensCount.ts b/web-app/src/hooks/useTokensCount.ts
new file mode 100644
index 000000000..90f740a4a
--- /dev/null
+++ b/web-app/src/hooks/useTokensCount.ts
@@ -0,0 +1,200 @@
+import { useCallback, useState, useRef, useEffect, useMemo } from 'react'
+import { ThreadMessage, ContentType } from '@janhq/core'
+import { useServiceHub } from './useServiceHub'
+import { useModelProvider } from './useModelProvider'
+import { usePrompt } from './usePrompt'
+
+export interface TokenCountData {
+  tokenCount: number
+  maxTokens?: number
+  percentage?: number
+  isNearLimit: boolean
+  loading: boolean
+  error?: string
+}
+
+export const useTokensCount = (
+  messages: ThreadMessage[] = [],
+  uploadedFiles?: Array<{
+    name: string
+    type: string
+    size: number
+    base64: string
+    dataUrl: string
+  }>
+) => {
+  const [tokenData, setTokenData] = useState<TokenCountData>({
+    tokenCount: 0,
+    loading: false,
+    isNearLimit: false,
+  })
+
+  const debounceTimeoutRef = useRef<NodeJS.Timeout | undefined>(undefined)
+  const isIncreasingContextSize = useRef<boolean>(false)
+  const serviceHub = useServiceHub()
+  const { selectedModel, selectedProvider } = useModelProvider()
+  const { prompt } = usePrompt()
+
+  // Create messages with current prompt for live calculation
+  const messagesWithPrompt = useMemo(() => {
+    const result = [...messages]
+    if (prompt.trim() || (uploadedFiles && uploadedFiles.length > 0)) {
+      const content = []
+
+      // Add text content if prompt exists
+      if (prompt.trim()) {
+        content.push({ type: ContentType.Text, text: { value: prompt } })
+      }
+
+      // Add image content for uploaded files
+      if (uploadedFiles && uploadedFiles.length > 0) {
+        uploadedFiles.forEach((file) => {
+          content.push({
+            type: ContentType.Image,
+            image_url: {
+              url: file.dataUrl,
+              detail: 'high', // Default to high detail for token calculation
+            },
+          })
+        })
+      }
+
+      if (content.length > 0) {
+        result.push({
+          id: 'temp-prompt',
+          thread_id: '',
+          role: 'user',
+          content,
+          created_at: Date.now(),
+        } as ThreadMessage)
+      }
+    }
+    return result
+  }, [messages, prompt, uploadedFiles])
+
+  // Debounced calculation that includes current prompt
+  const debouncedCalculateTokens = useCallback(async () => {
+    const modelId = selectedModel?.id
+    if (!modelId || selectedProvider !== 'llamacpp') {
+      setTokenData({
+        tokenCount: 0,
+        loading: false,
+        isNearLimit: false,
+      })
+      return
+    }
+
+    // Use messages with current prompt for calculation
+    const messagesToCalculate = messagesWithPrompt
+    if (messagesToCalculate.length === 0) {
+      setTokenData({
+        tokenCount: 0,
+        loading: false,
+        isNearLimit: false,
+      })
+      return
+    }
+
+    setTokenData((prev) => ({ ...prev, loading: true, error: undefined }))
+
+    try {
+      const tokenCount = await serviceHub
+        .models()
+        .getTokensCount(modelId, messagesToCalculate)
+
+      const maxTokensValue =
+        selectedModel?.settings?.ctx_len?.controller_props?.value
+      const maxTokensNum =
+        typeof maxTokensValue === 'string'
+          ? parseInt(maxTokensValue)
+          : typeof maxTokensValue === 'number'
+            ? maxTokensValue
+            : undefined
+
+      const percentage = maxTokensNum
+        ? (tokenCount / maxTokensNum) * 100
+        : undefined
+      const isNearLimit = percentage ? percentage > 85 : false
+
+      setTokenData({
+        tokenCount,
+        maxTokens: maxTokensNum,
+        percentage,
+        isNearLimit,
+        loading: false,
+      })
+    } catch (error) {
+      console.error('Failed to calculate tokens:', error)
+      setTokenData((prev) => ({
+        ...prev,
+        loading: false,
+        error:
+          error instanceof Error ? error.message : 'Failed to calculate tokens',
+      }))
+    }
+  }, [
+    selectedModel?.id,
+    selectedProvider,
+    messagesWithPrompt,
+    serviceHub,
+    selectedModel?.settings?.ctx_len?.controller_props?.value,
+  ])
+
+  // Debounced effect that triggers when prompt or messages change
+  useEffect(() => {
+    // Clear existing timeout
+    if (debounceTimeoutRef.current) {
+      clearTimeout(debounceTimeoutRef.current)
+    }
+
+    // Skip calculation if we're currently increasing context size
+    if (isIncreasingContextSize.current) {
+      return
+    }
+
+    // Only calculate if we have messages or a prompt
+    if (
+      messagesWithPrompt.length > 0 &&
+      selectedProvider === 'llamacpp' &&
+      selectedModel?.id
+    ) {
+      debounceTimeoutRef.current = setTimeout(() => {
+        debouncedCalculateTokens()
+      }, 150) // 150ms debounce for more responsive updates
+    } else {
+      // Reset immediately if no content
+      setTokenData({
+        tokenCount: 0,
+        loading: false,
+        isNearLimit: false,
+      })
+    }
+
+    return () => {
+      if (debounceTimeoutRef.current) {
+        clearTimeout(debounceTimeoutRef.current)
+      }
+    }
+  }, [
+    prompt,
+    messages.length,
+    selectedModel?.id,
+    selectedProvider,
+    messagesWithPrompt.length,
+    debouncedCalculateTokens,
+  ])
+
+  // Manual calculation function (for click events)
+  const calculateTokens = useCallback(async () => {
+    // Trigger the debounced calculation immediately
+    if (debounceTimeoutRef.current) {
+      clearTimeout(debounceTimeoutRef.current)
+    }
+    await debouncedCalculateTokens()
+  }, [debouncedCalculateTokens])
+
+  return {
+    ...tokenData,
+    calculateTokens,
+  }
+}
diff --git a/web-app/src/locales/de-DE/settings.json b/web-app/src/locales/de-DE/settings.json
index 94c6c82a7..ec1429353 100644
--- a/web-app/src/locales/de-DE/settings.json
+++ b/web-app/src/locales/de-DE/settings.json
@@ -100,6 +100,8 @@
     "resetAppearanceSuccessDesc": "Alle Darstellungseinstellungen wurden auf die Standardeinstellungen zurückgesetzt.",
     "chatWidth": "Chat Breite",
     "chatWidthDesc": "Passe die Breite der Chatansicht an.",
+    "tokenCounterCompact": "Kompakter Token-Zähler",
+    "tokenCounterCompactDesc": "Token-Zähler im Chat-Eingabefeld anzeigen. Wenn deaktiviert, wird der Token-Zähler unter dem Eingabefeld angezeigt.",
     "codeBlockTitle": "Code Block",
     "codeBlockDesc": "Wähle einen Stil zur Syntaxhervorhebung.",
     "showLineNumbers": "Zeilennummern anzeigen",
diff --git a/web-app/src/locales/en/settings.json b/web-app/src/locales/en/settings.json
index 44a56d9e0..bea43d2de 100644
--- a/web-app/src/locales/en/settings.json
+++ b/web-app/src/locales/en/settings.json
@@ -100,6 +100,8 @@
     "resetAppearanceSuccessDesc": "All appearance settings have been restored to default.",
     "chatWidth": "Chat Width",
     "chatWidthDesc": "Customize the width of the chat view.",
+    "tokenCounterCompact": "Compact Token Counter",
+    "tokenCounterCompactDesc": "Show token counter inside chat input. When disabled, token counter appears below the input.",
     "codeBlockTitle": "Code Block",
     "codeBlockDesc": "Choose a syntax highlighting style.",
     "showLineNumbers": "Show Line Numbers",
diff --git a/web-app/src/locales/vn/settings.json b/web-app/src/locales/vn/settings.json
index 618aa046b..c7a92e348 100644
--- a/web-app/src/locales/vn/settings.json
+++ b/web-app/src/locales/vn/settings.json
@@ -100,6 +100,8 @@
     "resetAppearanceSuccessDesc": "Tất cả cài đặt giao diện đã được khôi phục về mặc định.",
     "chatWidth": "Chiều rộng trò chuyện",
     "chatWidthDesc": "Tùy chỉnh chiều rộng của chế độ xem trò chuyện.",
+    "tokenCounterCompact": "Bộ đếm token nhỏ gọn",
+    "tokenCounterCompactDesc": "Hiển thị bộ đếm token bên trong ô nhập trò chuyện. Khi tắt, bộ đếm token sẽ xuất hiện bên dưới ô nhập.",
     "codeBlockTitle": "Khối mã",
     "codeBlockDesc": "Chọn kiểu tô sáng cú pháp.",
     "showLineNumbers": "Hiển thị số dòng",
diff --git a/web-app/src/locales/zh-CN/settings.json b/web-app/src/locales/zh-CN/settings.json
index d2dead089..805901044 100644
--- a/web-app/src/locales/zh-CN/settings.json
+++ b/web-app/src/locales/zh-CN/settings.json
@@ -100,6 +100,8 @@
     "resetAppearanceSuccessDesc": "所有外观设置已恢复为默认值。",
     "chatWidth": "聊天宽度",
     "chatWidthDesc": "自定义聊天视图的宽度。",
+    "tokenCounterCompact": "紧凑令牌计数器",
+    "tokenCounterCompactDesc": "在聊天输入框内显示令牌计数器。禁用时，令牌计数器显示在输入框下方。",
     "codeBlockTitle": "代码块",
     "codeBlockDesc": "选择语法高亮样式。",
     "showLineNumbers": "显示行号",
@@ -264,4 +266,3 @@
     "updateError": "更新 Llamacpp 失败"
   }
 }
-
diff --git a/web-app/src/routes/settings/appearance.tsx b/web-app/src/routes/settings/appearance.tsx
index 3cba3eed5..118f82d07 100644
--- a/web-app/src/routes/settings/appearance.tsx
+++ b/web-app/src/routes/settings/appearance.tsx
@@ -19,6 +19,7 @@ import { LineNumbersSwitcher } from '@/containers/LineNumbersSwitcher'
 import { CodeBlockExample } from '@/containers/CodeBlockExample'
 import { toast } from 'sonner'
 import { ChatWidthSwitcher } from '@/containers/ChatWidthSwitcher'
+import { TokenCounterCompactSwitcher } from '@/containers/TokenCounterCompactSwitcher'
 
 // eslint-disable-next-line @typescript-eslint/no-explicit-any
 export const Route = createFileRoute(route.settings.appearance as any)({
@@ -115,6 +116,11 @@ function Appareances() {
                 description={t('settings:appearance.chatWidthDesc')}
               />
               <ChatWidthSwitcher />
+              <CardItem
+                title={t('settings:appearance.tokenCounterCompact')}
+                description={t('settings:appearance.tokenCounterCompactDesc')}
+                actions={<TokenCounterCompactSwitcher />}
+              />
             </Card>
 
             {/* Codeblock */}
diff --git a/web-app/src/services/models/default.ts b/web-app/src/services/models/default.ts
index 65fb17a8e..5a31f3993 100644
--- a/web-app/src/services/models/default.ts
+++ b/web-app/src/services/models/default.ts
@@ -9,6 +9,8 @@ import {
   SessionInfo,
   SettingComponentProps,
   modelInfo,
+  ThreadMessage,
+  ContentType,
 } from '@janhq/core'
 import { Model as CoreModel } from '@janhq/core'
 import type {
@@ -544,4 +546,113 @@ export class DefaultModelsService implements ModelsService {
       }
     }
   }
+
+  async getTokensCount(
+    modelId: string,
+    messages: ThreadMessage[]
+  ): Promise<number> {
+    try {
+      const engine = this.getEngine('llamacpp') as AIEngine & {
+        getTokensCount?: (opts: {
+          model: string
+          messages: Array<{
+            role: string
+            content:
+              | string
+              | Array<{
+                  type: string
+                  text?: string
+                  image_url?: {
+                    detail?: string
+                    url?: string
+                  }
+                }>
+          }>
+        }) => Promise<number>
+      }
+
+      if (engine && typeof engine.getTokensCount === 'function') {
+        // Transform Jan's ThreadMessage format to OpenAI chat completion format
+        const transformedMessages = messages
+          .map((message) => {
+            // Handle different content types
+            let content:
+              | string
+              | Array<{
+                  type: string
+                  text?: string
+                  image_url?: {
+                    detail?: string
+                    url?: string
+                  }
+                }> = ''
+
+            if (message.content && message.content.length > 0) {
+              // Check if there are any image_url content types
+              const hasImages = message.content.some(
+                (content) => content.type === ContentType.Image
+              )
+
+              if (hasImages) {
+                // For multimodal messages, preserve the array structure
+                content = message.content.map((contentItem) => {
+                  if (contentItem.type === ContentType.Text) {
+                    return {
+                      type: 'text',
+                      text: contentItem.text?.value || '',
+                    }
+                  } else if (contentItem.type === ContentType.Image) {
+                    return {
+                      type: 'image_url',
+                      image_url: {
+                        detail: contentItem.image_url?.detail,
+                        url: contentItem.image_url?.url || '',
+                      },
+                    }
+                  }
+                  // Fallback for unknown content types
+                  return {
+                    type: contentItem.type,
+                    text: contentItem.text?.value,
+                    image_url: contentItem.image_url,
+                  }
+                })
+              } else {
+                // For text-only messages, keep the string format
+                const textContents = message.content
+                  .filter(
+                    (content) =>
+                      content.type === ContentType.Text && content.text?.value
+                  )
+                  .map((content) => content.text?.value || '')
+
+                content = textContents.join(' ')
+              }
+            }
+
+            return {
+              role: message.role,
+              content,
+            }
+          })
+          .filter((msg) =>
+            typeof msg.content === 'string'
+              ? msg.content.trim() !== ''
+              : Array.isArray(msg.content) && msg.content.length > 0
+          ) // Filter out empty messages
+
+        return await engine.getTokensCount({
+          model: modelId,
+          messages: transformedMessages,
+        })
+      }
+
+      // Fallback if method is not available
+      console.warn('getTokensCount method not available in llamacpp engine')
+      return 0
+    } catch (error) {
+      console.error(`Error getting tokens count for model ${modelId}:`, error)
+      return 0
+    }
+  }
 }
diff --git a/web-app/src/services/models/types.ts b/web-app/src/services/models/types.ts
index b7724fef2..5bf66b8bf 100644
--- a/web-app/src/services/models/types.ts
+++ b/web-app/src/services/models/types.ts
@@ -2,7 +2,7 @@
  * Models Service Types
  */
 
-import { SessionInfo, modelInfo } from '@janhq/core'
+import { SessionInfo, modelInfo, ThreadMessage } from '@janhq/core'
 import { Model as CoreModel } from '@janhq/core'
 
 // Types for model catalog
@@ -142,4 +142,5 @@ export interface ModelsService {
     mmprojPath?: string,
     requestedCtx?: number
   ): Promise<ModelPlan>
+  getTokensCount(modelId: string, messages: ThreadMessage[]): Promise<number>
 }