diff --git a/extensions/llamacpp-extension/src/index.ts b/extensions/llamacpp-extension/src/index.ts index ca8e7791f..8590891b6 100644 --- a/extensions/llamacpp-extension/src/index.ts +++ b/extensions/llamacpp-extension/src/index.ts @@ -2286,38 +2286,36 @@ export default class llamacpp_extension extends AIEngine { } // Calculate text tokens - // Use a direct approach: convert messages to text and tokenize directly - // This avoids issues with enable_thinking and assistant prefills - let textToTokenize = '' - - for (const msg of opts.messages) { - const rolePrefix = - msg.role === 'user' - ? 'User: ' - : msg.role === 'assistant' - ? 'Assistant: ' - : msg.role === 'system' - ? 'System: ' - : '' - - if (typeof msg.content === 'string') { - textToTokenize += `${rolePrefix}${msg.content}\n` - } else if (Array.isArray(msg.content)) { - for (const part of msg.content) { - if (part.type === 'text' && part.text) { - textToTokenize += part.text - } - // Skip image tokens as they're calculated separately - } - textToTokenize += '\n' - } + // Use chat_template_kwargs from opts if provided, otherwise default to disable enable_thinking + const tokenizeRequest = { + messages: opts.messages, + chat_template_kwargs: opts.chat_template_kwargs || { + enable_thinking: false, + }, } + let parseResponse = await fetch(`${baseUrl}/apply-template`, { + method: 'POST', + headers: headers, + body: JSON.stringify(tokenizeRequest), + }) + + if (!parseResponse.ok) { + const errorData = await parseResponse.json().catch(() => null) + throw new Error( + `API request failed with status ${ + parseResponse.status + }: ${JSON.stringify(errorData)}` + ) + } + + const parsedPrompt = await parseResponse.json() + const response = await fetch(`${baseUrl}/tokenize`, { method: 'POST', headers: headers, body: JSON.stringify({ - content: textToTokenize, + content: parsedPrompt.prompt, }), }) diff --git a/web-app/src/services/models/default.ts b/web-app/src/services/models/default.ts index 746f869d1..203ab5ccd 100644 --- a/web-app/src/services/models/default.ts +++ b/web-app/src/services/models/default.ts @@ -578,6 +578,9 @@ export class DefaultModelsService implements ModelsService { } }> }> + chat_template_kwargs?: { + enable_thinking: boolean + } }) => Promise } @@ -654,6 +657,9 @@ export class DefaultModelsService implements ModelsService { return await engine.getTokensCount({ model: modelId, messages: transformedMessages, + chat_template_kwargs: { + enable_thinking: false, + }, }) }