Compare commits

...

5 Commits

Author SHA1 Message Date
Akarshan
ea231676bf
fix: correct flash_attn and main_gpu flag checks in llamacpp extension
Previously the condition for `flash_attn` was always truthy, causing
unnecessary or incorrect `--flash-attn` arguments to be added. The
`main_gpu` check also used a loose inequality which could match values
that were not intended. The updated logic uses strict comparison and
correctly handles the empty string case, ensuring the command line
arguments are generated only when appropriate.
2025-10-30 19:49:55 +05:30
Akarshan
1f4977c1d1
fix mmap settings and adjust flash attention 2025-10-29 08:02:11 +05:30
Akarshan
7b6e4cd172
fix: compare 2025-10-29 08:02:11 +05:30
Akarshan
8b15fe4ef2
feat: Simplify backend architecture
This commit introduces a functional flag for embedding models and refactors the backend detection logic for cleaner implementation.

Key changes:

 - Embedding Support: The loadLlamaModel API and SessionInfo now include an isEmbedding: boolean flag. This allows the core process to differentiate and correctly initialize models intended for embedding tasks.

 - Backend Naming Simplification (Refactor): Consolidated the CPU-specific backend tags (e.g., win-noavx-x64, win-avx2-x64) into generic *-common_cpus-x64 variants (e.g., win-common_cpus-x64). This streamlines supported backend detection.

 - File Structure Update: Changed the download path for CUDA runtime libraries (cudart) to place them inside the specific backend's directory (/build/bin/) rather than a shared lib folder, improving asset isolation.
2025-10-29 08:02:09 +05:30
Akarshan
0c5fbc102c
refactor: Simplify Tauri plugin calls and enhance 'Flash Attention' setting
This commit introduces significant improvements to the llama.cpp extension, focusing on the 'Flash Attention' setting and refactoring Tauri plugin interactions for better code clarity and maintenance.

The backend interaction is streamlined by removing the unnecessary `libraryPath` argument from the Tauri plugin commands for loading models and listing devices.

* **Simplified API Calls:** The `loadLlamaModel`, `unloadLlamaModel`, and `get_devices` functions in both the extension and the Tauri plugin now manage the library path internally based on the backend executable's location.
* **Decoupled Logic:** The extension (`src/index.ts`) now uses the new, simplified Tauri plugin functions, which enhances modularity and reduces boilerplate code in the extension.
* **Type Consistency:** Added `UnloadResult` interface to `guest-js/index.ts` for consistency.

* **Updated UI Control:** The 'Flash Attention' setting in `settings.json` is changed from a boolean checkbox to a string-based dropdown, offering **'auto'**, **'on'**, and **'off'** options.
* **Improved Logic:** The extension logic in `src/index.ts` is updated to correctly handle the new string-based `flash_attn` configuration. It now passes the string value (`'auto'`, `'on'`, or `'off'`) directly as a command-line argument to the llama.cpp backend, simplifying the version-checking logic previously required for older llama.cpp versions. The old, complex logic tied to specific backend versions is removed.

This refactoring cleans up the extension's codebase and moves environment and path setup concerns into the Tauri plugin where they are most relevant.
2025-10-29 08:00:57 +05:30
7 changed files with 71 additions and 95 deletions

View File

@ -149,9 +149,14 @@
"key": "flash_attn",
"title": "Flash Attention",
"description": "Enable Flash Attention for optimized performance.",
"controllerType": "checkbox",
"controllerType": "dropdown",
"controllerProps": {
"value": false
"value": "auto",
"options": [
{ "value": "auto", "name": "Auto" },
{ "value": "on", "name": "ON" },
{ "value": "off", "name": "OFF" }
]
}
},
{

View File

@ -102,50 +102,27 @@ export async function listSupportedBackends(): Promise<
// TODO: fetch versions from the server?
// TODO: select CUDA version based on driver version
if (sysType == 'windows-x86_64') {
// NOTE: if a machine supports AVX2, should we include noavx and avx?
supportedBackends.push('win-noavx-x64')
if (features.avx) supportedBackends.push('win-avx-x64')
if (features.avx2) supportedBackends.push('win-avx2-x64')
if (features.avx512) supportedBackends.push('win-avx512-x64')
supportedBackends.push('win-common_cpus-x64')
if (features.cuda11) {
if (features.avx512) supportedBackends.push('win-avx512-cuda-cu11.7-x64')
else if (features.avx2) supportedBackends.push('win-avx2-cuda-cu11.7-x64')
else if (features.avx) supportedBackends.push('win-avx-cuda-cu11.7-x64')
else supportedBackends.push('win-noavx-cuda-cu11.7-x64')
supportedBackends.push('win-cuda-11-common_cpus-x64')
}
if (features.cuda12) {
if (features.avx512) supportedBackends.push('win-avx512-cuda-cu12.0-x64')
else if (features.avx2) supportedBackends.push('win-avx2-cuda-cu12.0-x64')
else if (features.avx) supportedBackends.push('win-avx-cuda-cu12.0-x64')
else supportedBackends.push('win-noavx-cuda-cu12.0-x64')
supportedBackends.push('win-cuda-12-common_cpus-x64')
}
if (features.vulkan) supportedBackends.push('win-vulkan-x64')
if (features.vulkan) supportedBackends.push('win-vulkan-common_cpus-x64')
}
// not available yet, placeholder for future
else if (sysType === 'windows-aarch64' || sysType === 'windows-arm64') {
supportedBackends.push('win-arm64')
} else if (sysType === 'linux-x86_64' || sysType === 'linux-x86') {
supportedBackends.push('linux-noavx-x64')
if (features.avx) supportedBackends.push('linux-avx-x64')
if (features.avx2) supportedBackends.push('linux-avx2-x64')
if (features.avx512) supportedBackends.push('linux-avx512-x64')
supportedBackends.push('linux-common_cpus-x64')
if (features.cuda11) {
if (features.avx512)
supportedBackends.push('linux-avx512-cuda-cu11.7-x64')
else if (features.avx2)
supportedBackends.push('linux-avx2-cuda-cu11.7-x64')
else if (features.avx) supportedBackends.push('linux-avx-cuda-cu11.7-x64')
else supportedBackends.push('linux-noavx-cuda-cu11.7-x64')
supportedBackends.push('linux-cuda-11-common_cpus-x64')
}
if (features.cuda12) {
if (features.avx512)
supportedBackends.push('linux-avx512-cuda-cu12.0-x64')
else if (features.avx2)
supportedBackends.push('linux-avx2-cuda-cu12.0-x64')
else if (features.avx) supportedBackends.push('linux-avx-cuda-cu12.0-x64')
else supportedBackends.push('linux-noavx-cuda-cu12.0-x64')
supportedBackends.push('linux-cuda-12-common_cpus-x64')
}
if (features.vulkan) supportedBackends.push('linux-vulkan-x64')
if (features.vulkan) supportedBackends.push('linux-vulkan-common_cpus-x64')
}
// not available yet, placeholder for future
else if (sysType === 'linux-aarch64' || sysType === 'linux-arm64') {
@ -230,10 +207,7 @@ export async function downloadBackend(
version: string,
source: 'github' | 'cdn' = 'github'
): Promise<void> {
const janDataFolderPath = await getJanDataFolderPath()
const llamacppPath = await joinPath([janDataFolderPath, 'llamacpp'])
const backendDir = await getBackendDir(backend, version)
const libDir = await joinPath([llamacppPath, 'lib'])
const downloadManager = window.core.extensionManager.getByName(
'@janhq/download-extension'
@ -265,7 +239,7 @@ export async function downloadBackend(
source === 'github'
? `https://github.com/janhq/llama.cpp/releases/download/${version}/cudart-llama-bin-${platformName}-cu11.7-x64.tar.gz`
: `https://catalog.jan.ai/llama.cpp/releases/${version}/cudart-llama-bin-${platformName}-cu11.7-x64.tar.gz`,
save_path: await joinPath([libDir, 'cuda11.tar.gz']),
save_path: await joinPath([backendDir, 'build', 'bin', 'cuda11.tar.gz']),
proxy: proxyConfig,
})
} else if (backend.includes('cu12.0') && !(await _isCudaInstalled('12.0'))) {
@ -274,7 +248,7 @@ export async function downloadBackend(
source === 'github'
? `https://github.com/janhq/llama.cpp/releases/download/${version}/cudart-llama-bin-${platformName}-cu12.0-x64.tar.gz`
: `https://catalog.jan.ai/llama.cpp/releases/${version}/cudart-llama-bin-${platformName}-cu12.0-x64.tar.gz`,
save_path: await joinPath([libDir, 'cuda12.tar.gz']),
save_path: await joinPath([backendDir, 'build', 'bin', 'cuda12.tar.gz']),
proxy: proxyConfig,
})
}
@ -344,8 +318,8 @@ async function _getSupportedFeatures() {
}
// https://docs.nvidia.com/deploy/cuda-compatibility/#cuda-11-and-later-defaults-to-minor-version-compatibility
let minCuda11DriverVersion
let minCuda12DriverVersion
let minCuda11DriverVersion: string
let minCuda12DriverVersion: string
if (sysInfo.os_type === 'linux') {
minCuda11DriverVersion = '450.80.02'
minCuda12DriverVersion = '525.60.13'

View File

@ -38,10 +38,12 @@ import { invoke } from '@tauri-apps/api/core'
import { getProxyConfig } from './util'
import { basename } from '@tauri-apps/api/path'
import {
loadLlamaModel,
readGgufMetadata,
getModelSize,
isModelSupported,
planModelLoadInternal,
unloadLlamaModel,
} from '@janhq/tauri-plugin-llamacpp-api'
import { getSystemUsage, getSystemInfo } from '@janhq/tauri-plugin-hardware-api'
@ -69,7 +71,7 @@ type LlamacppConfig = {
device: string
split_mode: string
main_gpu: number
flash_attn: boolean
flash_attn: string
cont_batching: boolean
no_mmap: boolean
mlock: boolean
@ -549,9 +551,9 @@ export default class llamacpp_extension extends AIEngine {
// Helper to map backend string to a priority category
const getBackendCategory = (backendString: string): string | undefined => {
if (backendString.includes('cu12.0')) return 'cuda-cu12.0'
if (backendString.includes('cu11.7')) return 'cuda-cu11.7'
if (backendString.includes('vulkan')) return 'vulkan'
if (backendString.includes('cuda-12-common_cpus')) return 'cuda-cu12.0'
if (backendString.includes('cuda-11-common_cpus')) return 'cuda-cu11.7'
if (backendString.includes('vulkan-common_cpus')) return 'vulkan'
if (backendString.includes('avx512')) return 'avx512'
if (backendString.includes('avx2')) return 'avx2'
if (
@ -1644,18 +1646,20 @@ export default class llamacpp_extension extends AIEngine {
if (cfg.device.length > 0) args.push('--device', cfg.device)
if (cfg.split_mode.length > 0 && cfg.split_mode != 'layer')
args.push('--split-mode', cfg.split_mode)
if (cfg.main_gpu !== undefined && cfg.main_gpu != 0)
if (cfg.main_gpu !== undefined && cfg.main_gpu !== 0)
args.push('--main-gpu', String(cfg.main_gpu))
// Note: Older llama.cpp versions are no longer supported
if (
cfg.flash_attn !== undefined ||
!cfg.flash_attn ||
cfg.flash_attn !== ''
)
args.push('--flash-attn', String(cfg.flash_attn)) //default: auto = ON when supported
// Boolean flags
if (cfg.ctx_shift) args.push('--context-shift')
if (Number(version.replace(/^b/, '')) >= 6325) {
if (!cfg.flash_attn) args.push('--flash-attn', 'off') //default: auto = ON when supported
} else {
if (cfg.flash_attn) args.push('--flash-attn')
}
if (cfg.cont_batching) args.push('--cont-batching')
args.push('--no-mmap')
if (cfg.no_mmap) args.push('--no-mmap')
if (cfg.mlock) args.push('--mlock')
if (cfg.no_kv_offload) args.push('--no-kv-offload')
if (isEmbedding) {
@ -1667,7 +1671,7 @@ export default class llamacpp_extension extends AIEngine {
if (cfg.cache_type_k && cfg.cache_type_k != 'f16')
args.push('--cache-type-k', cfg.cache_type_k)
if (
cfg.flash_attn &&
cfg.flash_attn !== 'on' &&
cfg.cache_type_v != 'f16' &&
cfg.cache_type_v != 'f32'
) {
@ -1688,20 +1692,9 @@ export default class llamacpp_extension extends AIEngine {
logger.info('Calling Tauri command llama_load with args:', args)
const backendPath = await getBackendExePath(backend, version)
const libraryPath = await joinPath([await this.getProviderPath(), 'lib'])
try {
// TODO: add LIBRARY_PATH
const sInfo = await invoke<SessionInfo>(
'plugin:llamacpp|load_llama_model',
{
backendPath,
libraryPath,
args,
envs,
isEmbedding,
}
)
const sInfo = await loadLlamaModel(backendPath, args, envs, isEmbedding)
return sInfo
} catch (error) {
logger.error('Error in load command:\n', error)
@ -1717,12 +1710,7 @@ export default class llamacpp_extension extends AIEngine {
const pid = sInfo.pid
try {
// Pass the PID as the session_id
const result = await invoke<UnloadResult>(
'plugin:llamacpp|unload_llama_model',
{
pid: pid,
}
)
const result = await unloadLlamaModel(pid)
// If successful, remove from active sessions
if (result.success) {
@ -2042,7 +2030,10 @@ export default class llamacpp_extension extends AIEngine {
if (sysInfo?.os_type === 'linux' && Array.isArray(sysInfo.gpus)) {
const usage = await getSystemUsage()
if (usage && Array.isArray(usage.gpus)) {
const uuidToUsage: Record<string, { total_memory: number; used_memory: number }> = {}
const uuidToUsage: Record<
string,
{ total_memory: number; used_memory: number }
> = {}
for (const u of usage.gpus as any[]) {
if (u && typeof u.uuid === 'string') {
uuidToUsage[u.uuid] = u
@ -2082,7 +2073,10 @@ export default class llamacpp_extension extends AIEngine {
typeof u.used_memory === 'number'
) {
const total = Math.max(0, Math.floor(u.total_memory))
const free = Math.max(0, Math.floor(u.total_memory - u.used_memory))
const free = Math.max(
0,
Math.floor(u.total_memory - u.used_memory)
)
return { ...dev, mem: total, free }
}
}

View File

@ -2,11 +2,18 @@ import { invoke } from '@tauri-apps/api/core'
// Types
export interface SessionInfo {
pid: number
port: number
model_id: string
model_path: string
api_key: string
pid: number;
port: number;
model_id: string;
model_path: string;
is_embedding: boolean
api_key: string;
mmproj_path?: string;
}
export interface UnloadResult {
success: boolean;
error?: string;
}
export interface DeviceInfo {
@ -29,19 +36,19 @@ export async function cleanupLlamaProcesses(): Promise<void> {
// LlamaCpp server commands
export async function loadLlamaModel(
backendPath: string,
libraryPath?: string,
args: string[] = [],
isEmbedding: boolean = false
args: string[],
envs: Record<string, string>,
isEmbedding: boolean
): Promise<SessionInfo> {
return await invoke('plugin:llamacpp|load_llama_model', {
backendPath,
libraryPath,
args,
isEmbedding,
envs,
isEmbedding
})
}
export async function unloadLlamaModel(pid: number): Promise<void> {
export async function unloadLlamaModel(pid: number): Promise<UnloadResult> {
return await invoke('plugin:llamacpp|unload_llama_model', { pid })
}

View File

@ -41,7 +41,6 @@ pub struct UnloadResult {
pub async fn load_llama_model<R: Runtime>(
app_handle: tauri::AppHandle<R>,
backend_path: &str,
library_path: Option<&str>,
mut args: Vec<String>,
envs: HashMap<String, String>,
is_embedding: bool,
@ -52,7 +51,7 @@ pub async fn load_llama_model<R: Runtime>(
log::info!("Attempting to launch server at path: {:?}", backend_path);
log::info!("Using arguments: {:?}", args);
validate_binary_path(backend_path)?;
let bin_path = validate_binary_path(backend_path)?;
let port = parse_port_from_args(&args);
let model_path_pb = validate_model_path(&mut args)?;
@ -83,11 +82,11 @@ pub async fn load_llama_model<R: Runtime>(
let model_id = extract_arg_value(&args, "-a");
// Configure the command to run the server
let mut command = Command::new(backend_path);
let mut command = Command::new(&bin_path);
command.args(args);
command.envs(envs);
setup_library_path(library_path, &mut command);
setup_library_path(bin_path.parent().and_then(|p| p.to_str()), &mut command);
command.stdout(Stdio::piped());
command.stderr(Stdio::piped());
setup_windows_process_flags(&mut command);
@ -280,10 +279,9 @@ pub async fn unload_llama_model<R: Runtime>(
#[tauri::command]
pub async fn get_devices(
backend_path: &str,
library_path: Option<&str>,
envs: HashMap<String, String>,
) -> ServerResult<Vec<DeviceInfo>> {
get_devices_from_backend(backend_path, library_path, envs).await
get_devices_from_backend(backend_path, envs).await
}
/// Generate API key using HMAC-SHA256

View File

@ -19,20 +19,19 @@ pub struct DeviceInfo {
pub async fn get_devices_from_backend(
backend_path: &str,
library_path: Option<&str>,
envs: HashMap<String, String>,
) -> ServerResult<Vec<DeviceInfo>> {
log::info!("Getting devices from server at path: {:?}", backend_path);
validate_binary_path(backend_path)?;
let bin_path = validate_binary_path(backend_path)?;
// Configure the command to run the server with --list-devices
let mut command = Command::new(backend_path);
let mut command = Command::new(&bin_path);
command.arg("--list-devices");
command.envs(envs);
// Set up library path
setup_library_path(library_path, &mut command);
setup_library_path(bin_path.parent().and_then(|p| p.to_str()), &mut command);
command.stdout(Stdio::piped());
command.stderr(Stdio::piped());
@ -410,4 +409,4 @@ AnotherInvalid
assert_eq!(result[0].id, "Vulkan0");
assert_eq!(result[1].id, "CUDA0");
}
}
}

View File

@ -62,7 +62,6 @@ pub async fn estimate_kv_cache_internal(
ctx_size: Option<u64>,
) -> Result<KVCacheEstimate, KVCacheError> {
log::info!("Received ctx_size parameter: {:?}", ctx_size);
log::info!("Received model metadata:\n{:?}", &meta);
let arch = meta
.get("general.architecture")
.ok_or(KVCacheError::ArchitectureNotFound)?;