Compare commits
5 Commits
dev
...
refactor/b
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ea231676bf | ||
|
|
1f4977c1d1 | ||
|
|
7b6e4cd172 | ||
|
|
8b15fe4ef2 | ||
|
|
0c5fbc102c |
@ -149,9 +149,14 @@
|
||||
"key": "flash_attn",
|
||||
"title": "Flash Attention",
|
||||
"description": "Enable Flash Attention for optimized performance.",
|
||||
"controllerType": "checkbox",
|
||||
"controllerType": "dropdown",
|
||||
"controllerProps": {
|
||||
"value": false
|
||||
"value": "auto",
|
||||
"options": [
|
||||
{ "value": "auto", "name": "Auto" },
|
||||
{ "value": "on", "name": "ON" },
|
||||
{ "value": "off", "name": "OFF" }
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
|
||||
@ -102,50 +102,27 @@ export async function listSupportedBackends(): Promise<
|
||||
// TODO: fetch versions from the server?
|
||||
// TODO: select CUDA version based on driver version
|
||||
if (sysType == 'windows-x86_64') {
|
||||
// NOTE: if a machine supports AVX2, should we include noavx and avx?
|
||||
supportedBackends.push('win-noavx-x64')
|
||||
if (features.avx) supportedBackends.push('win-avx-x64')
|
||||
if (features.avx2) supportedBackends.push('win-avx2-x64')
|
||||
if (features.avx512) supportedBackends.push('win-avx512-x64')
|
||||
supportedBackends.push('win-common_cpus-x64')
|
||||
if (features.cuda11) {
|
||||
if (features.avx512) supportedBackends.push('win-avx512-cuda-cu11.7-x64')
|
||||
else if (features.avx2) supportedBackends.push('win-avx2-cuda-cu11.7-x64')
|
||||
else if (features.avx) supportedBackends.push('win-avx-cuda-cu11.7-x64')
|
||||
else supportedBackends.push('win-noavx-cuda-cu11.7-x64')
|
||||
supportedBackends.push('win-cuda-11-common_cpus-x64')
|
||||
}
|
||||
if (features.cuda12) {
|
||||
if (features.avx512) supportedBackends.push('win-avx512-cuda-cu12.0-x64')
|
||||
else if (features.avx2) supportedBackends.push('win-avx2-cuda-cu12.0-x64')
|
||||
else if (features.avx) supportedBackends.push('win-avx-cuda-cu12.0-x64')
|
||||
else supportedBackends.push('win-noavx-cuda-cu12.0-x64')
|
||||
supportedBackends.push('win-cuda-12-common_cpus-x64')
|
||||
}
|
||||
if (features.vulkan) supportedBackends.push('win-vulkan-x64')
|
||||
if (features.vulkan) supportedBackends.push('win-vulkan-common_cpus-x64')
|
||||
}
|
||||
// not available yet, placeholder for future
|
||||
else if (sysType === 'windows-aarch64' || sysType === 'windows-arm64') {
|
||||
supportedBackends.push('win-arm64')
|
||||
} else if (sysType === 'linux-x86_64' || sysType === 'linux-x86') {
|
||||
supportedBackends.push('linux-noavx-x64')
|
||||
if (features.avx) supportedBackends.push('linux-avx-x64')
|
||||
if (features.avx2) supportedBackends.push('linux-avx2-x64')
|
||||
if (features.avx512) supportedBackends.push('linux-avx512-x64')
|
||||
supportedBackends.push('linux-common_cpus-x64')
|
||||
if (features.cuda11) {
|
||||
if (features.avx512)
|
||||
supportedBackends.push('linux-avx512-cuda-cu11.7-x64')
|
||||
else if (features.avx2)
|
||||
supportedBackends.push('linux-avx2-cuda-cu11.7-x64')
|
||||
else if (features.avx) supportedBackends.push('linux-avx-cuda-cu11.7-x64')
|
||||
else supportedBackends.push('linux-noavx-cuda-cu11.7-x64')
|
||||
supportedBackends.push('linux-cuda-11-common_cpus-x64')
|
||||
}
|
||||
if (features.cuda12) {
|
||||
if (features.avx512)
|
||||
supportedBackends.push('linux-avx512-cuda-cu12.0-x64')
|
||||
else if (features.avx2)
|
||||
supportedBackends.push('linux-avx2-cuda-cu12.0-x64')
|
||||
else if (features.avx) supportedBackends.push('linux-avx-cuda-cu12.0-x64')
|
||||
else supportedBackends.push('linux-noavx-cuda-cu12.0-x64')
|
||||
supportedBackends.push('linux-cuda-12-common_cpus-x64')
|
||||
}
|
||||
if (features.vulkan) supportedBackends.push('linux-vulkan-x64')
|
||||
if (features.vulkan) supportedBackends.push('linux-vulkan-common_cpus-x64')
|
||||
}
|
||||
// not available yet, placeholder for future
|
||||
else if (sysType === 'linux-aarch64' || sysType === 'linux-arm64') {
|
||||
@ -230,10 +207,7 @@ export async function downloadBackend(
|
||||
version: string,
|
||||
source: 'github' | 'cdn' = 'github'
|
||||
): Promise<void> {
|
||||
const janDataFolderPath = await getJanDataFolderPath()
|
||||
const llamacppPath = await joinPath([janDataFolderPath, 'llamacpp'])
|
||||
const backendDir = await getBackendDir(backend, version)
|
||||
const libDir = await joinPath([llamacppPath, 'lib'])
|
||||
|
||||
const downloadManager = window.core.extensionManager.getByName(
|
||||
'@janhq/download-extension'
|
||||
@ -265,7 +239,7 @@ export async function downloadBackend(
|
||||
source === 'github'
|
||||
? `https://github.com/janhq/llama.cpp/releases/download/${version}/cudart-llama-bin-${platformName}-cu11.7-x64.tar.gz`
|
||||
: `https://catalog.jan.ai/llama.cpp/releases/${version}/cudart-llama-bin-${platformName}-cu11.7-x64.tar.gz`,
|
||||
save_path: await joinPath([libDir, 'cuda11.tar.gz']),
|
||||
save_path: await joinPath([backendDir, 'build', 'bin', 'cuda11.tar.gz']),
|
||||
proxy: proxyConfig,
|
||||
})
|
||||
} else if (backend.includes('cu12.0') && !(await _isCudaInstalled('12.0'))) {
|
||||
@ -274,7 +248,7 @@ export async function downloadBackend(
|
||||
source === 'github'
|
||||
? `https://github.com/janhq/llama.cpp/releases/download/${version}/cudart-llama-bin-${platformName}-cu12.0-x64.tar.gz`
|
||||
: `https://catalog.jan.ai/llama.cpp/releases/${version}/cudart-llama-bin-${platformName}-cu12.0-x64.tar.gz`,
|
||||
save_path: await joinPath([libDir, 'cuda12.tar.gz']),
|
||||
save_path: await joinPath([backendDir, 'build', 'bin', 'cuda12.tar.gz']),
|
||||
proxy: proxyConfig,
|
||||
})
|
||||
}
|
||||
@ -344,8 +318,8 @@ async function _getSupportedFeatures() {
|
||||
}
|
||||
|
||||
// https://docs.nvidia.com/deploy/cuda-compatibility/#cuda-11-and-later-defaults-to-minor-version-compatibility
|
||||
let minCuda11DriverVersion
|
||||
let minCuda12DriverVersion
|
||||
let minCuda11DriverVersion: string
|
||||
let minCuda12DriverVersion: string
|
||||
if (sysInfo.os_type === 'linux') {
|
||||
minCuda11DriverVersion = '450.80.02'
|
||||
minCuda12DriverVersion = '525.60.13'
|
||||
|
||||
@ -38,10 +38,12 @@ import { invoke } from '@tauri-apps/api/core'
|
||||
import { getProxyConfig } from './util'
|
||||
import { basename } from '@tauri-apps/api/path'
|
||||
import {
|
||||
loadLlamaModel,
|
||||
readGgufMetadata,
|
||||
getModelSize,
|
||||
isModelSupported,
|
||||
planModelLoadInternal,
|
||||
unloadLlamaModel,
|
||||
} from '@janhq/tauri-plugin-llamacpp-api'
|
||||
import { getSystemUsage, getSystemInfo } from '@janhq/tauri-plugin-hardware-api'
|
||||
|
||||
@ -69,7 +71,7 @@ type LlamacppConfig = {
|
||||
device: string
|
||||
split_mode: string
|
||||
main_gpu: number
|
||||
flash_attn: boolean
|
||||
flash_attn: string
|
||||
cont_batching: boolean
|
||||
no_mmap: boolean
|
||||
mlock: boolean
|
||||
@ -549,9 +551,9 @@ export default class llamacpp_extension extends AIEngine {
|
||||
|
||||
// Helper to map backend string to a priority category
|
||||
const getBackendCategory = (backendString: string): string | undefined => {
|
||||
if (backendString.includes('cu12.0')) return 'cuda-cu12.0'
|
||||
if (backendString.includes('cu11.7')) return 'cuda-cu11.7'
|
||||
if (backendString.includes('vulkan')) return 'vulkan'
|
||||
if (backendString.includes('cuda-12-common_cpus')) return 'cuda-cu12.0'
|
||||
if (backendString.includes('cuda-11-common_cpus')) return 'cuda-cu11.7'
|
||||
if (backendString.includes('vulkan-common_cpus')) return 'vulkan'
|
||||
if (backendString.includes('avx512')) return 'avx512'
|
||||
if (backendString.includes('avx2')) return 'avx2'
|
||||
if (
|
||||
@ -1644,18 +1646,20 @@ export default class llamacpp_extension extends AIEngine {
|
||||
if (cfg.device.length > 0) args.push('--device', cfg.device)
|
||||
if (cfg.split_mode.length > 0 && cfg.split_mode != 'layer')
|
||||
args.push('--split-mode', cfg.split_mode)
|
||||
if (cfg.main_gpu !== undefined && cfg.main_gpu != 0)
|
||||
if (cfg.main_gpu !== undefined && cfg.main_gpu !== 0)
|
||||
args.push('--main-gpu', String(cfg.main_gpu))
|
||||
// Note: Older llama.cpp versions are no longer supported
|
||||
if (
|
||||
cfg.flash_attn !== undefined ||
|
||||
!cfg.flash_attn ||
|
||||
cfg.flash_attn !== ''
|
||||
)
|
||||
args.push('--flash-attn', String(cfg.flash_attn)) //default: auto = ON when supported
|
||||
|
||||
// Boolean flags
|
||||
if (cfg.ctx_shift) args.push('--context-shift')
|
||||
if (Number(version.replace(/^b/, '')) >= 6325) {
|
||||
if (!cfg.flash_attn) args.push('--flash-attn', 'off') //default: auto = ON when supported
|
||||
} else {
|
||||
if (cfg.flash_attn) args.push('--flash-attn')
|
||||
}
|
||||
if (cfg.cont_batching) args.push('--cont-batching')
|
||||
args.push('--no-mmap')
|
||||
if (cfg.no_mmap) args.push('--no-mmap')
|
||||
if (cfg.mlock) args.push('--mlock')
|
||||
if (cfg.no_kv_offload) args.push('--no-kv-offload')
|
||||
if (isEmbedding) {
|
||||
@ -1667,7 +1671,7 @@ export default class llamacpp_extension extends AIEngine {
|
||||
if (cfg.cache_type_k && cfg.cache_type_k != 'f16')
|
||||
args.push('--cache-type-k', cfg.cache_type_k)
|
||||
if (
|
||||
cfg.flash_attn &&
|
||||
cfg.flash_attn !== 'on' &&
|
||||
cfg.cache_type_v != 'f16' &&
|
||||
cfg.cache_type_v != 'f32'
|
||||
) {
|
||||
@ -1688,20 +1692,9 @@ export default class llamacpp_extension extends AIEngine {
|
||||
|
||||
logger.info('Calling Tauri command llama_load with args:', args)
|
||||
const backendPath = await getBackendExePath(backend, version)
|
||||
const libraryPath = await joinPath([await this.getProviderPath(), 'lib'])
|
||||
|
||||
try {
|
||||
// TODO: add LIBRARY_PATH
|
||||
const sInfo = await invoke<SessionInfo>(
|
||||
'plugin:llamacpp|load_llama_model',
|
||||
{
|
||||
backendPath,
|
||||
libraryPath,
|
||||
args,
|
||||
envs,
|
||||
isEmbedding,
|
||||
}
|
||||
)
|
||||
const sInfo = await loadLlamaModel(backendPath, args, envs, isEmbedding)
|
||||
return sInfo
|
||||
} catch (error) {
|
||||
logger.error('Error in load command:\n', error)
|
||||
@ -1717,12 +1710,7 @@ export default class llamacpp_extension extends AIEngine {
|
||||
const pid = sInfo.pid
|
||||
try {
|
||||
// Pass the PID as the session_id
|
||||
const result = await invoke<UnloadResult>(
|
||||
'plugin:llamacpp|unload_llama_model',
|
||||
{
|
||||
pid: pid,
|
||||
}
|
||||
)
|
||||
const result = await unloadLlamaModel(pid)
|
||||
|
||||
// If successful, remove from active sessions
|
||||
if (result.success) {
|
||||
@ -2042,7 +2030,10 @@ export default class llamacpp_extension extends AIEngine {
|
||||
if (sysInfo?.os_type === 'linux' && Array.isArray(sysInfo.gpus)) {
|
||||
const usage = await getSystemUsage()
|
||||
if (usage && Array.isArray(usage.gpus)) {
|
||||
const uuidToUsage: Record<string, { total_memory: number; used_memory: number }> = {}
|
||||
const uuidToUsage: Record<
|
||||
string,
|
||||
{ total_memory: number; used_memory: number }
|
||||
> = {}
|
||||
for (const u of usage.gpus as any[]) {
|
||||
if (u && typeof u.uuid === 'string') {
|
||||
uuidToUsage[u.uuid] = u
|
||||
@ -2082,7 +2073,10 @@ export default class llamacpp_extension extends AIEngine {
|
||||
typeof u.used_memory === 'number'
|
||||
) {
|
||||
const total = Math.max(0, Math.floor(u.total_memory))
|
||||
const free = Math.max(0, Math.floor(u.total_memory - u.used_memory))
|
||||
const free = Math.max(
|
||||
0,
|
||||
Math.floor(u.total_memory - u.used_memory)
|
||||
)
|
||||
return { ...dev, mem: total, free }
|
||||
}
|
||||
}
|
||||
|
||||
@ -2,11 +2,18 @@ import { invoke } from '@tauri-apps/api/core'
|
||||
|
||||
// Types
|
||||
export interface SessionInfo {
|
||||
pid: number
|
||||
port: number
|
||||
model_id: string
|
||||
model_path: string
|
||||
api_key: string
|
||||
pid: number;
|
||||
port: number;
|
||||
model_id: string;
|
||||
model_path: string;
|
||||
is_embedding: boolean
|
||||
api_key: string;
|
||||
mmproj_path?: string;
|
||||
}
|
||||
|
||||
export interface UnloadResult {
|
||||
success: boolean;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export interface DeviceInfo {
|
||||
@ -29,19 +36,19 @@ export async function cleanupLlamaProcesses(): Promise<void> {
|
||||
// LlamaCpp server commands
|
||||
export async function loadLlamaModel(
|
||||
backendPath: string,
|
||||
libraryPath?: string,
|
||||
args: string[] = [],
|
||||
isEmbedding: boolean = false
|
||||
args: string[],
|
||||
envs: Record<string, string>,
|
||||
isEmbedding: boolean
|
||||
): Promise<SessionInfo> {
|
||||
return await invoke('plugin:llamacpp|load_llama_model', {
|
||||
backendPath,
|
||||
libraryPath,
|
||||
args,
|
||||
isEmbedding,
|
||||
envs,
|
||||
isEmbedding
|
||||
})
|
||||
}
|
||||
|
||||
export async function unloadLlamaModel(pid: number): Promise<void> {
|
||||
export async function unloadLlamaModel(pid: number): Promise<UnloadResult> {
|
||||
return await invoke('plugin:llamacpp|unload_llama_model', { pid })
|
||||
}
|
||||
|
||||
|
||||
@ -41,7 +41,6 @@ pub struct UnloadResult {
|
||||
pub async fn load_llama_model<R: Runtime>(
|
||||
app_handle: tauri::AppHandle<R>,
|
||||
backend_path: &str,
|
||||
library_path: Option<&str>,
|
||||
mut args: Vec<String>,
|
||||
envs: HashMap<String, String>,
|
||||
is_embedding: bool,
|
||||
@ -52,7 +51,7 @@ pub async fn load_llama_model<R: Runtime>(
|
||||
log::info!("Attempting to launch server at path: {:?}", backend_path);
|
||||
log::info!("Using arguments: {:?}", args);
|
||||
|
||||
validate_binary_path(backend_path)?;
|
||||
let bin_path = validate_binary_path(backend_path)?;
|
||||
|
||||
let port = parse_port_from_args(&args);
|
||||
let model_path_pb = validate_model_path(&mut args)?;
|
||||
@ -83,11 +82,11 @@ pub async fn load_llama_model<R: Runtime>(
|
||||
let model_id = extract_arg_value(&args, "-a");
|
||||
|
||||
// Configure the command to run the server
|
||||
let mut command = Command::new(backend_path);
|
||||
let mut command = Command::new(&bin_path);
|
||||
command.args(args);
|
||||
command.envs(envs);
|
||||
|
||||
setup_library_path(library_path, &mut command);
|
||||
setup_library_path(bin_path.parent().and_then(|p| p.to_str()), &mut command);
|
||||
command.stdout(Stdio::piped());
|
||||
command.stderr(Stdio::piped());
|
||||
setup_windows_process_flags(&mut command);
|
||||
@ -280,10 +279,9 @@ pub async fn unload_llama_model<R: Runtime>(
|
||||
#[tauri::command]
|
||||
pub async fn get_devices(
|
||||
backend_path: &str,
|
||||
library_path: Option<&str>,
|
||||
envs: HashMap<String, String>,
|
||||
) -> ServerResult<Vec<DeviceInfo>> {
|
||||
get_devices_from_backend(backend_path, library_path, envs).await
|
||||
get_devices_from_backend(backend_path, envs).await
|
||||
}
|
||||
|
||||
/// Generate API key using HMAC-SHA256
|
||||
|
||||
@ -19,20 +19,19 @@ pub struct DeviceInfo {
|
||||
|
||||
pub async fn get_devices_from_backend(
|
||||
backend_path: &str,
|
||||
library_path: Option<&str>,
|
||||
envs: HashMap<String, String>,
|
||||
) -> ServerResult<Vec<DeviceInfo>> {
|
||||
log::info!("Getting devices from server at path: {:?}", backend_path);
|
||||
|
||||
validate_binary_path(backend_path)?;
|
||||
let bin_path = validate_binary_path(backend_path)?;
|
||||
|
||||
// Configure the command to run the server with --list-devices
|
||||
let mut command = Command::new(backend_path);
|
||||
let mut command = Command::new(&bin_path);
|
||||
command.arg("--list-devices");
|
||||
command.envs(envs);
|
||||
|
||||
// Set up library path
|
||||
setup_library_path(library_path, &mut command);
|
||||
setup_library_path(bin_path.parent().and_then(|p| p.to_str()), &mut command);
|
||||
|
||||
command.stdout(Stdio::piped());
|
||||
command.stderr(Stdio::piped());
|
||||
@ -410,4 +409,4 @@ AnotherInvalid
|
||||
assert_eq!(result[0].id, "Vulkan0");
|
||||
assert_eq!(result[1].id, "CUDA0");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -62,7 +62,6 @@ pub async fn estimate_kv_cache_internal(
|
||||
ctx_size: Option<u64>,
|
||||
) -> Result<KVCacheEstimate, KVCacheError> {
|
||||
log::info!("Received ctx_size parameter: {:?}", ctx_size);
|
||||
log::info!("Received model metadata:\n{:?}", &meta);
|
||||
let arch = meta
|
||||
.get("general.architecture")
|
||||
.ok_or(KVCacheError::ArchitectureNotFound)?;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user