* feat: Adjust RAM/VRAM calculation for unified memory systems This commit refactors the logic for calculating **total RAM** and **total VRAM** in `is_model_supported` and `plan_model_load` commands, specifically targeting systems with **unified memory** (like modern macOS devices where the GPU list may be empty). The changes are as follows: * **Total RAM Calculation:** If no GPUs are detected (`sys_info.gpus.is_empty()` is true), **total RAM** is now set to $0$. This avoids confusing total system memory with dedicated GPU memory when planning model placement. * **Total VRAM Calculation:** If no GPUs are detected, **total VRAM** is still calculated as the system's **total memory (RAM)**, as this shared memory acts as VRAM on unified memory architectures. This adjustment improves the accuracy of memory availability checks and model planning on unified memory systems. * fix: total usable memory in case there is no system vram reported * chore: temporarily change to self-hosted runner mac * ci: revert back to github hosted runner macos --------- Co-authored-by: Louis <louis@jan.ai> Co-authored-by: Minh141120 <minh.itptit@gmail.com>
319 lines
11 KiB
Rust
319 lines
11 KiB
Rust
use crate::gguf::commands::get_model_size;
|
|
use crate::gguf::utils::estimate_kv_cache_internal;
|
|
use crate::gguf::utils::read_gguf_metadata_internal;
|
|
use serde::{Deserialize, Serialize};
|
|
use std::collections::HashMap;
|
|
use tauri::Runtime;
|
|
use tauri_plugin_hardware::get_system_info;
|
|
|
|
#[derive(Serialize, Deserialize, Clone, Debug)]
|
|
#[serde(rename_all = "camelCase")]
|
|
pub struct ModelPlan {
|
|
pub gpu_layers: u64,
|
|
pub max_context_length: u64,
|
|
pub no_offload_kv_cache: bool,
|
|
pub offload_mmproj: bool,
|
|
pub batch_size: u64,
|
|
pub mode: ModelMode,
|
|
}
|
|
|
|
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
|
|
#[serde(rename_all = "UPPERCASE")]
|
|
pub enum ModelMode {
|
|
GPU,
|
|
Hybrid,
|
|
CPU,
|
|
Unsupported,
|
|
}
|
|
|
|
#[tauri::command]
|
|
pub async fn plan_model_load<R: Runtime>(
|
|
path: String,
|
|
memory_mode: String,
|
|
mmproj_path: Option<String>,
|
|
requested_ctx: Option<u64>,
|
|
app: tauri::AppHandle<R>,
|
|
) -> Result<ModelPlan, String> {
|
|
let model_size = get_model_size(path.clone()).await?;
|
|
let sys_info = get_system_info(app.clone());
|
|
let gguf = read_gguf_metadata_internal(path.clone()).await?;
|
|
|
|
let mut mmproj_size: u64 = 0;
|
|
if let Some(ref mmproj) = mmproj_path {
|
|
mmproj_size = get_model_size(mmproj.clone()).await?;
|
|
}
|
|
|
|
let arch = gguf
|
|
.metadata
|
|
.get("general.architecture")
|
|
.ok_or("Missing architecture")?;
|
|
let repeating_layers: u64 = gguf
|
|
.metadata
|
|
.get(&format!("{arch}.block_count"))
|
|
.ok_or("Missing block_count")?
|
|
.parse()
|
|
.map_err(|_| "Invalid block_count")?;
|
|
let total_layers = repeating_layers + 1;
|
|
let layer_size = model_size / total_layers;
|
|
|
|
let kv_cache = estimate_kv_cache_internal(gguf.metadata.clone(), None)
|
|
.await
|
|
.map_err(|e| e.to_string())?;
|
|
let kv_cache_per_token = kv_cache.per_token_size;
|
|
|
|
if model_size == 0 || layer_size == 0 || kv_cache_per_token == 0 {
|
|
return Err("Invalid model/layer/cache sizes".into());
|
|
}
|
|
|
|
const RESERVE_BYTES: u64 = 2288490189;
|
|
const MIN_CONTEXT_LENGTH: u64 = 2048;
|
|
|
|
let model_max_ctx: u64 = gguf
|
|
.metadata
|
|
.get(&format!("{arch}.context_length"))
|
|
.and_then(|s| s.parse().ok())
|
|
.unwrap_or(8192);
|
|
|
|
let memory_percentages = HashMap::from([("high", 0.7), ("medium", 0.5), ("low", 0.4)]);
|
|
|
|
let multiplier = *memory_percentages
|
|
.get(memory_mode.as_str())
|
|
.ok_or("Invalid memory mode")?;
|
|
|
|
log::info!("Got GPUs:\n{:?}", &sys_info.gpus);
|
|
|
|
let total_ram: u64 = match sys_info.gpus.is_empty() {
|
|
// Consider RAM as 0 for unified memory
|
|
true => 0,
|
|
false => sys_info.total_memory * 1024 * 1024,
|
|
};
|
|
|
|
// Calculate total VRAM from all GPUs
|
|
let total_vram: u64 = match sys_info.gpus.is_empty() {
|
|
true => {
|
|
log::info!("No GPUs detected (likely unified memory system), using total RAM as VRAM");
|
|
sys_info.total_memory * 1024 * 1024
|
|
}
|
|
false => sys_info
|
|
.gpus
|
|
.iter()
|
|
.map(|g| g.total_memory * 1024 * 1024)
|
|
.sum::<u64>(),
|
|
};
|
|
log::info!("Total RAM reported/calculated (in bytes): {}", &total_ram);
|
|
log::info!("Total VRAM reported/calculated (in bytes): {}", &total_vram);
|
|
let usable_vram: u64 = if total_vram > RESERVE_BYTES {
|
|
(((total_vram - RESERVE_BYTES) as f64) * multiplier) as u64
|
|
} else {
|
|
0
|
|
};
|
|
log::info!("Usable vram calculated: {}", &usable_vram);
|
|
|
|
let usable_ram: u64 = if total_ram > RESERVE_BYTES {
|
|
(((total_ram - RESERVE_BYTES) as f64) * multiplier).max(0.0) as u64
|
|
} else {
|
|
0
|
|
};
|
|
log::info!("Usable ram calculated (in bytes): {}", &usable_ram);
|
|
|
|
let mut gpu_layers = 0;
|
|
let mut max_ctx_len = 0;
|
|
let mut no_offload_kv_cache = false;
|
|
let mut mode = ModelMode::Unsupported;
|
|
let mut offload_mmproj = false;
|
|
let mut batch_size = 2048;
|
|
|
|
let total_available_mem = usable_vram.saturating_add(usable_ram);
|
|
if model_size + mmproj_size > total_available_mem {
|
|
log::info!("Model not supported in this system!");
|
|
return Ok(ModelPlan {
|
|
gpu_layers: 0,
|
|
max_context_length: 0,
|
|
no_offload_kv_cache: true,
|
|
batch_size: 64,
|
|
mode: ModelMode::Unsupported,
|
|
offload_mmproj: false,
|
|
});
|
|
}
|
|
if mmproj_size > 0 {
|
|
offload_mmproj = true;
|
|
}
|
|
|
|
let kv_min_size = estimate_kv_cache_internal(gguf.metadata.clone(), Some(MIN_CONTEXT_LENGTH))
|
|
.await
|
|
.map_err(|e| e.to_string())?
|
|
.size;
|
|
|
|
if model_size + kv_min_size + mmproj_size <= usable_vram {
|
|
log::info!("Planning mode: Full GPU offload is possible.");
|
|
mode = ModelMode::GPU;
|
|
gpu_layers = total_layers;
|
|
let vram_left_for_ctx = usable_vram.saturating_sub(model_size);
|
|
let max_ctx_by_vram = (vram_left_for_ctx / kv_cache_per_token) as u64;
|
|
let requested_target = requested_ctx.unwrap_or(model_max_ctx).min(model_max_ctx);
|
|
max_ctx_len = requested_target.min(max_ctx_by_vram);
|
|
no_offload_kv_cache = false;
|
|
offload_mmproj = true;
|
|
} else {
|
|
let mut found_plan = false;
|
|
|
|
log::info!("Attempting VRAM-Maximized Hybrid plan (KV cache in VRAM only).");
|
|
for candidate_gpu_layers in (0..=total_layers).rev() {
|
|
let vram_used_by_layers = candidate_gpu_layers.saturating_mul(layer_size);
|
|
if vram_used_by_layers > usable_vram {
|
|
continue;
|
|
}
|
|
|
|
let ram_used_by_cpu_layers =
|
|
(total_layers.saturating_sub(candidate_gpu_layers)).saturating_mul(layer_size);
|
|
let ram_used_by_mmproj = if offload_mmproj { 0 } else { mmproj_size };
|
|
let required_ram_for_model = ram_used_by_cpu_layers.saturating_add(ram_used_by_mmproj);
|
|
|
|
if required_ram_for_model > usable_ram {
|
|
continue;
|
|
}
|
|
|
|
let vram_left_for_kv = usable_vram.saturating_sub(vram_used_by_layers);
|
|
let ctx_in_vram_only = (vram_left_for_kv / kv_cache_per_token) as u64;
|
|
|
|
if ctx_in_vram_only >= MIN_CONTEXT_LENGTH {
|
|
log::info!(
|
|
"Found VRAM-Maximized Hybrid plan with {} GPU layers.",
|
|
candidate_gpu_layers
|
|
);
|
|
mode = ModelMode::Hybrid;
|
|
gpu_layers = candidate_gpu_layers;
|
|
let requested_target = requested_ctx.unwrap_or(model_max_ctx).min(model_max_ctx);
|
|
max_ctx_len = requested_target.min(ctx_in_vram_only);
|
|
no_offload_kv_cache = false;
|
|
found_plan = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if !found_plan {
|
|
log::info!("VRAM-Maximized plan not feasible. Falling back to Standard Hybrid (KV cache in VRAM+RAM).");
|
|
for candidate_gpu_layers in (0..=total_layers).rev() {
|
|
let vram_used_by_layers = candidate_gpu_layers.saturating_mul(layer_size);
|
|
if vram_used_by_layers > usable_vram {
|
|
continue;
|
|
}
|
|
let vram_left_for_kv = usable_vram.saturating_sub(vram_used_by_layers);
|
|
let kv_in_vram = (vram_left_for_kv / kv_cache_per_token) as u64;
|
|
|
|
let ram_used_by_cpu_layers =
|
|
(total_layers.saturating_sub(candidate_gpu_layers)).saturating_mul(layer_size);
|
|
let ram_used_by_mmproj = if offload_mmproj { 0 } else { mmproj_size };
|
|
let required_ram_for_model =
|
|
ram_used_by_cpu_layers.saturating_add(ram_used_by_mmproj);
|
|
|
|
if required_ram_for_model > usable_ram {
|
|
continue;
|
|
}
|
|
|
|
let available_ram_for_kv = usable_ram.saturating_sub(required_ram_for_model);
|
|
let kv_in_ram = (available_ram_for_kv / kv_cache_per_token) as u64;
|
|
|
|
let total_kv_tokens = kv_in_vram.saturating_add(kv_in_ram);
|
|
|
|
if total_kv_tokens >= MIN_CONTEXT_LENGTH {
|
|
log::info!(
|
|
"Found Standard Hybrid plan with {} GPU layers.",
|
|
candidate_gpu_layers
|
|
);
|
|
mode = if candidate_gpu_layers > 0 {
|
|
ModelMode::Hybrid
|
|
} else {
|
|
ModelMode::CPU
|
|
};
|
|
gpu_layers = candidate_gpu_layers;
|
|
let requested_target =
|
|
requested_ctx.unwrap_or(model_max_ctx).min(model_max_ctx);
|
|
let max_possible_ctx = total_kv_tokens.min(model_max_ctx);
|
|
max_ctx_len = requested_target.min(max_possible_ctx);
|
|
no_offload_kv_cache = kv_in_ram > 0 && kv_in_vram == 0;
|
|
found_plan = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if !found_plan {
|
|
log::info!("No hybrid plan found. Attempting CPU-only plan.");
|
|
if model_size + mmproj_size <= usable_ram {
|
|
let available_ram_for_kv = usable_ram.saturating_sub(model_size + mmproj_size);
|
|
let kv_tokens = (available_ram_for_kv / kv_cache_per_token) as u64;
|
|
if kv_tokens >= MIN_CONTEXT_LENGTH {
|
|
mode = ModelMode::CPU;
|
|
gpu_layers = 0;
|
|
max_ctx_len = kv_tokens
|
|
.min(requested_ctx.unwrap_or(model_max_ctx))
|
|
.min(model_max_ctx);
|
|
no_offload_kv_cache = true;
|
|
offload_mmproj = false;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if let Some(req) = requested_ctx {
|
|
if req > 0 {
|
|
max_ctx_len = max_ctx_len.min(req);
|
|
}
|
|
}
|
|
max_ctx_len = max_ctx_len.min(model_max_ctx);
|
|
|
|
if max_ctx_len > 0 {
|
|
log::info!("Max context before power-of-2 adjustment: {}", max_ctx_len);
|
|
max_ctx_len = 1u64 << (63 - max_ctx_len.leading_zeros());
|
|
log::info!("Adjusted max context to power of 2: {}", max_ctx_len);
|
|
}
|
|
|
|
if mode == ModelMode::Unsupported {
|
|
if max_ctx_len >= MIN_CONTEXT_LENGTH {
|
|
// do nothing, plan is viable but wasn't assigned a mode
|
|
} else {
|
|
gpu_layers = 0;
|
|
max_ctx_len = 0;
|
|
offload_mmproj = false;
|
|
}
|
|
} else if max_ctx_len < MIN_CONTEXT_LENGTH {
|
|
log::info!(
|
|
"Final context length {} is less than minimum required {}. Marking as unsupported.",
|
|
max_ctx_len,
|
|
MIN_CONTEXT_LENGTH
|
|
);
|
|
mode = ModelMode::Unsupported;
|
|
gpu_layers = 0;
|
|
max_ctx_len = 0;
|
|
offload_mmproj = false;
|
|
}
|
|
|
|
if mode == ModelMode::Hybrid {
|
|
batch_size = 256;
|
|
} else if mode == ModelMode::CPU || no_offload_kv_cache || mode == ModelMode::Unsupported {
|
|
batch_size = 64;
|
|
}
|
|
|
|
if max_ctx_len > 0 {
|
|
batch_size = batch_size.min(max_ctx_len);
|
|
} else {
|
|
batch_size = 64;
|
|
}
|
|
|
|
if mode == ModelMode::CPU || no_offload_kv_cache {
|
|
offload_mmproj = false;
|
|
}
|
|
|
|
log::info!("Planned model load params: GPU Layers: {}, max_ctx_len: {}, kv_cache offload: {}, offload mmproj: {}, batch_size: {}",
|
|
gpu_layers, max_ctx_len, !no_offload_kv_cache, offload_mmproj, batch_size);
|
|
Ok(ModelPlan {
|
|
gpu_layers,
|
|
max_context_length: max_ctx_len,
|
|
no_offload_kv_cache,
|
|
offload_mmproj,
|
|
batch_size,
|
|
mode,
|
|
})
|
|
}
|