Fix: Llama.cpp server hangs on model load (#6030)

* Fix: Llama.cpp server hangs on model load

Resolves an issue where the llama.cpp server would hang indefinitely when loading certain models, as described in the attached ticket. The server's readiness message was not being correctly detected, causing the application to stall.

The previous implementation used a line-buffered reader (BufReader::lines()) to process the stderr stream. This method proved to be unreliable for the specific output of the llama.cpp server.

This commit refactors the stderr handling logic to use a more robust, chunk-based approach (read_until(b'\n', ...)). This ensures that the output is processed as it arrives, reliably capturing critical status messages and preventing the application from hanging during model initialization.

Fixes: #6021

* Handle error gracefully with ServerError

Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com>

* Revert "Handle error gracefully with ServerError"

This reverts commit 267a8a8a3262fbe36a445a30b8b3ba9a39697643.

* Revert "Fix: Llama.cpp server hangs on model load"

This reverts commit 44e5447f82f0ae32b6db7ffb213025f130d655c4.

* Add more guards, refactor and fix error sending to FE

---------

Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com>
This commit is contained in:
Akarshan Biswas 2025-08-02 21:50:07 +05:30 committed by GitHub
parent 787c4ee073
commit b1984a452e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -185,40 +185,76 @@ pub async fn load_llama_model(
// Spawn task to monitor stdout for readiness
let _stdout_task = tokio::spawn(async move {
let mut reader = BufReader::new(stdout).lines();
while let Ok(Some(line)) = reader.next_line().await {
log::info!("[llamacpp stdout] {}", line);
let mut reader = BufReader::new(stdout);
let mut byte_buffer = Vec::new();
loop {
byte_buffer.clear();
match reader.read_until(b'\n', &mut byte_buffer).await {
Ok(0) => break, // EOF
Ok(_) => {
let line = String::from_utf8_lossy(&byte_buffer);
let line = line.trim_end();
if !line.is_empty() {
log::info!("[llamacpp stdout] {}", line);
}
}
Err(e) => {
log::error!("Error reading stdout: {}", e);
break;
}
}
}
});
// Spawn task to capture stderr and monitor for errors
let stderr_task = tokio::spawn(async move {
let mut reader = BufReader::new(stderr).lines();
let mut reader = BufReader::new(stderr);
let mut byte_buffer = Vec::new();
let mut stderr_buffer = String::new();
while let Ok(Some(line)) = reader.next_line().await {
log::info!("[llamacpp] {}", line); // Using your log format
stderr_buffer.push_str(&line);
stderr_buffer.push('\n');
// Check for critical error indicators that should stop the process
// TODO: check for different errors
if line.to_lowercase().contains("error")
|| line.to_lowercase().contains("failed")
|| line.to_lowercase().contains("fatal")
|| line.contains("CUDA error")
|| line.contains("out of memory")
|| line.contains("failed to load")
{
let _ = error_tx.send(line.clone()).await;
}
// Check for readiness indicator - llama-server outputs this when ready
else if line.contains("server is listening on")
|| line.contains("starting the main loop")
|| line.contains("server listening on")
{
log::info!("Server appears to be ready based on stdout: '{}'", line);
let _ = ready_tx.send(true).await;
loop {
byte_buffer.clear();
match reader.read_until(b'\n', &mut byte_buffer).await {
Ok(0) => break, // EOF
Ok(_) => {
let line = String::from_utf8_lossy(&byte_buffer);
let line = line.trim_end();
if !line.is_empty() {
stderr_buffer.push_str(line);
stderr_buffer.push('\n');
log::info!("[llamacpp] {}", line);
// Check for critical error indicators that should stop the process
let line_lower = line.to_string().to_lowercase();
if line_lower.contains("error loading model")
|| line_lower.contains("unknown model architecture")
|| line_lower.contains("fatal")
|| line_lower.contains("cuda error")
|| line_lower.contains("out of memory")
|| line_lower.contains("error")
|| line_lower.contains("failed")
{
let _ = error_tx.send(line.to_string()).await;
}
// Check for readiness indicator - llama-server outputs this when ready
else if line.contains("server is listening on")
|| line.contains("starting the main loop")
|| line.contains("server listening on")
{
log::info!("Server appears to be ready based on stderr: '{}'", line);
let _ = ready_tx.send(true).await;
}
}
}
Err(e) => {
log::error!("Error reading stderr: {}", e);
break;
}
}
}
stderr_buffer
});
@ -226,7 +262,7 @@ pub async fn load_llama_model(
if let Some(status) = child.try_wait()? {
if !status.success() {
let stderr_output = stderr_task.await.unwrap_or_default();
log::error!("llama.cpp exited early with code {status:?}");
log::error!("llama.cpp exited early with code {:?}", status);
log::error!("--- stderr ---\n{}", stderr_output);
return Err(ServerError::LlamacppError(stderr_output.trim().to_string()));
}
@ -246,25 +282,43 @@ pub async fn load_llama_model(
// Error occurred
Some(error_msg) = error_rx.recv() => {
log::error!("Server encountered an error: {}", error_msg);
let _ = child.kill().await;
// Give process a moment to exit naturally
tokio::time::sleep(Duration::from_millis(100)).await;
// Check if process already exited
if let Some(status) = child.try_wait()? {
log::info!("Process exited with code {:?}", status);
return Err(ServerError::LlamacppError(error_msg));
} else {
log::info!("Process still running, killing it...");
let _ = child.kill().await;
}
// Get full stderr output
let stderr_output = stderr_task.await.unwrap_or_default();
return Err(ServerError::LlamacppError(format!("Error: {}\n\nFull stderr:\n{}", error_msg, stderr_output)));
}
// Timeout
_ = tokio::time::sleep(Duration::from_millis(100)) => {
// Check for process exit more frequently
_ = tokio::time::sleep(Duration::from_millis(50)) => {
// Check if process exited
if let Some(status) = child.try_wait()? {
let stderr_output = stderr_task.await.unwrap_or_default();
if !status.success() {
log::error!("llama.cpp exited with error code {:?}", status);
return Err(ServerError::LlamacppError(format!("Process exited with code {:?}\n\nStderr:\n{}", status, stderr_output)));
} else {
log::error!("llama.cpp exited successfully but without ready signal");
return Err(ServerError::LlamacppError(format!("Process exited unexpectedly\n\nStderr:\n{}", stderr_output)));
}
}
// Timeout check
if start_time.elapsed() > timeout_duration {
log::error!("Timeout waiting for server to be ready");
let _ = child.kill().await;
return Err(ServerError::LlamacppError("Server startup timeout".to_string()));
}
// Check if process is still alive
if let Some(status) = child.try_wait()? {
if !status.success() {
let stderr_output = stderr_task.await.unwrap_or_default();
log::error!("llama.cpp exited during startup with code {status:?}");
return Err(ServerError::LlamacppError(format!("Process exited with code {status:?}\n\nStderr:\n{}", stderr_output)));
}
let stderr_output = stderr_task.await.unwrap_or_default();
return Err(ServerError::LlamacppError(format!("Server startup timeout\n\nStderr:\n{}", stderr_output)));
}
}
}
@ -331,7 +385,10 @@ pub async fn unload_llama_model(
#[cfg(all(windows, target_arch = "x86_64"))]
{
if let Some(raw_pid) = child.id() {
log::warn!("gracefully killing is unsupported on Windows, force-killing PID {}", raw_pid);
log::warn!(
"gracefully killing is unsupported on Windows, force-killing PID {}",
raw_pid
);
// Since we know a graceful shutdown doesn't work and there are no child processes
// to worry about, we can use `child.kill()` directly. On Windows, this is