Fix: Llama.cpp server hangs on model load (#6030)
* Fix: Llama.cpp server hangs on model load Resolves an issue where the llama.cpp server would hang indefinitely when loading certain models, as described in the attached ticket. The server's readiness message was not being correctly detected, causing the application to stall. The previous implementation used a line-buffered reader (BufReader::lines()) to process the stderr stream. This method proved to be unreliable for the specific output of the llama.cpp server. This commit refactors the stderr handling logic to use a more robust, chunk-based approach (read_until(b'\n', ...)). This ensures that the output is processed as it arrives, reliably capturing critical status messages and preventing the application from hanging during model initialization. Fixes: #6021 * Handle error gracefully with ServerError Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * Revert "Handle error gracefully with ServerError" This reverts commit 267a8a8a3262fbe36a445a30b8b3ba9a39697643. * Revert "Fix: Llama.cpp server hangs on model load" This reverts commit 44e5447f82f0ae32b6db7ffb213025f130d655c4. * Add more guards, refactor and fix error sending to FE --------- Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com>
This commit is contained in:
parent
787c4ee073
commit
b1984a452e
@ -185,40 +185,76 @@ pub async fn load_llama_model(
|
||||
|
||||
// Spawn task to monitor stdout for readiness
|
||||
let _stdout_task = tokio::spawn(async move {
|
||||
let mut reader = BufReader::new(stdout).lines();
|
||||
while let Ok(Some(line)) = reader.next_line().await {
|
||||
log::info!("[llamacpp stdout] {}", line);
|
||||
let mut reader = BufReader::new(stdout);
|
||||
let mut byte_buffer = Vec::new();
|
||||
|
||||
loop {
|
||||
byte_buffer.clear();
|
||||
match reader.read_until(b'\n', &mut byte_buffer).await {
|
||||
Ok(0) => break, // EOF
|
||||
Ok(_) => {
|
||||
let line = String::from_utf8_lossy(&byte_buffer);
|
||||
let line = line.trim_end();
|
||||
if !line.is_empty() {
|
||||
log::info!("[llamacpp stdout] {}", line);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
log::error!("Error reading stdout: {}", e);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Spawn task to capture stderr and monitor for errors
|
||||
let stderr_task = tokio::spawn(async move {
|
||||
let mut reader = BufReader::new(stderr).lines();
|
||||
let mut reader = BufReader::new(stderr);
|
||||
let mut byte_buffer = Vec::new();
|
||||
let mut stderr_buffer = String::new();
|
||||
while let Ok(Some(line)) = reader.next_line().await {
|
||||
log::info!("[llamacpp] {}", line); // Using your log format
|
||||
stderr_buffer.push_str(&line);
|
||||
stderr_buffer.push('\n');
|
||||
// Check for critical error indicators that should stop the process
|
||||
// TODO: check for different errors
|
||||
if line.to_lowercase().contains("error")
|
||||
|| line.to_lowercase().contains("failed")
|
||||
|| line.to_lowercase().contains("fatal")
|
||||
|| line.contains("CUDA error")
|
||||
|| line.contains("out of memory")
|
||||
|| line.contains("failed to load")
|
||||
{
|
||||
let _ = error_tx.send(line.clone()).await;
|
||||
}
|
||||
// Check for readiness indicator - llama-server outputs this when ready
|
||||
else if line.contains("server is listening on")
|
||||
|| line.contains("starting the main loop")
|
||||
|| line.contains("server listening on")
|
||||
{
|
||||
log::info!("Server appears to be ready based on stdout: '{}'", line);
|
||||
let _ = ready_tx.send(true).await;
|
||||
|
||||
loop {
|
||||
byte_buffer.clear();
|
||||
match reader.read_until(b'\n', &mut byte_buffer).await {
|
||||
Ok(0) => break, // EOF
|
||||
Ok(_) => {
|
||||
let line = String::from_utf8_lossy(&byte_buffer);
|
||||
let line = line.trim_end();
|
||||
|
||||
if !line.is_empty() {
|
||||
stderr_buffer.push_str(line);
|
||||
stderr_buffer.push('\n');
|
||||
log::info!("[llamacpp] {}", line);
|
||||
|
||||
// Check for critical error indicators that should stop the process
|
||||
let line_lower = line.to_string().to_lowercase();
|
||||
if line_lower.contains("error loading model")
|
||||
|| line_lower.contains("unknown model architecture")
|
||||
|| line_lower.contains("fatal")
|
||||
|| line_lower.contains("cuda error")
|
||||
|| line_lower.contains("out of memory")
|
||||
|| line_lower.contains("error")
|
||||
|| line_lower.contains("failed")
|
||||
{
|
||||
let _ = error_tx.send(line.to_string()).await;
|
||||
}
|
||||
// Check for readiness indicator - llama-server outputs this when ready
|
||||
else if line.contains("server is listening on")
|
||||
|| line.contains("starting the main loop")
|
||||
|| line.contains("server listening on")
|
||||
{
|
||||
log::info!("Server appears to be ready based on stderr: '{}'", line);
|
||||
let _ = ready_tx.send(true).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
log::error!("Error reading stderr: {}", e);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
stderr_buffer
|
||||
});
|
||||
|
||||
@ -226,7 +262,7 @@ pub async fn load_llama_model(
|
||||
if let Some(status) = child.try_wait()? {
|
||||
if !status.success() {
|
||||
let stderr_output = stderr_task.await.unwrap_or_default();
|
||||
log::error!("llama.cpp exited early with code {status:?}");
|
||||
log::error!("llama.cpp exited early with code {:?}", status);
|
||||
log::error!("--- stderr ---\n{}", stderr_output);
|
||||
return Err(ServerError::LlamacppError(stderr_output.trim().to_string()));
|
||||
}
|
||||
@ -246,25 +282,43 @@ pub async fn load_llama_model(
|
||||
// Error occurred
|
||||
Some(error_msg) = error_rx.recv() => {
|
||||
log::error!("Server encountered an error: {}", error_msg);
|
||||
let _ = child.kill().await;
|
||||
|
||||
// Give process a moment to exit naturally
|
||||
tokio::time::sleep(Duration::from_millis(100)).await;
|
||||
|
||||
// Check if process already exited
|
||||
if let Some(status) = child.try_wait()? {
|
||||
log::info!("Process exited with code {:?}", status);
|
||||
return Err(ServerError::LlamacppError(error_msg));
|
||||
} else {
|
||||
log::info!("Process still running, killing it...");
|
||||
let _ = child.kill().await;
|
||||
}
|
||||
|
||||
// Get full stderr output
|
||||
let stderr_output = stderr_task.await.unwrap_or_default();
|
||||
return Err(ServerError::LlamacppError(format!("Error: {}\n\nFull stderr:\n{}", error_msg, stderr_output)));
|
||||
}
|
||||
// Timeout
|
||||
_ = tokio::time::sleep(Duration::from_millis(100)) => {
|
||||
// Check for process exit more frequently
|
||||
_ = tokio::time::sleep(Duration::from_millis(50)) => {
|
||||
// Check if process exited
|
||||
if let Some(status) = child.try_wait()? {
|
||||
let stderr_output = stderr_task.await.unwrap_or_default();
|
||||
if !status.success() {
|
||||
log::error!("llama.cpp exited with error code {:?}", status);
|
||||
return Err(ServerError::LlamacppError(format!("Process exited with code {:?}\n\nStderr:\n{}", status, stderr_output)));
|
||||
} else {
|
||||
log::error!("llama.cpp exited successfully but without ready signal");
|
||||
return Err(ServerError::LlamacppError(format!("Process exited unexpectedly\n\nStderr:\n{}", stderr_output)));
|
||||
}
|
||||
}
|
||||
|
||||
// Timeout check
|
||||
if start_time.elapsed() > timeout_duration {
|
||||
log::error!("Timeout waiting for server to be ready");
|
||||
let _ = child.kill().await;
|
||||
return Err(ServerError::LlamacppError("Server startup timeout".to_string()));
|
||||
}
|
||||
// Check if process is still alive
|
||||
if let Some(status) = child.try_wait()? {
|
||||
if !status.success() {
|
||||
let stderr_output = stderr_task.await.unwrap_or_default();
|
||||
log::error!("llama.cpp exited during startup with code {status:?}");
|
||||
return Err(ServerError::LlamacppError(format!("Process exited with code {status:?}\n\nStderr:\n{}", stderr_output)));
|
||||
}
|
||||
let stderr_output = stderr_task.await.unwrap_or_default();
|
||||
return Err(ServerError::LlamacppError(format!("Server startup timeout\n\nStderr:\n{}", stderr_output)));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -331,7 +385,10 @@ pub async fn unload_llama_model(
|
||||
#[cfg(all(windows, target_arch = "x86_64"))]
|
||||
{
|
||||
if let Some(raw_pid) = child.id() {
|
||||
log::warn!("gracefully killing is unsupported on Windows, force-killing PID {}", raw_pid);
|
||||
log::warn!(
|
||||
"gracefully killing is unsupported on Windows, force-killing PID {}",
|
||||
raw_pid
|
||||
);
|
||||
|
||||
// Since we know a graceful shutdown doesn't work and there are no child processes
|
||||
// to worry about, we can use `child.kill()` directly. On Windows, this is
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user