use crate::RagError; use std::fs; use std::io::{Read, Cursor}; use zip::read::ZipArchive; use quick_xml::events::Event; use quick_xml::Reader; use csv as csv_crate; use calamine::{Reader as _, open_workbook_auto, DataType}; use html2text; use chardetng::EncodingDetector; use infer; use std::borrow::Cow; pub fn parse_pdf(file_path: &str) -> Result { let bytes = fs::read(file_path)?; let text = pdf_extract::extract_text_from_mem(&bytes) .map_err(|e| RagError::ParseError(format!("PDF parse error: {}", e)))?; // Validate that the PDF has extractable text (not image-based/scanned) // Count meaningful characters (excluding whitespace) let meaningful_chars = text.chars() .filter(|c| !c.is_whitespace()) .count(); // Require at least 50 non-whitespace characters to consider it a text PDF // This threshold filters out PDFs that are purely images or scanned documents if meaningful_chars < 50 { return Err(RagError::ParseError( "PDF appears to be image-based or scanned. OCR is not supported yet. Please use a text-based PDF.".to_string() )); } Ok(text) } pub fn parse_text(file_path: &str) -> Result { read_text_auto(file_path) } pub fn parse_document(file_path: &str, file_type: &str) -> Result { match file_type.to_lowercase().as_str() { "pdf" | "application/pdf" => parse_pdf(file_path), "txt" | "text/plain" | "md" | "text/markdown" => parse_text(file_path), "csv" | "text/csv" => parse_csv(file_path), // Excel family via calamine "xlsx" | "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | "xls" | "application/vnd.ms-excel" | "ods" | "application/vnd.oasis.opendocument.spreadsheet" => parse_spreadsheet(file_path), // PowerPoint "pptx" | "application/vnd.openxmlformats-officedocument.presentationml.presentation" => parse_pptx(file_path), // HTML "html" | "htm" | "text/html" => parse_html(file_path), "docx" | "application/vnd.openxmlformats-officedocument.wordprocessingml.document" => { parse_docx(file_path) } other => { // Try MIME sniffing when extension or MIME is unknown if let Ok(Some(k)) = infer::get_from_path(file_path) { let mime = k.mime_type(); return parse_document(file_path, mime); } Err(RagError::UnsupportedFileType(other.to_string())) } } } fn parse_docx(file_path: &str) -> Result { let file = std::fs::File::open(file_path)?; let mut zip = ZipArchive::new(file).map_err(|e| RagError::ParseError(e.to_string()))?; // Standard DOCX stores document text at word/document.xml let mut doc_xml = match zip.by_name("word/document.xml") { Ok(f) => f, Err(_) => return Err(RagError::ParseError("document.xml not found".into())), }; let mut xml_content = String::new(); doc_xml .read_to_string(&mut xml_content) .map_err(|e| RagError::ParseError(e.to_string()))?; // Parse XML and extract text from w:t nodes; add newlines on w:p boundaries let mut reader = Reader::from_str(&xml_content); reader.trim_text(true); let mut buf = Vec::new(); let mut result = String::new(); let mut in_text = false; loop { match reader.read_event_into(&mut buf) { Ok(Event::Start(e)) => { let name: String = reader .decoder() .decode(e.name().as_ref()) .unwrap_or(Cow::Borrowed("")) .into_owned(); if name.ends_with(":t") || name == "w:t" || name == "t" { in_text = true; } } Ok(Event::End(e)) => { let name: String = reader .decoder() .decode(e.name().as_ref()) .unwrap_or(Cow::Borrowed("")) .into_owned(); if name.ends_with(":t") || name == "w:t" || name == "t" { in_text = false; result.push(' '); } if name.ends_with(":p") || name == "w:p" || name == "p" { // Paragraph end – add newline result.push_str("\n\n"); } } Ok(Event::Text(t)) => { if in_text { let text = t.unescape().unwrap_or_default(); result.push_str(&text); } } Ok(Event::Eof) => break, Err(e) => return Err(RagError::ParseError(e.to_string())), _ => {} } } // Normalize whitespace let normalized = result .lines() .map(|l| l.trim()) .filter(|l| !l.is_empty()) .collect::>() .join("\n"); Ok(normalized) } fn parse_csv(file_path: &str) -> Result { let mut rdr = csv_crate::ReaderBuilder::new() .has_headers(false) .flexible(true) .from_path(file_path) .map_err(|e| RagError::ParseError(e.to_string()))?; let mut out = String::new(); for rec in rdr.records() { let rec = rec.map_err(|e| RagError::ParseError(e.to_string()))?; out.push_str(&rec.iter().collect::>().join(", ")); out.push('\n'); } Ok(out) } fn parse_spreadsheet(file_path: &str) -> Result { let mut workbook = open_workbook_auto(file_path) .map_err(|e| RagError::ParseError(e.to_string()))?; let mut out = String::new(); for sheet_name in workbook.sheet_names().to_owned() { if let Ok(range) = workbook.worksheet_range(&sheet_name) { out.push_str(&format!("# Sheet: {}\n", sheet_name)); for row in range.rows() { let cells = row .iter() .map(|c| match c { DataType::Empty => "".to_string(), DataType::String(s) => s.to_string(), DataType::Float(f) => format!("{}", f), DataType::Int(i) => i.to_string(), DataType::Bool(b) => b.to_string(), DataType::DateTime(f) => format!("{}", f), other => other.to_string(), }) .collect::>() .join("\t"); out.push_str(&cells); out.push('\n'); } out.push_str("\n"); } } Ok(out) } fn parse_pptx(file_path: &str) -> Result { let file = std::fs::File::open(file_path)?; let mut zip = ZipArchive::new(file).map_err(|e| RagError::ParseError(e.to_string()))?; // Collect slide files: ppt/slides/slide*.xml let mut slides = Vec::new(); for i in 0..zip.len() { let name = zip.by_index(i).map(|f| f.name().to_string()).unwrap_or_default(); if name.starts_with("ppt/slides/") && name.ends_with(".xml") { slides.push(name); } } slides.sort(); let mut output = String::new(); for slide_name in slides { let mut file = zip.by_name(&slide_name).map_err(|e| RagError::ParseError(e.to_string()))?; let mut xml = String::new(); file.read_to_string(&mut xml).map_err(|e| RagError::ParseError(e.to_string()))?; output.push_str(&extract_pptx_text(&xml)); output.push_str("\n\n"); } Ok(output) } fn extract_pptx_text(xml: &str) -> String { let mut reader = Reader::from_str(xml); reader.trim_text(true); let mut buf = Vec::new(); let mut result = String::new(); let mut in_text = false; loop { match reader.read_event_into(&mut buf) { Ok(Event::Start(e)) => { let name: String = reader .decoder() .decode(e.name().as_ref()) .unwrap_or(Cow::Borrowed("")) .into_owned(); if name.ends_with(":t") || name == "a:t" || name == "t" { in_text = true; } } Ok(Event::End(e)) => { let name: String = reader .decoder() .decode(e.name().as_ref()) .unwrap_or(Cow::Borrowed("")) .into_owned(); if name.ends_with(":t") || name == "a:t" || name == "t" { in_text = false; result.push(' '); } } Ok(Event::Text(t)) => { if in_text { let text = t.unescape().unwrap_or_default(); result.push_str(&text); } } Ok(Event::Eof) => break, Err(_) => break, _ => {} } } result } fn parse_html(file_path: &str) -> Result { let html = read_text_auto(file_path)?; // 80-column wrap default Ok(html2text::from_read(Cursor::new(html), 80)) } fn read_text_auto(file_path: &str) -> Result { let bytes = fs::read(file_path)?; // Detect encoding let mut detector = EncodingDetector::new(); detector.feed(&bytes, true); let enc = detector.guess(None, true); let (decoded, _, had_errors) = enc.decode(&bytes); if had_errors { // fallback to UTF-8 lossy Ok(String::from_utf8_lossy(&bytes).to_string()) } else { Ok(decoded.to_string()) } }