275 lines
9.5 KiB
Rust
275 lines
9.5 KiB
Rust
use crate::RagError;
|
||
use std::fs;
|
||
use std::io::{Read, Cursor};
|
||
use zip::read::ZipArchive;
|
||
use quick_xml::events::Event;
|
||
use quick_xml::Reader;
|
||
use csv as csv_crate;
|
||
use calamine::{Reader as _, open_workbook_auto, DataType};
|
||
use html2text;
|
||
use chardetng::EncodingDetector;
|
||
use infer;
|
||
use std::borrow::Cow;
|
||
|
||
pub fn parse_pdf(file_path: &str) -> Result<String, RagError> {
|
||
let bytes = fs::read(file_path)?;
|
||
let text = pdf_extract::extract_text_from_mem(&bytes)
|
||
.map_err(|e| RagError::ParseError(format!("PDF parse error: {}", e)))?;
|
||
|
||
// Validate that the PDF has extractable text (not image-based/scanned)
|
||
// Count meaningful characters (excluding whitespace)
|
||
let meaningful_chars = text.chars()
|
||
.filter(|c| !c.is_whitespace())
|
||
.count();
|
||
|
||
// Require at least 50 non-whitespace characters to consider it a text PDF
|
||
// This threshold filters out PDFs that are purely images or scanned documents
|
||
if meaningful_chars < 50 {
|
||
return Err(RagError::ParseError(
|
||
"PDF appears to be image-based or scanned. OCR is not supported yet. Please use a text-based PDF.".to_string()
|
||
));
|
||
}
|
||
|
||
Ok(text)
|
||
}
|
||
|
||
pub fn parse_text(file_path: &str) -> Result<String, RagError> {
|
||
read_text_auto(file_path)
|
||
}
|
||
|
||
pub fn parse_document(file_path: &str, file_type: &str) -> Result<String, RagError> {
|
||
match file_type.to_lowercase().as_str() {
|
||
"pdf" | "application/pdf" => parse_pdf(file_path),
|
||
"txt" | "text/plain" | "md" | "text/markdown" => parse_text(file_path),
|
||
"csv" | "text/csv" => parse_csv(file_path),
|
||
// Excel family via calamine
|
||
"xlsx"
|
||
| "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||
| "xls"
|
||
| "application/vnd.ms-excel"
|
||
| "ods"
|
||
| "application/vnd.oasis.opendocument.spreadsheet" => parse_spreadsheet(file_path),
|
||
// PowerPoint
|
||
"pptx"
|
||
| "application/vnd.openxmlformats-officedocument.presentationml.presentation" => parse_pptx(file_path),
|
||
// HTML
|
||
"html" | "htm" | "text/html" => parse_html(file_path),
|
||
"docx"
|
||
| "application/vnd.openxmlformats-officedocument.wordprocessingml.document" => {
|
||
parse_docx(file_path)
|
||
}
|
||
other => {
|
||
// Try MIME sniffing when extension or MIME is unknown
|
||
if let Ok(Some(k)) = infer::get_from_path(file_path) {
|
||
let mime = k.mime_type();
|
||
return parse_document(file_path, mime);
|
||
}
|
||
Err(RagError::UnsupportedFileType(other.to_string()))
|
||
}
|
||
}
|
||
}
|
||
|
||
fn parse_docx(file_path: &str) -> Result<String, RagError> {
|
||
let file = std::fs::File::open(file_path)?;
|
||
let mut zip = ZipArchive::new(file).map_err(|e| RagError::ParseError(e.to_string()))?;
|
||
|
||
// Standard DOCX stores document text at word/document.xml
|
||
let mut doc_xml = match zip.by_name("word/document.xml") {
|
||
Ok(f) => f,
|
||
Err(_) => return Err(RagError::ParseError("document.xml not found".into())),
|
||
};
|
||
let mut xml_content = String::new();
|
||
doc_xml
|
||
.read_to_string(&mut xml_content)
|
||
.map_err(|e| RagError::ParseError(e.to_string()))?;
|
||
|
||
// Parse XML and extract text from w:t nodes; add newlines on w:p boundaries
|
||
let mut reader = Reader::from_str(&xml_content);
|
||
reader.trim_text(true);
|
||
let mut buf = Vec::new();
|
||
let mut result = String::new();
|
||
let mut in_text = false;
|
||
|
||
loop {
|
||
match reader.read_event_into(&mut buf) {
|
||
Ok(Event::Start(e)) => {
|
||
let name: String = reader
|
||
.decoder()
|
||
.decode(e.name().as_ref())
|
||
.unwrap_or(Cow::Borrowed(""))
|
||
.into_owned();
|
||
if name.ends_with(":t") || name == "w:t" || name == "t" {
|
||
in_text = true;
|
||
}
|
||
}
|
||
Ok(Event::End(e)) => {
|
||
let name: String = reader
|
||
.decoder()
|
||
.decode(e.name().as_ref())
|
||
.unwrap_or(Cow::Borrowed(""))
|
||
.into_owned();
|
||
if name.ends_with(":t") || name == "w:t" || name == "t" {
|
||
in_text = false;
|
||
result.push(' ');
|
||
}
|
||
if name.ends_with(":p") || name == "w:p" || name == "p" {
|
||
// Paragraph end – add newline
|
||
result.push_str("\n\n");
|
||
}
|
||
}
|
||
Ok(Event::Text(t)) => {
|
||
if in_text {
|
||
let text = t.unescape().unwrap_or_default();
|
||
result.push_str(&text);
|
||
}
|
||
}
|
||
Ok(Event::Eof) => break,
|
||
Err(e) => return Err(RagError::ParseError(e.to_string())),
|
||
_ => {}
|
||
}
|
||
}
|
||
|
||
// Normalize whitespace
|
||
let normalized = result
|
||
.lines()
|
||
.map(|l| l.trim())
|
||
.filter(|l| !l.is_empty())
|
||
.collect::<Vec<_>>()
|
||
.join("\n");
|
||
Ok(normalized)
|
||
}
|
||
|
||
fn parse_csv(file_path: &str) -> Result<String, RagError> {
|
||
let mut rdr = csv_crate::ReaderBuilder::new()
|
||
.has_headers(false)
|
||
.flexible(true)
|
||
.from_path(file_path)
|
||
.map_err(|e| RagError::ParseError(e.to_string()))?;
|
||
let mut out = String::new();
|
||
for rec in rdr.records() {
|
||
let rec = rec.map_err(|e| RagError::ParseError(e.to_string()))?;
|
||
out.push_str(&rec.iter().collect::<Vec<_>>().join(", "));
|
||
out.push('\n');
|
||
}
|
||
Ok(out)
|
||
}
|
||
|
||
fn parse_spreadsheet(file_path: &str) -> Result<String, RagError> {
|
||
let mut workbook = open_workbook_auto(file_path)
|
||
.map_err(|e| RagError::ParseError(e.to_string()))?;
|
||
let mut out = String::new();
|
||
for sheet_name in workbook.sheet_names().to_owned() {
|
||
if let Ok(range) = workbook.worksheet_range(&sheet_name) {
|
||
out.push_str(&format!("# Sheet: {}\n", sheet_name));
|
||
for row in range.rows() {
|
||
let cells = row
|
||
.iter()
|
||
.map(|c| match c {
|
||
DataType::Empty => "".to_string(),
|
||
DataType::String(s) => s.to_string(),
|
||
DataType::Float(f) => format!("{}", f),
|
||
DataType::Int(i) => i.to_string(),
|
||
DataType::Bool(b) => b.to_string(),
|
||
DataType::DateTime(f) => format!("{}", f),
|
||
other => other.to_string(),
|
||
})
|
||
.collect::<Vec<_>>()
|
||
.join("\t");
|
||
out.push_str(&cells);
|
||
out.push('\n');
|
||
}
|
||
out.push_str("\n");
|
||
}
|
||
}
|
||
Ok(out)
|
||
}
|
||
|
||
fn parse_pptx(file_path: &str) -> Result<String, RagError> {
|
||
let file = std::fs::File::open(file_path)?;
|
||
let mut zip = ZipArchive::new(file).map_err(|e| RagError::ParseError(e.to_string()))?;
|
||
|
||
// Collect slide files: ppt/slides/slide*.xml
|
||
let mut slides = Vec::new();
|
||
for i in 0..zip.len() {
|
||
let name = zip.by_index(i).map(|f| f.name().to_string()).unwrap_or_default();
|
||
if name.starts_with("ppt/slides/") && name.ends_with(".xml") {
|
||
slides.push(name);
|
||
}
|
||
}
|
||
slides.sort();
|
||
|
||
let mut output = String::new();
|
||
for slide_name in slides {
|
||
let mut file = zip.by_name(&slide_name).map_err(|e| RagError::ParseError(e.to_string()))?;
|
||
let mut xml = String::new();
|
||
file.read_to_string(&mut xml).map_err(|e| RagError::ParseError(e.to_string()))?;
|
||
output.push_str(&extract_pptx_text(&xml));
|
||
output.push_str("\n\n");
|
||
}
|
||
Ok(output)
|
||
}
|
||
|
||
fn extract_pptx_text(xml: &str) -> String {
|
||
let mut reader = Reader::from_str(xml);
|
||
reader.trim_text(true);
|
||
let mut buf = Vec::new();
|
||
let mut result = String::new();
|
||
let mut in_text = false;
|
||
loop {
|
||
match reader.read_event_into(&mut buf) {
|
||
Ok(Event::Start(e)) => {
|
||
let name: String = reader
|
||
.decoder()
|
||
.decode(e.name().as_ref())
|
||
.unwrap_or(Cow::Borrowed(""))
|
||
.into_owned();
|
||
if name.ends_with(":t") || name == "a:t" || name == "t" {
|
||
in_text = true;
|
||
}
|
||
}
|
||
Ok(Event::End(e)) => {
|
||
let name: String = reader
|
||
.decoder()
|
||
.decode(e.name().as_ref())
|
||
.unwrap_or(Cow::Borrowed(""))
|
||
.into_owned();
|
||
if name.ends_with(":t") || name == "a:t" || name == "t" {
|
||
in_text = false;
|
||
result.push(' ');
|
||
}
|
||
}
|
||
Ok(Event::Text(t)) => {
|
||
if in_text {
|
||
let text = t.unescape().unwrap_or_default();
|
||
result.push_str(&text);
|
||
}
|
||
}
|
||
Ok(Event::Eof) => break,
|
||
Err(_) => break,
|
||
_ => {}
|
||
}
|
||
}
|
||
result
|
||
}
|
||
|
||
fn parse_html(file_path: &str) -> Result<String, RagError> {
|
||
let html = read_text_auto(file_path)?;
|
||
// 80-column wrap default
|
||
Ok(html2text::from_read(Cursor::new(html), 80))
|
||
}
|
||
|
||
fn read_text_auto(file_path: &str) -> Result<String, RagError> {
|
||
let bytes = fs::read(file_path)?;
|
||
// Detect encoding
|
||
let mut detector = EncodingDetector::new();
|
||
detector.feed(&bytes, true);
|
||
let enc = detector.guess(None, true);
|
||
let (decoded, _, had_errors) = enc.decode(&bytes);
|
||
if had_errors {
|
||
// fallback to UTF-8 lossy
|
||
Ok(String::from_utf8_lossy(&bytes).to_string())
|
||
} else {
|
||
Ok(decoded.to_string())
|
||
}
|
||
}
|