2025-10-09 03:48:51 +07:00

275 lines
9.5 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

use crate::RagError;
use std::fs;
use std::io::{Read, Cursor};
use zip::read::ZipArchive;
use quick_xml::events::Event;
use quick_xml::Reader;
use csv as csv_crate;
use calamine::{Reader as _, open_workbook_auto, DataType};
use html2text;
use chardetng::EncodingDetector;
use infer;
use std::borrow::Cow;
pub fn parse_pdf(file_path: &str) -> Result<String, RagError> {
let bytes = fs::read(file_path)?;
let text = pdf_extract::extract_text_from_mem(&bytes)
.map_err(|e| RagError::ParseError(format!("PDF parse error: {}", e)))?;
// Validate that the PDF has extractable text (not image-based/scanned)
// Count meaningful characters (excluding whitespace)
let meaningful_chars = text.chars()
.filter(|c| !c.is_whitespace())
.count();
// Require at least 50 non-whitespace characters to consider it a text PDF
// This threshold filters out PDFs that are purely images or scanned documents
if meaningful_chars < 50 {
return Err(RagError::ParseError(
"PDF appears to be image-based or scanned. OCR is not supported yet. Please use a text-based PDF.".to_string()
));
}
Ok(text)
}
pub fn parse_text(file_path: &str) -> Result<String, RagError> {
read_text_auto(file_path)
}
pub fn parse_document(file_path: &str, file_type: &str) -> Result<String, RagError> {
match file_type.to_lowercase().as_str() {
"pdf" | "application/pdf" => parse_pdf(file_path),
"txt" | "text/plain" | "md" | "text/markdown" => parse_text(file_path),
"csv" | "text/csv" => parse_csv(file_path),
// Excel family via calamine
"xlsx"
| "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
| "xls"
| "application/vnd.ms-excel"
| "ods"
| "application/vnd.oasis.opendocument.spreadsheet" => parse_spreadsheet(file_path),
// PowerPoint
"pptx"
| "application/vnd.openxmlformats-officedocument.presentationml.presentation" => parse_pptx(file_path),
// HTML
"html" | "htm" | "text/html" => parse_html(file_path),
"docx"
| "application/vnd.openxmlformats-officedocument.wordprocessingml.document" => {
parse_docx(file_path)
}
other => {
// Try MIME sniffing when extension or MIME is unknown
if let Ok(Some(k)) = infer::get_from_path(file_path) {
let mime = k.mime_type();
return parse_document(file_path, mime);
}
Err(RagError::UnsupportedFileType(other.to_string()))
}
}
}
fn parse_docx(file_path: &str) -> Result<String, RagError> {
let file = std::fs::File::open(file_path)?;
let mut zip = ZipArchive::new(file).map_err(|e| RagError::ParseError(e.to_string()))?;
// Standard DOCX stores document text at word/document.xml
let mut doc_xml = match zip.by_name("word/document.xml") {
Ok(f) => f,
Err(_) => return Err(RagError::ParseError("document.xml not found".into())),
};
let mut xml_content = String::new();
doc_xml
.read_to_string(&mut xml_content)
.map_err(|e| RagError::ParseError(e.to_string()))?;
// Parse XML and extract text from w:t nodes; add newlines on w:p boundaries
let mut reader = Reader::from_str(&xml_content);
reader.trim_text(true);
let mut buf = Vec::new();
let mut result = String::new();
let mut in_text = false;
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(e)) => {
let name: String = reader
.decoder()
.decode(e.name().as_ref())
.unwrap_or(Cow::Borrowed(""))
.into_owned();
if name.ends_with(":t") || name == "w:t" || name == "t" {
in_text = true;
}
}
Ok(Event::End(e)) => {
let name: String = reader
.decoder()
.decode(e.name().as_ref())
.unwrap_or(Cow::Borrowed(""))
.into_owned();
if name.ends_with(":t") || name == "w:t" || name == "t" {
in_text = false;
result.push(' ');
}
if name.ends_with(":p") || name == "w:p" || name == "p" {
// Paragraph end add newline
result.push_str("\n\n");
}
}
Ok(Event::Text(t)) => {
if in_text {
let text = t.unescape().unwrap_or_default();
result.push_str(&text);
}
}
Ok(Event::Eof) => break,
Err(e) => return Err(RagError::ParseError(e.to_string())),
_ => {}
}
}
// Normalize whitespace
let normalized = result
.lines()
.map(|l| l.trim())
.filter(|l| !l.is_empty())
.collect::<Vec<_>>()
.join("\n");
Ok(normalized)
}
fn parse_csv(file_path: &str) -> Result<String, RagError> {
let mut rdr = csv_crate::ReaderBuilder::new()
.has_headers(false)
.flexible(true)
.from_path(file_path)
.map_err(|e| RagError::ParseError(e.to_string()))?;
let mut out = String::new();
for rec in rdr.records() {
let rec = rec.map_err(|e| RagError::ParseError(e.to_string()))?;
out.push_str(&rec.iter().collect::<Vec<_>>().join(", "));
out.push('\n');
}
Ok(out)
}
fn parse_spreadsheet(file_path: &str) -> Result<String, RagError> {
let mut workbook = open_workbook_auto(file_path)
.map_err(|e| RagError::ParseError(e.to_string()))?;
let mut out = String::new();
for sheet_name in workbook.sheet_names().to_owned() {
if let Ok(range) = workbook.worksheet_range(&sheet_name) {
out.push_str(&format!("# Sheet: {}\n", sheet_name));
for row in range.rows() {
let cells = row
.iter()
.map(|c| match c {
DataType::Empty => "".to_string(),
DataType::String(s) => s.to_string(),
DataType::Float(f) => format!("{}", f),
DataType::Int(i) => i.to_string(),
DataType::Bool(b) => b.to_string(),
DataType::DateTime(f) => format!("{}", f),
other => other.to_string(),
})
.collect::<Vec<_>>()
.join("\t");
out.push_str(&cells);
out.push('\n');
}
out.push_str("\n");
}
}
Ok(out)
}
fn parse_pptx(file_path: &str) -> Result<String, RagError> {
let file = std::fs::File::open(file_path)?;
let mut zip = ZipArchive::new(file).map_err(|e| RagError::ParseError(e.to_string()))?;
// Collect slide files: ppt/slides/slide*.xml
let mut slides = Vec::new();
for i in 0..zip.len() {
let name = zip.by_index(i).map(|f| f.name().to_string()).unwrap_or_default();
if name.starts_with("ppt/slides/") && name.ends_with(".xml") {
slides.push(name);
}
}
slides.sort();
let mut output = String::new();
for slide_name in slides {
let mut file = zip.by_name(&slide_name).map_err(|e| RagError::ParseError(e.to_string()))?;
let mut xml = String::new();
file.read_to_string(&mut xml).map_err(|e| RagError::ParseError(e.to_string()))?;
output.push_str(&extract_pptx_text(&xml));
output.push_str("\n\n");
}
Ok(output)
}
fn extract_pptx_text(xml: &str) -> String {
let mut reader = Reader::from_str(xml);
reader.trim_text(true);
let mut buf = Vec::new();
let mut result = String::new();
let mut in_text = false;
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(e)) => {
let name: String = reader
.decoder()
.decode(e.name().as_ref())
.unwrap_or(Cow::Borrowed(""))
.into_owned();
if name.ends_with(":t") || name == "a:t" || name == "t" {
in_text = true;
}
}
Ok(Event::End(e)) => {
let name: String = reader
.decoder()
.decode(e.name().as_ref())
.unwrap_or(Cow::Borrowed(""))
.into_owned();
if name.ends_with(":t") || name == "a:t" || name == "t" {
in_text = false;
result.push(' ');
}
}
Ok(Event::Text(t)) => {
if in_text {
let text = t.unescape().unwrap_or_default();
result.push_str(&text);
}
}
Ok(Event::Eof) => break,
Err(_) => break,
_ => {}
}
}
result
}
fn parse_html(file_path: &str) -> Result<String, RagError> {
let html = read_text_auto(file_path)?;
// 80-column wrap default
Ok(html2text::from_read(Cursor::new(html), 80))
}
fn read_text_auto(file_path: &str) -> Result<String, RagError> {
let bytes = fs::read(file_path)?;
// Detect encoding
let mut detector = EncodingDetector::new();
detector.feed(&bytes, true);
let enc = detector.guess(None, true);
let (decoded, _, had_errors) = enc.decode(&bytes);
if had_errors {
// fallback to UTF-8 lossy
Ok(String::from_utf8_lossy(&bytes).to_string())
} else {
Ok(decoded.to_string())
}
}