//! 文档处理管线 — PDF/DOCX/Excel 格式提取 //! //! 核心思想:每种格式输出统一的 NormalizedDocument,后面复用现有管线。 //! Excel 走独立的结构化通道(JSONB 行级存储),不走 RAG。 use calamine::{Reader, Data, Range}; // === 规范化文档 — 所有格式的统一中间表示 === /// 文档提取结果(用于 RAG 通道) pub struct NormalizedDocument { pub title: String, pub sections: Vec, pub metadata: DocumentMetadata, } pub struct DocumentSection { pub heading: Option, pub content: String, pub level: u8, pub page_number: Option, } pub struct DocumentMetadata { pub source_format: String, pub file_name: String, pub total_pages: Option, pub total_sections: u32, } // === 格式路由 === /// 根据文件扩展名判断处理通道 pub fn detect_format(file_name: &str) -> Option { let ext = file_name.rsplit('.').next().unwrap_or("").to_lowercase(); match ext.as_str() { "pdf" => Some(DocumentFormat::Pdf), "docx" | "doc" => Some(DocumentFormat::Docx), "xlsx" | "xls" => Some(DocumentFormat::Excel), "md" | "txt" | "markdown" => Some(DocumentFormat::Markdown), "csv" => Some(DocumentFormat::Csv), _ => None, } } #[derive(Debug, Clone, Copy, PartialEq)] pub enum DocumentFormat { Pdf, Docx, Excel, Csv, Markdown, } impl DocumentFormat { pub fn is_structured(&self) -> bool { matches!(self, Self::Excel | Self::Csv) } } // === 文件处理结果 === pub enum ProcessedFile { /// 文档通道(RAG)— PDF/DOCX/Markdown Document(NormalizedDocument), /// 结构化通道 — Excel/CSV 行数据 Structured { title: String, sheet_names: Vec, column_headers: Vec, rows: Vec<(Option, i32, Vec, serde_json::Value)>, }, } // === 提取错误 === #[derive(Debug)] pub struct ExtractError(pub String); impl std::fmt::Display for ExtractError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.0) } } impl std::error::Error for ExtractError {} impl From for crate::error::SaasError { fn from(e: ExtractError) -> Self { crate::error::SaasError::InvalidInput(e.0) } } // === PDF 提取 === pub fn extract_pdf(data: &[u8], file_name: &str) -> Result { let text = pdf_extract::extract_text_from_mem(data) .map_err(|e| ExtractError(format!("PDF 提取失败: {}", e)))?; let pages: Vec<&str> = text.split('\x0c').collect(); let page_count = pages.len() as u32; let mut sections = Vec::new(); let mut current_content = String::new(); for (i, page) in pages.iter().enumerate() { let page_text = page.trim(); if page_text.is_empty() { continue; } current_content.push_str(page_text); current_content.push('\n'); if current_content.len() > 2000 || i == pages.len() - 1 { let content = current_content.trim().to_string(); if !content.is_empty() { sections.push(DocumentSection { heading: Some(format!("第 {} 页", i + 1)), content, level: 2, page_number: Some((i + 1) as u32), }); } current_content.clear(); } } let title = extract_title(file_name, ".pdf"); let total_sections = sections.len() as u32; Ok(NormalizedDocument { title, sections, metadata: DocumentMetadata { source_format: "pdf".to_string(), file_name: file_name.to_string(), total_pages: Some(page_count), total_sections, }, }) } // === DOCX 提取 === pub fn extract_docx(data: &[u8], file_name: &str) -> Result { let reader = std::io::Cursor::new(data); let mut archive = zip::ZipArchive::new(reader) .map_err(|e| ExtractError(format!("DOCX 解压失败: {}", e)))?; let mut doc_xml = archive.by_name("word/document.xml") .map_err(|e| ExtractError(format!("DOCX 中未找到 document.xml: {}", e)))?; let mut xml_content = String::new(); use std::io::Read; doc_xml.read_to_string(&mut xml_content) .map_err(|e| ExtractError(format!("DOCX 读取失败: {}", e)))?; let mut sections = Vec::new(); let mut current_heading: Option = None; let mut current_content = String::new(); // 简单 XML 解析:提取 文本和 标题层级 let mut in_text = false; let mut paragraph_style = String::new(); let mut text_buf = String::new(); let mut reader = quick_xml::Reader::from_str(&xml_content); let mut buf = Vec::new(); loop { match reader.read_event_into(&mut buf) { Ok(quick_xml::events::Event::Start(e)) => { let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string(); match name.as_str() { "p" => paragraph_style.clear(), "t" => in_text = true, "pStyle" => { for attr in e.attributes().flatten() { if attr.key.local_name().as_ref() == b"val" { paragraph_style = String::from_utf8_lossy(&attr.value).to_string(); } } } _ => {} } } Ok(quick_xml::events::Event::Text(t)) => { if in_text { text_buf.push_str(&t.unescape().unwrap_or_default()); } } Ok(quick_xml::events::Event::End(e)) => { let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string(); match name.as_str() { "p" => { let text = text_buf.trim().to_string(); text_buf.clear(); if text.is_empty() { continue; } let is_heading = paragraph_style.starts_with("Heading") || paragraph_style.starts_with("heading") || paragraph_style == "Title"; if is_heading { if !current_content.is_empty() { sections.push(DocumentSection { heading: current_heading.take(), content: current_content.trim().to_string(), level: 2, page_number: None, }); current_content.clear(); } current_heading = Some(text); } else { current_content.push_str(&text); current_content.push('\n'); } } "t" => in_text = false, _ => {} } } Ok(quick_xml::events::Event::Eof) => break, Err(e) => { tracing::warn!("DOCX XML parse warning: {}", e); break; } _ => {} } buf.clear(); } if !current_content.is_empty() { sections.push(DocumentSection { heading: current_heading, content: current_content.trim().to_string(), level: 2, page_number: None, }); } let title = extract_title(file_name, ".docx"); let total_sections = sections.len() as u32; Ok(NormalizedDocument { title, sections, metadata: DocumentMetadata { source_format: "docx".to_string(), file_name: file_name.to_string(), total_pages: None, total_sections, }, }) } // === Excel 解析 === pub fn extract_excel(data: &[u8], file_name: &str) -> Result { let cursor = std::io::Cursor::new(data); let mut workbook: calamine::Xlsx<_> = calamine::open_workbook_from_rs(cursor) .map_err(|e| ExtractError(format!("Excel 解析失败: {}", e)))?; let sheet_names = workbook.sheet_names().to_vec(); let mut all_rows: Vec<(Option, i32, Vec, serde_json::Value)> = Vec::new(); let mut all_headers: Vec = Vec::new(); let mut global_row_index = 0i32; for sheet_name in &sheet_names { if let Ok(range) = workbook.worksheet_range(sheet_name) { let mut headers: Vec = Vec::new(); let mut first_row = true; for row in range_as_data_rows(&range) { if first_row { headers = row.iter().map(|cell| { cell.to_string().trim().to_string() }).collect(); headers.retain(|h| !h.is_empty()); if headers.is_empty() { first_row = false; continue; } for h in &headers { if !all_headers.contains(h) { all_headers.push(h.clone()); } } first_row = false; continue; } let mut row_map = serde_json::Map::new(); for (i, cell) in row.iter().enumerate() { if i >= headers.len() { break; } let value = match cell { Data::Empty => continue, Data::String(s) => serde_json::Value::String(s.clone()), Data::Float(f) => serde_json::json!(f), Data::Int(n) => serde_json::json!(n), Data::Bool(b) => serde_json::Value::Bool(*b), Data::DateTime(dt) => { serde_json::Value::String(dt.to_string()) } Data::DateTimeIso(s) => { serde_json::Value::String(s.clone()) } Data::DurationIso(s) => { serde_json::Value::String(s.clone()) } Data::Error(e) => { serde_json::Value::String(format!("{:?}", e)) } }; row_map.insert(headers[i].clone(), value); } if !row_map.is_empty() { all_rows.push(( Some(sheet_name.clone()), global_row_index, headers.clone(), serde_json::Value::Object(row_map), )); global_row_index += 1; } } } } let title = extract_title(file_name, ".xlsx"); Ok(ProcessedFile::Structured { title, sheet_names, column_headers: all_headers, rows: all_rows, }) } // === 工具函数 === /// 辅助:将 Range 转为行的 Vec,解决 calamine 类型推断问题 fn range_as_data_rows(range: &Range) -> Vec> { range.rows().map(|row| row.to_vec()).collect() } /// 从文件名提取标题 fn extract_title(file_name: &str, ext: &str) -> String { file_name .rsplit_once('/') .or_else(|| file_name.rsplit_once('\\')) .map(|(_, name)| name) .unwrap_or(file_name) .trim_end_matches(ext) .to_string() } /// 将 NormalizedDocument 转为单个 Markdown 内容字符串 pub fn normalized_to_markdown(doc: &NormalizedDocument) -> String { let mut md = String::new(); for section in &doc.sections { if let Some(ref heading) = section.heading { md.push_str(&format!("## {}\n\n", heading)); } md.push_str(§ion.content); md.push_str("\n\n"); } md.trim().to_string() }