Files
zclaw_openfang/crates/zclaw-saas/src/knowledge/extractors.rs
iven 60062a8097
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled
feat(knowledge): Phase B+C 文档提取器 + multipart 文件上传
- PDF 提取 (pdf-extract) + DOCX 提取 (zip+quick-xml) + Excel 解析 (calamine)
- 统一格式路由 detect_format() → RAG 通道或结构化通道
- POST /api/v1/knowledge/upload multipart 文件上传
- PDF/DOCX/Markdown → RAG 管线,Excel → structured_rows JSONB
- 结构化数据源 CRUD API (GET/DELETE /api/v1/structured/sources)
- POST /api/v1/structured/query JSONB 关键词查询
- 修复 industry/service.rs SaasError::Database 类型不匹配
2026-04-12 19:25:24 +08:00

370 lines
12 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//! 文档处理管线 — PDF/DOCX/Excel 格式提取
//!
//! 核心思想:每种格式输出统一的 NormalizedDocument后面复用现有管线。
//! Excel 走独立的结构化通道JSONB 行级存储),不走 RAG。
use calamine::{Reader, Data, Range};
// === 规范化文档 — 所有格式的统一中间表示 ===
/// 文档提取结果(用于 RAG 通道)
pub struct NormalizedDocument {
pub title: String,
pub sections: Vec<DocumentSection>,
pub metadata: DocumentMetadata,
}
pub struct DocumentSection {
pub heading: Option<String>,
pub content: String,
pub level: u8,
pub page_number: Option<u32>,
}
pub struct DocumentMetadata {
pub source_format: String,
pub file_name: String,
pub total_pages: Option<u32>,
pub total_sections: u32,
}
// === 格式路由 ===
/// 根据文件扩展名判断处理通道
pub fn detect_format(file_name: &str) -> Option<DocumentFormat> {
let ext = file_name.rsplit('.').next().unwrap_or("").to_lowercase();
match ext.as_str() {
"pdf" => Some(DocumentFormat::Pdf),
"docx" | "doc" => Some(DocumentFormat::Docx),
"xlsx" | "xls" => Some(DocumentFormat::Excel),
"md" | "txt" | "markdown" => Some(DocumentFormat::Markdown),
"csv" => Some(DocumentFormat::Csv),
_ => None,
}
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum DocumentFormat {
Pdf,
Docx,
Excel,
Csv,
Markdown,
}
impl DocumentFormat {
pub fn is_structured(&self) -> bool {
matches!(self, Self::Excel | Self::Csv)
}
}
// === 文件处理结果 ===
pub enum ProcessedFile {
/// 文档通道RAG— PDF/DOCX/Markdown
Document(NormalizedDocument),
/// 结构化通道 — Excel/CSV 行数据
Structured {
title: String,
sheet_names: Vec<String>,
column_headers: Vec<String>,
rows: Vec<(Option<String>, i32, Vec<String>, serde_json::Value)>,
},
}
// === 提取错误 ===
#[derive(Debug)]
pub struct ExtractError(pub String);
impl std::fmt::Display for ExtractError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
impl std::error::Error for ExtractError {}
impl From<ExtractError> for crate::error::SaasError {
fn from(e: ExtractError) -> Self {
crate::error::SaasError::InvalidInput(e.0)
}
}
// === PDF 提取 ===
pub fn extract_pdf(data: &[u8], file_name: &str) -> Result<NormalizedDocument, ExtractError> {
let text = pdf_extract::extract_text_from_mem(data)
.map_err(|e| ExtractError(format!("PDF 提取失败: {}", e)))?;
let pages: Vec<&str> = text.split('\x0c').collect();
let page_count = pages.len() as u32;
let mut sections = Vec::new();
let mut current_content = String::new();
for (i, page) in pages.iter().enumerate() {
let page_text = page.trim();
if page_text.is_empty() {
continue;
}
current_content.push_str(page_text);
current_content.push('\n');
if current_content.len() > 2000 || i == pages.len() - 1 {
let content = current_content.trim().to_string();
if !content.is_empty() {
sections.push(DocumentSection {
heading: Some(format!("{}", i + 1)),
content,
level: 2,
page_number: Some((i + 1) as u32),
});
}
current_content.clear();
}
}
let title = extract_title(file_name, ".pdf");
let total_sections = sections.len() as u32;
Ok(NormalizedDocument {
title,
sections,
metadata: DocumentMetadata {
source_format: "pdf".to_string(),
file_name: file_name.to_string(),
total_pages: Some(page_count),
total_sections,
},
})
}
// === DOCX 提取 ===
pub fn extract_docx(data: &[u8], file_name: &str) -> Result<NormalizedDocument, ExtractError> {
let reader = std::io::Cursor::new(data);
let mut archive = zip::ZipArchive::new(reader)
.map_err(|e| ExtractError(format!("DOCX 解压失败: {}", e)))?;
let mut doc_xml = archive.by_name("word/document.xml")
.map_err(|e| ExtractError(format!("DOCX 中未找到 document.xml: {}", e)))?;
let mut xml_content = String::new();
use std::io::Read;
doc_xml.read_to_string(&mut xml_content)
.map_err(|e| ExtractError(format!("DOCX 读取失败: {}", e)))?;
let mut sections = Vec::new();
let mut current_heading: Option<String> = None;
let mut current_content = String::new();
// 简单 XML 解析:提取 <w:t> 文本和 <w:pStyle> 标题层级
let mut in_text = false;
let mut paragraph_style = String::new();
let mut text_buf = String::new();
let mut reader = quick_xml::Reader::from_str(&xml_content);
let mut buf = Vec::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(quick_xml::events::Event::Start(e)) => {
let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string();
match name.as_str() {
"p" => paragraph_style.clear(),
"t" => in_text = true,
"pStyle" => {
for attr in e.attributes().flatten() {
if attr.key.local_name().as_ref() == b"val" {
paragraph_style = String::from_utf8_lossy(&attr.value).to_string();
}
}
}
_ => {}
}
}
Ok(quick_xml::events::Event::Text(t)) => {
if in_text {
text_buf.push_str(&t.unescape().unwrap_or_default());
}
}
Ok(quick_xml::events::Event::End(e)) => {
let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string();
match name.as_str() {
"p" => {
let text = text_buf.trim().to_string();
text_buf.clear();
if text.is_empty() { continue; }
let is_heading = paragraph_style.starts_with("Heading")
|| paragraph_style.starts_with("heading")
|| paragraph_style == "Title";
if is_heading {
if !current_content.is_empty() {
sections.push(DocumentSection {
heading: current_heading.take(),
content: current_content.trim().to_string(),
level: 2,
page_number: None,
});
current_content.clear();
}
current_heading = Some(text);
} else {
current_content.push_str(&text);
current_content.push('\n');
}
}
"t" => in_text = false,
_ => {}
}
}
Ok(quick_xml::events::Event::Eof) => break,
Err(e) => {
tracing::warn!("DOCX XML parse warning: {}", e);
break;
}
_ => {}
}
buf.clear();
}
if !current_content.is_empty() {
sections.push(DocumentSection {
heading: current_heading,
content: current_content.trim().to_string(),
level: 2,
page_number: None,
});
}
let title = extract_title(file_name, ".docx");
let total_sections = sections.len() as u32;
Ok(NormalizedDocument {
title,
sections,
metadata: DocumentMetadata {
source_format: "docx".to_string(),
file_name: file_name.to_string(),
total_pages: None,
total_sections,
},
})
}
// === Excel 解析 ===
pub fn extract_excel(data: &[u8], file_name: &str) -> Result<ProcessedFile, ExtractError> {
let cursor = std::io::Cursor::new(data);
let mut workbook: calamine::Xlsx<_> = calamine::open_workbook_from_rs(cursor)
.map_err(|e| ExtractError(format!("Excel 解析失败: {}", e)))?;
let sheet_names = workbook.sheet_names().to_vec();
let mut all_rows: Vec<(Option<String>, i32, Vec<String>, serde_json::Value)> = Vec::new();
let mut all_headers: Vec<String> = Vec::new();
let mut global_row_index = 0i32;
for sheet_name in &sheet_names {
if let Ok(range) = workbook.worksheet_range(sheet_name) {
let mut headers: Vec<String> = Vec::new();
let mut first_row = true;
for row in range_as_data_rows(&range) {
if first_row {
headers = row.iter().map(|cell| {
cell.to_string().trim().to_string()
}).collect();
headers.retain(|h| !h.is_empty());
if headers.is_empty() { first_row = false; continue; }
for h in &headers {
if !all_headers.contains(h) {
all_headers.push(h.clone());
}
}
first_row = false;
continue;
}
let mut row_map = serde_json::Map::new();
for (i, cell) in row.iter().enumerate() {
if i >= headers.len() { break; }
let value = match cell {
Data::Empty => continue,
Data::String(s) => serde_json::Value::String(s.clone()),
Data::Float(f) => serde_json::json!(f),
Data::Int(n) => serde_json::json!(n),
Data::Bool(b) => serde_json::Value::Bool(*b),
Data::DateTime(dt) => {
serde_json::Value::String(dt.to_string())
}
Data::DateTimeIso(s) => {
serde_json::Value::String(s.clone())
}
Data::DurationIso(s) => {
serde_json::Value::String(s.clone())
}
Data::Error(e) => {
serde_json::Value::String(format!("{:?}", e))
}
};
row_map.insert(headers[i].clone(), value);
}
if !row_map.is_empty() {
all_rows.push((
Some(sheet_name.clone()),
global_row_index,
headers.clone(),
serde_json::Value::Object(row_map),
));
global_row_index += 1;
}
}
}
}
let title = extract_title(file_name, ".xlsx");
Ok(ProcessedFile::Structured {
title,
sheet_names,
column_headers: all_headers,
rows: all_rows,
})
}
// === 工具函数 ===
/// 辅助:将 Range<Data> 转为行的 Vec解决 calamine 类型推断问题
fn range_as_data_rows(range: &Range<Data>) -> Vec<Vec<Data>> {
range.rows().map(|row| row.to_vec()).collect()
}
/// 从文件名提取标题
fn extract_title(file_name: &str, ext: &str) -> String {
file_name
.rsplit_once('/')
.or_else(|| file_name.rsplit_once('\\'))
.map(|(_, name)| name)
.unwrap_or(file_name)
.trim_end_matches(ext)
.to_string()
}
/// 将 NormalizedDocument 转为单个 Markdown 内容字符串
pub fn normalized_to_markdown(doc: &NormalizedDocument) -> String {
let mut md = String::new();
for section in &doc.sections {
if let Some(ref heading) = section.heading {
md.push_str(&format!("## {}\n\n", heading));
}
md.push_str(&section.content);
md.push_str("\n\n");
}
md.trim().to_string()
}