feat(knowledge): Phase B+C 文档提取器 + multipart 文件上传
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled
- PDF 提取 (pdf-extract) + DOCX 提取 (zip+quick-xml) + Excel 解析 (calamine) - 统一格式路由 detect_format() → RAG 通道或结构化通道 - POST /api/v1/knowledge/upload multipart 文件上传 - PDF/DOCX/Markdown → RAG 管线,Excel → structured_rows JSONB - 结构化数据源 CRUD API (GET/DELETE /api/v1/structured/sources) - POST /api/v1/structured/query JSONB 关键词查询 - 修复 industry/service.rs SaasError::Database 类型不匹配
This commit is contained in:
369
crates/zclaw-saas/src/knowledge/extractors.rs
Normal file
369
crates/zclaw-saas/src/knowledge/extractors.rs
Normal file
@@ -0,0 +1,369 @@
|
||||
//! 文档处理管线 — PDF/DOCX/Excel 格式提取
|
||||
//!
|
||||
//! 核心思想:每种格式输出统一的 NormalizedDocument,后面复用现有管线。
|
||||
//! Excel 走独立的结构化通道(JSONB 行级存储),不走 RAG。
|
||||
|
||||
use calamine::{Reader, Data, Range};
|
||||
|
||||
// === 规范化文档 — 所有格式的统一中间表示 ===
|
||||
|
||||
/// 文档提取结果(用于 RAG 通道)
|
||||
pub struct NormalizedDocument {
|
||||
pub title: String,
|
||||
pub sections: Vec<DocumentSection>,
|
||||
pub metadata: DocumentMetadata,
|
||||
}
|
||||
|
||||
pub struct DocumentSection {
|
||||
pub heading: Option<String>,
|
||||
pub content: String,
|
||||
pub level: u8,
|
||||
pub page_number: Option<u32>,
|
||||
}
|
||||
|
||||
pub struct DocumentMetadata {
|
||||
pub source_format: String,
|
||||
pub file_name: String,
|
||||
pub total_pages: Option<u32>,
|
||||
pub total_sections: u32,
|
||||
}
|
||||
|
||||
// === 格式路由 ===
|
||||
|
||||
/// 根据文件扩展名判断处理通道
|
||||
pub fn detect_format(file_name: &str) -> Option<DocumentFormat> {
|
||||
let ext = file_name.rsplit('.').next().unwrap_or("").to_lowercase();
|
||||
match ext.as_str() {
|
||||
"pdf" => Some(DocumentFormat::Pdf),
|
||||
"docx" | "doc" => Some(DocumentFormat::Docx),
|
||||
"xlsx" | "xls" => Some(DocumentFormat::Excel),
|
||||
"md" | "txt" | "markdown" => Some(DocumentFormat::Markdown),
|
||||
"csv" => Some(DocumentFormat::Csv),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
pub enum DocumentFormat {
|
||||
Pdf,
|
||||
Docx,
|
||||
Excel,
|
||||
Csv,
|
||||
Markdown,
|
||||
}
|
||||
|
||||
impl DocumentFormat {
|
||||
pub fn is_structured(&self) -> bool {
|
||||
matches!(self, Self::Excel | Self::Csv)
|
||||
}
|
||||
}
|
||||
|
||||
// === 文件处理结果 ===
|
||||
|
||||
pub enum ProcessedFile {
|
||||
/// 文档通道(RAG)— PDF/DOCX/Markdown
|
||||
Document(NormalizedDocument),
|
||||
/// 结构化通道 — Excel/CSV 行数据
|
||||
Structured {
|
||||
title: String,
|
||||
sheet_names: Vec<String>,
|
||||
column_headers: Vec<String>,
|
||||
rows: Vec<(Option<String>, i32, Vec<String>, serde_json::Value)>,
|
||||
},
|
||||
}
|
||||
|
||||
// === 提取错误 ===
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ExtractError(pub String);
|
||||
|
||||
impl std::fmt::Display for ExtractError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for ExtractError {}
|
||||
|
||||
impl From<ExtractError> for crate::error::SaasError {
|
||||
fn from(e: ExtractError) -> Self {
|
||||
crate::error::SaasError::InvalidInput(e.0)
|
||||
}
|
||||
}
|
||||
|
||||
// === PDF 提取 ===
|
||||
|
||||
pub fn extract_pdf(data: &[u8], file_name: &str) -> Result<NormalizedDocument, ExtractError> {
|
||||
let text = pdf_extract::extract_text_from_mem(data)
|
||||
.map_err(|e| ExtractError(format!("PDF 提取失败: {}", e)))?;
|
||||
|
||||
let pages: Vec<&str> = text.split('\x0c').collect();
|
||||
let page_count = pages.len() as u32;
|
||||
|
||||
let mut sections = Vec::new();
|
||||
let mut current_content = String::new();
|
||||
|
||||
for (i, page) in pages.iter().enumerate() {
|
||||
let page_text = page.trim();
|
||||
if page_text.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
current_content.push_str(page_text);
|
||||
current_content.push('\n');
|
||||
|
||||
if current_content.len() > 2000 || i == pages.len() - 1 {
|
||||
let content = current_content.trim().to_string();
|
||||
if !content.is_empty() {
|
||||
sections.push(DocumentSection {
|
||||
heading: Some(format!("第 {} 页", i + 1)),
|
||||
content,
|
||||
level: 2,
|
||||
page_number: Some((i + 1) as u32),
|
||||
});
|
||||
}
|
||||
current_content.clear();
|
||||
}
|
||||
}
|
||||
|
||||
let title = extract_title(file_name, ".pdf");
|
||||
let total_sections = sections.len() as u32;
|
||||
|
||||
Ok(NormalizedDocument {
|
||||
title,
|
||||
sections,
|
||||
metadata: DocumentMetadata {
|
||||
source_format: "pdf".to_string(),
|
||||
file_name: file_name.to_string(),
|
||||
total_pages: Some(page_count),
|
||||
total_sections,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
// === DOCX 提取 ===
|
||||
|
||||
pub fn extract_docx(data: &[u8], file_name: &str) -> Result<NormalizedDocument, ExtractError> {
|
||||
let reader = std::io::Cursor::new(data);
|
||||
let mut archive = zip::ZipArchive::new(reader)
|
||||
.map_err(|e| ExtractError(format!("DOCX 解压失败: {}", e)))?;
|
||||
|
||||
let mut doc_xml = archive.by_name("word/document.xml")
|
||||
.map_err(|e| ExtractError(format!("DOCX 中未找到 document.xml: {}", e)))?;
|
||||
|
||||
let mut xml_content = String::new();
|
||||
use std::io::Read;
|
||||
doc_xml.read_to_string(&mut xml_content)
|
||||
.map_err(|e| ExtractError(format!("DOCX 读取失败: {}", e)))?;
|
||||
|
||||
let mut sections = Vec::new();
|
||||
let mut current_heading: Option<String> = None;
|
||||
let mut current_content = String::new();
|
||||
|
||||
// 简单 XML 解析:提取 <w:t> 文本和 <w:pStyle> 标题层级
|
||||
let mut in_text = false;
|
||||
let mut paragraph_style = String::new();
|
||||
let mut text_buf = String::new();
|
||||
|
||||
let mut reader = quick_xml::Reader::from_str(&xml_content);
|
||||
let mut buf = Vec::new();
|
||||
|
||||
loop {
|
||||
match reader.read_event_into(&mut buf) {
|
||||
Ok(quick_xml::events::Event::Start(e)) => {
|
||||
let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string();
|
||||
match name.as_str() {
|
||||
"p" => paragraph_style.clear(),
|
||||
"t" => in_text = true,
|
||||
"pStyle" => {
|
||||
for attr in e.attributes().flatten() {
|
||||
if attr.key.local_name().as_ref() == b"val" {
|
||||
paragraph_style = String::from_utf8_lossy(&attr.value).to_string();
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Ok(quick_xml::events::Event::Text(t)) => {
|
||||
if in_text {
|
||||
text_buf.push_str(&t.unescape().unwrap_or_default());
|
||||
}
|
||||
}
|
||||
Ok(quick_xml::events::Event::End(e)) => {
|
||||
let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string();
|
||||
match name.as_str() {
|
||||
"p" => {
|
||||
let text = text_buf.trim().to_string();
|
||||
text_buf.clear();
|
||||
if text.is_empty() { continue; }
|
||||
|
||||
let is_heading = paragraph_style.starts_with("Heading")
|
||||
|| paragraph_style.starts_with("heading")
|
||||
|| paragraph_style == "Title";
|
||||
|
||||
if is_heading {
|
||||
if !current_content.is_empty() {
|
||||
sections.push(DocumentSection {
|
||||
heading: current_heading.take(),
|
||||
content: current_content.trim().to_string(),
|
||||
level: 2,
|
||||
page_number: None,
|
||||
});
|
||||
current_content.clear();
|
||||
}
|
||||
current_heading = Some(text);
|
||||
} else {
|
||||
current_content.push_str(&text);
|
||||
current_content.push('\n');
|
||||
}
|
||||
}
|
||||
"t" => in_text = false,
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Ok(quick_xml::events::Event::Eof) => break,
|
||||
Err(e) => {
|
||||
tracing::warn!("DOCX XML parse warning: {}", e);
|
||||
break;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
buf.clear();
|
||||
}
|
||||
|
||||
if !current_content.is_empty() {
|
||||
sections.push(DocumentSection {
|
||||
heading: current_heading,
|
||||
content: current_content.trim().to_string(),
|
||||
level: 2,
|
||||
page_number: None,
|
||||
});
|
||||
}
|
||||
|
||||
let title = extract_title(file_name, ".docx");
|
||||
let total_sections = sections.len() as u32;
|
||||
|
||||
Ok(NormalizedDocument {
|
||||
title,
|
||||
sections,
|
||||
metadata: DocumentMetadata {
|
||||
source_format: "docx".to_string(),
|
||||
file_name: file_name.to_string(),
|
||||
total_pages: None,
|
||||
total_sections,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
// === Excel 解析 ===
|
||||
|
||||
pub fn extract_excel(data: &[u8], file_name: &str) -> Result<ProcessedFile, ExtractError> {
|
||||
let cursor = std::io::Cursor::new(data);
|
||||
let mut workbook: calamine::Xlsx<_> = calamine::open_workbook_from_rs(cursor)
|
||||
.map_err(|e| ExtractError(format!("Excel 解析失败: {}", e)))?;
|
||||
|
||||
let sheet_names = workbook.sheet_names().to_vec();
|
||||
let mut all_rows: Vec<(Option<String>, i32, Vec<String>, serde_json::Value)> = Vec::new();
|
||||
let mut all_headers: Vec<String> = Vec::new();
|
||||
let mut global_row_index = 0i32;
|
||||
|
||||
for sheet_name in &sheet_names {
|
||||
if let Ok(range) = workbook.worksheet_range(sheet_name) {
|
||||
let mut headers: Vec<String> = Vec::new();
|
||||
let mut first_row = true;
|
||||
|
||||
for row in range_as_data_rows(&range) {
|
||||
if first_row {
|
||||
headers = row.iter().map(|cell| {
|
||||
cell.to_string().trim().to_string()
|
||||
}).collect();
|
||||
headers.retain(|h| !h.is_empty());
|
||||
if headers.is_empty() { first_row = false; continue; }
|
||||
for h in &headers {
|
||||
if !all_headers.contains(h) {
|
||||
all_headers.push(h.clone());
|
||||
}
|
||||
}
|
||||
first_row = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut row_map = serde_json::Map::new();
|
||||
for (i, cell) in row.iter().enumerate() {
|
||||
if i >= headers.len() { break; }
|
||||
let value = match cell {
|
||||
Data::Empty => continue,
|
||||
Data::String(s) => serde_json::Value::String(s.clone()),
|
||||
Data::Float(f) => serde_json::json!(f),
|
||||
Data::Int(n) => serde_json::json!(n),
|
||||
Data::Bool(b) => serde_json::Value::Bool(*b),
|
||||
Data::DateTime(dt) => {
|
||||
serde_json::Value::String(dt.to_string())
|
||||
}
|
||||
Data::DateTimeIso(s) => {
|
||||
serde_json::Value::String(s.clone())
|
||||
}
|
||||
Data::DurationIso(s) => {
|
||||
serde_json::Value::String(s.clone())
|
||||
}
|
||||
Data::Error(e) => {
|
||||
serde_json::Value::String(format!("{:?}", e))
|
||||
}
|
||||
};
|
||||
row_map.insert(headers[i].clone(), value);
|
||||
}
|
||||
|
||||
if !row_map.is_empty() {
|
||||
all_rows.push((
|
||||
Some(sheet_name.clone()),
|
||||
global_row_index,
|
||||
headers.clone(),
|
||||
serde_json::Value::Object(row_map),
|
||||
));
|
||||
global_row_index += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let title = extract_title(file_name, ".xlsx");
|
||||
|
||||
Ok(ProcessedFile::Structured {
|
||||
title,
|
||||
sheet_names,
|
||||
column_headers: all_headers,
|
||||
rows: all_rows,
|
||||
})
|
||||
}
|
||||
|
||||
// === 工具函数 ===
|
||||
|
||||
/// 辅助:将 Range<Data> 转为行的 Vec,解决 calamine 类型推断问题
|
||||
fn range_as_data_rows(range: &Range<Data>) -> Vec<Vec<Data>> {
|
||||
range.rows().map(|row| row.to_vec()).collect()
|
||||
}
|
||||
|
||||
/// 从文件名提取标题
|
||||
fn extract_title(file_name: &str, ext: &str) -> String {
|
||||
file_name
|
||||
.rsplit_once('/')
|
||||
.or_else(|| file_name.rsplit_once('\\'))
|
||||
.map(|(_, name)| name)
|
||||
.unwrap_or(file_name)
|
||||
.trim_end_matches(ext)
|
||||
.to_string()
|
||||
}
|
||||
|
||||
/// 将 NormalizedDocument 转为单个 Markdown 内容字符串
|
||||
pub fn normalized_to_markdown(doc: &NormalizedDocument) -> String {
|
||||
let mut md = String::new();
|
||||
for section in &doc.sections {
|
||||
if let Some(ref heading) = section.heading {
|
||||
md.push_str(&format!("## {}\n\n", heading));
|
||||
}
|
||||
md.push_str(§ion.content);
|
||||
md.push_str("\n\n");
|
||||
}
|
||||
md.trim().to_string()
|
||||
}
|
||||
Reference in New Issue
Block a user