feat(knowledge): Phase B+C 文档提取器 + multipart 文件上传

- PDF 提取 (pdf-extract) + DOCX 提取 (zip+quick-xml) + Excel 解析 (calamine) - 统一格式路由 detect_format() → RAG 通道或结构化通道 - POST /api/v1/knowledge/upload multipart 文件上传 - PDF/DOCX/Markdown → RAG 管线，Excel → structured_rows JSONB - 结构化数据源 CRUD API (GET/DELETE /api/v1/structured/sources) - POST /api/v1/structured/query JSONB 关键词查询 - 修复 industry/service.rs SaasError::Database 类型不匹配
2026-04-12 19:25:24 +08:00
parent 4800f89467
commit 60062a8097
7 changed files with 849 additions and 8 deletions
--- a/crates/zclaw-saas/Cargo.toml
+++ b/crates/zclaw-saas/Cargo.toml
@@ -53,5 +53,11 @@ bytes = { workspace = true }
 async-stream = { workspace = true }
 genpdf = "0.2"

+# Document processing
+pdf-extract = { workspace = true }
+calamine = { workspace = true }
+quick-xml = { workspace = true }
+zip = { workspace = true }
+
 [dev-dependencies]
 tempfile = { workspace = true }
--- a/crates/zclaw-saas/src/industry/service.rs
+++ b/crates/zclaw-saas/src/industry/service.rs
@@ -196,14 +196,14 @@ pub async fn set_account_industries(
    .bind(&ids)
    .fetch_one(pool)
    .await
-    .map_err(|e| SaasError::Database(e.to_string()))?;
+    .map_err(SaasError::Database)?;

    if valid_count.0 != ids.len() as i64 {
        return Err(SaasError::InvalidInput("部分行业不存在或已禁用".to_string()));
    }

    // 事务性 DELETE + INSERT
-    let mut tx = pool.begin().await.map_err(|e| SaasError::Database(e.to_string()))?;
+    let mut tx = pool.begin().await.map_err(SaasError::Database)?;

    sqlx::query("DELETE FROM account_industries WHERE account_id = $1")
        .bind(account_id)
@@ -223,7 +223,7 @@ pub async fn set_account_industries(
        .await?;
    }

-    tx.commit().await.map_err(|e| SaasError::Database(e.to_string()))?;
+    tx.commit().await.map_err(SaasError::Database)?;

    list_account_industries(pool, account_id).await
 }
--- a/crates/zclaw-saas/src/knowledge/extractors.rs
+++ b/crates/zclaw-saas/src/knowledge/extractors.rs
@@ -0,0 +1,369 @@
+//! 文档处理管线 — PDF/DOCX/Excel 格式提取
+//!
+//! 核心思想：每种格式输出统一的 NormalizedDocument，后面复用现有管线。
+//! Excel 走独立的结构化通道（JSONB 行级存储），不走 RAG。
+
+use calamine::{Reader, Data, Range};
+
+// === 规范化文档 — 所有格式的统一中间表示 ===
+
+/// 文档提取结果（用于 RAG 通道）
+pub struct NormalizedDocument {
+    pub title: String,
+    pub sections: Vec<DocumentSection>,
+    pub metadata: DocumentMetadata,
+}
+
+pub struct DocumentSection {
+    pub heading: Option<String>,
+    pub content: String,
+    pub level: u8,
+    pub page_number: Option<u32>,
+}
+
+pub struct DocumentMetadata {
+    pub source_format: String,
+    pub file_name: String,
+    pub total_pages: Option<u32>,
+    pub total_sections: u32,
+}
+
+// === 格式路由 ===
+
+/// 根据文件扩展名判断处理通道
+pub fn detect_format(file_name: &str) -> Option<DocumentFormat> {
+    let ext = file_name.rsplit('.').next().unwrap_or("").to_lowercase();
+    match ext.as_str() {
+        "pdf" => Some(DocumentFormat::Pdf),
+        "docx" | "doc" => Some(DocumentFormat::Docx),
+        "xlsx" | "xls" => Some(DocumentFormat::Excel),
+        "md" | "txt" | "markdown" => Some(DocumentFormat::Markdown),
+        "csv" => Some(DocumentFormat::Csv),
+        _ => None,
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub enum DocumentFormat {
+    Pdf,
+    Docx,
+    Excel,
+    Csv,
+    Markdown,
+}
+
+impl DocumentFormat {
+    pub fn is_structured(&self) -> bool {
+        matches!(self, Self::Excel | Self::Csv)
+    }
+}
+
+// === 文件处理结果 ===
+
+pub enum ProcessedFile {
+    /// 文档通道（RAG）— PDF/DOCX/Markdown
+    Document(NormalizedDocument),
+    /// 结构化通道 — Excel/CSV 行数据
+    Structured {
+        title: String,
+        sheet_names: Vec<String>,
+        column_headers: Vec<String>,
+        rows: Vec<(Option<String>, i32, Vec<String>, serde_json::Value)>,
+    },
+}
+
+// === 提取错误 ===
+
+#[derive(Debug)]
+pub struct ExtractError(pub String);
+
+impl std::fmt::Display for ExtractError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+impl std::error::Error for ExtractError {}
+
+impl From<ExtractError> for crate::error::SaasError {
+    fn from(e: ExtractError) -> Self {
+        crate::error::SaasError::InvalidInput(e.0)
+    }
+}
+
+// === PDF 提取 ===
+
+pub fn extract_pdf(data: &[u8], file_name: &str) -> Result<NormalizedDocument, ExtractError> {
+    let text = pdf_extract::extract_text_from_mem(data)
+        .map_err(|e| ExtractError(format!("PDF 提取失败: {}", e)))?;
+
+    let pages: Vec<&str> = text.split('\x0c').collect();
+    let page_count = pages.len() as u32;
+
+    let mut sections = Vec::new();
+    let mut current_content = String::new();
+
+    for (i, page) in pages.iter().enumerate() {
+        let page_text = page.trim();
+        if page_text.is_empty() {
+            continue;
+        }
+
+        current_content.push_str(page_text);
+        current_content.push('\n');
+
+        if current_content.len() > 2000 || i == pages.len() - 1 {
+            let content = current_content.trim().to_string();
+            if !content.is_empty() {
+                sections.push(DocumentSection {
+                    heading: Some(format!("第 {} 页", i + 1)),
+                    content,
+                    level: 2,
+                    page_number: Some((i + 1) as u32),
+                });
+            }
+            current_content.clear();
+        }
+    }
+
+    let title = extract_title(file_name, ".pdf");
+    let total_sections = sections.len() as u32;
+
+    Ok(NormalizedDocument {
+        title,
+        sections,
+        metadata: DocumentMetadata {
+            source_format: "pdf".to_string(),
+            file_name: file_name.to_string(),
+            total_pages: Some(page_count),
+            total_sections,
+        },
+    })
+}
+
+// === DOCX 提取 ===
+
+pub fn extract_docx(data: &[u8], file_name: &str) -> Result<NormalizedDocument, ExtractError> {
+    let reader = std::io::Cursor::new(data);
+    let mut archive = zip::ZipArchive::new(reader)
+        .map_err(|e| ExtractError(format!("DOCX 解压失败: {}", e)))?;
+
+    let mut doc_xml = archive.by_name("word/document.xml")
+        .map_err(|e| ExtractError(format!("DOCX 中未找到 document.xml: {}", e)))?;
+
+    let mut xml_content = String::new();
+    use std::io::Read;
+    doc_xml.read_to_string(&mut xml_content)
+        .map_err(|e| ExtractError(format!("DOCX 读取失败: {}", e)))?;
+
+    let mut sections = Vec::new();
+    let mut current_heading: Option<String> = None;
+    let mut current_content = String::new();
+
+    // 简单 XML 解析：提取 <w:t> 文本和 <w:pStyle> 标题层级
+    let mut in_text = false;
+    let mut paragraph_style = String::new();
+    let mut text_buf = String::new();
+
+    let mut reader = quick_xml::Reader::from_str(&xml_content);
+    let mut buf = Vec::new();
+
+    loop {
+        match reader.read_event_into(&mut buf) {
+            Ok(quick_xml::events::Event::Start(e)) => {
+                let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string();
+                match name.as_str() {
+                    "p" => paragraph_style.clear(),
+                    "t" => in_text = true,
+                    "pStyle" => {
+                        for attr in e.attributes().flatten() {
+                            if attr.key.local_name().as_ref() == b"val" {
+                                paragraph_style = String::from_utf8_lossy(&attr.value).to_string();
+                            }
+                        }
+                    }
+                    _ => {}
+                }
+            }
+            Ok(quick_xml::events::Event::Text(t)) => {
+                if in_text {
+                    text_buf.push_str(&t.unescape().unwrap_or_default());
+                }
+            }
+            Ok(quick_xml::events::Event::End(e)) => {
+                let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string();
+                match name.as_str() {
+                    "p" => {
+                        let text = text_buf.trim().to_string();
+                        text_buf.clear();
+                        if text.is_empty() { continue; }
+
+                        let is_heading = paragraph_style.starts_with("Heading")
+                            || paragraph_style.starts_with("heading")
+                            || paragraph_style == "Title";
+
+                        if is_heading {
+                            if !current_content.is_empty() {
+                                sections.push(DocumentSection {
+                                    heading: current_heading.take(),
+                                    content: current_content.trim().to_string(),
+                                    level: 2,
+                                    page_number: None,
+                                });
+                                current_content.clear();
+                            }
+                            current_heading = Some(text);
+                        } else {
+                            current_content.push_str(&text);
+                            current_content.push('\n');
+                        }
+                    }
+                    "t" => in_text = false,
+                    _ => {}
+                }
+            }
+            Ok(quick_xml::events::Event::Eof) => break,
+            Err(e) => {
+                tracing::warn!("DOCX XML parse warning: {}", e);
+                break;
+            }
+            _ => {}
+        }
+        buf.clear();
+    }
+
+    if !current_content.is_empty() {
+        sections.push(DocumentSection {
+            heading: current_heading,
+            content: current_content.trim().to_string(),
+            level: 2,
+            page_number: None,
+        });
+    }
+
+    let title = extract_title(file_name, ".docx");
+    let total_sections = sections.len() as u32;
+
+    Ok(NormalizedDocument {
+        title,
+        sections,
+        metadata: DocumentMetadata {
+            source_format: "docx".to_string(),
+            file_name: file_name.to_string(),
+            total_pages: None,
+            total_sections,
+        },
+    })
+}
+
+// === Excel 解析 ===
+
+pub fn extract_excel(data: &[u8], file_name: &str) -> Result<ProcessedFile, ExtractError> {
+    let cursor = std::io::Cursor::new(data);
+    let mut workbook: calamine::Xlsx<_> = calamine::open_workbook_from_rs(cursor)
+        .map_err(|e| ExtractError(format!("Excel 解析失败: {}", e)))?;
+
+    let sheet_names = workbook.sheet_names().to_vec();
+    let mut all_rows: Vec<(Option<String>, i32, Vec<String>, serde_json::Value)> = Vec::new();
+    let mut all_headers: Vec<String> = Vec::new();
+    let mut global_row_index = 0i32;
+
+    for sheet_name in &sheet_names {
+        if let Ok(range) = workbook.worksheet_range(sheet_name) {
+            let mut headers: Vec<String> = Vec::new();
+            let mut first_row = true;
+
+            for row in range_as_data_rows(&range) {
+                if first_row {
+                    headers = row.iter().map(|cell| {
+                        cell.to_string().trim().to_string()
+                    }).collect();
+                    headers.retain(|h| !h.is_empty());
+                    if headers.is_empty() { first_row = false; continue; }
+                    for h in &headers {
+                        if !all_headers.contains(h) {
+                            all_headers.push(h.clone());
+                        }
+                    }
+                    first_row = false;
+                    continue;
+                }
+
+                let mut row_map = serde_json::Map::new();
+                for (i, cell) in row.iter().enumerate() {
+                    if i >= headers.len() { break; }
+                    let value = match cell {
+                        Data::Empty => continue,
+                        Data::String(s) => serde_json::Value::String(s.clone()),
+                        Data::Float(f) => serde_json::json!(f),
+                        Data::Int(n) => serde_json::json!(n),
+                        Data::Bool(b) => serde_json::Value::Bool(*b),
+                        Data::DateTime(dt) => {
+                            serde_json::Value::String(dt.to_string())
+                        }
+                        Data::DateTimeIso(s) => {
+                            serde_json::Value::String(s.clone())
+                        }
+                        Data::DurationIso(s) => {
+                            serde_json::Value::String(s.clone())
+                        }
+                        Data::Error(e) => {
+                            serde_json::Value::String(format!("{:?}", e))
+                        }
+                    };
+                    row_map.insert(headers[i].clone(), value);
+                }
+
+                if !row_map.is_empty() {
+                    all_rows.push((
+                        Some(sheet_name.clone()),
+                        global_row_index,
+                        headers.clone(),
+                        serde_json::Value::Object(row_map),
+                    ));
+                    global_row_index += 1;
+                }
+            }
+        }
+    }
+
+    let title = extract_title(file_name, ".xlsx");
+
+    Ok(ProcessedFile::Structured {
+        title,
+        sheet_names,
+        column_headers: all_headers,
+        rows: all_rows,
+    })
+}
+
+// === 工具函数 ===
+
+/// 辅助：将 Range<Data> 转为行的 Vec，解决 calamine 类型推断问题
+fn range_as_data_rows(range: &Range<Data>) -> Vec<Vec<Data>> {
+    range.rows().map(|row| row.to_vec()).collect()
+}
+
+/// 从文件名提取标题
+fn extract_title(file_name: &str, ext: &str) -> String {
+    file_name
+        .rsplit_once('/')
+        .or_else(|| file_name.rsplit_once('\\'))
+        .map(|(_, name)| name)
+        .unwrap_or(file_name)
+        .trim_end_matches(ext)
+        .to_string()
+}
+
+/// 将 NormalizedDocument 转为单个 Markdown 内容字符串
+pub fn normalized_to_markdown(doc: &NormalizedDocument) -> String {
+    let mut md = String::new();
+    for section in &doc.sections {
+        if let Some(ref heading) = section.heading {
+            md.push_str(&format!("## {}\n\n", heading));
+        }
+        md.push_str(&section.content);
+        md.push_str("\n\n");
+    }
+    md.trim().to_string()
+}
--- a/crates/zclaw-saas/src/knowledge/handlers.rs
+++ b/crates/zclaw-saas/src/knowledge/handlers.rs
@@ -1,7 +1,7 @@
 //! 知识库 HTTP 处理器

 use axum::{
-    extract::{Extension, Path, Query, State},
+    extract::{Extension, Multipart, Path, Query, State},
    Json,
 };

@@ -10,6 +10,7 @@ use crate::error::{SaasError, SaasResult};
 use crate::state::AppState;
 use super::service;
 use super::types::*;
+use super::extractors;

 // === 分类管理 ===

@@ -685,3 +686,202 @@ pub async fn query_structured(
    let results = service::query_structured(&state.db, &req, Some(&ctx.account_id)).await?;
    Ok(Json(results))
 }
+
+// === 文件上传 ===
+
+/// POST /api/v1/knowledge/upload — multipart 文件上传
+///
+/// 支持 PDF/DOCX → RAG 管线，Excel → 结构化管线
+pub async fn upload_file(
+    State(state): State<AppState>,
+    Extension(ctx): Extension<AuthContext>,
+    mut multipart: Multipart,
+) -> SaasResult<Json<serde_json::Value>> {
+    check_permission(&ctx, "knowledge:write")?;
+    let is_admin = ctx.role == "admin" || ctx.role == "super_admin";
+
+    let mut results = Vec::new();
+
+    while let Some(field) = multipart.next_field().await.map_err(|e| {
+        SaasError::InvalidInput(format!("文件上传解析失败: {}", e))
+    })? {
+        let file_name = field.file_name().unwrap_or("unknown").to_string();
+        let data = field.bytes().await.map_err(|e| {
+            SaasError::InvalidInput(format!("文件读取失败: {}", e))
+        })?;
+
+        // 大小限制 20MB
+        if data.len() > 20 * 1024 * 1024 {
+            results.push(serde_json::json!({
+                "file": file_name,
+                "status": "error",
+                "error": "文件超过 20MB 限制"
+            }));
+            continue;
+        }
+
+        let format = match extractors::detect_format(&file_name) {
+            Some(f) => f,
+            None => {
+                results.push(serde_json::json!({
+                    "file": file_name,
+                    "status": "error",
+                    "error": "不支持的文件格式"
+                }));
+                continue;
+            }
+        };
+
+        if format.is_structured() {
+            // Excel → 结构化通道
+            match handle_structured_upload(
+                &state, &ctx, is_admin, &data, &file_name,
+            ).await {
+                Ok(result) => results.push(result),
+                Err(e) => results.push(serde_json::json!({
+                    "file": file_name,
+                    "status": "error",
+                    "error": e.to_string()
+                })),
+            }
+        } else {
+            // PDF/DOCX/MD → 文档通道 (RAG)
+            match handle_document_upload(
+                &state, &ctx, is_admin, &data, &file_name, format,
+            ).await {
+                Ok(result) => results.push(result),
+                Err(e) => results.push(serde_json::json!({
+                    "file": file_name,
+                    "status": "error",
+                    "error": e.to_string()
+                })),
+            }
+        }
+    }
+
+    Ok(Json(serde_json::json!({
+        "results": results,
+        "count": results.len(),
+    })))
+}
+
+/// 处理文档类上传（PDF/DOCX/MD → RAG 管线）
+async fn handle_document_upload(
+    state: &AppState,
+    ctx: &AuthContext,
+    is_admin: bool,
+    data: &[u8],
+    file_name: &str,
+    format: extractors::DocumentFormat,
+) -> SaasResult<serde_json::Value> {
+    let doc = match format {
+        extractors::DocumentFormat::Pdf => extractors::extract_pdf(data, file_name)?,
+        extractors::DocumentFormat::Docx => extractors::extract_docx(data, file_name)?,
+        extractors::DocumentFormat::Markdown => {
+            // Markdown 直通
+            let text = String::from_utf8_lossy(data).to_string();
+            let title = file_name.trim_end_matches(".md").trim_end_matches(".txt").to_string();
+            extractors::NormalizedDocument {
+                title,
+                sections: vec![extractors::DocumentSection {
+                    heading: None,
+                    content: text,
+                    level: 1,
+                    page_number: None,
+                }],
+                metadata: extractors::DocumentMetadata {
+                    source_format: "markdown".to_string(),
+                    file_name: file_name.to_string(),
+                    total_pages: None,
+                    total_sections: 1,
+                },
+            }
+        }
+        _ => return Err(SaasError::InvalidInput("不支持的文档格式".into())),
+    };
+
+    // 转为 Markdown 内容
+    let content = extractors::normalized_to_markdown(&doc);
+    if content.is_empty() {
+        return Err(SaasError::InvalidInput("文件内容为空".into()));
+    }
+
+    // 创建知识条目
+    let item_req = CreateItemRequest {
+        category_id: "uploaded".to_string(), // TODO: 从上传参数获取
+        title: doc.title.clone(),
+        content,
+        keywords: None,
+        related_questions: None,
+        priority: Some(5),
+        tags: Some(vec![format!("source:{}", doc.metadata.source_format)]),
+        visibility: None,
+    };
+
+    let item = service::create_item(&state.db, &ctx.account_id, &item_req, is_admin).await?;
+
+    // 触发分块
+    if let Err(e) = state.worker_dispatcher.dispatch(
+        "generate_embedding",
+        serde_json::json!({ "item_id": item.id }),
+    ).await {
+        tracing::warn!("Upload: failed to dispatch embedding for {}: {}", item.id, e);
+    }
+
+    Ok(serde_json::json!({
+        "file": file_name,
+        "status": "ok",
+        "item_id": item.id,
+        "sections": doc.metadata.total_sections,
+        "format": doc.metadata.source_format,
+    }))
+}
+
+/// 处理结构化数据上传（Excel → structured_rows）
+async fn handle_structured_upload(
+    state: &AppState,
+    ctx: &AuthContext,
+    is_admin: bool,
+    data: &[u8],
+    file_name: &str,
+) -> SaasResult<serde_json::Value> {
+    let processed = extractors::extract_excel(data, file_name)?;
+
+    match processed {
+        extractors::ProcessedFile::Structured { title, sheet_names, column_headers, rows } => {
+            if rows.is_empty() {
+                return Err(SaasError::InvalidInput("Excel 文件没有数据行".into()));
+            }
+
+            // 创建结构化数据源
+            let source_req = CreateStructuredSourceRequest {
+                title,
+                description: None,
+                original_file_name: Some(file_name.to_string()),
+                sheet_names: Some(sheet_names.clone()),
+                column_headers: Some(column_headers.clone()),
+                visibility: None,
+                industry_id: None,
+            };
+
+            let source = service::create_structured_source(
+                &state.db, &ctx.account_id, is_admin, &source_req,
+            ).await?;
+
+            // 批量写入行数据
+            let count = service::insert_structured_rows(
+                &state.db, &source.id, &rows,
+            ).await?;
+
+            Ok(serde_json::json!({
+                "file": file_name,
+                "status": "ok",
+                "source_id": source.id,
+                "sheets": sheet_names,
+                "rows_imported": count,
+                "columns": column_headers.len(),
+            }))
+        }
+        _ => Err(SaasError::InvalidInput("意外的处理结果".into())),
+    }
+}
--- a/crates/zclaw-saas/src/knowledge/mod.rs
+++ b/crates/zclaw-saas/src/knowledge/mod.rs
@@ -3,6 +3,7 @@
 pub mod types;
 pub mod service;
 pub mod handlers;
+pub mod extractors;

 use axum::routing::{delete, get, patch, post, put};

@@ -20,6 +21,7 @@ pub fn routes() -> axum::Router<crate::state::AppState> {
        .route("/api/v1/knowledge/items", post(handlers::create_item))
        .route("/api/v1/knowledge/items/batch", post(handlers::batch_create_items))
        .route("/api/v1/knowledge/items/import", post(handlers::import_items))
+        .route("/api/v1/knowledge/upload", post(handlers::upload_file))
        .route("/api/v1/knowledge/items/:id", get(handlers::get_item))
        .route("/api/v1/knowledge/items/:id", put(handlers::update_item))
        .route("/api/v1/knowledge/items/:id", delete(handlers::delete_item))