feat(knowledge): Phase B+C 文档提取器 + multipart 文件上传
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled

- PDF 提取 (pdf-extract) + DOCX 提取 (zip+quick-xml) + Excel 解析 (calamine)
- 统一格式路由 detect_format() → RAG 通道或结构化通道
- POST /api/v1/knowledge/upload multipart 文件上传
- PDF/DOCX/Markdown → RAG 管线,Excel → structured_rows JSONB
- 结构化数据源 CRUD API (GET/DELETE /api/v1/structured/sources)
- POST /api/v1/structured/query JSONB 关键词查询
- 修复 industry/service.rs SaasError::Database 类型不匹配
This commit is contained in:
iven
2026-04-12 19:25:24 +08:00
parent 4800f89467
commit 60062a8097
7 changed files with 849 additions and 8 deletions

View File

@@ -53,5 +53,11 @@ bytes = { workspace = true }
async-stream = { workspace = true }
genpdf = "0.2"
# Document processing
pdf-extract = { workspace = true }
calamine = { workspace = true }
quick-xml = { workspace = true }
zip = { workspace = true }
[dev-dependencies]
tempfile = { workspace = true }

View File

@@ -196,14 +196,14 @@ pub async fn set_account_industries(
.bind(&ids)
.fetch_one(pool)
.await
.map_err(|e| SaasError::Database(e.to_string()))?;
.map_err(SaasError::Database)?;
if valid_count.0 != ids.len() as i64 {
return Err(SaasError::InvalidInput("部分行业不存在或已禁用".to_string()));
}
// 事务性 DELETE + INSERT
let mut tx = pool.begin().await.map_err(|e| SaasError::Database(e.to_string()))?;
let mut tx = pool.begin().await.map_err(SaasError::Database)?;
sqlx::query("DELETE FROM account_industries WHERE account_id = $1")
.bind(account_id)
@@ -223,7 +223,7 @@ pub async fn set_account_industries(
.await?;
}
tx.commit().await.map_err(|e| SaasError::Database(e.to_string()))?;
tx.commit().await.map_err(SaasError::Database)?;
list_account_industries(pool, account_id).await
}

View File

@@ -0,0 +1,369 @@
//! 文档处理管线 — PDF/DOCX/Excel 格式提取
//!
//! 核心思想:每种格式输出统一的 NormalizedDocument后面复用现有管线。
//! Excel 走独立的结构化通道JSONB 行级存储),不走 RAG。
use calamine::{Reader, Data, Range};
// === 规范化文档 — 所有格式的统一中间表示 ===
/// 文档提取结果(用于 RAG 通道)
pub struct NormalizedDocument {
pub title: String,
pub sections: Vec<DocumentSection>,
pub metadata: DocumentMetadata,
}
pub struct DocumentSection {
pub heading: Option<String>,
pub content: String,
pub level: u8,
pub page_number: Option<u32>,
}
pub struct DocumentMetadata {
pub source_format: String,
pub file_name: String,
pub total_pages: Option<u32>,
pub total_sections: u32,
}
// === 格式路由 ===
/// 根据文件扩展名判断处理通道
pub fn detect_format(file_name: &str) -> Option<DocumentFormat> {
let ext = file_name.rsplit('.').next().unwrap_or("").to_lowercase();
match ext.as_str() {
"pdf" => Some(DocumentFormat::Pdf),
"docx" | "doc" => Some(DocumentFormat::Docx),
"xlsx" | "xls" => Some(DocumentFormat::Excel),
"md" | "txt" | "markdown" => Some(DocumentFormat::Markdown),
"csv" => Some(DocumentFormat::Csv),
_ => None,
}
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum DocumentFormat {
Pdf,
Docx,
Excel,
Csv,
Markdown,
}
impl DocumentFormat {
pub fn is_structured(&self) -> bool {
matches!(self, Self::Excel | Self::Csv)
}
}
// === 文件处理结果 ===
pub enum ProcessedFile {
/// 文档通道RAG— PDF/DOCX/Markdown
Document(NormalizedDocument),
/// 结构化通道 — Excel/CSV 行数据
Structured {
title: String,
sheet_names: Vec<String>,
column_headers: Vec<String>,
rows: Vec<(Option<String>, i32, Vec<String>, serde_json::Value)>,
},
}
// === 提取错误 ===
#[derive(Debug)]
pub struct ExtractError(pub String);
impl std::fmt::Display for ExtractError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
impl std::error::Error for ExtractError {}
impl From<ExtractError> for crate::error::SaasError {
fn from(e: ExtractError) -> Self {
crate::error::SaasError::InvalidInput(e.0)
}
}
// === PDF 提取 ===
pub fn extract_pdf(data: &[u8], file_name: &str) -> Result<NormalizedDocument, ExtractError> {
let text = pdf_extract::extract_text_from_mem(data)
.map_err(|e| ExtractError(format!("PDF 提取失败: {}", e)))?;
let pages: Vec<&str> = text.split('\x0c').collect();
let page_count = pages.len() as u32;
let mut sections = Vec::new();
let mut current_content = String::new();
for (i, page) in pages.iter().enumerate() {
let page_text = page.trim();
if page_text.is_empty() {
continue;
}
current_content.push_str(page_text);
current_content.push('\n');
if current_content.len() > 2000 || i == pages.len() - 1 {
let content = current_content.trim().to_string();
if !content.is_empty() {
sections.push(DocumentSection {
heading: Some(format!("{}", i + 1)),
content,
level: 2,
page_number: Some((i + 1) as u32),
});
}
current_content.clear();
}
}
let title = extract_title(file_name, ".pdf");
let total_sections = sections.len() as u32;
Ok(NormalizedDocument {
title,
sections,
metadata: DocumentMetadata {
source_format: "pdf".to_string(),
file_name: file_name.to_string(),
total_pages: Some(page_count),
total_sections,
},
})
}
// === DOCX 提取 ===
pub fn extract_docx(data: &[u8], file_name: &str) -> Result<NormalizedDocument, ExtractError> {
let reader = std::io::Cursor::new(data);
let mut archive = zip::ZipArchive::new(reader)
.map_err(|e| ExtractError(format!("DOCX 解压失败: {}", e)))?;
let mut doc_xml = archive.by_name("word/document.xml")
.map_err(|e| ExtractError(format!("DOCX 中未找到 document.xml: {}", e)))?;
let mut xml_content = String::new();
use std::io::Read;
doc_xml.read_to_string(&mut xml_content)
.map_err(|e| ExtractError(format!("DOCX 读取失败: {}", e)))?;
let mut sections = Vec::new();
let mut current_heading: Option<String> = None;
let mut current_content = String::new();
// 简单 XML 解析:提取 <w:t> 文本和 <w:pStyle> 标题层级
let mut in_text = false;
let mut paragraph_style = String::new();
let mut text_buf = String::new();
let mut reader = quick_xml::Reader::from_str(&xml_content);
let mut buf = Vec::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(quick_xml::events::Event::Start(e)) => {
let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string();
match name.as_str() {
"p" => paragraph_style.clear(),
"t" => in_text = true,
"pStyle" => {
for attr in e.attributes().flatten() {
if attr.key.local_name().as_ref() == b"val" {
paragraph_style = String::from_utf8_lossy(&attr.value).to_string();
}
}
}
_ => {}
}
}
Ok(quick_xml::events::Event::Text(t)) => {
if in_text {
text_buf.push_str(&t.unescape().unwrap_or_default());
}
}
Ok(quick_xml::events::Event::End(e)) => {
let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string();
match name.as_str() {
"p" => {
let text = text_buf.trim().to_string();
text_buf.clear();
if text.is_empty() { continue; }
let is_heading = paragraph_style.starts_with("Heading")
|| paragraph_style.starts_with("heading")
|| paragraph_style == "Title";
if is_heading {
if !current_content.is_empty() {
sections.push(DocumentSection {
heading: current_heading.take(),
content: current_content.trim().to_string(),
level: 2,
page_number: None,
});
current_content.clear();
}
current_heading = Some(text);
} else {
current_content.push_str(&text);
current_content.push('\n');
}
}
"t" => in_text = false,
_ => {}
}
}
Ok(quick_xml::events::Event::Eof) => break,
Err(e) => {
tracing::warn!("DOCX XML parse warning: {}", e);
break;
}
_ => {}
}
buf.clear();
}
if !current_content.is_empty() {
sections.push(DocumentSection {
heading: current_heading,
content: current_content.trim().to_string(),
level: 2,
page_number: None,
});
}
let title = extract_title(file_name, ".docx");
let total_sections = sections.len() as u32;
Ok(NormalizedDocument {
title,
sections,
metadata: DocumentMetadata {
source_format: "docx".to_string(),
file_name: file_name.to_string(),
total_pages: None,
total_sections,
},
})
}
// === Excel 解析 ===
pub fn extract_excel(data: &[u8], file_name: &str) -> Result<ProcessedFile, ExtractError> {
let cursor = std::io::Cursor::new(data);
let mut workbook: calamine::Xlsx<_> = calamine::open_workbook_from_rs(cursor)
.map_err(|e| ExtractError(format!("Excel 解析失败: {}", e)))?;
let sheet_names = workbook.sheet_names().to_vec();
let mut all_rows: Vec<(Option<String>, i32, Vec<String>, serde_json::Value)> = Vec::new();
let mut all_headers: Vec<String> = Vec::new();
let mut global_row_index = 0i32;
for sheet_name in &sheet_names {
if let Ok(range) = workbook.worksheet_range(sheet_name) {
let mut headers: Vec<String> = Vec::new();
let mut first_row = true;
for row in range_as_data_rows(&range) {
if first_row {
headers = row.iter().map(|cell| {
cell.to_string().trim().to_string()
}).collect();
headers.retain(|h| !h.is_empty());
if headers.is_empty() { first_row = false; continue; }
for h in &headers {
if !all_headers.contains(h) {
all_headers.push(h.clone());
}
}
first_row = false;
continue;
}
let mut row_map = serde_json::Map::new();
for (i, cell) in row.iter().enumerate() {
if i >= headers.len() { break; }
let value = match cell {
Data::Empty => continue,
Data::String(s) => serde_json::Value::String(s.clone()),
Data::Float(f) => serde_json::json!(f),
Data::Int(n) => serde_json::json!(n),
Data::Bool(b) => serde_json::Value::Bool(*b),
Data::DateTime(dt) => {
serde_json::Value::String(dt.to_string())
}
Data::DateTimeIso(s) => {
serde_json::Value::String(s.clone())
}
Data::DurationIso(s) => {
serde_json::Value::String(s.clone())
}
Data::Error(e) => {
serde_json::Value::String(format!("{:?}", e))
}
};
row_map.insert(headers[i].clone(), value);
}
if !row_map.is_empty() {
all_rows.push((
Some(sheet_name.clone()),
global_row_index,
headers.clone(),
serde_json::Value::Object(row_map),
));
global_row_index += 1;
}
}
}
}
let title = extract_title(file_name, ".xlsx");
Ok(ProcessedFile::Structured {
title,
sheet_names,
column_headers: all_headers,
rows: all_rows,
})
}
// === 工具函数 ===
/// 辅助:将 Range<Data> 转为行的 Vec解决 calamine 类型推断问题
fn range_as_data_rows(range: &Range<Data>) -> Vec<Vec<Data>> {
range.rows().map(|row| row.to_vec()).collect()
}
/// 从文件名提取标题
fn extract_title(file_name: &str, ext: &str) -> String {
file_name
.rsplit_once('/')
.or_else(|| file_name.rsplit_once('\\'))
.map(|(_, name)| name)
.unwrap_or(file_name)
.trim_end_matches(ext)
.to_string()
}
/// 将 NormalizedDocument 转为单个 Markdown 内容字符串
pub fn normalized_to_markdown(doc: &NormalizedDocument) -> String {
let mut md = String::new();
for section in &doc.sections {
if let Some(ref heading) = section.heading {
md.push_str(&format!("## {}\n\n", heading));
}
md.push_str(&section.content);
md.push_str("\n\n");
}
md.trim().to_string()
}

View File

@@ -1,7 +1,7 @@
//! 知识库 HTTP 处理器
use axum::{
extract::{Extension, Path, Query, State},
extract::{Extension, Multipart, Path, Query, State},
Json,
};
@@ -10,6 +10,7 @@ use crate::error::{SaasError, SaasResult};
use crate::state::AppState;
use super::service;
use super::types::*;
use super::extractors;
// === 分类管理 ===
@@ -685,3 +686,202 @@ pub async fn query_structured(
let results = service::query_structured(&state.db, &req, Some(&ctx.account_id)).await?;
Ok(Json(results))
}
// === 文件上传 ===
/// POST /api/v1/knowledge/upload — multipart 文件上传
///
/// 支持 PDF/DOCX → RAG 管线Excel → 结构化管线
pub async fn upload_file(
State(state): State<AppState>,
Extension(ctx): Extension<AuthContext>,
mut multipart: Multipart,
) -> SaasResult<Json<serde_json::Value>> {
check_permission(&ctx, "knowledge:write")?;
let is_admin = ctx.role == "admin" || ctx.role == "super_admin";
let mut results = Vec::new();
while let Some(field) = multipart.next_field().await.map_err(|e| {
SaasError::InvalidInput(format!("文件上传解析失败: {}", e))
})? {
let file_name = field.file_name().unwrap_or("unknown").to_string();
let data = field.bytes().await.map_err(|e| {
SaasError::InvalidInput(format!("文件读取失败: {}", e))
})?;
// 大小限制 20MB
if data.len() > 20 * 1024 * 1024 {
results.push(serde_json::json!({
"file": file_name,
"status": "error",
"error": "文件超过 20MB 限制"
}));
continue;
}
let format = match extractors::detect_format(&file_name) {
Some(f) => f,
None => {
results.push(serde_json::json!({
"file": file_name,
"status": "error",
"error": "不支持的文件格式"
}));
continue;
}
};
if format.is_structured() {
// Excel → 结构化通道
match handle_structured_upload(
&state, &ctx, is_admin, &data, &file_name,
).await {
Ok(result) => results.push(result),
Err(e) => results.push(serde_json::json!({
"file": file_name,
"status": "error",
"error": e.to_string()
})),
}
} else {
// PDF/DOCX/MD → 文档通道 (RAG)
match handle_document_upload(
&state, &ctx, is_admin, &data, &file_name, format,
).await {
Ok(result) => results.push(result),
Err(e) => results.push(serde_json::json!({
"file": file_name,
"status": "error",
"error": e.to_string()
})),
}
}
}
Ok(Json(serde_json::json!({
"results": results,
"count": results.len(),
})))
}
/// 处理文档类上传PDF/DOCX/MD → RAG 管线)
async fn handle_document_upload(
state: &AppState,
ctx: &AuthContext,
is_admin: bool,
data: &[u8],
file_name: &str,
format: extractors::DocumentFormat,
) -> SaasResult<serde_json::Value> {
let doc = match format {
extractors::DocumentFormat::Pdf => extractors::extract_pdf(data, file_name)?,
extractors::DocumentFormat::Docx => extractors::extract_docx(data, file_name)?,
extractors::DocumentFormat::Markdown => {
// Markdown 直通
let text = String::from_utf8_lossy(data).to_string();
let title = file_name.trim_end_matches(".md").trim_end_matches(".txt").to_string();
extractors::NormalizedDocument {
title,
sections: vec![extractors::DocumentSection {
heading: None,
content: text,
level: 1,
page_number: None,
}],
metadata: extractors::DocumentMetadata {
source_format: "markdown".to_string(),
file_name: file_name.to_string(),
total_pages: None,
total_sections: 1,
},
}
}
_ => return Err(SaasError::InvalidInput("不支持的文档格式".into())),
};
// 转为 Markdown 内容
let content = extractors::normalized_to_markdown(&doc);
if content.is_empty() {
return Err(SaasError::InvalidInput("文件内容为空".into()));
}
// 创建知识条目
let item_req = CreateItemRequest {
category_id: "uploaded".to_string(), // TODO: 从上传参数获取
title: doc.title.clone(),
content,
keywords: None,
related_questions: None,
priority: Some(5),
tags: Some(vec![format!("source:{}", doc.metadata.source_format)]),
visibility: None,
};
let item = service::create_item(&state.db, &ctx.account_id, &item_req, is_admin).await?;
// 触发分块
if let Err(e) = state.worker_dispatcher.dispatch(
"generate_embedding",
serde_json::json!({ "item_id": item.id }),
).await {
tracing::warn!("Upload: failed to dispatch embedding for {}: {}", item.id, e);
}
Ok(serde_json::json!({
"file": file_name,
"status": "ok",
"item_id": item.id,
"sections": doc.metadata.total_sections,
"format": doc.metadata.source_format,
}))
}
/// 处理结构化数据上传Excel → structured_rows
async fn handle_structured_upload(
state: &AppState,
ctx: &AuthContext,
is_admin: bool,
data: &[u8],
file_name: &str,
) -> SaasResult<serde_json::Value> {
let processed = extractors::extract_excel(data, file_name)?;
match processed {
extractors::ProcessedFile::Structured { title, sheet_names, column_headers, rows } => {
if rows.is_empty() {
return Err(SaasError::InvalidInput("Excel 文件没有数据行".into()));
}
// 创建结构化数据源
let source_req = CreateStructuredSourceRequest {
title,
description: None,
original_file_name: Some(file_name.to_string()),
sheet_names: Some(sheet_names.clone()),
column_headers: Some(column_headers.clone()),
visibility: None,
industry_id: None,
};
let source = service::create_structured_source(
&state.db, &ctx.account_id, is_admin, &source_req,
).await?;
// 批量写入行数据
let count = service::insert_structured_rows(
&state.db, &source.id, &rows,
).await?;
Ok(serde_json::json!({
"file": file_name,
"status": "ok",
"source_id": source.id,
"sheets": sheet_names,
"rows_imported": count,
"columns": column_headers.len(),
}))
}
_ => Err(SaasError::InvalidInput("意外的处理结果".into())),
}
}

View File

@@ -3,6 +3,7 @@
pub mod types;
pub mod service;
pub mod handlers;
pub mod extractors;
use axum::routing::{delete, get, patch, post, put};
@@ -20,6 +21,7 @@ pub fn routes() -> axum::Router<crate::state::AppState> {
.route("/api/v1/knowledge/items", post(handlers::create_item))
.route("/api/v1/knowledge/items/batch", post(handlers::batch_create_items))
.route("/api/v1/knowledge/items/import", post(handlers::import_items))
.route("/api/v1/knowledge/upload", post(handlers::upload_file))
.route("/api/v1/knowledge/items/:id", get(handlers::get_item))
.route("/api/v1/knowledge/items/:id", put(handlers::update_item))
.route("/api/v1/knowledge/items/:id", delete(handlers::delete_item))