diff --git a/Cargo.lock b/Cargo.lock index 80ab2d3..7ce9334 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,15 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" +[[package]] +name = "adobe-cmap-parser" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae8abfa9a4688de8fc9f42b3f013b6fffec18ed8a554f5f113577e0b9b3212a3" +dependencies = [ + "pom 1.1.0", +] + [[package]] name = "aead" version = "0.5.2" @@ -381,6 +390,7 @@ dependencies = [ "matchit", "memchr", "mime", + "multer", "percent-encoding", "pin-project-lite", "rustversion", @@ -621,6 +631,25 @@ dependencies = [ "serde", ] +[[package]] +name = "bzip2" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" +dependencies = [ + "bzip2-sys", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.13+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" +dependencies = [ + "cc", + "pkg-config", +] + [[package]] name = "cairo-rs" version = "0.18.5" @@ -646,6 +675,21 @@ dependencies = [ "system-deps", ] +[[package]] +name = "calamine" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "138646b9af2c5d7f1804ea4bf93afc597737d2bd4f7341d67c48b03316976eb1" +dependencies = [ + "byteorder", + "codepage", + "encoding_rs", + "log", + "quick-xml 0.31.0", + "serde", + "zip 2.4.2", +] + [[package]] name = "camino" version = "1.2.2" @@ -779,6 +823,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a0dd1ca384932ff3641c8718a02769f1698e7563dc6974ffd03346116310423" dependencies = [ "find-msvc-tools", + "jobserver", + "libc", "shlex", ] @@ -906,6 +952,15 @@ dependencies = [ "thiserror 2.0.18", ] +[[package]] +name = "codepage" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48f68d061bc2828ae826206326e61251aca94c1e4a5305cf52d9138639c918b4" +dependencies = [ + "encoding_rs", +] + [[package]] name = "color_quant" version = "1.1.0" @@ -1458,6 +1513,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "deflate64" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac6b926516df9c60bfa16e107b21086399f8285a44ca9711344b9e553c5146e2" + [[package]] name = "der" version = "0.7.10" @@ -1904,6 +1965,15 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "euclid" +version = "0.20.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bb7ef65b3777a325d1eeefefab5b6d4959da54747e33bd6258e789640f307ad" +dependencies = [ + "num-traits", +] + [[package]] name = "event-listener" version = "2.5.3" @@ -2371,7 +2441,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1c422344482708cb32db843cf3f55f27918cd24fec7b505bde895a1e8702c34" dependencies = [ "derive_more 0.99.20", - "lopdf", + "lopdf 0.26.0", "printpdf", "rusttype", ] @@ -3289,6 +3359,16 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.4", + "libc", +] + [[package]] name = "jpeg-decoder" version = "0.3.2" @@ -3537,16 +3617,55 @@ dependencies = [ "linked-hash-map", "log", "lzw", - "pom", + "pom 3.4.0", "time 0.2.27", ] +[[package]] +name = "lopdf" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5c8ecfc6c72051981c0459f75ccc585e7ff67c70829560cda8e647882a9abff" +dependencies = [ + "encoding_rs", + "flate2", + "indexmap 2.13.0", + "itoa 1.0.18", + "log", + "md-5", + "nom", + "rangemap", + "time 0.3.47", + "weezl", +] + [[package]] name = "lru-slab" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" +[[package]] +name = "lzma-rs" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "297e814c836ae64db86b36cf2a557ba54368d03f6afcd7d947c266692f71115e" +dependencies = [ + "byteorder", + "crc", +] + +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "lzw" version = "0.10.0" @@ -4251,6 +4370,31 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df94ce210e5bc13cb6651479fa48d14f601d9858cfe0467f43ae157023b938d3" +[[package]] +name = "pbkdf2" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2" +dependencies = [ + "digest", + "hmac", +] + +[[package]] +name = "pdf-extract" +version = "0.7.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbb3a5387b94b9053c1e69d8abfd4dd6dae7afda65a5c5279bc1f42ab39df575" +dependencies = [ + "adobe-cmap-parser", + "encoding_rs", + "euclid", + "lopdf 0.34.0", + "postscript", + "type1-encoding-parser", + "unicode-normalization", +] + [[package]] name = "pem" version = "3.0.6" @@ -4628,6 +4772,12 @@ dependencies = [ "universal-hash", ] +[[package]] +name = "pom" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60f6ce597ecdcc9a098e7fddacb1065093a3d66446fa16c675e7e71d1b5c28e6" + [[package]] name = "pom" version = "3.4.0" @@ -4649,6 +4799,12 @@ dependencies = [ "serde", ] +[[package]] +name = "postscript" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78451badbdaebaf17f053fd9152b3ffb33b516104eacb45e7864aaa9c712f306" + [[package]] name = "potential_utf" version = "0.1.4" @@ -4696,7 +4852,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a2472a184bcb128d0e3db65b59ebd11d010259a5e14fd9d048cba8f2c9302d4" dependencies = [ "js-sys", - "lopdf", + "lopdf 0.26.0", "rusttype", "time 0.2.27", ] @@ -4810,6 +4966,25 @@ dependencies = [ "memchr", ] +[[package]] +name = "quick-xml" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33" +dependencies = [ + "encoding_rs", + "memchr", +] + +[[package]] +name = "quick-xml" +version = "0.37.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb" +dependencies = [ + "memchr", +] + [[package]] name = "quick-xml" version = "0.38.4" @@ -5005,6 +5180,12 @@ dependencies = [ "rand_core 0.5.1", ] +[[package]] +name = "rangemap" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "973443cf09a9c8656b574a866ab68dfa19f0867d0340648c7d2f6a71b8a8ea68" + [[package]] name = "raw-window-handle" version = "0.6.2" @@ -7531,6 +7712,15 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "type1-encoding-parser" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa10c302f5a53b7ad27fd42a3996e23d096ba39b5b8dd6d9e683a05b01bee749" +dependencies = [ + "pom 1.1.0", +] + [[package]] name = "typeid" version = "1.0.3" @@ -9335,6 +9525,15 @@ dependencies = [ "quick-xml 0.30.0", ] +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] + [[package]] name = "yoke" version = "0.8.1" @@ -9590,6 +9789,7 @@ dependencies = [ "axum-extra", "base64 0.22.1", "bytes", + "calamine", "chrono", "dashmap", "data-encoding", @@ -9597,7 +9797,9 @@ dependencies = [ "genpdf", "hex", "jsonwebtoken", + "pdf-extract", "pgvector", + "quick-xml 0.37.5", "rand 0.8.5", "regex", "reqwest 0.12.28", @@ -9623,6 +9825,7 @@ dependencies = [ "urlencoding", "uuid", "zclaw-types", + "zip 2.4.2", ] [[package]] @@ -9700,6 +9903,20 @@ name = "zeroize" version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" +dependencies = [ + "zeroize_derive", +] + +[[package]] +name = "zeroize_derive" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85a5b4158499876c763cb03bc4e49185d3cccbabb15b33c627f7884f43db852e" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] [[package]] name = "zerotrie" @@ -9740,15 +9957,28 @@ version = "2.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fabe6324e908f85a1c52063ce7aa26b68dcb7eb6dbc83a2d148403c9bc3eba50" dependencies = [ + "aes", "arbitrary", + "bzip2", + "constant_time_eq", "crc32fast", "crossbeam-utils", + "deflate64", "displaydoc", "flate2", + "getrandom 0.3.4", + "hmac", "indexmap 2.13.0", + "lzma-rs", "memchr", + "pbkdf2", + "sha1 0.10.6", "thiserror 2.0.18", + "time 0.3.47", + "xz2", + "zeroize", "zopfli", + "zstd", ] [[package]] @@ -9781,6 +10011,34 @@ dependencies = [ "simd-adler32", ] +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.16+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" +dependencies = [ + "cc", + "pkg-config", +] + [[package]] name = "zune-inflate" version = "0.2.54" diff --git a/Cargo.toml b/Cargo.toml index 8b80ec2..7f491fc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -103,7 +103,7 @@ wasmtime-wasi = { version = "43" } tempfile = "3" # SaaS dependencies -axum = { version = "0.7", features = ["macros"] } +axum = { version = "0.7", features = ["macros", "multipart"] } axum-extra = { version = "0.9", features = ["typed-header", "cookie"] } tower = { version = "0.4", features = ["util"] } tower-http = { version = "0.5", features = ["cors", "trace", "limit", "timeout"] } @@ -112,6 +112,12 @@ argon2 = "0.5" totp-rs = "5" hex = "0.4" +# Document processing +pdf-extract = "0.7" +calamine = "0.26" +quick-xml = "0.37" +zip = "2" + # TCP socket configuration socket2 = { version = "0.5", features = ["all"] } diff --git a/crates/zclaw-saas/Cargo.toml b/crates/zclaw-saas/Cargo.toml index b5b2b8e..bb71b5f 100644 --- a/crates/zclaw-saas/Cargo.toml +++ b/crates/zclaw-saas/Cargo.toml @@ -53,5 +53,11 @@ bytes = { workspace = true } async-stream = { workspace = true } genpdf = "0.2" +# Document processing +pdf-extract = { workspace = true } +calamine = { workspace = true } +quick-xml = { workspace = true } +zip = { workspace = true } + [dev-dependencies] tempfile = { workspace = true } diff --git a/crates/zclaw-saas/src/industry/service.rs b/crates/zclaw-saas/src/industry/service.rs index b30bef4..dc1d2cc 100644 --- a/crates/zclaw-saas/src/industry/service.rs +++ b/crates/zclaw-saas/src/industry/service.rs @@ -196,14 +196,14 @@ pub async fn set_account_industries( .bind(&ids) .fetch_one(pool) .await - .map_err(|e| SaasError::Database(e.to_string()))?; + .map_err(SaasError::Database)?; if valid_count.0 != ids.len() as i64 { return Err(SaasError::InvalidInput("部分行业不存在或已禁用".to_string())); } // 事务性 DELETE + INSERT - let mut tx = pool.begin().await.map_err(|e| SaasError::Database(e.to_string()))?; + let mut tx = pool.begin().await.map_err(SaasError::Database)?; sqlx::query("DELETE FROM account_industries WHERE account_id = $1") .bind(account_id) @@ -223,7 +223,7 @@ pub async fn set_account_industries( .await?; } - tx.commit().await.map_err(|e| SaasError::Database(e.to_string()))?; + tx.commit().await.map_err(SaasError::Database)?; list_account_industries(pool, account_id).await } diff --git a/crates/zclaw-saas/src/knowledge/extractors.rs b/crates/zclaw-saas/src/knowledge/extractors.rs new file mode 100644 index 0000000..c87a2e1 --- /dev/null +++ b/crates/zclaw-saas/src/knowledge/extractors.rs @@ -0,0 +1,369 @@ +//! 文档处理管线 — PDF/DOCX/Excel 格式提取 +//! +//! 核心思想:每种格式输出统一的 NormalizedDocument,后面复用现有管线。 +//! Excel 走独立的结构化通道(JSONB 行级存储),不走 RAG。 + +use calamine::{Reader, Data, Range}; + +// === 规范化文档 — 所有格式的统一中间表示 === + +/// 文档提取结果(用于 RAG 通道) +pub struct NormalizedDocument { + pub title: String, + pub sections: Vec, + pub metadata: DocumentMetadata, +} + +pub struct DocumentSection { + pub heading: Option, + pub content: String, + pub level: u8, + pub page_number: Option, +} + +pub struct DocumentMetadata { + pub source_format: String, + pub file_name: String, + pub total_pages: Option, + pub total_sections: u32, +} + +// === 格式路由 === + +/// 根据文件扩展名判断处理通道 +pub fn detect_format(file_name: &str) -> Option { + let ext = file_name.rsplit('.').next().unwrap_or("").to_lowercase(); + match ext.as_str() { + "pdf" => Some(DocumentFormat::Pdf), + "docx" | "doc" => Some(DocumentFormat::Docx), + "xlsx" | "xls" => Some(DocumentFormat::Excel), + "md" | "txt" | "markdown" => Some(DocumentFormat::Markdown), + "csv" => Some(DocumentFormat::Csv), + _ => None, + } +} + +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum DocumentFormat { + Pdf, + Docx, + Excel, + Csv, + Markdown, +} + +impl DocumentFormat { + pub fn is_structured(&self) -> bool { + matches!(self, Self::Excel | Self::Csv) + } +} + +// === 文件处理结果 === + +pub enum ProcessedFile { + /// 文档通道(RAG)— PDF/DOCX/Markdown + Document(NormalizedDocument), + /// 结构化通道 — Excel/CSV 行数据 + Structured { + title: String, + sheet_names: Vec, + column_headers: Vec, + rows: Vec<(Option, i32, Vec, serde_json::Value)>, + }, +} + +// === 提取错误 === + +#[derive(Debug)] +pub struct ExtractError(pub String); + +impl std::fmt::Display for ExtractError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + +impl std::error::Error for ExtractError {} + +impl From for crate::error::SaasError { + fn from(e: ExtractError) -> Self { + crate::error::SaasError::InvalidInput(e.0) + } +} + +// === PDF 提取 === + +pub fn extract_pdf(data: &[u8], file_name: &str) -> Result { + let text = pdf_extract::extract_text_from_mem(data) + .map_err(|e| ExtractError(format!("PDF 提取失败: {}", e)))?; + + let pages: Vec<&str> = text.split('\x0c').collect(); + let page_count = pages.len() as u32; + + let mut sections = Vec::new(); + let mut current_content = String::new(); + + for (i, page) in pages.iter().enumerate() { + let page_text = page.trim(); + if page_text.is_empty() { + continue; + } + + current_content.push_str(page_text); + current_content.push('\n'); + + if current_content.len() > 2000 || i == pages.len() - 1 { + let content = current_content.trim().to_string(); + if !content.is_empty() { + sections.push(DocumentSection { + heading: Some(format!("第 {} 页", i + 1)), + content, + level: 2, + page_number: Some((i + 1) as u32), + }); + } + current_content.clear(); + } + } + + let title = extract_title(file_name, ".pdf"); + let total_sections = sections.len() as u32; + + Ok(NormalizedDocument { + title, + sections, + metadata: DocumentMetadata { + source_format: "pdf".to_string(), + file_name: file_name.to_string(), + total_pages: Some(page_count), + total_sections, + }, + }) +} + +// === DOCX 提取 === + +pub fn extract_docx(data: &[u8], file_name: &str) -> Result { + let reader = std::io::Cursor::new(data); + let mut archive = zip::ZipArchive::new(reader) + .map_err(|e| ExtractError(format!("DOCX 解压失败: {}", e)))?; + + let mut doc_xml = archive.by_name("word/document.xml") + .map_err(|e| ExtractError(format!("DOCX 中未找到 document.xml: {}", e)))?; + + let mut xml_content = String::new(); + use std::io::Read; + doc_xml.read_to_string(&mut xml_content) + .map_err(|e| ExtractError(format!("DOCX 读取失败: {}", e)))?; + + let mut sections = Vec::new(); + let mut current_heading: Option = None; + let mut current_content = String::new(); + + // 简单 XML 解析:提取 文本和 标题层级 + let mut in_text = false; + let mut paragraph_style = String::new(); + let mut text_buf = String::new(); + + let mut reader = quick_xml::Reader::from_str(&xml_content); + let mut buf = Vec::new(); + + loop { + match reader.read_event_into(&mut buf) { + Ok(quick_xml::events::Event::Start(e)) => { + let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string(); + match name.as_str() { + "p" => paragraph_style.clear(), + "t" => in_text = true, + "pStyle" => { + for attr in e.attributes().flatten() { + if attr.key.local_name().as_ref() == b"val" { + paragraph_style = String::from_utf8_lossy(&attr.value).to_string(); + } + } + } + _ => {} + } + } + Ok(quick_xml::events::Event::Text(t)) => { + if in_text { + text_buf.push_str(&t.unescape().unwrap_or_default()); + } + } + Ok(quick_xml::events::Event::End(e)) => { + let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string(); + match name.as_str() { + "p" => { + let text = text_buf.trim().to_string(); + text_buf.clear(); + if text.is_empty() { continue; } + + let is_heading = paragraph_style.starts_with("Heading") + || paragraph_style.starts_with("heading") + || paragraph_style == "Title"; + + if is_heading { + if !current_content.is_empty() { + sections.push(DocumentSection { + heading: current_heading.take(), + content: current_content.trim().to_string(), + level: 2, + page_number: None, + }); + current_content.clear(); + } + current_heading = Some(text); + } else { + current_content.push_str(&text); + current_content.push('\n'); + } + } + "t" => in_text = false, + _ => {} + } + } + Ok(quick_xml::events::Event::Eof) => break, + Err(e) => { + tracing::warn!("DOCX XML parse warning: {}", e); + break; + } + _ => {} + } + buf.clear(); + } + + if !current_content.is_empty() { + sections.push(DocumentSection { + heading: current_heading, + content: current_content.trim().to_string(), + level: 2, + page_number: None, + }); + } + + let title = extract_title(file_name, ".docx"); + let total_sections = sections.len() as u32; + + Ok(NormalizedDocument { + title, + sections, + metadata: DocumentMetadata { + source_format: "docx".to_string(), + file_name: file_name.to_string(), + total_pages: None, + total_sections, + }, + }) +} + +// === Excel 解析 === + +pub fn extract_excel(data: &[u8], file_name: &str) -> Result { + let cursor = std::io::Cursor::new(data); + let mut workbook: calamine::Xlsx<_> = calamine::open_workbook_from_rs(cursor) + .map_err(|e| ExtractError(format!("Excel 解析失败: {}", e)))?; + + let sheet_names = workbook.sheet_names().to_vec(); + let mut all_rows: Vec<(Option, i32, Vec, serde_json::Value)> = Vec::new(); + let mut all_headers: Vec = Vec::new(); + let mut global_row_index = 0i32; + + for sheet_name in &sheet_names { + if let Ok(range) = workbook.worksheet_range(sheet_name) { + let mut headers: Vec = Vec::new(); + let mut first_row = true; + + for row in range_as_data_rows(&range) { + if first_row { + headers = row.iter().map(|cell| { + cell.to_string().trim().to_string() + }).collect(); + headers.retain(|h| !h.is_empty()); + if headers.is_empty() { first_row = false; continue; } + for h in &headers { + if !all_headers.contains(h) { + all_headers.push(h.clone()); + } + } + first_row = false; + continue; + } + + let mut row_map = serde_json::Map::new(); + for (i, cell) in row.iter().enumerate() { + if i >= headers.len() { break; } + let value = match cell { + Data::Empty => continue, + Data::String(s) => serde_json::Value::String(s.clone()), + Data::Float(f) => serde_json::json!(f), + Data::Int(n) => serde_json::json!(n), + Data::Bool(b) => serde_json::Value::Bool(*b), + Data::DateTime(dt) => { + serde_json::Value::String(dt.to_string()) + } + Data::DateTimeIso(s) => { + serde_json::Value::String(s.clone()) + } + Data::DurationIso(s) => { + serde_json::Value::String(s.clone()) + } + Data::Error(e) => { + serde_json::Value::String(format!("{:?}", e)) + } + }; + row_map.insert(headers[i].clone(), value); + } + + if !row_map.is_empty() { + all_rows.push(( + Some(sheet_name.clone()), + global_row_index, + headers.clone(), + serde_json::Value::Object(row_map), + )); + global_row_index += 1; + } + } + } + } + + let title = extract_title(file_name, ".xlsx"); + + Ok(ProcessedFile::Structured { + title, + sheet_names, + column_headers: all_headers, + rows: all_rows, + }) +} + +// === 工具函数 === + +/// 辅助:将 Range 转为行的 Vec,解决 calamine 类型推断问题 +fn range_as_data_rows(range: &Range) -> Vec> { + range.rows().map(|row| row.to_vec()).collect() +} + +/// 从文件名提取标题 +fn extract_title(file_name: &str, ext: &str) -> String { + file_name + .rsplit_once('/') + .or_else(|| file_name.rsplit_once('\\')) + .map(|(_, name)| name) + .unwrap_or(file_name) + .trim_end_matches(ext) + .to_string() +} + +/// 将 NormalizedDocument 转为单个 Markdown 内容字符串 +pub fn normalized_to_markdown(doc: &NormalizedDocument) -> String { + let mut md = String::new(); + for section in &doc.sections { + if let Some(ref heading) = section.heading { + md.push_str(&format!("## {}\n\n", heading)); + } + md.push_str(§ion.content); + md.push_str("\n\n"); + } + md.trim().to_string() +} diff --git a/crates/zclaw-saas/src/knowledge/handlers.rs b/crates/zclaw-saas/src/knowledge/handlers.rs index 735cc3b..d985c70 100644 --- a/crates/zclaw-saas/src/knowledge/handlers.rs +++ b/crates/zclaw-saas/src/knowledge/handlers.rs @@ -1,7 +1,7 @@ //! 知识库 HTTP 处理器 use axum::{ - extract::{Extension, Path, Query, State}, + extract::{Extension, Multipart, Path, Query, State}, Json, }; @@ -10,6 +10,7 @@ use crate::error::{SaasError, SaasResult}; use crate::state::AppState; use super::service; use super::types::*; +use super::extractors; // === 分类管理 === @@ -685,3 +686,202 @@ pub async fn query_structured( let results = service::query_structured(&state.db, &req, Some(&ctx.account_id)).await?; Ok(Json(results)) } + +// === 文件上传 === + +/// POST /api/v1/knowledge/upload — multipart 文件上传 +/// +/// 支持 PDF/DOCX → RAG 管线,Excel → 结构化管线 +pub async fn upload_file( + State(state): State, + Extension(ctx): Extension, + mut multipart: Multipart, +) -> SaasResult> { + check_permission(&ctx, "knowledge:write")?; + let is_admin = ctx.role == "admin" || ctx.role == "super_admin"; + + let mut results = Vec::new(); + + while let Some(field) = multipart.next_field().await.map_err(|e| { + SaasError::InvalidInput(format!("文件上传解析失败: {}", e)) + })? { + let file_name = field.file_name().unwrap_or("unknown").to_string(); + let data = field.bytes().await.map_err(|e| { + SaasError::InvalidInput(format!("文件读取失败: {}", e)) + })?; + + // 大小限制 20MB + if data.len() > 20 * 1024 * 1024 { + results.push(serde_json::json!({ + "file": file_name, + "status": "error", + "error": "文件超过 20MB 限制" + })); + continue; + } + + let format = match extractors::detect_format(&file_name) { + Some(f) => f, + None => { + results.push(serde_json::json!({ + "file": file_name, + "status": "error", + "error": "不支持的文件格式" + })); + continue; + } + }; + + if format.is_structured() { + // Excel → 结构化通道 + match handle_structured_upload( + &state, &ctx, is_admin, &data, &file_name, + ).await { + Ok(result) => results.push(result), + Err(e) => results.push(serde_json::json!({ + "file": file_name, + "status": "error", + "error": e.to_string() + })), + } + } else { + // PDF/DOCX/MD → 文档通道 (RAG) + match handle_document_upload( + &state, &ctx, is_admin, &data, &file_name, format, + ).await { + Ok(result) => results.push(result), + Err(e) => results.push(serde_json::json!({ + "file": file_name, + "status": "error", + "error": e.to_string() + })), + } + } + } + + Ok(Json(serde_json::json!({ + "results": results, + "count": results.len(), + }))) +} + +/// 处理文档类上传(PDF/DOCX/MD → RAG 管线) +async fn handle_document_upload( + state: &AppState, + ctx: &AuthContext, + is_admin: bool, + data: &[u8], + file_name: &str, + format: extractors::DocumentFormat, +) -> SaasResult { + let doc = match format { + extractors::DocumentFormat::Pdf => extractors::extract_pdf(data, file_name)?, + extractors::DocumentFormat::Docx => extractors::extract_docx(data, file_name)?, + extractors::DocumentFormat::Markdown => { + // Markdown 直通 + let text = String::from_utf8_lossy(data).to_string(); + let title = file_name.trim_end_matches(".md").trim_end_matches(".txt").to_string(); + extractors::NormalizedDocument { + title, + sections: vec![extractors::DocumentSection { + heading: None, + content: text, + level: 1, + page_number: None, + }], + metadata: extractors::DocumentMetadata { + source_format: "markdown".to_string(), + file_name: file_name.to_string(), + total_pages: None, + total_sections: 1, + }, + } + } + _ => return Err(SaasError::InvalidInput("不支持的文档格式".into())), + }; + + // 转为 Markdown 内容 + let content = extractors::normalized_to_markdown(&doc); + if content.is_empty() { + return Err(SaasError::InvalidInput("文件内容为空".into())); + } + + // 创建知识条目 + let item_req = CreateItemRequest { + category_id: "uploaded".to_string(), // TODO: 从上传参数获取 + title: doc.title.clone(), + content, + keywords: None, + related_questions: None, + priority: Some(5), + tags: Some(vec![format!("source:{}", doc.metadata.source_format)]), + visibility: None, + }; + + let item = service::create_item(&state.db, &ctx.account_id, &item_req, is_admin).await?; + + // 触发分块 + if let Err(e) = state.worker_dispatcher.dispatch( + "generate_embedding", + serde_json::json!({ "item_id": item.id }), + ).await { + tracing::warn!("Upload: failed to dispatch embedding for {}: {}", item.id, e); + } + + Ok(serde_json::json!({ + "file": file_name, + "status": "ok", + "item_id": item.id, + "sections": doc.metadata.total_sections, + "format": doc.metadata.source_format, + })) +} + +/// 处理结构化数据上传(Excel → structured_rows) +async fn handle_structured_upload( + state: &AppState, + ctx: &AuthContext, + is_admin: bool, + data: &[u8], + file_name: &str, +) -> SaasResult { + let processed = extractors::extract_excel(data, file_name)?; + + match processed { + extractors::ProcessedFile::Structured { title, sheet_names, column_headers, rows } => { + if rows.is_empty() { + return Err(SaasError::InvalidInput("Excel 文件没有数据行".into())); + } + + // 创建结构化数据源 + let source_req = CreateStructuredSourceRequest { + title, + description: None, + original_file_name: Some(file_name.to_string()), + sheet_names: Some(sheet_names.clone()), + column_headers: Some(column_headers.clone()), + visibility: None, + industry_id: None, + }; + + let source = service::create_structured_source( + &state.db, &ctx.account_id, is_admin, &source_req, + ).await?; + + // 批量写入行数据 + let count = service::insert_structured_rows( + &state.db, &source.id, &rows, + ).await?; + + Ok(serde_json::json!({ + "file": file_name, + "status": "ok", + "source_id": source.id, + "sheets": sheet_names, + "rows_imported": count, + "columns": column_headers.len(), + })) + } + _ => Err(SaasError::InvalidInput("意外的处理结果".into())), + } +} diff --git a/crates/zclaw-saas/src/knowledge/mod.rs b/crates/zclaw-saas/src/knowledge/mod.rs index 65fc4d0..658a8bd 100644 --- a/crates/zclaw-saas/src/knowledge/mod.rs +++ b/crates/zclaw-saas/src/knowledge/mod.rs @@ -3,6 +3,7 @@ pub mod types; pub mod service; pub mod handlers; +pub mod extractors; use axum::routing::{delete, get, patch, post, put}; @@ -20,6 +21,7 @@ pub fn routes() -> axum::Router { .route("/api/v1/knowledge/items", post(handlers::create_item)) .route("/api/v1/knowledge/items/batch", post(handlers::batch_create_items)) .route("/api/v1/knowledge/items/import", post(handlers::import_items)) + .route("/api/v1/knowledge/upload", post(handlers::upload_file)) .route("/api/v1/knowledge/items/:id", get(handlers::get_item)) .route("/api/v1/knowledge/items/:id", put(handlers::update_item)) .route("/api/v1/knowledge/items/:id", delete(handlers::delete_item))