feat(knowledge): Phase B+C 文档提取器 + multipart 文件上传
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled
- PDF 提取 (pdf-extract) + DOCX 提取 (zip+quick-xml) + Excel 解析 (calamine) - 统一格式路由 detect_format() → RAG 通道或结构化通道 - POST /api/v1/knowledge/upload multipart 文件上传 - PDF/DOCX/Markdown → RAG 管线,Excel → structured_rows JSONB - 结构化数据源 CRUD API (GET/DELETE /api/v1/structured/sources) - POST /api/v1/structured/query JSONB 关键词查询 - 修复 industry/service.rs SaasError::Database 类型不匹配
This commit is contained in:
264
Cargo.lock
generated
264
Cargo.lock
generated
@@ -17,6 +17,15 @@ version = "2.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
|
||||
|
||||
[[package]]
|
||||
name = "adobe-cmap-parser"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ae8abfa9a4688de8fc9f42b3f013b6fffec18ed8a554f5f113577e0b9b3212a3"
|
||||
dependencies = [
|
||||
"pom 1.1.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aead"
|
||||
version = "0.5.2"
|
||||
@@ -381,6 +390,7 @@ dependencies = [
|
||||
"matchit",
|
||||
"memchr",
|
||||
"mime",
|
||||
"multer",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"rustversion",
|
||||
@@ -621,6 +631,25 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bzip2"
|
||||
version = "0.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47"
|
||||
dependencies = [
|
||||
"bzip2-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bzip2-sys"
|
||||
version = "0.1.13+1.0.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"pkg-config",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cairo-rs"
|
||||
version = "0.18.5"
|
||||
@@ -646,6 +675,21 @@ dependencies = [
|
||||
"system-deps",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "calamine"
|
||||
version = "0.26.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "138646b9af2c5d7f1804ea4bf93afc597737d2bd4f7341d67c48b03316976eb1"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"codepage",
|
||||
"encoding_rs",
|
||||
"log",
|
||||
"quick-xml 0.31.0",
|
||||
"serde",
|
||||
"zip 2.4.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "camino"
|
||||
version = "1.2.2"
|
||||
@@ -779,6 +823,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a0dd1ca384932ff3641c8718a02769f1698e7563dc6974ffd03346116310423"
|
||||
dependencies = [
|
||||
"find-msvc-tools",
|
||||
"jobserver",
|
||||
"libc",
|
||||
"shlex",
|
||||
]
|
||||
|
||||
@@ -906,6 +952,15 @@ dependencies = [
|
||||
"thiserror 2.0.18",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "codepage"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "48f68d061bc2828ae826206326e61251aca94c1e4a5305cf52d9138639c918b4"
|
||||
dependencies = [
|
||||
"encoding_rs",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "color_quant"
|
||||
version = "1.1.0"
|
||||
@@ -1458,6 +1513,12 @@ dependencies = [
|
||||
"windows-sys 0.59.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "deflate64"
|
||||
version = "0.1.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ac6b926516df9c60bfa16e107b21086399f8285a44ca9711344b9e553c5146e2"
|
||||
|
||||
[[package]]
|
||||
name = "der"
|
||||
version = "0.7.10"
|
||||
@@ -1904,6 +1965,15 @@ dependencies = [
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "euclid"
|
||||
version = "0.20.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2bb7ef65b3777a325d1eeefefab5b6d4959da54747e33bd6258e789640f307ad"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "event-listener"
|
||||
version = "2.5.3"
|
||||
@@ -2371,7 +2441,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a1c422344482708cb32db843cf3f55f27918cd24fec7b505bde895a1e8702c34"
|
||||
dependencies = [
|
||||
"derive_more 0.99.20",
|
||||
"lopdf",
|
||||
"lopdf 0.26.0",
|
||||
"printpdf",
|
||||
"rusttype",
|
||||
]
|
||||
@@ -3289,6 +3359,16 @@ dependencies = [
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jobserver"
|
||||
version = "0.1.34"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
|
||||
dependencies = [
|
||||
"getrandom 0.3.4",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jpeg-decoder"
|
||||
version = "0.3.2"
|
||||
@@ -3537,16 +3617,55 @@ dependencies = [
|
||||
"linked-hash-map",
|
||||
"log",
|
||||
"lzw",
|
||||
"pom",
|
||||
"pom 3.4.0",
|
||||
"time 0.2.27",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lopdf"
|
||||
version = "0.34.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c5c8ecfc6c72051981c0459f75ccc585e7ff67c70829560cda8e647882a9abff"
|
||||
dependencies = [
|
||||
"encoding_rs",
|
||||
"flate2",
|
||||
"indexmap 2.13.0",
|
||||
"itoa 1.0.18",
|
||||
"log",
|
||||
"md-5",
|
||||
"nom",
|
||||
"rangemap",
|
||||
"time 0.3.47",
|
||||
"weezl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lru-slab"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
|
||||
|
||||
[[package]]
|
||||
name = "lzma-rs"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "297e814c836ae64db86b36cf2a557ba54368d03f6afcd7d947c266692f71115e"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"crc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lzma-sys"
|
||||
version = "0.1.20"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
"pkg-config",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lzw"
|
||||
version = "0.10.0"
|
||||
@@ -4251,6 +4370,31 @@ version = "0.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "df94ce210e5bc13cb6651479fa48d14f601d9858cfe0467f43ae157023b938d3"
|
||||
|
||||
[[package]]
|
||||
name = "pbkdf2"
|
||||
version = "0.12.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2"
|
||||
dependencies = [
|
||||
"digest",
|
||||
"hmac",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pdf-extract"
|
||||
version = "0.7.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cbb3a5387b94b9053c1e69d8abfd4dd6dae7afda65a5c5279bc1f42ab39df575"
|
||||
dependencies = [
|
||||
"adobe-cmap-parser",
|
||||
"encoding_rs",
|
||||
"euclid",
|
||||
"lopdf 0.34.0",
|
||||
"postscript",
|
||||
"type1-encoding-parser",
|
||||
"unicode-normalization",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pem"
|
||||
version = "3.0.6"
|
||||
@@ -4628,6 +4772,12 @@ dependencies = [
|
||||
"universal-hash",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pom"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "60f6ce597ecdcc9a098e7fddacb1065093a3d66446fa16c675e7e71d1b5c28e6"
|
||||
|
||||
[[package]]
|
||||
name = "pom"
|
||||
version = "3.4.0"
|
||||
@@ -4649,6 +4799,12 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postscript"
|
||||
version = "0.14.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78451badbdaebaf17f053fd9152b3ffb33b516104eacb45e7864aaa9c712f306"
|
||||
|
||||
[[package]]
|
||||
name = "potential_utf"
|
||||
version = "0.1.4"
|
||||
@@ -4696,7 +4852,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1a2472a184bcb128d0e3db65b59ebd11d010259a5e14fd9d048cba8f2c9302d4"
|
||||
dependencies = [
|
||||
"js-sys",
|
||||
"lopdf",
|
||||
"lopdf 0.26.0",
|
||||
"rusttype",
|
||||
"time 0.2.27",
|
||||
]
|
||||
@@ -4810,6 +4966,25 @@ dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quick-xml"
|
||||
version = "0.31.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33"
|
||||
dependencies = [
|
||||
"encoding_rs",
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quick-xml"
|
||||
version = "0.37.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quick-xml"
|
||||
version = "0.38.4"
|
||||
@@ -5005,6 +5180,12 @@ dependencies = [
|
||||
"rand_core 0.5.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rangemap"
|
||||
version = "1.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "973443cf09a9c8656b574a866ab68dfa19f0867d0340648c7d2f6a71b8a8ea68"
|
||||
|
||||
[[package]]
|
||||
name = "raw-window-handle"
|
||||
version = "0.6.2"
|
||||
@@ -7531,6 +7712,15 @@ version = "0.2.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
|
||||
|
||||
[[package]]
|
||||
name = "type1-encoding-parser"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fa10c302f5a53b7ad27fd42a3996e23d096ba39b5b8dd6d9e683a05b01bee749"
|
||||
dependencies = [
|
||||
"pom 1.1.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typeid"
|
||||
version = "1.0.3"
|
||||
@@ -9335,6 +9525,15 @@ dependencies = [
|
||||
"quick-xml 0.30.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "xz2"
|
||||
version = "0.1.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2"
|
||||
dependencies = [
|
||||
"lzma-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "yoke"
|
||||
version = "0.8.1"
|
||||
@@ -9590,6 +9789,7 @@ dependencies = [
|
||||
"axum-extra",
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
"calamine",
|
||||
"chrono",
|
||||
"dashmap",
|
||||
"data-encoding",
|
||||
@@ -9597,7 +9797,9 @@ dependencies = [
|
||||
"genpdf",
|
||||
"hex",
|
||||
"jsonwebtoken",
|
||||
"pdf-extract",
|
||||
"pgvector",
|
||||
"quick-xml 0.37.5",
|
||||
"rand 0.8.5",
|
||||
"regex",
|
||||
"reqwest 0.12.28",
|
||||
@@ -9623,6 +9825,7 @@ dependencies = [
|
||||
"urlencoding",
|
||||
"uuid",
|
||||
"zclaw-types",
|
||||
"zip 2.4.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -9700,6 +9903,20 @@ name = "zeroize"
|
||||
version = "1.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0"
|
||||
dependencies = [
|
||||
"zeroize_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zeroize_derive"
|
||||
version = "1.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "85a5b4158499876c763cb03bc4e49185d3cccbabb15b33c627f7884f43db852e"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerotrie"
|
||||
@@ -9740,15 +9957,28 @@ version = "2.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fabe6324e908f85a1c52063ce7aa26b68dcb7eb6dbc83a2d148403c9bc3eba50"
|
||||
dependencies = [
|
||||
"aes",
|
||||
"arbitrary",
|
||||
"bzip2",
|
||||
"constant_time_eq",
|
||||
"crc32fast",
|
||||
"crossbeam-utils",
|
||||
"deflate64",
|
||||
"displaydoc",
|
||||
"flate2",
|
||||
"getrandom 0.3.4",
|
||||
"hmac",
|
||||
"indexmap 2.13.0",
|
||||
"lzma-rs",
|
||||
"memchr",
|
||||
"pbkdf2",
|
||||
"sha1 0.10.6",
|
||||
"thiserror 2.0.18",
|
||||
"time 0.3.47",
|
||||
"xz2",
|
||||
"zeroize",
|
||||
"zopfli",
|
||||
"zstd",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -9781,6 +10011,34 @@ dependencies = [
|
||||
"simd-adler32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd"
|
||||
version = "0.13.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a"
|
||||
dependencies = [
|
||||
"zstd-safe",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd-safe"
|
||||
version = "7.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d"
|
||||
dependencies = [
|
||||
"zstd-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd-sys"
|
||||
version = "2.0.16+zstd.1.5.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"pkg-config",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zune-inflate"
|
||||
version = "0.2.54"
|
||||
|
||||
@@ -103,7 +103,7 @@ wasmtime-wasi = { version = "43" }
|
||||
tempfile = "3"
|
||||
|
||||
# SaaS dependencies
|
||||
axum = { version = "0.7", features = ["macros"] }
|
||||
axum = { version = "0.7", features = ["macros", "multipart"] }
|
||||
axum-extra = { version = "0.9", features = ["typed-header", "cookie"] }
|
||||
tower = { version = "0.4", features = ["util"] }
|
||||
tower-http = { version = "0.5", features = ["cors", "trace", "limit", "timeout"] }
|
||||
@@ -112,6 +112,12 @@ argon2 = "0.5"
|
||||
totp-rs = "5"
|
||||
hex = "0.4"
|
||||
|
||||
# Document processing
|
||||
pdf-extract = "0.7"
|
||||
calamine = "0.26"
|
||||
quick-xml = "0.37"
|
||||
zip = "2"
|
||||
|
||||
# TCP socket configuration
|
||||
socket2 = { version = "0.5", features = ["all"] }
|
||||
|
||||
|
||||
@@ -53,5 +53,11 @@ bytes = { workspace = true }
|
||||
async-stream = { workspace = true }
|
||||
genpdf = "0.2"
|
||||
|
||||
# Document processing
|
||||
pdf-extract = { workspace = true }
|
||||
calamine = { workspace = true }
|
||||
quick-xml = { workspace = true }
|
||||
zip = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = { workspace = true }
|
||||
|
||||
@@ -196,14 +196,14 @@ pub async fn set_account_industries(
|
||||
.bind(&ids)
|
||||
.fetch_one(pool)
|
||||
.await
|
||||
.map_err(|e| SaasError::Database(e.to_string()))?;
|
||||
.map_err(SaasError::Database)?;
|
||||
|
||||
if valid_count.0 != ids.len() as i64 {
|
||||
return Err(SaasError::InvalidInput("部分行业不存在或已禁用".to_string()));
|
||||
}
|
||||
|
||||
// 事务性 DELETE + INSERT
|
||||
let mut tx = pool.begin().await.map_err(|e| SaasError::Database(e.to_string()))?;
|
||||
let mut tx = pool.begin().await.map_err(SaasError::Database)?;
|
||||
|
||||
sqlx::query("DELETE FROM account_industries WHERE account_id = $1")
|
||||
.bind(account_id)
|
||||
@@ -223,7 +223,7 @@ pub async fn set_account_industries(
|
||||
.await?;
|
||||
}
|
||||
|
||||
tx.commit().await.map_err(|e| SaasError::Database(e.to_string()))?;
|
||||
tx.commit().await.map_err(SaasError::Database)?;
|
||||
|
||||
list_account_industries(pool, account_id).await
|
||||
}
|
||||
|
||||
369
crates/zclaw-saas/src/knowledge/extractors.rs
Normal file
369
crates/zclaw-saas/src/knowledge/extractors.rs
Normal file
@@ -0,0 +1,369 @@
|
||||
//! 文档处理管线 — PDF/DOCX/Excel 格式提取
|
||||
//!
|
||||
//! 核心思想:每种格式输出统一的 NormalizedDocument,后面复用现有管线。
|
||||
//! Excel 走独立的结构化通道(JSONB 行级存储),不走 RAG。
|
||||
|
||||
use calamine::{Reader, Data, Range};
|
||||
|
||||
// === 规范化文档 — 所有格式的统一中间表示 ===
|
||||
|
||||
/// 文档提取结果(用于 RAG 通道)
|
||||
pub struct NormalizedDocument {
|
||||
pub title: String,
|
||||
pub sections: Vec<DocumentSection>,
|
||||
pub metadata: DocumentMetadata,
|
||||
}
|
||||
|
||||
pub struct DocumentSection {
|
||||
pub heading: Option<String>,
|
||||
pub content: String,
|
||||
pub level: u8,
|
||||
pub page_number: Option<u32>,
|
||||
}
|
||||
|
||||
pub struct DocumentMetadata {
|
||||
pub source_format: String,
|
||||
pub file_name: String,
|
||||
pub total_pages: Option<u32>,
|
||||
pub total_sections: u32,
|
||||
}
|
||||
|
||||
// === 格式路由 ===
|
||||
|
||||
/// 根据文件扩展名判断处理通道
|
||||
pub fn detect_format(file_name: &str) -> Option<DocumentFormat> {
|
||||
let ext = file_name.rsplit('.').next().unwrap_or("").to_lowercase();
|
||||
match ext.as_str() {
|
||||
"pdf" => Some(DocumentFormat::Pdf),
|
||||
"docx" | "doc" => Some(DocumentFormat::Docx),
|
||||
"xlsx" | "xls" => Some(DocumentFormat::Excel),
|
||||
"md" | "txt" | "markdown" => Some(DocumentFormat::Markdown),
|
||||
"csv" => Some(DocumentFormat::Csv),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
pub enum DocumentFormat {
|
||||
Pdf,
|
||||
Docx,
|
||||
Excel,
|
||||
Csv,
|
||||
Markdown,
|
||||
}
|
||||
|
||||
impl DocumentFormat {
|
||||
pub fn is_structured(&self) -> bool {
|
||||
matches!(self, Self::Excel | Self::Csv)
|
||||
}
|
||||
}
|
||||
|
||||
// === 文件处理结果 ===
|
||||
|
||||
pub enum ProcessedFile {
|
||||
/// 文档通道(RAG)— PDF/DOCX/Markdown
|
||||
Document(NormalizedDocument),
|
||||
/// 结构化通道 — Excel/CSV 行数据
|
||||
Structured {
|
||||
title: String,
|
||||
sheet_names: Vec<String>,
|
||||
column_headers: Vec<String>,
|
||||
rows: Vec<(Option<String>, i32, Vec<String>, serde_json::Value)>,
|
||||
},
|
||||
}
|
||||
|
||||
// === 提取错误 ===
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ExtractError(pub String);
|
||||
|
||||
impl std::fmt::Display for ExtractError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for ExtractError {}
|
||||
|
||||
impl From<ExtractError> for crate::error::SaasError {
|
||||
fn from(e: ExtractError) -> Self {
|
||||
crate::error::SaasError::InvalidInput(e.0)
|
||||
}
|
||||
}
|
||||
|
||||
// === PDF 提取 ===
|
||||
|
||||
pub fn extract_pdf(data: &[u8], file_name: &str) -> Result<NormalizedDocument, ExtractError> {
|
||||
let text = pdf_extract::extract_text_from_mem(data)
|
||||
.map_err(|e| ExtractError(format!("PDF 提取失败: {}", e)))?;
|
||||
|
||||
let pages: Vec<&str> = text.split('\x0c').collect();
|
||||
let page_count = pages.len() as u32;
|
||||
|
||||
let mut sections = Vec::new();
|
||||
let mut current_content = String::new();
|
||||
|
||||
for (i, page) in pages.iter().enumerate() {
|
||||
let page_text = page.trim();
|
||||
if page_text.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
current_content.push_str(page_text);
|
||||
current_content.push('\n');
|
||||
|
||||
if current_content.len() > 2000 || i == pages.len() - 1 {
|
||||
let content = current_content.trim().to_string();
|
||||
if !content.is_empty() {
|
||||
sections.push(DocumentSection {
|
||||
heading: Some(format!("第 {} 页", i + 1)),
|
||||
content,
|
||||
level: 2,
|
||||
page_number: Some((i + 1) as u32),
|
||||
});
|
||||
}
|
||||
current_content.clear();
|
||||
}
|
||||
}
|
||||
|
||||
let title = extract_title(file_name, ".pdf");
|
||||
let total_sections = sections.len() as u32;
|
||||
|
||||
Ok(NormalizedDocument {
|
||||
title,
|
||||
sections,
|
||||
metadata: DocumentMetadata {
|
||||
source_format: "pdf".to_string(),
|
||||
file_name: file_name.to_string(),
|
||||
total_pages: Some(page_count),
|
||||
total_sections,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
// === DOCX 提取 ===
|
||||
|
||||
pub fn extract_docx(data: &[u8], file_name: &str) -> Result<NormalizedDocument, ExtractError> {
|
||||
let reader = std::io::Cursor::new(data);
|
||||
let mut archive = zip::ZipArchive::new(reader)
|
||||
.map_err(|e| ExtractError(format!("DOCX 解压失败: {}", e)))?;
|
||||
|
||||
let mut doc_xml = archive.by_name("word/document.xml")
|
||||
.map_err(|e| ExtractError(format!("DOCX 中未找到 document.xml: {}", e)))?;
|
||||
|
||||
let mut xml_content = String::new();
|
||||
use std::io::Read;
|
||||
doc_xml.read_to_string(&mut xml_content)
|
||||
.map_err(|e| ExtractError(format!("DOCX 读取失败: {}", e)))?;
|
||||
|
||||
let mut sections = Vec::new();
|
||||
let mut current_heading: Option<String> = None;
|
||||
let mut current_content = String::new();
|
||||
|
||||
// 简单 XML 解析:提取 <w:t> 文本和 <w:pStyle> 标题层级
|
||||
let mut in_text = false;
|
||||
let mut paragraph_style = String::new();
|
||||
let mut text_buf = String::new();
|
||||
|
||||
let mut reader = quick_xml::Reader::from_str(&xml_content);
|
||||
let mut buf = Vec::new();
|
||||
|
||||
loop {
|
||||
match reader.read_event_into(&mut buf) {
|
||||
Ok(quick_xml::events::Event::Start(e)) => {
|
||||
let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string();
|
||||
match name.as_str() {
|
||||
"p" => paragraph_style.clear(),
|
||||
"t" => in_text = true,
|
||||
"pStyle" => {
|
||||
for attr in e.attributes().flatten() {
|
||||
if attr.key.local_name().as_ref() == b"val" {
|
||||
paragraph_style = String::from_utf8_lossy(&attr.value).to_string();
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Ok(quick_xml::events::Event::Text(t)) => {
|
||||
if in_text {
|
||||
text_buf.push_str(&t.unescape().unwrap_or_default());
|
||||
}
|
||||
}
|
||||
Ok(quick_xml::events::Event::End(e)) => {
|
||||
let name = String::from_utf8_lossy(e.local_name().as_ref()).to_string();
|
||||
match name.as_str() {
|
||||
"p" => {
|
||||
let text = text_buf.trim().to_string();
|
||||
text_buf.clear();
|
||||
if text.is_empty() { continue; }
|
||||
|
||||
let is_heading = paragraph_style.starts_with("Heading")
|
||||
|| paragraph_style.starts_with("heading")
|
||||
|| paragraph_style == "Title";
|
||||
|
||||
if is_heading {
|
||||
if !current_content.is_empty() {
|
||||
sections.push(DocumentSection {
|
||||
heading: current_heading.take(),
|
||||
content: current_content.trim().to_string(),
|
||||
level: 2,
|
||||
page_number: None,
|
||||
});
|
||||
current_content.clear();
|
||||
}
|
||||
current_heading = Some(text);
|
||||
} else {
|
||||
current_content.push_str(&text);
|
||||
current_content.push('\n');
|
||||
}
|
||||
}
|
||||
"t" => in_text = false,
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Ok(quick_xml::events::Event::Eof) => break,
|
||||
Err(e) => {
|
||||
tracing::warn!("DOCX XML parse warning: {}", e);
|
||||
break;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
buf.clear();
|
||||
}
|
||||
|
||||
if !current_content.is_empty() {
|
||||
sections.push(DocumentSection {
|
||||
heading: current_heading,
|
||||
content: current_content.trim().to_string(),
|
||||
level: 2,
|
||||
page_number: None,
|
||||
});
|
||||
}
|
||||
|
||||
let title = extract_title(file_name, ".docx");
|
||||
let total_sections = sections.len() as u32;
|
||||
|
||||
Ok(NormalizedDocument {
|
||||
title,
|
||||
sections,
|
||||
metadata: DocumentMetadata {
|
||||
source_format: "docx".to_string(),
|
||||
file_name: file_name.to_string(),
|
||||
total_pages: None,
|
||||
total_sections,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
// === Excel 解析 ===
|
||||
|
||||
pub fn extract_excel(data: &[u8], file_name: &str) -> Result<ProcessedFile, ExtractError> {
|
||||
let cursor = std::io::Cursor::new(data);
|
||||
let mut workbook: calamine::Xlsx<_> = calamine::open_workbook_from_rs(cursor)
|
||||
.map_err(|e| ExtractError(format!("Excel 解析失败: {}", e)))?;
|
||||
|
||||
let sheet_names = workbook.sheet_names().to_vec();
|
||||
let mut all_rows: Vec<(Option<String>, i32, Vec<String>, serde_json::Value)> = Vec::new();
|
||||
let mut all_headers: Vec<String> = Vec::new();
|
||||
let mut global_row_index = 0i32;
|
||||
|
||||
for sheet_name in &sheet_names {
|
||||
if let Ok(range) = workbook.worksheet_range(sheet_name) {
|
||||
let mut headers: Vec<String> = Vec::new();
|
||||
let mut first_row = true;
|
||||
|
||||
for row in range_as_data_rows(&range) {
|
||||
if first_row {
|
||||
headers = row.iter().map(|cell| {
|
||||
cell.to_string().trim().to_string()
|
||||
}).collect();
|
||||
headers.retain(|h| !h.is_empty());
|
||||
if headers.is_empty() { first_row = false; continue; }
|
||||
for h in &headers {
|
||||
if !all_headers.contains(h) {
|
||||
all_headers.push(h.clone());
|
||||
}
|
||||
}
|
||||
first_row = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut row_map = serde_json::Map::new();
|
||||
for (i, cell) in row.iter().enumerate() {
|
||||
if i >= headers.len() { break; }
|
||||
let value = match cell {
|
||||
Data::Empty => continue,
|
||||
Data::String(s) => serde_json::Value::String(s.clone()),
|
||||
Data::Float(f) => serde_json::json!(f),
|
||||
Data::Int(n) => serde_json::json!(n),
|
||||
Data::Bool(b) => serde_json::Value::Bool(*b),
|
||||
Data::DateTime(dt) => {
|
||||
serde_json::Value::String(dt.to_string())
|
||||
}
|
||||
Data::DateTimeIso(s) => {
|
||||
serde_json::Value::String(s.clone())
|
||||
}
|
||||
Data::DurationIso(s) => {
|
||||
serde_json::Value::String(s.clone())
|
||||
}
|
||||
Data::Error(e) => {
|
||||
serde_json::Value::String(format!("{:?}", e))
|
||||
}
|
||||
};
|
||||
row_map.insert(headers[i].clone(), value);
|
||||
}
|
||||
|
||||
if !row_map.is_empty() {
|
||||
all_rows.push((
|
||||
Some(sheet_name.clone()),
|
||||
global_row_index,
|
||||
headers.clone(),
|
||||
serde_json::Value::Object(row_map),
|
||||
));
|
||||
global_row_index += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let title = extract_title(file_name, ".xlsx");
|
||||
|
||||
Ok(ProcessedFile::Structured {
|
||||
title,
|
||||
sheet_names,
|
||||
column_headers: all_headers,
|
||||
rows: all_rows,
|
||||
})
|
||||
}
|
||||
|
||||
// === 工具函数 ===
|
||||
|
||||
/// 辅助:将 Range<Data> 转为行的 Vec,解决 calamine 类型推断问题
|
||||
fn range_as_data_rows(range: &Range<Data>) -> Vec<Vec<Data>> {
|
||||
range.rows().map(|row| row.to_vec()).collect()
|
||||
}
|
||||
|
||||
/// 从文件名提取标题
|
||||
fn extract_title(file_name: &str, ext: &str) -> String {
|
||||
file_name
|
||||
.rsplit_once('/')
|
||||
.or_else(|| file_name.rsplit_once('\\'))
|
||||
.map(|(_, name)| name)
|
||||
.unwrap_or(file_name)
|
||||
.trim_end_matches(ext)
|
||||
.to_string()
|
||||
}
|
||||
|
||||
/// 将 NormalizedDocument 转为单个 Markdown 内容字符串
|
||||
pub fn normalized_to_markdown(doc: &NormalizedDocument) -> String {
|
||||
let mut md = String::new();
|
||||
for section in &doc.sections {
|
||||
if let Some(ref heading) = section.heading {
|
||||
md.push_str(&format!("## {}\n\n", heading));
|
||||
}
|
||||
md.push_str(§ion.content);
|
||||
md.push_str("\n\n");
|
||||
}
|
||||
md.trim().to_string()
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
//! 知识库 HTTP 处理器
|
||||
|
||||
use axum::{
|
||||
extract::{Extension, Path, Query, State},
|
||||
extract::{Extension, Multipart, Path, Query, State},
|
||||
Json,
|
||||
};
|
||||
|
||||
@@ -10,6 +10,7 @@ use crate::error::{SaasError, SaasResult};
|
||||
use crate::state::AppState;
|
||||
use super::service;
|
||||
use super::types::*;
|
||||
use super::extractors;
|
||||
|
||||
// === 分类管理 ===
|
||||
|
||||
@@ -685,3 +686,202 @@ pub async fn query_structured(
|
||||
let results = service::query_structured(&state.db, &req, Some(&ctx.account_id)).await?;
|
||||
Ok(Json(results))
|
||||
}
|
||||
|
||||
// === 文件上传 ===
|
||||
|
||||
/// POST /api/v1/knowledge/upload — multipart 文件上传
|
||||
///
|
||||
/// 支持 PDF/DOCX → RAG 管线,Excel → 结构化管线
|
||||
pub async fn upload_file(
|
||||
State(state): State<AppState>,
|
||||
Extension(ctx): Extension<AuthContext>,
|
||||
mut multipart: Multipart,
|
||||
) -> SaasResult<Json<serde_json::Value>> {
|
||||
check_permission(&ctx, "knowledge:write")?;
|
||||
let is_admin = ctx.role == "admin" || ctx.role == "super_admin";
|
||||
|
||||
let mut results = Vec::new();
|
||||
|
||||
while let Some(field) = multipart.next_field().await.map_err(|e| {
|
||||
SaasError::InvalidInput(format!("文件上传解析失败: {}", e))
|
||||
})? {
|
||||
let file_name = field.file_name().unwrap_or("unknown").to_string();
|
||||
let data = field.bytes().await.map_err(|e| {
|
||||
SaasError::InvalidInput(format!("文件读取失败: {}", e))
|
||||
})?;
|
||||
|
||||
// 大小限制 20MB
|
||||
if data.len() > 20 * 1024 * 1024 {
|
||||
results.push(serde_json::json!({
|
||||
"file": file_name,
|
||||
"status": "error",
|
||||
"error": "文件超过 20MB 限制"
|
||||
}));
|
||||
continue;
|
||||
}
|
||||
|
||||
let format = match extractors::detect_format(&file_name) {
|
||||
Some(f) => f,
|
||||
None => {
|
||||
results.push(serde_json::json!({
|
||||
"file": file_name,
|
||||
"status": "error",
|
||||
"error": "不支持的文件格式"
|
||||
}));
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
if format.is_structured() {
|
||||
// Excel → 结构化通道
|
||||
match handle_structured_upload(
|
||||
&state, &ctx, is_admin, &data, &file_name,
|
||||
).await {
|
||||
Ok(result) => results.push(result),
|
||||
Err(e) => results.push(serde_json::json!({
|
||||
"file": file_name,
|
||||
"status": "error",
|
||||
"error": e.to_string()
|
||||
})),
|
||||
}
|
||||
} else {
|
||||
// PDF/DOCX/MD → 文档通道 (RAG)
|
||||
match handle_document_upload(
|
||||
&state, &ctx, is_admin, &data, &file_name, format,
|
||||
).await {
|
||||
Ok(result) => results.push(result),
|
||||
Err(e) => results.push(serde_json::json!({
|
||||
"file": file_name,
|
||||
"status": "error",
|
||||
"error": e.to_string()
|
||||
})),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Json(serde_json::json!({
|
||||
"results": results,
|
||||
"count": results.len(),
|
||||
})))
|
||||
}
|
||||
|
||||
/// 处理文档类上传(PDF/DOCX/MD → RAG 管线)
|
||||
async fn handle_document_upload(
|
||||
state: &AppState,
|
||||
ctx: &AuthContext,
|
||||
is_admin: bool,
|
||||
data: &[u8],
|
||||
file_name: &str,
|
||||
format: extractors::DocumentFormat,
|
||||
) -> SaasResult<serde_json::Value> {
|
||||
let doc = match format {
|
||||
extractors::DocumentFormat::Pdf => extractors::extract_pdf(data, file_name)?,
|
||||
extractors::DocumentFormat::Docx => extractors::extract_docx(data, file_name)?,
|
||||
extractors::DocumentFormat::Markdown => {
|
||||
// Markdown 直通
|
||||
let text = String::from_utf8_lossy(data).to_string();
|
||||
let title = file_name.trim_end_matches(".md").trim_end_matches(".txt").to_string();
|
||||
extractors::NormalizedDocument {
|
||||
title,
|
||||
sections: vec![extractors::DocumentSection {
|
||||
heading: None,
|
||||
content: text,
|
||||
level: 1,
|
||||
page_number: None,
|
||||
}],
|
||||
metadata: extractors::DocumentMetadata {
|
||||
source_format: "markdown".to_string(),
|
||||
file_name: file_name.to_string(),
|
||||
total_pages: None,
|
||||
total_sections: 1,
|
||||
},
|
||||
}
|
||||
}
|
||||
_ => return Err(SaasError::InvalidInput("不支持的文档格式".into())),
|
||||
};
|
||||
|
||||
// 转为 Markdown 内容
|
||||
let content = extractors::normalized_to_markdown(&doc);
|
||||
if content.is_empty() {
|
||||
return Err(SaasError::InvalidInput("文件内容为空".into()));
|
||||
}
|
||||
|
||||
// 创建知识条目
|
||||
let item_req = CreateItemRequest {
|
||||
category_id: "uploaded".to_string(), // TODO: 从上传参数获取
|
||||
title: doc.title.clone(),
|
||||
content,
|
||||
keywords: None,
|
||||
related_questions: None,
|
||||
priority: Some(5),
|
||||
tags: Some(vec![format!("source:{}", doc.metadata.source_format)]),
|
||||
visibility: None,
|
||||
};
|
||||
|
||||
let item = service::create_item(&state.db, &ctx.account_id, &item_req, is_admin).await?;
|
||||
|
||||
// 触发分块
|
||||
if let Err(e) = state.worker_dispatcher.dispatch(
|
||||
"generate_embedding",
|
||||
serde_json::json!({ "item_id": item.id }),
|
||||
).await {
|
||||
tracing::warn!("Upload: failed to dispatch embedding for {}: {}", item.id, e);
|
||||
}
|
||||
|
||||
Ok(serde_json::json!({
|
||||
"file": file_name,
|
||||
"status": "ok",
|
||||
"item_id": item.id,
|
||||
"sections": doc.metadata.total_sections,
|
||||
"format": doc.metadata.source_format,
|
||||
}))
|
||||
}
|
||||
|
||||
/// 处理结构化数据上传(Excel → structured_rows)
|
||||
async fn handle_structured_upload(
|
||||
state: &AppState,
|
||||
ctx: &AuthContext,
|
||||
is_admin: bool,
|
||||
data: &[u8],
|
||||
file_name: &str,
|
||||
) -> SaasResult<serde_json::Value> {
|
||||
let processed = extractors::extract_excel(data, file_name)?;
|
||||
|
||||
match processed {
|
||||
extractors::ProcessedFile::Structured { title, sheet_names, column_headers, rows } => {
|
||||
if rows.is_empty() {
|
||||
return Err(SaasError::InvalidInput("Excel 文件没有数据行".into()));
|
||||
}
|
||||
|
||||
// 创建结构化数据源
|
||||
let source_req = CreateStructuredSourceRequest {
|
||||
title,
|
||||
description: None,
|
||||
original_file_name: Some(file_name.to_string()),
|
||||
sheet_names: Some(sheet_names.clone()),
|
||||
column_headers: Some(column_headers.clone()),
|
||||
visibility: None,
|
||||
industry_id: None,
|
||||
};
|
||||
|
||||
let source = service::create_structured_source(
|
||||
&state.db, &ctx.account_id, is_admin, &source_req,
|
||||
).await?;
|
||||
|
||||
// 批量写入行数据
|
||||
let count = service::insert_structured_rows(
|
||||
&state.db, &source.id, &rows,
|
||||
).await?;
|
||||
|
||||
Ok(serde_json::json!({
|
||||
"file": file_name,
|
||||
"status": "ok",
|
||||
"source_id": source.id,
|
||||
"sheets": sheet_names,
|
||||
"rows_imported": count,
|
||||
"columns": column_headers.len(),
|
||||
}))
|
||||
}
|
||||
_ => Err(SaasError::InvalidInput("意外的处理结果".into())),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
pub mod types;
|
||||
pub mod service;
|
||||
pub mod handlers;
|
||||
pub mod extractors;
|
||||
|
||||
use axum::routing::{delete, get, patch, post, put};
|
||||
|
||||
@@ -20,6 +21,7 @@ pub fn routes() -> axum::Router<crate::state::AppState> {
|
||||
.route("/api/v1/knowledge/items", post(handlers::create_item))
|
||||
.route("/api/v1/knowledge/items/batch", post(handlers::batch_create_items))
|
||||
.route("/api/v1/knowledge/items/import", post(handlers::import_items))
|
||||
.route("/api/v1/knowledge/upload", post(handlers::upload_file))
|
||||
.route("/api/v1/knowledge/items/:id", get(handlers::get_item))
|
||||
.route("/api/v1/knowledge/items/:id", put(handlers::update_item))
|
||||
.route("/api/v1/knowledge/items/:id", delete(handlers::delete_item))
|
||||
|
||||
Reference in New Issue
Block a user