docs(ai): 知识库 V2 设计规格 — review 修复
修复 3 CRITICAL + 4 HIGH 问题: - C1: 明确禁止 format! 拼接 SQL,所有 pgvector 查询参数化 - C2: 迁移 SQL 改用临时映射表精确关联,防数据重复/丢失 - C3: SSRF 防护细化(DNS rebinding + 超时 + 重定向校验) - H1: document_count/chunk_count 改为原子 SQL 增量 - H2: embedding 部分失败:NULL embedding 写入 + embedded_count 统计 - H3: chunks 表补充 updated_at/updated_by 审计字段 - H4: 新增 validator crate 依赖说明 - M1: 空知识库边界返回空 context 不报错 - M2: 向量索引改用 HNSW(无需预热)
This commit is contained in:
@@ -303,12 +303,14 @@ CREATE TABLE ai_knowledge_chunks (
|
||||
metadata JSONB DEFAULT '{}', -- 元数据: page, section, source_line
|
||||
hit_count INT NOT NULL DEFAULT 0, -- 命中次数
|
||||
last_hit_at TIMESTAMPTZ, -- 最近命中时间
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
updated_by UUID
|
||||
);
|
||||
|
||||
-- 向量检索核心索引(IVFFlat,适合 < 100 万切片)
|
||||
-- 向量检索核心索引(HNSW,无需预热数据,pgvector >= 0.5.0)
|
||||
CREATE INDEX idx_chunk_embedding ON ai_knowledge_chunks
|
||||
USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
|
||||
USING hnsw (embedding vector_cosine_ops) WITH (m = 16, ef_construction = 64);
|
||||
|
||||
-- 按知识库查询
|
||||
CREATE INDEX idx_chunk_kb ON ai_knowledge_chunks(knowledge_base_id);
|
||||
@@ -433,20 +435,36 @@ fn chunk_text(text: &str, strategy: &ChunkStrategy) -> Vec<Chunk> {
|
||||
|
||||
```rust
|
||||
async fn process_chunks(chunks: &[NewChunk]) -> Result<()> {
|
||||
let texts: Vec<&str> = chunks.iter().map(|c| c.content.as_str()).collect();
|
||||
let embeddings = embedding_service.embed_batch(&texts).await?;
|
||||
// 分批 embedding,单批失败不阻塞其他批次
|
||||
let mut embedded_count = 0;
|
||||
for batch in chunks.chunks(EMBEDDING_BATCH_SIZE) {
|
||||
match embedding_service.embed_batch(&batch_texts).await {
|
||||
Ok(embeddings) => {
|
||||
batch_insert_chunks(batch, &embeddings).await?;
|
||||
embedded_count += batch.len();
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("Embedding batch failed: {}, storing {} chunks with NULL embedding", e, batch.len());
|
||||
// 切片仍然写入,embedding = NULL,后续可重新 embedding
|
||||
batch_insert_chunks_without_embedding(batch).await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 批量写入 ai_knowledge_chunks(含 vector)
|
||||
batch_insert_chunks(chunks, &embeddings).await?;
|
||||
|
||||
// 更新 document 状态和计数
|
||||
update_document_status(doc_id, "completed", chunks.len()).await?;
|
||||
update_kb_counts(kb_id).await?;
|
||||
// 更新文档状态和计数(原子增量,防并发计数不一致)
|
||||
let doc_id = chunks.first().unwrap().document_id;
|
||||
let kb_id = chunks.first().unwrap().knowledge_base_id;
|
||||
atomic_update_document_status(doc_id, "completed", chunks.len(), embedded_count).await?;
|
||||
// 使用 SQL: UPDATE ... SET document_count = document_count + 1, chunk_count = chunk_count + N
|
||||
atomic_increment_kb_counts(kb_id, 1, chunks.len()).await?;
|
||||
}
|
||||
```
|
||||
|
||||
**错误容忍:**
|
||||
- embedding API 不可用时:切片写入但 `embedding = NULL`,标记文档为 `completed`(降级为纯文本匹配)
|
||||
- 单个切片 embedding 失败:该切片以 `embedding = NULL` 写入,其他切片正常处理
|
||||
- 文档完成后记录 `embedded_count / total_count`,前端展示"42/50 切片已向量化"
|
||||
- 所有 `embedding = NULL` 的切片可后续通过"重新 embedding"按钮批量处理
|
||||
- 单个切片 embedding 失败:跳过该切片,不影响其他切片
|
||||
- 整体失败:文档状态设为 `failed` + `error_message`
|
||||
|
||||
@@ -535,7 +553,7 @@ fn keyword_match(query: &str, all_kb_ids: &[KnowledgeBase]) -> Vec<Uuid> {
|
||||
复用现有 pgvector 基础设施,改造 `KnowledgeSearchRepository` 适配新表:
|
||||
|
||||
```sql
|
||||
-- 在指定知识库范围内检索
|
||||
-- 在指定知识库范围内检索(全部参数化绑定,禁止 format! 拼接)
|
||||
SELECT c.id, c.content, c.metadata, c.document_id, c.knowledge_base_id,
|
||||
kb.name AS kb_name, doc.title AS doc_title,
|
||||
1 - (c.embedding <=> $1::vector) AS similarity
|
||||
@@ -549,6 +567,8 @@ ORDER BY c.embedding <=> $1::vector
|
||||
LIMIT $4;
|
||||
```
|
||||
|
||||
> **安全约束:** 所有 pgvector 查询必须使用参数化绑定(`$N`),禁止 `format!` 拼接任何用户输入。现有 `vector_search.rs` 中的 `format!("AND analysis_type = '{}'", ...)` 模式必须在此次重构中消除。如需按 `kb_type` 过滤,通过 JOIN `ai_knowledge_bases` 实现,不直接拼接到 WHERE 子句。
|
||||
|
||||
**检索参数:**
|
||||
|
||||
| 参数 | 值 | 说明 |
|
||||
@@ -776,8 +796,19 @@ pub struct HybridKnowledgeRouter {
|
||||
#[async_trait]
|
||||
impl KnowledgeSource for HybridKnowledgeRouter {
|
||||
async fn get_context(&self, query: &KnowledgeQuery) -> Result<KnowledgeContext> {
|
||||
// Layer 0: 边界情况 — 无知识库或全部禁用时,返回空 context(不报错)
|
||||
let all_kbs = self.load_enabled_kbs(&query.tenant_id).await?;
|
||||
if all_kbs.is_empty() {
|
||||
return Ok(KnowledgeContext {
|
||||
source: "hybrid_router".into(),
|
||||
context_text: String::new(),
|
||||
references: vec![],
|
||||
confidence: 0.0,
|
||||
});
|
||||
}
|
||||
|
||||
// Layer 1: 关键词粗筛
|
||||
let candidate_kb_ids = self.keyword_match(&query.query_text).await?;
|
||||
let candidate_kb_ids = self.keyword_match(&query.query_text, &all_kbs).await?;
|
||||
|
||||
// Layer 2: 向量检索
|
||||
let results = self.vector_search(&query.query_text, &candidate_kb_ids).await?;
|
||||
@@ -964,8 +995,18 @@ Phase 3: 标记旧表 deprecated(不删除,保留备份)
|
||||
|
||||
### 8.3 迁移 SQL 示例
|
||||
|
||||
> **关键约束:** 使用临时映射表确保旧数据精确关联到新知识库,避免 JOIN 模糊匹配导致数据重复或丢失。
|
||||
|
||||
```sql
|
||||
-- Step 1: 按 analysis_type 创建默认知识库
|
||||
-- Step 0: 创建临时映射表(tenant_id + analysis_type → new_kb_id)
|
||||
CREATE TEMP TABLE kb_migration_map (
|
||||
tenant_id UUID,
|
||||
analysis_type VARCHAR(50),
|
||||
old_table VARCHAR(20), -- 'references' / 'guides' / 'rules'
|
||||
new_kb_id UUID
|
||||
);
|
||||
|
||||
-- Step 1: 按 tenant_id + analysis_type 创建知识库,同时记录映射
|
||||
INSERT INTO ai_knowledge_bases (id, tenant_id, name, kb_type, description, chunk_strategy, intent_keywords)
|
||||
SELECT
|
||||
gen_random_uuid(),
|
||||
@@ -977,37 +1018,60 @@ SELECT
|
||||
'[]'::jsonb
|
||||
FROM ai_knowledge_references
|
||||
WHERE deleted_at IS NULL
|
||||
GROUP BY tenant_id, analysis_type;
|
||||
GROUP BY tenant_id, analysis_type
|
||||
RETURNING id, tenant_id, name; -- PostgreSQL RETURNING 支持精确关联
|
||||
|
||||
-- Step 2: 为每个 reference 创建文档 + 切片
|
||||
-- Step 1b: 填充映射表(使用 name 匹配,因为 RETURNING 不能直接填充 temp table)
|
||||
INSERT INTO kb_migration_map (tenant_id, analysis_type, old_table, new_kb_id)
|
||||
SELECT
|
||||
r.tenant_id,
|
||||
r.analysis_type,
|
||||
'references',
|
||||
kb.id
|
||||
FROM (SELECT DISTINCT tenant_id, analysis_type FROM ai_knowledge_references WHERE deleted_at IS NULL) r
|
||||
JOIN ai_knowledge_bases kb ON kb.tenant_id = r.tenant_id
|
||||
AND kb.name = '参考资料 - ' || r.analysis_type
|
||||
AND kb.kb_type = 'reference';
|
||||
|
||||
-- Step 2: 为每个 reference 创建文档(通过映射表精确 JOIN)
|
||||
INSERT INTO ai_knowledge_documents (id, tenant_id, knowledge_base_id, title, source_type, raw_content, status, chunk_count)
|
||||
SELECT
|
||||
r.id,
|
||||
r.tenant_id,
|
||||
kb.id,
|
||||
m.new_kb_id,
|
||||
r.title,
|
||||
'manual',
|
||||
r.content_summary,
|
||||
'completed',
|
||||
1
|
||||
FROM ai_knowledge_references r
|
||||
JOIN ai_knowledge_bases kb ON kb.tenant_id = r.tenant_id AND kb.kb_type = 'reference'
|
||||
JOIN kb_migration_map m ON m.tenant_id = r.tenant_id
|
||||
AND m.analysis_type = r.analysis_type
|
||||
AND m.old_table = 'references'
|
||||
WHERE r.deleted_at IS NULL;
|
||||
|
||||
-- Step 3: 迁移切片 + 向量
|
||||
-- Step 3: 迁移切片 + 向量(通过映射表精确 JOIN)
|
||||
INSERT INTO ai_knowledge_chunks (id, tenant_id, document_id, knowledge_base_id, chunk_index, content, embedding, metadata)
|
||||
SELECT
|
||||
gen_random_uuid(),
|
||||
r.tenant_id,
|
||||
r.id,
|
||||
kb.id,
|
||||
m.new_kb_id,
|
||||
0,
|
||||
r.content_summary,
|
||||
r.embedding::vector, -- 需确保维度匹配
|
||||
jsonb_build_object('source_name', r.source_name, 'tags', r.tags)
|
||||
r.embedding::vector,
|
||||
jsonb_build_object('source_name', r.source_name, 'tags', r.tags, 'migrated_from', 'ai_knowledge_references')
|
||||
FROM ai_knowledge_references r
|
||||
JOIN ai_knowledge_bases kb ON kb.tenant_id = r.tenant_id AND kb.kb_type = 'reference'
|
||||
JOIN kb_migration_map m ON m.tenant_id = r.tenant_id
|
||||
AND m.analysis_type = r.analysis_type
|
||||
AND m.old_table = 'references'
|
||||
WHERE r.deleted_at IS NULL AND r.embedding IS NOT NULL;
|
||||
|
||||
-- Step 4-6: 同理迁移 ai_knowledge_guides 和 ai_knowledge_rules
|
||||
-- (guides 的 kb_type='clinical_guide',rules 的 kb_type='rule')
|
||||
|
||||
-- Step 7: 清理临时表
|
||||
DROP TABLE kb_migration_map;
|
||||
```
|
||||
|
||||
### 8.4 旧表处理
|
||||
@@ -1092,6 +1156,10 @@ pub struct ImportUrlReq {
|
||||
}
|
||||
```
|
||||
|
||||
> **SSRF 防护要求(§9.6 安全约束):**
|
||||
> - `validate_no_ssrf` 必须同时验证:① 仅允许 `http`/`https` 协议 ② 禁止 `localhost`/`127.0.0.1`/`10.x`/`172.16-31.x`/`192.168.x`/`0.0.0.0` ③ DNS 解析后检查实际 IP(防 DNS rebinding)④ 请求超时 10 秒 ⑤ 禁止重定向到内网地址(跟随重定向时重新校验)
|
||||
> - 文件上传安全:MIME 类型白名单(`application/pdf`, `application/vnd.openxmlformats-officedocument.*`, `text/*`)+ 大小上限 50MB + 文件名 sanitize(防路径穿越)
|
||||
|
||||
**CreateManualReq:**
|
||||
```rust
|
||||
pub struct CreateManualReq {
|
||||
@@ -1120,6 +1188,7 @@ pub struct HitTestReq {
|
||||
| `docx-rs` | Word 文档解析 | 小 | 纯 Rust |
|
||||
| `calamine` | Excel 文件读取 | 小 | 纯 Rust,支持 xlsx/xls/csv |
|
||||
| `scraper` | HTML DOM 解析(URL 导入) | 小 | 已在项目间接依赖中 |
|
||||
| `validator` | DTO 输入校验 | 小 | workspace 已有,需在 erp-ai 的 Cargo.toml 中启用 derive feature |
|
||||
|
||||
**不新增的系统级依赖** — 所有解析库均为纯 Rust 实现,无需安装系统库。
|
||||
|
||||
|
||||
Reference in New Issue
Block a user