From af2484e63be6c0bd998c6a199225674910e9d9cd Mon Sep 17 00:00:00 2001 From: iven Date: Tue, 26 May 2026 22:18:28 +0800 Subject: [PATCH] =?UTF-8?q?docs(ai):=20=E7=9F=A5=E8=AF=86=E5=BA=93=20V2=20?= =?UTF-8?q?=E8=AE=BE=E8=AE=A1=E8=A7=84=E6=A0=BC=20=E2=80=94=20review=20?= =?UTF-8?q?=E4=BF=AE=E5=A4=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 修复 3 CRITICAL + 4 HIGH 问题: - C1: 明确禁止 format! 拼接 SQL,所有 pgvector 查询参数化 - C2: 迁移 SQL 改用临时映射表精确关联,防数据重复/丢失 - C3: SSRF 防护细化(DNS rebinding + 超时 + 重定向校验) - H1: document_count/chunk_count 改为原子 SQL 增量 - H2: embedding 部分失败:NULL embedding 写入 + embedded_count 统计 - H3: chunks 表补充 updated_at/updated_by 审计字段 - H4: 新增 validator crate 依赖说明 - M1: 空知识库边界返回空 context 不报错 - M2: 向量索引改用 HNSW(无需预热) --- .../2026-05-26-ai-knowledge-base-v2-design.md | 115 ++++++++++++++---- 1 file changed, 92 insertions(+), 23 deletions(-) diff --git a/docs/superpowers/specs/2026-05-26-ai-knowledge-base-v2-design.md b/docs/superpowers/specs/2026-05-26-ai-knowledge-base-v2-design.md index a852d3e..c0116c6 100644 --- a/docs/superpowers/specs/2026-05-26-ai-knowledge-base-v2-design.md +++ b/docs/superpowers/specs/2026-05-26-ai-knowledge-base-v2-design.md @@ -303,12 +303,14 @@ CREATE TABLE ai_knowledge_chunks ( metadata JSONB DEFAULT '{}', -- 元数据: page, section, source_line hit_count INT NOT NULL DEFAULT 0, -- 命中次数 last_hit_at TIMESTAMPTZ, -- 最近命中时间 - created_at TIMESTAMPTZ NOT NULL DEFAULT now() + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT now(), + updated_by UUID ); --- 向量检索核心索引(IVFFlat,适合 < 100 万切片) +-- 向量检索核心索引(HNSW,无需预热数据,pgvector >= 0.5.0) CREATE INDEX idx_chunk_embedding ON ai_knowledge_chunks - USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100); + USING hnsw (embedding vector_cosine_ops) WITH (m = 16, ef_construction = 64); -- 按知识库查询 CREATE INDEX idx_chunk_kb ON ai_knowledge_chunks(knowledge_base_id); @@ -433,20 +435,36 @@ fn chunk_text(text: &str, strategy: &ChunkStrategy) -> Vec { ```rust async fn process_chunks(chunks: &[NewChunk]) -> Result<()> { - let texts: Vec<&str> = chunks.iter().map(|c| c.content.as_str()).collect(); - let embeddings = embedding_service.embed_batch(&texts).await?; + // 分批 embedding,单批失败不阻塞其他批次 + let mut embedded_count = 0; + for batch in chunks.chunks(EMBEDDING_BATCH_SIZE) { + match embedding_service.embed_batch(&batch_texts).await { + Ok(embeddings) => { + batch_insert_chunks(batch, &embeddings).await?; + embedded_count += batch.len(); + } + Err(e) => { + tracing::warn!("Embedding batch failed: {}, storing {} chunks with NULL embedding", e, batch.len()); + // 切片仍然写入,embedding = NULL,后续可重新 embedding + batch_insert_chunks_without_embedding(batch).await?; + } + } + } - // 批量写入 ai_knowledge_chunks(含 vector) - batch_insert_chunks(chunks, &embeddings).await?; - - // 更新 document 状态和计数 - update_document_status(doc_id, "completed", chunks.len()).await?; - update_kb_counts(kb_id).await?; + // 更新文档状态和计数(原子增量,防并发计数不一致) + let doc_id = chunks.first().unwrap().document_id; + let kb_id = chunks.first().unwrap().knowledge_base_id; + atomic_update_document_status(doc_id, "completed", chunks.len(), embedded_count).await?; + // 使用 SQL: UPDATE ... SET document_count = document_count + 1, chunk_count = chunk_count + N + atomic_increment_kb_counts(kb_id, 1, chunks.len()).await?; } ``` **错误容忍:** - embedding API 不可用时:切片写入但 `embedding = NULL`,标记文档为 `completed`(降级为纯文本匹配) +- 单个切片 embedding 失败:该切片以 `embedding = NULL` 写入,其他切片正常处理 +- 文档完成后记录 `embedded_count / total_count`,前端展示"42/50 切片已向量化" +- 所有 `embedding = NULL` 的切片可后续通过"重新 embedding"按钮批量处理 - 单个切片 embedding 失败:跳过该切片,不影响其他切片 - 整体失败:文档状态设为 `failed` + `error_message` @@ -535,7 +553,7 @@ fn keyword_match(query: &str, all_kb_ids: &[KnowledgeBase]) -> Vec { 复用现有 pgvector 基础设施,改造 `KnowledgeSearchRepository` 适配新表: ```sql --- 在指定知识库范围内检索 +-- 在指定知识库范围内检索(全部参数化绑定,禁止 format! 拼接) SELECT c.id, c.content, c.metadata, c.document_id, c.knowledge_base_id, kb.name AS kb_name, doc.title AS doc_title, 1 - (c.embedding <=> $1::vector) AS similarity @@ -549,6 +567,8 @@ ORDER BY c.embedding <=> $1::vector LIMIT $4; ``` +> **安全约束:** 所有 pgvector 查询必须使用参数化绑定(`$N`),禁止 `format!` 拼接任何用户输入。现有 `vector_search.rs` 中的 `format!("AND analysis_type = '{}'", ...)` 模式必须在此次重构中消除。如需按 `kb_type` 过滤,通过 JOIN `ai_knowledge_bases` 实现,不直接拼接到 WHERE 子句。 + **检索参数:** | 参数 | 值 | 说明 | @@ -776,8 +796,19 @@ pub struct HybridKnowledgeRouter { #[async_trait] impl KnowledgeSource for HybridKnowledgeRouter { async fn get_context(&self, query: &KnowledgeQuery) -> Result { + // Layer 0: 边界情况 — 无知识库或全部禁用时,返回空 context(不报错) + let all_kbs = self.load_enabled_kbs(&query.tenant_id).await?; + if all_kbs.is_empty() { + return Ok(KnowledgeContext { + source: "hybrid_router".into(), + context_text: String::new(), + references: vec![], + confidence: 0.0, + }); + } + // Layer 1: 关键词粗筛 - let candidate_kb_ids = self.keyword_match(&query.query_text).await?; + let candidate_kb_ids = self.keyword_match(&query.query_text, &all_kbs).await?; // Layer 2: 向量检索 let results = self.vector_search(&query.query_text, &candidate_kb_ids).await?; @@ -964,8 +995,18 @@ Phase 3: 标记旧表 deprecated(不删除,保留备份) ### 8.3 迁移 SQL 示例 +> **关键约束:** 使用临时映射表确保旧数据精确关联到新知识库,避免 JOIN 模糊匹配导致数据重复或丢失。 + ```sql --- Step 1: 按 analysis_type 创建默认知识库 +-- Step 0: 创建临时映射表(tenant_id + analysis_type → new_kb_id) +CREATE TEMP TABLE kb_migration_map ( + tenant_id UUID, + analysis_type VARCHAR(50), + old_table VARCHAR(20), -- 'references' / 'guides' / 'rules' + new_kb_id UUID +); + +-- Step 1: 按 tenant_id + analysis_type 创建知识库,同时记录映射 INSERT INTO ai_knowledge_bases (id, tenant_id, name, kb_type, description, chunk_strategy, intent_keywords) SELECT gen_random_uuid(), @@ -977,37 +1018,60 @@ SELECT '[]'::jsonb FROM ai_knowledge_references WHERE deleted_at IS NULL -GROUP BY tenant_id, analysis_type; +GROUP BY tenant_id, analysis_type +RETURNING id, tenant_id, name; -- PostgreSQL RETURNING 支持精确关联 --- Step 2: 为每个 reference 创建文档 + 切片 +-- Step 1b: 填充映射表(使用 name 匹配,因为 RETURNING 不能直接填充 temp table) +INSERT INTO kb_migration_map (tenant_id, analysis_type, old_table, new_kb_id) +SELECT + r.tenant_id, + r.analysis_type, + 'references', + kb.id +FROM (SELECT DISTINCT tenant_id, analysis_type FROM ai_knowledge_references WHERE deleted_at IS NULL) r +JOIN ai_knowledge_bases kb ON kb.tenant_id = r.tenant_id + AND kb.name = '参考资料 - ' || r.analysis_type + AND kb.kb_type = 'reference'; + +-- Step 2: 为每个 reference 创建文档(通过映射表精确 JOIN) INSERT INTO ai_knowledge_documents (id, tenant_id, knowledge_base_id, title, source_type, raw_content, status, chunk_count) SELECT r.id, r.tenant_id, - kb.id, + m.new_kb_id, r.title, 'manual', r.content_summary, 'completed', 1 FROM ai_knowledge_references r -JOIN ai_knowledge_bases kb ON kb.tenant_id = r.tenant_id AND kb.kb_type = 'reference' +JOIN kb_migration_map m ON m.tenant_id = r.tenant_id + AND m.analysis_type = r.analysis_type + AND m.old_table = 'references' WHERE r.deleted_at IS NULL; --- Step 3: 迁移切片 + 向量 +-- Step 3: 迁移切片 + 向量(通过映射表精确 JOIN) INSERT INTO ai_knowledge_chunks (id, tenant_id, document_id, knowledge_base_id, chunk_index, content, embedding, metadata) SELECT gen_random_uuid(), r.tenant_id, r.id, - kb.id, + m.new_kb_id, 0, r.content_summary, - r.embedding::vector, -- 需确保维度匹配 - jsonb_build_object('source_name', r.source_name, 'tags', r.tags) + r.embedding::vector, + jsonb_build_object('source_name', r.source_name, 'tags', r.tags, 'migrated_from', 'ai_knowledge_references') FROM ai_knowledge_references r -JOIN ai_knowledge_bases kb ON kb.tenant_id = r.tenant_id AND kb.kb_type = 'reference' +JOIN kb_migration_map m ON m.tenant_id = r.tenant_id + AND m.analysis_type = r.analysis_type + AND m.old_table = 'references' WHERE r.deleted_at IS NULL AND r.embedding IS NOT NULL; + +-- Step 4-6: 同理迁移 ai_knowledge_guides 和 ai_knowledge_rules +-- (guides 的 kb_type='clinical_guide',rules 的 kb_type='rule') + +-- Step 7: 清理临时表 +DROP TABLE kb_migration_map; ``` ### 8.4 旧表处理 @@ -1092,6 +1156,10 @@ pub struct ImportUrlReq { } ``` +> **SSRF 防护要求(§9.6 安全约束):** +> - `validate_no_ssrf` 必须同时验证:① 仅允许 `http`/`https` 协议 ② 禁止 `localhost`/`127.0.0.1`/`10.x`/`172.16-31.x`/`192.168.x`/`0.0.0.0` ③ DNS 解析后检查实际 IP(防 DNS rebinding)④ 请求超时 10 秒 ⑤ 禁止重定向到内网地址(跟随重定向时重新校验) +> - 文件上传安全:MIME 类型白名单(`application/pdf`, `application/vnd.openxmlformats-officedocument.*`, `text/*`)+ 大小上限 50MB + 文件名 sanitize(防路径穿越) + **CreateManualReq:** ```rust pub struct CreateManualReq { @@ -1120,6 +1188,7 @@ pub struct HitTestReq { | `docx-rs` | Word 文档解析 | 小 | 纯 Rust | | `calamine` | Excel 文件读取 | 小 | 纯 Rust,支持 xlsx/xls/csv | | `scraper` | HTML DOM 解析(URL 导入) | 小 | 已在项目间接依赖中 | +| `validator` | DTO 输入校验 | 小 | workspace 已有,需在 erp-ai 的 Cargo.toml 中启用 derive feature | **不新增的系统级依赖** — 所有解析库均为纯 Rust 实现,无需安装系统库。