feat(ai): 向量搜索 + hit test API

- KnowledgeV2Service.vector_search: pgvector 余弦相似度搜索
- SearchHit DTO: chunk_id/document_id/similarity/metadata
- hit_test handler: POST /ai/documents/hit-test (embed query → 搜索)
- AiState 添加 embedding 字段,共享 EmbeddingService 实例
- top_k 限制最大 20

Phase 2 Task 11
This commit is contained in:
iven
2026-05-27 00:24:34 +08:00
parent e94f5bc00c
commit 7d1b1f9c7c
5 changed files with 154 additions and 6 deletions

View File

@@ -255,4 +255,82 @@ impl KnowledgeV2Service {
.map_err(|e| AiError::DbError(e.to_string()))?;
Ok(())
}
/// 向量相似度搜索:在指定知识库中搜索与 query_embedding 最相似的 top_k 个切片
pub async fn vector_search(
&self,
tenant_id: Uuid,
kb_id: Uuid,
query_embedding: &[f32],
top_k: i64,
) -> AiResult<Vec<SearchHit>> {
let vector_str = crate::service::embedding::format_vector(query_embedding);
let sql = r#"
SELECT c.id, c.document_id, c.chunk_index, c.content, c.metadata,
d.title AS doc_title,
1 - (c.embedding <=> $3::vector) AS similarity
FROM ai_knowledge_chunks c
JOIN ai_knowledge_documents d ON d.id = c.document_id
WHERE c.tenant_id = $1
AND c.knowledge_base_id = $2
AND c.deleted_at IS NULL
AND d.deleted_at IS NULL
AND c.embedding IS NOT NULL
ORDER BY c.embedding <=> $3::vector
LIMIT $4
"#;
let stmt = sea_orm::Statement::from_sql_and_values(
sea_orm::DatabaseBackend::Postgres,
sql,
[
sea_orm::Value::from(tenant_id),
sea_orm::Value::from(kb_id),
sea_orm::Value::String(Some(Box::new(vector_str))),
sea_orm::Value::from(top_k),
],
);
let rows: Vec<SearchHitRow> = sea_orm::FromQueryResult::find_by_statement(stmt)
.all(&self.db)
.await
.map_err(|e| AiError::DbError(e.to_string()))?;
Ok(rows.into_iter().map(SearchHit::from).collect())
}
}
#[derive(Debug, sea_orm::FromQueryResult)]
struct SearchHitRow {
id: Uuid,
document_id: Uuid,
chunk_index: i32,
content: String,
metadata: serde_json::Value,
doc_title: String,
similarity: f64,
}
#[derive(Debug, serde::Serialize)]
pub struct SearchHit {
pub chunk_id: Uuid,
pub document_id: Uuid,
pub chunk_index: i32,
pub content: String,
pub doc_title: String,
pub similarity: f64,
pub metadata: serde_json::Value,
}
impl From<SearchHitRow> for SearchHit {
fn from(row: SearchHitRow) -> Self {
Self {
chunk_id: row.id,
document_id: row.document_id,
chunk_index: row.chunk_index,
content: row.content,
doc_title: row.doc_title,
similarity: row.similarity,
metadata: row.metadata,
}
}
}