feat(knowledge): Phase A 知识库可见性隔离 + 结构化数据源 + 蒸馏Worker
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled

- knowledge_items 增加 visibility(public/private) + account_id 字段
- 新建 structured_sources + structured_rows 表 (Excel JSONB 行级存储)
- 结构化数据源 CRUD API (5 路由: list/get/rows/delete/query)
- 安全查询: JSONB GIN 索引 + 可见性过滤 + 行数限制
- 蒸馏 Worker: 复用 Provider Key Pool 调 DeepSeek/Qwen API
- L0 质量过滤: 长度/隐私检测
- create_item 增加 is_admin 参数控制可见性默认值
- generate_embedding: extract_keywords_from_text 改为 pub 复用

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
iven
2026-04-12 18:36:05 +08:00
parent b8fb76375c
commit c3593d3438
10 changed files with 846 additions and 20 deletions

View File

@@ -276,6 +276,7 @@ pub async fn create_item(
pool: &PgPool,
account_id: &str,
req: &CreateItemRequest,
is_admin: bool,
) -> SaasResult<KnowledgeItem> {
let id = uuid::Uuid::new_v4().to_string();
let keywords = req.keywords.as_deref().unwrap_or(&[]);
@@ -283,6 +284,16 @@ pub async fn create_item(
let priority = req.priority.unwrap_or(0);
let tags = req.tags.as_deref().unwrap_or(&[]);
// visibility: Admin 默认 public普通用户默认 private
let visibility = req.visibility.as_deref().unwrap_or_else(|| {
if is_admin { "public" } else { "private" }
});
if !is_admin && visibility == "public" {
return Err(crate::error::SaasError::InvalidInput(
"普通用户只能创建私有知识条目".into(),
));
}
// 验证 category_id 存在性
let cat_exists: bool = sqlx::query_scalar(
"SELECT EXISTS(SELECT 1 FROM knowledge_categories WHERE id = $1)"
@@ -299,10 +310,12 @@ pub async fn create_item(
// 使用事务保证 item + version 原子性
let mut tx = pool.begin().await?;
let item_account_id: Option<&str> = if visibility == "public" { None } else { Some(account_id) };
let item = sqlx::query_as::<_, KnowledgeItem>(
"INSERT INTO knowledge_items \
(id, category_id, title, content, keywords, related_questions, priority, tags, created_by) \
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) \
(id, category_id, title, content, keywords, related_questions, priority, tags, created_by, visibility, account_id) \
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11) \
RETURNING *"
)
.bind(&id)
@@ -314,6 +327,8 @@ pub async fn create_item(
.bind(priority)
.bind(tags)
.bind(account_id)
.bind(visibility)
.bind(item_account_id)
.fetch_one(&mut *tx)
.await?;
@@ -781,3 +796,257 @@ pub async fn analytics_gaps(pool: &PgPool) -> SaasResult<serde_json::Value> {
"gaps": gaps.into_iter().map(|(v,)| v).collect::<Vec<_>>()
}))
}
// === 结构化数据源 CRUD ===
/// 创建结构化数据源
pub async fn create_structured_source(
pool: &PgPool,
account_id: &str,
is_admin: bool,
req: &CreateStructuredSourceRequest,
) -> SaasResult<StructuredSource> {
let id = uuid::Uuid::new_v4().to_string();
let visibility = req.visibility.as_deref().unwrap_or_else(|| {
if is_admin { "public" } else { "private" }
});
let source_account_id: Option<&str> = if visibility == "public" { None } else { Some(account_id) };
let source = sqlx::query_as::<_, StructuredSource>(
"INSERT INTO structured_sources \
(id, account_id, title, description, original_file_name, sheet_names, column_headers, \
visibility, industry_id, created_by) \
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10) \
RETURNING *"
)
.bind(&id)
.bind(source_account_id)
.bind(&req.title)
.bind(&req.description)
.bind(&req.original_file_name)
.bind(req.sheet_names.as_deref().unwrap_or(&vec![]))
.bind(req.column_headers.as_deref().unwrap_or(&vec![]))
.bind(visibility)
.bind(&req.industry_id)
.bind(account_id)
.fetch_one(pool)
.await?;
Ok(source)
}
/// 批量写入结构化数据行
pub async fn insert_structured_rows(
pool: &PgPool,
source_id: &str,
rows: &[(Option<String>, i32, Vec<String>, serde_json::Value)],
) -> SaasResult<i64> {
let mut tx = pool.begin().await?;
let mut count: i64 = 0;
for (sheet_name, row_index, headers, row_data) in rows {
let row_id = uuid::Uuid::new_v4().to_string();
sqlx::query(
"INSERT INTO structured_rows (id, source_id, sheet_name, row_index, headers, row_data) \
VALUES ($1, $2, $3, $4, $5, $6)"
)
.bind(&row_id)
.bind(source_id)
.bind(sheet_name)
.bind(*row_index)
.bind(headers)
.bind(row_data)
.execute(&mut *tx)
.await?;
count += 1;
}
sqlx::query(
"UPDATE structured_sources SET row_count = (SELECT COUNT(*) FROM structured_rows WHERE source_id = $1), \
updated_at = NOW() WHERE id = $1"
)
.bind(source_id)
.execute(&mut *tx)
.await?;
tx.commit().await?;
Ok(count)
}
/// 列出结构化数据源(分页,含可见性过滤)
pub async fn list_structured_sources(
pool: &PgPool,
viewer_account_id: Option<&str>,
industry_id: Option<&str>,
status: Option<&str>,
page: i64,
page_size: i64,
) -> SaasResult<(Vec<StructuredSource>, i64)> {
let offset = (page - 1) * page_size;
let items: Vec<StructuredSource> = sqlx::query_as(
"SELECT * FROM structured_sources \
WHERE (visibility = 'public' OR account_id = $1) \
AND ($2::text IS NULL OR industry_id = $2) \
AND ($3::text IS NULL OR status = $3) \
ORDER BY updated_at DESC \
LIMIT $4 OFFSET $5"
)
.bind(viewer_account_id)
.bind(industry_id)
.bind(status)
.bind(page_size)
.bind(offset)
.fetch_all(pool)
.await?;
let total: (i64,) = sqlx::query_as(
"SELECT COUNT(*) FROM structured_sources \
WHERE (visibility = 'public' OR account_id = $1) \
AND ($2::text IS NULL OR industry_id = $2) \
AND ($3::text IS NULL OR status = $3)"
)
.bind(viewer_account_id)
.bind(industry_id)
.bind(status)
.fetch_one(pool)
.await?;
Ok((items, total.0))
}
/// 获取结构化数据源详情
pub async fn get_structured_source(
pool: &PgPool,
source_id: &str,
viewer_account_id: Option<&str>,
) -> SaasResult<Option<StructuredSource>> {
let source = sqlx::query_as::<_, StructuredSource>(
"SELECT * FROM structured_sources WHERE id = $1 \
AND (visibility = 'public' OR account_id = $2)"
)
.bind(source_id)
.bind(viewer_account_id)
.fetch_optional(pool)
.await?;
Ok(source)
}
/// 列出结构化数据源的行数据(分页)
pub async fn list_structured_rows(
pool: &PgPool,
source_id: &str,
viewer_account_id: Option<&str>,
sheet_name: Option<&str>,
page: i64,
page_size: i64,
) -> SaasResult<(Vec<StructuredRow>, i64)> {
let source = get_structured_source(pool, source_id, viewer_account_id).await?;
if source.is_none() {
return Err(crate::error::SaasError::NotFound("数据源不存在或无权限".into()));
}
let offset = (page - 1) * page_size;
let rows: Vec<StructuredRow> = sqlx::query_as(
"SELECT * FROM structured_rows \
WHERE source_id = $1 \
AND ($2::text IS NULL OR sheet_name = $2) \
ORDER BY row_index \
LIMIT $3 OFFSET $4"
)
.bind(source_id)
.bind(sheet_name)
.bind(page_size)
.bind(offset)
.fetch_all(pool)
.await?;
let total: (i64,) = sqlx::query_as(
"SELECT COUNT(*) FROM structured_rows \
WHERE source_id = $1 \
AND ($2::text IS NULL OR sheet_name = $2)"
)
.bind(source_id)
.bind(sheet_name)
.fetch_one(pool)
.await?;
Ok((rows, total.0))
}
/// 删除结构化数据源(级联删除行)
pub async fn delete_structured_source(pool: &PgPool, source_id: &str) -> SaasResult<()> {
let result = sqlx::query("DELETE FROM structured_sources WHERE id = $1")
.bind(source_id)
.execute(pool)
.await?;
if result.rows_affected() == 0 {
return Err(crate::error::SaasError::NotFound("数据源不存在".into()));
}
Ok(())
}
/// 安全的结构化查询(关键词匹配 + 可见性过滤)
pub async fn query_structured(
pool: &PgPool,
request: &StructuredQueryRequest,
viewer_account_id: Option<&str>,
) -> SaasResult<Vec<StructuredQueryResult>> {
let limit = request.limit.unwrap_or(20).min(50);
let pattern = format!("%{}%",
request.query.replace('\\', "\\\\").replace('%', "\\%").replace('_', "\\_")
);
let source_filter = if let Some(ref sid) = request.source_id {
format!("AND ss.id = '{}'", sid.replace('\'', "''"))
} else {
String::new()
};
let industry_filter = if let Some(ref iid) = request.industry_id {
format!("AND ss.industry_id = '{}'", iid.replace('\'', "''"))
} else {
String::new()
};
let rows: Vec<(String, String, Vec<String>, serde_json::Value)> = sqlx::query_as(
&format!(
"SELECT sr.source_id, ss.title, sr.headers, sr.row_data \
FROM structured_rows sr \
JOIN structured_sources ss ON sr.source_id = ss.id \
WHERE (ss.visibility = 'public' OR ss.account_id = $1) \
AND ss.status = 'active' \
{} {} \
AND (sr.row_data::text ILIKE $2 \
OR array_to_string(sr.headers, ' ') ILIKE $2) \
ORDER BY ss.title, sr.row_index \
LIMIT {}",
source_filter, industry_filter, limit
)
)
.bind(viewer_account_id)
.bind(&pattern)
.fetch_all(pool)
.await?;
let mut results_map: std::collections::HashMap<String, StructuredQueryResult> =
std::collections::HashMap::new();
for (source_id, source_title, headers, row_data) in rows {
let entry = results_map.entry(source_id.clone())
.or_insert_with(|| StructuredQueryResult {
source_id: source_id.clone(),
source_title: source_title.clone(),
headers: headers.clone(),
rows: Vec::new(),
total_matched: 0,
generated_sql: None,
});
if let Ok(map) = serde_json::from_value::<std::collections::HashMap<String, serde_json::Value>>(row_data) {
entry.rows.push(map);
}
entry.total_matched += 1;
}
Ok(results_map.into_values().collect())
}