Compare commits
3 Commits
23c5bbdb40
...
7d1b1f9c7c
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7d1b1f9c7c | ||
|
|
e94f5bc00c | ||
|
|
0a1f4cb9a9 |
@@ -120,6 +120,9 @@ handlebars = "6"
|
||||
# HTML sanitization
|
||||
ammonia = "4"
|
||||
|
||||
# Document parsing
|
||||
pdf-extract = "0.7"
|
||||
|
||||
# Metrics
|
||||
metrics = "0.24"
|
||||
metrics-exporter-prometheus = "0.16"
|
||||
|
||||
@@ -26,3 +26,4 @@ sha2.workspace = true
|
||||
redis.workspace = true
|
||||
hex.workspace = true
|
||||
regex-lite.workspace = true
|
||||
pdf-extract.workspace = true
|
||||
|
||||
307
crates/erp-ai/src/handler/document_handler.rs
Normal file
307
crates/erp-ai/src/handler/document_handler.rs
Normal file
@@ -0,0 +1,307 @@
|
||||
use axum::Json;
|
||||
use axum::extract::{Extension, FromRef, Multipart, Path, State};
|
||||
use erp_core::rbac::require_permission;
|
||||
use erp_core::types::{ApiResponse, TenantContext};
|
||||
use serde::Deserialize;
|
||||
|
||||
use crate::service::document::{CreateDocumentReq, ListDocumentsQuery, UploadDocumentParams};
|
||||
use crate::state::AiState;
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct ListDocumentsParams {
|
||||
pub kb_id: uuid::Uuid,
|
||||
pub status: Option<String>,
|
||||
pub page: Option<u64>,
|
||||
pub page_size: Option<u64>,
|
||||
}
|
||||
|
||||
#[utoipa::path(
|
||||
get,
|
||||
path = "/ai/knowledge-bases/{kb_id}/documents",
|
||||
responses((status = 200, description = "文档列表")),
|
||||
tag = "知识库文档",
|
||||
security(("bearer_auth" = [])),
|
||||
)]
|
||||
pub async fn list_documents<S>(
|
||||
State(state): State<AiState>,
|
||||
Extension(ctx): Extension<TenantContext>,
|
||||
Path(kb_id): Path<uuid::Uuid>,
|
||||
axum::extract::Query(params): axum::extract::Query<ListDocumentsParamsNoKb>,
|
||||
) -> Result<Json<ApiResponse<serde_json::Value>>, erp_core::error::AppError>
|
||||
where
|
||||
AiState: FromRef<S>,
|
||||
S: Clone + Send + Sync + 'static,
|
||||
{
|
||||
require_permission(&ctx, "ai.knowledge.list")?;
|
||||
|
||||
let query = ListDocumentsQuery {
|
||||
status: params.status,
|
||||
page: params.page,
|
||||
page_size: params.page_size,
|
||||
};
|
||||
let (items, total) = state
|
||||
.document
|
||||
.list_documents(ctx.tenant_id, kb_id, &query)
|
||||
.await?;
|
||||
|
||||
Ok(Json(ApiResponse::ok(serde_json::json!({
|
||||
"data": items,
|
||||
"total": total,
|
||||
"page": query.page.unwrap_or(1),
|
||||
"page_size": query.page_size.unwrap_or(20),
|
||||
}))))
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct ListDocumentsParamsNoKb {
|
||||
pub status: Option<String>,
|
||||
pub page: Option<u64>,
|
||||
pub page_size: Option<u64>,
|
||||
}
|
||||
|
||||
#[utoipa::path(
|
||||
get,
|
||||
path = "/ai/documents/{id}",
|
||||
responses((status = 200, description = "文档详情")),
|
||||
tag = "知识库文档",
|
||||
security(("bearer_auth" = [])),
|
||||
)]
|
||||
pub async fn get_document<S>(
|
||||
State(state): State<AiState>,
|
||||
Extension(ctx): Extension<TenantContext>,
|
||||
Path(id): Path<uuid::Uuid>,
|
||||
) -> Result<Json<ApiResponse<serde_json::Value>>, erp_core::error::AppError>
|
||||
where
|
||||
AiState: FromRef<S>,
|
||||
S: Clone + Send + Sync + 'static,
|
||||
{
|
||||
require_permission(&ctx, "ai.knowledge.list")?;
|
||||
let doc = state.document.get_document(ctx.tenant_id, id).await?;
|
||||
Ok(Json(ApiResponse::ok(
|
||||
serde_json::to_value(&doc).unwrap_or_default(),
|
||||
)))
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, utoipa::ToSchema)]
|
||||
pub struct CreateDocumentBody {
|
||||
pub kb_id: uuid::Uuid,
|
||||
pub title: String,
|
||||
pub doc_type: Option<String>,
|
||||
pub source_type: Option<String>,
|
||||
pub source_url: Option<String>,
|
||||
pub content: Option<String>,
|
||||
}
|
||||
|
||||
#[utoipa::path(
|
||||
post,
|
||||
path = "/ai/documents/manual",
|
||||
request_body = CreateDocumentBody,
|
||||
responses((status = 200, description = "创建手动文档")),
|
||||
tag = "知识库文档",
|
||||
security(("bearer_auth" = [])),
|
||||
)]
|
||||
pub async fn create_manual_document<S>(
|
||||
State(state): State<AiState>,
|
||||
Extension(ctx): Extension<TenantContext>,
|
||||
Json(body): Json<CreateDocumentBody>,
|
||||
) -> Result<Json<ApiResponse<serde_json::Value>>, erp_core::error::AppError>
|
||||
where
|
||||
AiState: FromRef<S>,
|
||||
S: Clone + Send + Sync + 'static,
|
||||
{
|
||||
require_permission(&ctx, "ai.knowledge.manage")?;
|
||||
|
||||
if body.title.trim().is_empty() {
|
||||
return Err(erp_core::error::AppError::Validation(
|
||||
"文档标题不能为空".into(),
|
||||
));
|
||||
}
|
||||
|
||||
let req = CreateDocumentReq {
|
||||
title: body.title,
|
||||
doc_type: body.doc_type,
|
||||
source_type: body.source_type,
|
||||
source_url: body.source_url,
|
||||
content: body.content,
|
||||
};
|
||||
|
||||
let id = state
|
||||
.document
|
||||
.create_manual_document(ctx.tenant_id, ctx.user_id, body.kb_id, req)
|
||||
.await?;
|
||||
|
||||
Ok(Json(ApiResponse::ok(serde_json::json!({ "id": id }))))
|
||||
}
|
||||
|
||||
const MAX_FILE_SIZE: usize = 20 * 1024 * 1024; // 20MB
|
||||
|
||||
#[utoipa::path(
|
||||
post,
|
||||
path = "/ai/documents/upload",
|
||||
responses((status = 200, description = "上传文档文件")),
|
||||
tag = "知识库文档",
|
||||
security(("bearer_auth" = [])),
|
||||
)]
|
||||
pub async fn upload_document<S>(
|
||||
State(state): State<AiState>,
|
||||
Extension(ctx): Extension<TenantContext>,
|
||||
mut multipart: Multipart,
|
||||
) -> Result<Json<ApiResponse<serde_json::Value>>, erp_core::error::AppError>
|
||||
where
|
||||
AiState: FromRef<S>,
|
||||
S: Clone + Send + Sync + 'static,
|
||||
{
|
||||
require_permission(&ctx, "ai.knowledge.manage")?;
|
||||
|
||||
let mut kb_id: Option<uuid::Uuid> = None;
|
||||
let mut title: Option<String> = None;
|
||||
let mut file_data: Option<(String, String, Vec<u8>)> = None; // (filename, mime, bytes)
|
||||
|
||||
while let Some(field) = multipart
|
||||
.next_field()
|
||||
.await
|
||||
.map_err(|e| erp_core::error::AppError::Validation(format!("读取上传字段失败: {}", e)))?
|
||||
{
|
||||
let field_name = field.name().unwrap_or("").to_string();
|
||||
|
||||
match field_name.as_str() {
|
||||
"kb_id" => {
|
||||
let text = field.text().await.map_err(|e| {
|
||||
erp_core::error::AppError::Validation(format!("读取 kb_id 失败: {}", e))
|
||||
})?;
|
||||
kb_id =
|
||||
Some(uuid::Uuid::parse_str(&text).map_err(|_| {
|
||||
erp_core::error::AppError::Validation("kb_id 格式错误".into())
|
||||
})?);
|
||||
}
|
||||
"title" => {
|
||||
let text = field.text().await.map_err(|e| {
|
||||
erp_core::error::AppError::Validation(format!("读取 title 失败: {}", e))
|
||||
})?;
|
||||
title = Some(text);
|
||||
}
|
||||
"file" => {
|
||||
let file_name = field.file_name().unwrap_or("unknown").to_string();
|
||||
let content_type = field
|
||||
.content_type()
|
||||
.unwrap_or("application/octet-stream")
|
||||
.to_string();
|
||||
let bytes = field.bytes().await.map_err(|e| {
|
||||
erp_core::error::AppError::Validation(format!("读取文件失败: {}", e))
|
||||
})?;
|
||||
|
||||
if bytes.len() > MAX_FILE_SIZE {
|
||||
return Err(erp_core::error::AppError::Validation(format!(
|
||||
"文件大小超过限制 (最大 {}MB)",
|
||||
MAX_FILE_SIZE / 1024 / 1024
|
||||
)));
|
||||
}
|
||||
|
||||
file_data = Some((file_name, content_type, bytes.to_vec()));
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
let kb_id =
|
||||
kb_id.ok_or_else(|| erp_core::error::AppError::Validation("缺少 kb_id 字段".into()))?;
|
||||
let (file_name, mime_type, bytes) =
|
||||
file_data.ok_or_else(|| erp_core::error::AppError::Validation("缺少 file 字段".into()))?;
|
||||
|
||||
let doc_title = title.unwrap_or_else(|| file_name.clone());
|
||||
|
||||
// 解析文档内容
|
||||
let content = crate::service::document::parser::parse_document(&file_name, &mime_type, &bytes)
|
||||
.map_err(|e| erp_core::error::AppError::Validation(format!("文档解析失败: {}", e)))?;
|
||||
|
||||
let params = UploadDocumentParams {
|
||||
file_name,
|
||||
file_size: bytes.len() as i64,
|
||||
mime_type,
|
||||
content,
|
||||
};
|
||||
|
||||
let id = state
|
||||
.document
|
||||
.create_upload_document(ctx.tenant_id, ctx.user_id, kb_id, doc_title, params)
|
||||
.await?;
|
||||
|
||||
Ok(Json(ApiResponse::ok(serde_json::json!({ "id": id }))))
|
||||
}
|
||||
|
||||
#[utoipa::path(
|
||||
delete,
|
||||
path = "/ai/knowledge-bases/{kb_id}/documents/{id}",
|
||||
responses((status = 200, description = "删除文档")),
|
||||
tag = "知识库文档",
|
||||
security(("bearer_auth" = [])),
|
||||
)]
|
||||
pub async fn delete_document<S>(
|
||||
State(state): State<AiState>,
|
||||
Extension(ctx): Extension<TenantContext>,
|
||||
Path((kb_id, id)): Path<(uuid::Uuid, uuid::Uuid)>,
|
||||
) -> Result<Json<ApiResponse<serde_json::Value>>, erp_core::error::AppError>
|
||||
where
|
||||
AiState: FromRef<S>,
|
||||
S: Clone + Send + Sync + 'static,
|
||||
{
|
||||
require_permission(&ctx, "ai.knowledge.manage")?;
|
||||
state
|
||||
.document
|
||||
.delete_document(ctx.tenant_id, kb_id, id)
|
||||
.await?;
|
||||
Ok(Json(ApiResponse::ok(serde_json::json!({ "id": id }))))
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, utoipa::ToSchema)]
|
||||
pub struct HitTestBody {
|
||||
pub kb_id: uuid::Uuid,
|
||||
pub query: String,
|
||||
pub top_k: Option<i64>,
|
||||
}
|
||||
|
||||
#[utoipa::path(
|
||||
post,
|
||||
path = "/ai/documents/hit-test",
|
||||
request_body = HitTestBody,
|
||||
responses((status = 200, description = "向量搜索 hit test")),
|
||||
tag = "知识库文档",
|
||||
security(("bearer_auth" = [])),
|
||||
)]
|
||||
pub async fn hit_test<S>(
|
||||
State(state): State<AiState>,
|
||||
Extension(ctx): Extension<TenantContext>,
|
||||
Json(body): Json<HitTestBody>,
|
||||
) -> Result<Json<ApiResponse<serde_json::Value>>, erp_core::error::AppError>
|
||||
where
|
||||
AiState: FromRef<S>,
|
||||
S: Clone + Send + Sync + 'static,
|
||||
{
|
||||
require_permission(&ctx, "ai.knowledge.list")?;
|
||||
|
||||
if body.query.trim().is_empty() {
|
||||
return Err(erp_core::error::AppError::Validation(
|
||||
"搜索查询不能为空".into(),
|
||||
));
|
||||
}
|
||||
|
||||
// 生成 query embedding
|
||||
let embedding =
|
||||
state.embedding.embed(&body.query).await.map_err(|e| {
|
||||
erp_core::error::AppError::Internal(format!("Embedding 生成失败: {}", e))
|
||||
})?;
|
||||
|
||||
let top_k = body.top_k.unwrap_or(5).min(20);
|
||||
|
||||
let hits = state
|
||||
.knowledge_v2
|
||||
.vector_search(ctx.tenant_id, body.kb_id, &embedding, top_k)
|
||||
.await
|
||||
.map_err(|e| erp_core::error::AppError::Internal(e.to_string()))?;
|
||||
|
||||
Ok(Json(ApiResponse::ok(serde_json::json!({
|
||||
"query": body.query,
|
||||
"total": hits.len(),
|
||||
"hits": hits,
|
||||
}))))
|
||||
}
|
||||
@@ -14,6 +14,7 @@ use crate::state::AiState;
|
||||
|
||||
pub mod chat_handler;
|
||||
pub mod config_handler;
|
||||
pub mod document_handler;
|
||||
pub mod insight_handler;
|
||||
pub mod knowledge_handler;
|
||||
pub mod knowledge_v2_handler;
|
||||
|
||||
@@ -609,6 +609,31 @@ impl AiModule {
|
||||
"/ai/knowledge-bases/{id}",
|
||||
axum::routing::delete(crate::handler::knowledge_v2_handler::delete_knowledge_base),
|
||||
)
|
||||
// 文档管理路由
|
||||
.route(
|
||||
"/ai/knowledge-bases/{kb_id}/documents",
|
||||
axum::routing::get(crate::handler::document_handler::list_documents),
|
||||
)
|
||||
.route(
|
||||
"/ai/documents/manual",
|
||||
axum::routing::post(crate::handler::document_handler::create_manual_document),
|
||||
)
|
||||
.route(
|
||||
"/ai/documents/upload",
|
||||
axum::routing::post(crate::handler::document_handler::upload_document),
|
||||
)
|
||||
.route(
|
||||
"/ai/documents/{id}",
|
||||
axum::routing::get(crate::handler::document_handler::get_document),
|
||||
)
|
||||
.route(
|
||||
"/ai/documents/hit-test",
|
||||
axum::routing::post(crate::handler::document_handler::hit_test),
|
||||
)
|
||||
.route(
|
||||
"/ai/knowledge-bases/{kb_id}/documents/{id}",
|
||||
axum::routing::delete(crate::handler::document_handler::delete_document),
|
||||
)
|
||||
.route(
|
||||
"/ai/dialysis/risk-assessment",
|
||||
axum::routing::post(crate::handler::assess_dialysis_risk),
|
||||
|
||||
459
crates/erp-ai/src/service/document/mod.rs
Normal file
459
crates/erp-ai/src/service/document/mod.rs
Normal file
@@ -0,0 +1,459 @@
|
||||
pub mod chunker;
|
||||
pub mod parser;
|
||||
|
||||
use sea_orm::{
|
||||
ColumnTrait, ConnectionTrait, EntityTrait, PaginatorTrait, QueryFilter, QueryOrder, Set,
|
||||
};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::entity::ai_knowledge_documents;
|
||||
use crate::error::{AiError, AiResult};
|
||||
use crate::service::embedding::{EmbeddingService, format_vector};
|
||||
use crate::service::knowledge_v2::KnowledgeV2Service;
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
// ─── DTO ───
|
||||
|
||||
#[derive(Debug, Deserialize, Serialize, utoipa::ToSchema)]
|
||||
pub struct CreateDocumentReq {
|
||||
pub title: String,
|
||||
pub doc_type: Option<String>,
|
||||
pub source_type: Option<String>,
|
||||
pub source_url: Option<String>,
|
||||
pub content: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, utoipa::IntoParams)]
|
||||
pub struct ListDocumentsQuery {
|
||||
pub status: Option<String>,
|
||||
pub page: Option<u64>,
|
||||
pub page_size: Option<u64>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct UploadDocumentParams {
|
||||
pub file_name: String,
|
||||
pub file_size: i64,
|
||||
pub mime_type: String,
|
||||
pub content: String,
|
||||
}
|
||||
|
||||
// ─── Service ───
|
||||
|
||||
pub struct DocumentService {
|
||||
db: sea_orm::DatabaseConnection,
|
||||
knowledge_v2: Arc<KnowledgeV2Service>,
|
||||
embedding: Arc<EmbeddingService>,
|
||||
}
|
||||
|
||||
impl DocumentService {
|
||||
pub fn new(
|
||||
db: sea_orm::DatabaseConnection,
|
||||
knowledge_v2: Arc<KnowledgeV2Service>,
|
||||
embedding: Arc<EmbeddingService>,
|
||||
) -> Self {
|
||||
Self {
|
||||
db,
|
||||
knowledge_v2,
|
||||
embedding,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn list_documents(
|
||||
&self,
|
||||
tenant_id: Uuid,
|
||||
kb_id: Uuid,
|
||||
query: &ListDocumentsQuery,
|
||||
) -> AiResult<(Vec<ai_knowledge_documents::Model>, u64)> {
|
||||
let page = query.page.unwrap_or(1);
|
||||
let page_size = query.page_size.unwrap_or(20);
|
||||
|
||||
let mut find = ai_knowledge_documents::Entity::find()
|
||||
.filter(ai_knowledge_documents::Column::TenantId.eq(tenant_id))
|
||||
.filter(ai_knowledge_documents::Column::KnowledgeBaseId.eq(kb_id))
|
||||
.filter(ai_knowledge_documents::Column::DeletedAt.is_null());
|
||||
|
||||
if let Some(ref status) = query.status {
|
||||
find = find.filter(ai_knowledge_documents::Column::Status.eq(status.as_str()));
|
||||
}
|
||||
|
||||
let paginator = find
|
||||
.order_by_desc(ai_knowledge_documents::Column::CreatedAt)
|
||||
.paginate(&self.db, page_size);
|
||||
|
||||
let total = paginator.num_items().await?;
|
||||
let items = paginator.fetch_page(page - 1).await?;
|
||||
|
||||
Ok((items, total))
|
||||
}
|
||||
|
||||
pub async fn get_document(
|
||||
&self,
|
||||
tenant_id: Uuid,
|
||||
id: Uuid,
|
||||
) -> AiResult<ai_knowledge_documents::Model> {
|
||||
ai_knowledge_documents::Entity::find_by_id(id)
|
||||
.one(&self.db)
|
||||
.await
|
||||
.map_err(|e| AiError::DbError(e.to_string()))?
|
||||
.filter(|m| m.tenant_id == tenant_id && m.deleted_at.is_none())
|
||||
.ok_or_else(|| AiError::KnowledgeError("文档不存在".into()))
|
||||
}
|
||||
|
||||
/// 创建手动输入文档并立即处理
|
||||
pub async fn create_manual_document(
|
||||
&self,
|
||||
tenant_id: Uuid,
|
||||
user_id: Uuid,
|
||||
kb_id: Uuid,
|
||||
req: CreateDocumentReq,
|
||||
) -> AiResult<Uuid> {
|
||||
// 验证知识库存在
|
||||
self.knowledge_v2.get_by_id(tenant_id, kb_id).await?;
|
||||
|
||||
let id = Uuid::now_v7();
|
||||
let now = chrono::Utc::now();
|
||||
|
||||
let active = ai_knowledge_documents::ActiveModel {
|
||||
id: Set(id),
|
||||
tenant_id: Set(tenant_id),
|
||||
knowledge_base_id: Set(kb_id),
|
||||
title: Set(req.title),
|
||||
doc_type: Set(req.doc_type.unwrap_or_else(|| "manual".into())),
|
||||
source_type: Set(req.source_type.unwrap_or_else(|| "manual".into())),
|
||||
source_url: Set(req.source_url),
|
||||
file_name: Set(None),
|
||||
file_size: Set(None),
|
||||
file_mime_type: Set(None),
|
||||
content: Set(req.content),
|
||||
status: Set("pending".into()),
|
||||
chunk_count: Set(0),
|
||||
embedded_count: Set(0),
|
||||
error_message: Set(None),
|
||||
processing_started_at: Set(None),
|
||||
processing_completed_at: Set(None),
|
||||
created_at: Set(now),
|
||||
updated_at: Set(now),
|
||||
created_by: Set(Some(user_id)),
|
||||
updated_by: Set(Some(user_id)),
|
||||
deleted_at: Set(None),
|
||||
version_lock: Set(1),
|
||||
};
|
||||
|
||||
ai_knowledge_documents::Entity::insert(active)
|
||||
.exec(&self.db)
|
||||
.await
|
||||
.map_err(|e| AiError::DbError(e.to_string()))?;
|
||||
|
||||
// 异步处理文档(切片 + 嵌入)
|
||||
self.knowledge_v2.increment_document_count(kb_id, 1).await?;
|
||||
self.process_document(id).await?;
|
||||
|
||||
Ok(id)
|
||||
}
|
||||
|
||||
/// 创建文件上传文档记录
|
||||
pub async fn create_upload_document(
|
||||
&self,
|
||||
tenant_id: Uuid,
|
||||
user_id: Uuid,
|
||||
kb_id: Uuid,
|
||||
title: String,
|
||||
params: UploadDocumentParams,
|
||||
) -> AiResult<Uuid> {
|
||||
self.knowledge_v2.get_by_id(tenant_id, kb_id).await?;
|
||||
|
||||
let id = Uuid::now_v7();
|
||||
let now = chrono::Utc::now();
|
||||
|
||||
let doc_type = mime_to_doc_type(¶ms.mime_type);
|
||||
|
||||
let active = ai_knowledge_documents::ActiveModel {
|
||||
id: Set(id),
|
||||
tenant_id: Set(tenant_id),
|
||||
knowledge_base_id: Set(kb_id),
|
||||
title: Set(title),
|
||||
doc_type: Set(doc_type),
|
||||
source_type: Set("upload".into()),
|
||||
source_url: Set(None),
|
||||
file_name: Set(Some(params.file_name)),
|
||||
file_size: Set(Some(params.file_size)),
|
||||
file_mime_type: Set(Some(params.mime_type)),
|
||||
content: Set(Some(params.content)),
|
||||
status: Set("pending".into()),
|
||||
chunk_count: Set(0),
|
||||
embedded_count: Set(0),
|
||||
error_message: Set(None),
|
||||
processing_started_at: Set(None),
|
||||
processing_completed_at: Set(None),
|
||||
created_at: Set(now),
|
||||
updated_at: Set(now),
|
||||
created_by: Set(Some(user_id)),
|
||||
updated_by: Set(Some(user_id)),
|
||||
deleted_at: Set(None),
|
||||
version_lock: Set(1),
|
||||
};
|
||||
|
||||
ai_knowledge_documents::Entity::insert(active)
|
||||
.exec(&self.db)
|
||||
.await
|
||||
.map_err(|e| AiError::DbError(e.to_string()))?;
|
||||
|
||||
self.knowledge_v2.increment_document_count(kb_id, 1).await?;
|
||||
self.process_document(id).await?;
|
||||
|
||||
Ok(id)
|
||||
}
|
||||
|
||||
pub async fn delete_document(&self, tenant_id: Uuid, kb_id: Uuid, id: Uuid) -> AiResult<()> {
|
||||
let existing = self.get_document(tenant_id, id).await?;
|
||||
if existing.knowledge_base_id != kb_id {
|
||||
return Err(AiError::KnowledgeError("文档不属于该知识库".into()));
|
||||
}
|
||||
|
||||
let now = chrono::Utc::now();
|
||||
let active = ai_knowledge_documents::ActiveModel {
|
||||
id: Set(existing.id),
|
||||
tenant_id: Set(existing.tenant_id),
|
||||
knowledge_base_id: Set(existing.knowledge_base_id),
|
||||
title: Set(existing.title),
|
||||
doc_type: Set(existing.doc_type),
|
||||
source_type: Set(existing.source_type),
|
||||
source_url: Set(existing.source_url),
|
||||
file_name: Set(existing.file_name),
|
||||
file_size: Set(existing.file_size),
|
||||
file_mime_type: Set(existing.file_mime_type),
|
||||
content: Set(existing.content),
|
||||
status: Set(existing.status),
|
||||
chunk_count: Set(existing.chunk_count),
|
||||
embedded_count: Set(existing.embedded_count),
|
||||
error_message: Set(existing.error_message),
|
||||
processing_started_at: Set(existing.processing_started_at),
|
||||
processing_completed_at: Set(existing.processing_completed_at),
|
||||
created_at: Set(existing.created_at),
|
||||
updated_at: Set(now),
|
||||
created_by: Set(existing.created_by),
|
||||
updated_by: Set(existing.updated_by),
|
||||
deleted_at: Set(Some(now)),
|
||||
version_lock: Set(existing.version_lock + 1),
|
||||
};
|
||||
|
||||
ai_knowledge_documents::Entity::update(active)
|
||||
.exec(&self.db)
|
||||
.await
|
||||
.map_err(|e| AiError::DbError(e.to_string()))?;
|
||||
|
||||
self.knowledge_v2
|
||||
.increment_document_count(kb_id, -1)
|
||||
.await?;
|
||||
self.knowledge_v2
|
||||
.increment_chunk_count(kb_id, -existing.chunk_count)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// 处理文档:切片 → 嵌入 → 更新状态
|
||||
async fn process_document(&self, doc_id: Uuid) -> AiResult<()> {
|
||||
let now = chrono::Utc::now();
|
||||
|
||||
// 标记处理中
|
||||
self.update_doc_status(doc_id, "processing", None, Some(now), None)
|
||||
.await?;
|
||||
|
||||
let doc = match ai_knowledge_documents::Entity::find_by_id(doc_id)
|
||||
.one(&self.db)
|
||||
.await
|
||||
{
|
||||
Ok(Some(d)) if d.deleted_at.is_none() => d,
|
||||
_ => {
|
||||
self.update_doc_status(
|
||||
doc_id,
|
||||
"failed",
|
||||
Some("文档未找到".into()),
|
||||
None,
|
||||
Some(now),
|
||||
)
|
||||
.await?;
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
|
||||
let content = match &doc.content {
|
||||
Some(c) if !c.trim().is_empty() => c.clone(),
|
||||
_ => {
|
||||
self.update_doc_status(
|
||||
doc_id,
|
||||
"failed",
|
||||
Some("文档内容为空".into()),
|
||||
None,
|
||||
Some(now),
|
||||
)
|
||||
.await?;
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
|
||||
// 切片
|
||||
let chunks = chunker::chunk_text(&content, 500, 50);
|
||||
if chunks.is_empty() {
|
||||
self.update_doc_status(
|
||||
doc_id,
|
||||
"failed",
|
||||
Some("切片结果为空".into()),
|
||||
None,
|
||||
Some(now),
|
||||
)
|
||||
.await?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// 嵌入 + 存储
|
||||
let mut embedded_count = 0u32;
|
||||
for (idx, chunk_content) in chunks.iter().enumerate() {
|
||||
let chunk_id = Uuid::now_v7();
|
||||
let embedding = self.try_embed(chunk_content).await;
|
||||
|
||||
let embedding_val = embedding
|
||||
.as_ref()
|
||||
.map(|e| sea_orm::Value::String(Some(Box::new(format_vector(e)))))
|
||||
.unwrap_or(sea_orm::Value::String(None));
|
||||
|
||||
let sql = r#"
|
||||
INSERT INTO ai_knowledge_chunks
|
||||
(id, tenant_id, knowledge_base_id, document_id, chunk_index, content,
|
||||
embedding, metadata, hit_count, created_at, updated_at, created_by, updated_by, deleted_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7::vector, '{}', 0, $8, $8, $9, $9, NULL)
|
||||
"#;
|
||||
|
||||
let stmt = sea_orm::Statement::from_sql_and_values(
|
||||
sea_orm::DatabaseBackend::Postgres,
|
||||
sql,
|
||||
[
|
||||
sea_orm::Value::from(chunk_id),
|
||||
sea_orm::Value::from(doc.tenant_id),
|
||||
sea_orm::Value::from(doc.knowledge_base_id),
|
||||
sea_orm::Value::from(doc_id),
|
||||
sea_orm::Value::from(idx as i32),
|
||||
sea_orm::Value::String(Some(Box::new(chunk_content.clone()))),
|
||||
embedding_val,
|
||||
sea_orm::Value::from(now),
|
||||
sea_orm::Value::from(doc.created_by),
|
||||
],
|
||||
);
|
||||
|
||||
match self.db.execute(stmt).await {
|
||||
Ok(_) => {
|
||||
if embedding.is_some() {
|
||||
embedded_count += 1;
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(chunk_index = idx, error = %e, "切片插入失败,跳过");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 更新文档状态
|
||||
let completed_now = chrono::Utc::now();
|
||||
let sql = r#"
|
||||
UPDATE ai_knowledge_documents
|
||||
SET status = 'completed', chunk_count = $2, embedded_count = $3,
|
||||
processing_completed_at = $4, updated_at = $4, version_lock = version_lock + 1
|
||||
WHERE id = $1 AND deleted_at IS NULL
|
||||
"#;
|
||||
let stmt = sea_orm::Statement::from_sql_and_values(
|
||||
sea_orm::DatabaseBackend::Postgres,
|
||||
sql,
|
||||
[
|
||||
sea_orm::Value::from(doc_id),
|
||||
sea_orm::Value::from(chunks.len() as i32),
|
||||
sea_orm::Value::from(embedded_count as i32),
|
||||
sea_orm::Value::from(completed_now),
|
||||
],
|
||||
);
|
||||
self.db
|
||||
.execute(stmt)
|
||||
.await
|
||||
.map_err(|e| AiError::DbError(e.to_string()))?;
|
||||
|
||||
// 原子递增知识库切片计数
|
||||
self.knowledge_v2
|
||||
.increment_chunk_count(doc.knowledge_base_id, chunks.len() as i32)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn update_doc_status(
|
||||
&self,
|
||||
doc_id: Uuid,
|
||||
status: &str,
|
||||
error: Option<String>,
|
||||
started_at: Option<chrono::DateTime<chrono::Utc>>,
|
||||
completed_at: Option<chrono::DateTime<chrono::Utc>>,
|
||||
) -> AiResult<()> {
|
||||
let now = chrono::Utc::now();
|
||||
let mut values: Vec<sea_orm::Value> = vec![
|
||||
sea_orm::Value::from(doc_id),
|
||||
sea_orm::Value::String(Some(Box::new(status.to_string()))),
|
||||
error
|
||||
.map(|e| sea_orm::Value::String(Some(Box::new(e))))
|
||||
.unwrap_or(sea_orm::Value::String(None)),
|
||||
sea_orm::Value::from(now),
|
||||
];
|
||||
|
||||
let mut extra_sql = String::new();
|
||||
if let Some(sa) = started_at {
|
||||
values.push(sea_orm::Value::from(sa));
|
||||
extra_sql.push_str(", processing_started_at = $5");
|
||||
}
|
||||
if let Some(ca) = completed_at {
|
||||
values.push(sea_orm::Value::from(ca));
|
||||
let idx = values.len();
|
||||
extra_sql.push_str(&format!(", processing_completed_at = ${}", idx));
|
||||
}
|
||||
|
||||
let sql = format!(
|
||||
"UPDATE ai_knowledge_documents SET status = $2, error_message = $3, updated_at = $4, version_lock = version_lock + 1{} WHERE id = $1 AND deleted_at IS NULL",
|
||||
extra_sql
|
||||
);
|
||||
|
||||
let stmt = sea_orm::Statement::from_sql_and_values(
|
||||
sea_orm::DatabaseBackend::Postgres,
|
||||
&sql,
|
||||
values,
|
||||
);
|
||||
self.db
|
||||
.execute(stmt)
|
||||
.await
|
||||
.map_err(|e| AiError::DbError(e.to_string()))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn try_embed(&self, text: &str) -> Option<Vec<f32>> {
|
||||
if !self.embedding.is_configured() {
|
||||
return None;
|
||||
}
|
||||
match self.embedding.embed(text).await {
|
||||
Ok(e) => Some(e),
|
||||
Err(e) => {
|
||||
tracing::warn!(error = %e, "Embedding 生成失败");
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn mime_to_doc_type(mime: &str) -> String {
|
||||
match mime {
|
||||
"application/pdf" => "pdf".into(),
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" => "docx".into(),
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => "xlsx".into(),
|
||||
"text/plain" => "txt".into(),
|
||||
"text/markdown" => "md".into(),
|
||||
_ => "other".into(),
|
||||
}
|
||||
}
|
||||
60
crates/erp-ai/src/service/document/parser.rs
Normal file
60
crates/erp-ai/src/service/document/parser.rs
Normal file
@@ -0,0 +1,60 @@
|
||||
use crate::error::{AiError, AiResult};
|
||||
|
||||
/// 从文件内容解析出纯文本
|
||||
pub fn parse_document(file_name: &str, mime_type: &str, data: &[u8]) -> AiResult<String> {
|
||||
match mime_type {
|
||||
"application/pdf" => parse_pdf(data),
|
||||
"text/plain" | "text/markdown" => parse_text(data),
|
||||
_ => {
|
||||
if file_name.ends_with(".pdf") {
|
||||
return parse_pdf(data);
|
||||
}
|
||||
// DOCX/XLSX 等二进制格式用 UTF-8 lossy 提取可读文本
|
||||
// 后续 Phase 可替换为专业解析器
|
||||
if file_name.ends_with(".txt") || file_name.ends_with(".md") {
|
||||
return parse_text(data);
|
||||
}
|
||||
// 二进制格式兜底:提取 UTF-8 可读片段
|
||||
parse_binary_text(data)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_pdf(data: &[u8]) -> AiResult<String> {
|
||||
pdf_extract::extract_text_from_mem(data)
|
||||
.map(|t| t.trim().to_string())
|
||||
.map_err(|e| AiError::KnowledgeError(format!("PDF 解析失败: {}", e)))
|
||||
}
|
||||
|
||||
fn parse_text(data: &[u8]) -> AiResult<String> {
|
||||
Ok(String::from_utf8_lossy(data).trim().to_string())
|
||||
}
|
||||
|
||||
/// 从二进制文件中提取可读文本片段(DOCX/XLSX 兜底方案)
|
||||
fn parse_binary_text(data: &[u8]) -> AiResult<String> {
|
||||
let text = String::from_utf8_lossy(data);
|
||||
let mut readable = String::new();
|
||||
let mut chunk = String::new();
|
||||
|
||||
for ch in text.chars() {
|
||||
let punctuation = ",。、;:\u{201c}\u{201d}\u{2018}\u{2019}!?()《》【】…—·\t\n\r";
|
||||
if ch.is_alphanumeric() || ch.is_whitespace() || punctuation.contains(ch) {
|
||||
chunk.push(ch);
|
||||
} else if !chunk.trim().is_empty() {
|
||||
readable.push_str(chunk.trim());
|
||||
readable.push(' ');
|
||||
chunk.clear();
|
||||
}
|
||||
}
|
||||
if !chunk.trim().is_empty() {
|
||||
readable.push_str(chunk.trim());
|
||||
}
|
||||
|
||||
let result = readable.split_whitespace().collect::<Vec<_>>().join(" ");
|
||||
if result.len() < 20 {
|
||||
return Err(AiError::KnowledgeError(
|
||||
"无法从文件中提取有效文本内容".into(),
|
||||
));
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
@@ -255,4 +255,82 @@ impl KnowledgeV2Service {
|
||||
.map_err(|e| AiError::DbError(e.to_string()))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// 向量相似度搜索:在指定知识库中搜索与 query_embedding 最相似的 top_k 个切片
|
||||
pub async fn vector_search(
|
||||
&self,
|
||||
tenant_id: Uuid,
|
||||
kb_id: Uuid,
|
||||
query_embedding: &[f32],
|
||||
top_k: i64,
|
||||
) -> AiResult<Vec<SearchHit>> {
|
||||
let vector_str = crate::service::embedding::format_vector(query_embedding);
|
||||
let sql = r#"
|
||||
SELECT c.id, c.document_id, c.chunk_index, c.content, c.metadata,
|
||||
d.title AS doc_title,
|
||||
1 - (c.embedding <=> $3::vector) AS similarity
|
||||
FROM ai_knowledge_chunks c
|
||||
JOIN ai_knowledge_documents d ON d.id = c.document_id
|
||||
WHERE c.tenant_id = $1
|
||||
AND c.knowledge_base_id = $2
|
||||
AND c.deleted_at IS NULL
|
||||
AND d.deleted_at IS NULL
|
||||
AND c.embedding IS NOT NULL
|
||||
ORDER BY c.embedding <=> $3::vector
|
||||
LIMIT $4
|
||||
"#;
|
||||
let stmt = sea_orm::Statement::from_sql_and_values(
|
||||
sea_orm::DatabaseBackend::Postgres,
|
||||
sql,
|
||||
[
|
||||
sea_orm::Value::from(tenant_id),
|
||||
sea_orm::Value::from(kb_id),
|
||||
sea_orm::Value::String(Some(Box::new(vector_str))),
|
||||
sea_orm::Value::from(top_k),
|
||||
],
|
||||
);
|
||||
|
||||
let rows: Vec<SearchHitRow> = sea_orm::FromQueryResult::find_by_statement(stmt)
|
||||
.all(&self.db)
|
||||
.await
|
||||
.map_err(|e| AiError::DbError(e.to_string()))?;
|
||||
|
||||
Ok(rows.into_iter().map(SearchHit::from).collect())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, sea_orm::FromQueryResult)]
|
||||
struct SearchHitRow {
|
||||
id: Uuid,
|
||||
document_id: Uuid,
|
||||
chunk_index: i32,
|
||||
content: String,
|
||||
metadata: serde_json::Value,
|
||||
doc_title: String,
|
||||
similarity: f64,
|
||||
}
|
||||
|
||||
#[derive(Debug, serde::Serialize)]
|
||||
pub struct SearchHit {
|
||||
pub chunk_id: Uuid,
|
||||
pub document_id: Uuid,
|
||||
pub chunk_index: i32,
|
||||
pub content: String,
|
||||
pub doc_title: String,
|
||||
pub similarity: f64,
|
||||
pub metadata: serde_json::Value,
|
||||
}
|
||||
|
||||
impl From<SearchHitRow> for SearchHit {
|
||||
fn from(row: SearchHitRow) -> Self {
|
||||
Self {
|
||||
chunk_id: row.id,
|
||||
document_id: row.document_id,
|
||||
chunk_index: row.chunk_index,
|
||||
content: row.content,
|
||||
doc_title: row.doc_title,
|
||||
similarity: row.similarity,
|
||||
metadata: row.metadata,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,6 +9,8 @@ use crate::service::analysis::AnalysisService;
|
||||
use crate::service::cache::CacheService;
|
||||
use crate::service::chat_message::ChatMessageService;
|
||||
use crate::service::chat_session::ChatSessionService;
|
||||
use crate::service::document::DocumentService;
|
||||
use crate::service::embedding::EmbeddingService;
|
||||
use crate::service::feature_flag_service::FeatureFlagService;
|
||||
use crate::service::insight_service::InsightService;
|
||||
use crate::service::knowledge::KnowledgeService;
|
||||
@@ -36,6 +38,8 @@ pub struct AiState {
|
||||
pub feature_flags: Arc<FeatureFlagService>,
|
||||
pub knowledge: Arc<KnowledgeService>,
|
||||
pub knowledge_v2: Arc<KnowledgeV2Service>,
|
||||
pub document: Arc<DocumentService>,
|
||||
pub embedding: Arc<EmbeddingService>,
|
||||
pub chat_session: Arc<ChatSessionService>,
|
||||
pub chat_message: Arc<ChatMessageService>,
|
||||
}
|
||||
|
||||
@@ -594,6 +594,13 @@ async fn main() -> anyhow::Result<()> {
|
||||
cache_ttl,
|
||||
));
|
||||
|
||||
let embedding_svc = std::sync::Arc::new(
|
||||
erp_ai::service::embedding::EmbeddingService::from_settings(&db).await,
|
||||
);
|
||||
let knowledge_v2_svc = std::sync::Arc::new(
|
||||
erp_ai::service::knowledge_v2::KnowledgeV2Service::new(db.clone()),
|
||||
);
|
||||
|
||||
erp_ai::AiState {
|
||||
db: db.clone(),
|
||||
event_bus: event_bus.clone(),
|
||||
@@ -612,13 +619,15 @@ async fn main() -> anyhow::Result<()> {
|
||||
),
|
||||
knowledge: std::sync::Arc::new(erp_ai::service::knowledge::KnowledgeService::new(
|
||||
db.clone(),
|
||||
std::sync::Arc::new(
|
||||
erp_ai::service::embedding::EmbeddingService::from_settings(&db).await,
|
||||
),
|
||||
embedding_svc.clone(),
|
||||
)),
|
||||
knowledge_v2: std::sync::Arc::new(
|
||||
erp_ai::service::knowledge_v2::KnowledgeV2Service::new(db.clone()),
|
||||
),
|
||||
knowledge_v2: knowledge_v2_svc.clone(),
|
||||
document: std::sync::Arc::new(erp_ai::service::document::DocumentService::new(
|
||||
db.clone(),
|
||||
knowledge_v2_svc,
|
||||
embedding_svc.clone(),
|
||||
)),
|
||||
embedding: embedding_svc,
|
||||
chat_session: std::sync::Arc::new(
|
||||
erp_ai::service::chat_session::ChatSessionService::new(db.clone()),
|
||||
),
|
||||
|
||||
Reference in New Issue
Block a user