//! 知识条目分块 + Embedding 生成 Worker //! //! 当知识条目创建/更新时触发: //! 1. 按 Markdown 标题 + 固定长度分块 //! 2. 提取关键词(从 item 的 keywords 字段继承 + 内容提取) //! 3. 写入 knowledge_chunks 表 //! 4. 如果配置了 embedding provider,生成向量 embedding(Phase 2) use async_trait::async_trait; use sqlx::PgPool; use serde::{Serialize, Deserialize}; use crate::error::SaasResult; use super::Worker; #[derive(Debug, Serialize, Deserialize)] pub struct GenerateEmbeddingArgs { pub item_id: String, } pub struct GenerateEmbeddingWorker; #[async_trait] impl Worker for GenerateEmbeddingWorker { type Args = GenerateEmbeddingArgs; fn name(&self) -> &str { "generate_embedding" } async fn perform(&self, db: &PgPool, args: Self::Args) -> SaasResult<()> { // 1. 加载条目 let item: Option<(String, String, Vec)> = sqlx::query_as( "SELECT content, title, keywords FROM knowledge_items WHERE id = $1" ) .bind(&args.item_id) .fetch_optional(db) .await?; let (content, title, keywords) = match item { Some(row) => row, None => { tracing::warn!("GenerateEmbedding: item {} not found, skipping", args.item_id); return Ok(()); } }; // 2. 分块 let chunks = crate::knowledge::service::chunk_content(&content, 512, 64); if chunks.is_empty() { tracing::debug!("GenerateEmbedding: item {} has no content to chunk", args.item_id); return Ok(()); } // 3. 在事务中删除旧分块 + 插入新分块(防止并发竞争条件) let mut tx = db.begin().await?; // 锁定条目行防止并发 worker 同时处理同一条目 let locked: Option<(String,)> = sqlx::query_as( "SELECT id FROM knowledge_items WHERE id = $1 FOR UPDATE" ) .bind(&args.item_id) .fetch_optional(&mut *tx) .await?; if locked.is_none() { tx.rollback().await?; tracing::warn!("GenerateEmbedding: item {} was deleted during processing", args.item_id); return Ok(()); } sqlx::query("DELETE FROM knowledge_chunks WHERE item_id = $1") .bind(&args.item_id) .execute(&mut *tx) .await?; for (idx, chunk) in chunks.iter().enumerate() { let chunk_id = uuid::Uuid::new_v4().to_string(); let mut chunk_keywords = keywords.clone(); extract_keywords_from_text(chunk, &mut chunk_keywords); sqlx::query( "INSERT INTO knowledge_chunks (id, item_id, chunk_index, content, keywords, created_at) VALUES ($1, $2, $3, $4, $5, NOW())" ) .bind(&chunk_id) .bind(&args.item_id) .bind(idx as i32) .bind(chunk) .bind(&chunk_keywords) .execute(&mut *tx) .await?; } tx.commit().await?; tracing::info!( "GenerateEmbedding: item '{}' → {} chunks (keywords: {})", title, chunks.len(), keywords.len(), ); // NOTE (Phase 2 - deferred): Embedding vector generation is not yet implemented. // When pgvector is configured with an embedding provider, uncomment and implement: // generate_vectors(db, &args.item_id, &chunks).await // This will call the configured embedding API and update knowledge_chunks.embedding column. // See: docs/memory_pipeline_embedding_fix.md for the embedding configuration steps. Ok(()) } } /// 从 chunk 内容中提取高频中文词组作为补充关键词(公开,供 distill_knowledge worker 复用) pub fn extract_keywords_from_text(content: &str, keywords: &mut Vec) { let chars: Vec = content.chars().collect(); let mut i = 0; while i < chars.len() { // 寻找连续中文字符段 if is_cjk(chars[i]) { let start = i; while i < chars.len() && is_cjk(chars[i]) { i += 1; } let segment: String = chars[start..i].iter().collect(); // 提取 2-4 字的子串 let seg_chars: Vec = segment.chars().collect(); if seg_chars.len() >= 2 { // 只取前 2-4 字的短语(避免过长无意义词组) for len in 2..=4.min(seg_chars.len()) { let phrase: String = seg_chars[..len].iter().collect(); // 过滤常见停用词(简单版) if !is_stop_word(&phrase) && !keywords.contains(&phrase) { keywords.push(phrase); } } } } else { i += 1; } } // 限制关键词总数 keywords.truncate(50); } /// 判断是否为 CJK 字符 fn is_cjk(c: char) -> bool { matches!(c, '\u{4E00}'..='\u{9FFF}' | // CJK Unified Ideographs '\u{3400}'..='\u{4DBF}' | // CJK Unified Ideographs Extension A '\u{F900}'..='\u{FAFF}' // CJK Compatibility Ideographs ) } /// 简单停用词表 fn is_stop_word(s: &str) -> bool { matches!(s, "的" | "了" | "是" | "在" | "我" | "有" | "和" | "就" | "不" | "人" | "都" | "一" | "一个" | "上" | "也" | "很" | "到" | "说" | "要" | "去" | "你" | "会" | "着" | "没有" | "看" | "好" | "自己" | "这" | "他" | "她" | "它" | "们" | "那" | "些" | "什么" | "为" | "所以" | "但是" | "因为" | "如果" | "可以" | "能够" | "需要" | "应该" | "已经" | "还是" | "或者" ) }