fix(安全): 修复HTML导出中的XSS漏洞并清理调试日志
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled
refactor(日志): 替换console.log为tracing日志系统 style(代码): 移除未使用的代码和依赖项 feat(测试): 添加端到端测试文档和CI工作流 docs(变更日志): 更新CHANGELOG.md记录0.1.0版本变更 perf(构建): 更新依赖版本并优化CI流程
This commit is contained in:
@@ -3,11 +3,35 @@
|
||||
//! Provides TF-IDF based semantic similarity computation for memory retrieval.
|
||||
//! This is a lightweight, dependency-free implementation suitable for
|
||||
//! medium-scale memory systems.
|
||||
//!
|
||||
//! Supports optional embedding API integration for improved semantic search.
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::Arc;
|
||||
use crate::types::MemoryEntry;
|
||||
|
||||
/// Semantic similarity scorer using TF-IDF
|
||||
/// Embedding client trait for API integration
|
||||
#[async_trait::async_trait]
|
||||
pub trait EmbeddingClient: Send + Sync {
|
||||
async fn embed(&self, text: &str) -> Result<Vec<f32>, String>;
|
||||
fn is_available(&self) -> bool;
|
||||
}
|
||||
|
||||
/// No-op embedding client (uses TF-IDF only)
|
||||
pub struct NoOpEmbeddingClient;
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl EmbeddingClient for NoOpEmbeddingClient {
|
||||
async fn embed(&self, _text: &str) -> Result<Vec<f32>, String> {
|
||||
Err("Embedding not configured".to_string())
|
||||
}
|
||||
|
||||
fn is_available(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Semantic similarity scorer using TF-IDF with optional embedding support
|
||||
pub struct SemanticScorer {
|
||||
/// Document frequency for IDF computation
|
||||
document_frequencies: HashMap<String, usize>,
|
||||
@@ -15,8 +39,14 @@ pub struct SemanticScorer {
|
||||
total_documents: usize,
|
||||
/// Precomputed TF-IDF vectors for entries
|
||||
entry_vectors: HashMap<String, HashMap<String, f32>>,
|
||||
/// Precomputed embedding vectors for entries
|
||||
entry_embeddings: HashMap<String, Vec<f32>>,
|
||||
/// Stop words to ignore
|
||||
stop_words: HashSet<String>,
|
||||
/// Optional embedding client
|
||||
embedding_client: Arc<dyn EmbeddingClient>,
|
||||
/// Whether to use embedding for similarity
|
||||
use_embedding: bool,
|
||||
}
|
||||
|
||||
impl SemanticScorer {
|
||||
@@ -26,10 +56,41 @@ impl SemanticScorer {
|
||||
document_frequencies: HashMap::new(),
|
||||
total_documents: 0,
|
||||
entry_vectors: HashMap::new(),
|
||||
entry_embeddings: HashMap::new(),
|
||||
stop_words: Self::default_stop_words(),
|
||||
embedding_client: Arc::new(NoOpEmbeddingClient),
|
||||
use_embedding: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new semantic scorer with embedding client
|
||||
pub fn with_embedding(client: Arc<dyn EmbeddingClient>) -> Self {
|
||||
Self {
|
||||
document_frequencies: HashMap::new(),
|
||||
total_documents: 0,
|
||||
entry_vectors: HashMap::new(),
|
||||
entry_embeddings: HashMap::new(),
|
||||
stop_words: Self::default_stop_words(),
|
||||
embedding_client: client,
|
||||
use_embedding: true,
|
||||
}
|
||||
}
|
||||
|
||||
/// Set whether to use embedding for similarity
|
||||
pub fn set_use_embedding(&mut self, use_embedding: bool) {
|
||||
self.use_embedding = use_embedding && self.embedding_client.is_available();
|
||||
}
|
||||
|
||||
/// Check if embedding is available
|
||||
pub fn is_embedding_available(&self) -> bool {
|
||||
self.embedding_client.is_available()
|
||||
}
|
||||
|
||||
/// Get the embedding client
|
||||
pub fn get_embedding_client(&self) -> Arc<dyn EmbeddingClient> {
|
||||
self.embedding_client.clone()
|
||||
}
|
||||
|
||||
/// Get default stop words
|
||||
fn default_stop_words() -> HashSet<String> {
|
||||
[
|
||||
@@ -132,9 +193,34 @@ impl SemanticScorer {
|
||||
self.entry_vectors.insert(entry.uri.clone(), tfidf);
|
||||
}
|
||||
|
||||
/// Index an entry with embedding (async)
|
||||
pub async fn index_entry_with_embedding(&mut self, entry: &MemoryEntry) {
|
||||
// First do TF-IDF indexing
|
||||
self.index_entry(entry);
|
||||
|
||||
// Then compute embedding if available
|
||||
if self.use_embedding && self.embedding_client.is_available() {
|
||||
let text_to_embed = if !entry.keywords.is_empty() {
|
||||
format!("{} {}", entry.content, entry.keywords.join(" "))
|
||||
} else {
|
||||
entry.content.clone()
|
||||
};
|
||||
|
||||
match self.embedding_client.embed(&text_to_embed).await {
|
||||
Ok(embedding) => {
|
||||
self.entry_embeddings.insert(entry.uri.clone(), embedding);
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("[SemanticScorer] Failed to compute embedding for {}: {}", entry.uri, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Remove an entry from the index
|
||||
pub fn remove_entry(&mut self, uri: &str) {
|
||||
self.entry_vectors.remove(uri);
|
||||
self.entry_embeddings.remove(uri);
|
||||
}
|
||||
|
||||
/// Compute cosine similarity between two vectors
|
||||
@@ -167,6 +253,57 @@ impl SemanticScorer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute cosine similarity between two embedding vectors
|
||||
fn cosine_similarity_embedding(v1: &[f32], v2: &[f32]) -> f32 {
|
||||
if v1.is_empty() || v2.is_empty() || v1.len() != v2.len() {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
let mut dot_product = 0.0;
|
||||
let mut norm1 = 0.0;
|
||||
let mut norm2 = 0.0;
|
||||
|
||||
for i in 0..v1.len() {
|
||||
dot_product += v1[i] * v2[i];
|
||||
norm1 += v1[i] * v1[i];
|
||||
norm2 += v2[i] * v2[i];
|
||||
}
|
||||
|
||||
let denom = (norm1 * norm2).sqrt();
|
||||
if denom == 0.0 {
|
||||
0.0
|
||||
} else {
|
||||
(dot_product / denom).clamp(0.0, 1.0)
|
||||
}
|
||||
}
|
||||
|
||||
/// Score similarity between query and entry using embedding (async)
|
||||
pub async fn score_similarity_with_embedding(&self, query: &str, entry: &MemoryEntry) -> f32 {
|
||||
// If we have precomputed embedding for this entry and embedding is enabled
|
||||
if self.use_embedding && self.embedding_client.is_available() {
|
||||
if let Some(entry_embedding) = self.entry_embeddings.get(&entry.uri) {
|
||||
// Compute query embedding
|
||||
match self.embedding_client.embed(query).await {
|
||||
Ok(query_embedding) => {
|
||||
let embedding_score = Self::cosine_similarity_embedding(&query_embedding, entry_embedding);
|
||||
|
||||
// Also compute TF-IDF score for hybrid approach
|
||||
let tfidf_score = self.score_similarity(query, entry);
|
||||
|
||||
// Weighted combination: 70% embedding, 30% TF-IDF
|
||||
return embedding_score * 0.7 + tfidf_score * 0.3;
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::debug!("[SemanticScorer] Failed to embed query: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fall back to TF-IDF
|
||||
self.score_similarity(query, entry)
|
||||
}
|
||||
|
||||
/// Score similarity between query and entry
|
||||
pub fn score_similarity(&self, query: &str, entry: &MemoryEntry) -> f32 {
|
||||
// Tokenize query
|
||||
@@ -246,6 +383,7 @@ impl SemanticScorer {
|
||||
self.document_frequencies.clear();
|
||||
self.total_documents = 0;
|
||||
self.entry_vectors.clear();
|
||||
self.entry_embeddings.clear();
|
||||
}
|
||||
|
||||
/// Get statistics about the index
|
||||
@@ -254,6 +392,8 @@ impl SemanticScorer {
|
||||
total_documents: self.total_documents,
|
||||
unique_terms: self.document_frequencies.len(),
|
||||
indexed_entries: self.entry_vectors.len(),
|
||||
embedding_entries: self.entry_embeddings.len(),
|
||||
use_embedding: self.use_embedding && self.embedding_client.is_available(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -270,6 +410,8 @@ pub struct IndexStats {
|
||||
pub total_documents: usize,
|
||||
pub unique_terms: usize,
|
||||
pub indexed_entries: usize,
|
||||
pub embedding_entries: usize,
|
||||
pub use_embedding: bool,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
Reference in New Issue
Block a user