fix(安全): 修复HTML导出中的XSS漏洞并清理调试日志
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled

refactor(日志): 替换console.log为tracing日志系统
style(代码): 移除未使用的代码和依赖项

feat(测试): 添加端到端测试文档和CI工作流
docs(变更日志): 更新CHANGELOG.md记录0.1.0版本变更

perf(构建): 更新依赖版本并优化CI流程
This commit is contained in:
iven
2026-03-26 19:49:03 +08:00
parent b8d565a9eb
commit 978dc5cdd8
79 changed files with 3953 additions and 5724 deletions

View File

@@ -3,11 +3,35 @@
//! Provides TF-IDF based semantic similarity computation for memory retrieval.
//! This is a lightweight, dependency-free implementation suitable for
//! medium-scale memory systems.
//!
//! Supports optional embedding API integration for improved semantic search.
use std::collections::{HashMap, HashSet};
use std::sync::Arc;
use crate::types::MemoryEntry;
/// Semantic similarity scorer using TF-IDF
/// Embedding client trait for API integration
#[async_trait::async_trait]
pub trait EmbeddingClient: Send + Sync {
async fn embed(&self, text: &str) -> Result<Vec<f32>, String>;
fn is_available(&self) -> bool;
}
/// No-op embedding client (uses TF-IDF only)
pub struct NoOpEmbeddingClient;
#[async_trait::async_trait]
impl EmbeddingClient for NoOpEmbeddingClient {
async fn embed(&self, _text: &str) -> Result<Vec<f32>, String> {
Err("Embedding not configured".to_string())
}
fn is_available(&self) -> bool {
false
}
}
/// Semantic similarity scorer using TF-IDF with optional embedding support
pub struct SemanticScorer {
/// Document frequency for IDF computation
document_frequencies: HashMap<String, usize>,
@@ -15,8 +39,14 @@ pub struct SemanticScorer {
total_documents: usize,
/// Precomputed TF-IDF vectors for entries
entry_vectors: HashMap<String, HashMap<String, f32>>,
/// Precomputed embedding vectors for entries
entry_embeddings: HashMap<String, Vec<f32>>,
/// Stop words to ignore
stop_words: HashSet<String>,
/// Optional embedding client
embedding_client: Arc<dyn EmbeddingClient>,
/// Whether to use embedding for similarity
use_embedding: bool,
}
impl SemanticScorer {
@@ -26,10 +56,41 @@ impl SemanticScorer {
document_frequencies: HashMap::new(),
total_documents: 0,
entry_vectors: HashMap::new(),
entry_embeddings: HashMap::new(),
stop_words: Self::default_stop_words(),
embedding_client: Arc::new(NoOpEmbeddingClient),
use_embedding: false,
}
}
/// Create a new semantic scorer with embedding client
pub fn with_embedding(client: Arc<dyn EmbeddingClient>) -> Self {
Self {
document_frequencies: HashMap::new(),
total_documents: 0,
entry_vectors: HashMap::new(),
entry_embeddings: HashMap::new(),
stop_words: Self::default_stop_words(),
embedding_client: client,
use_embedding: true,
}
}
/// Set whether to use embedding for similarity
pub fn set_use_embedding(&mut self, use_embedding: bool) {
self.use_embedding = use_embedding && self.embedding_client.is_available();
}
/// Check if embedding is available
pub fn is_embedding_available(&self) -> bool {
self.embedding_client.is_available()
}
/// Get the embedding client
pub fn get_embedding_client(&self) -> Arc<dyn EmbeddingClient> {
self.embedding_client.clone()
}
/// Get default stop words
fn default_stop_words() -> HashSet<String> {
[
@@ -132,9 +193,34 @@ impl SemanticScorer {
self.entry_vectors.insert(entry.uri.clone(), tfidf);
}
/// Index an entry with embedding (async)
pub async fn index_entry_with_embedding(&mut self, entry: &MemoryEntry) {
// First do TF-IDF indexing
self.index_entry(entry);
// Then compute embedding if available
if self.use_embedding && self.embedding_client.is_available() {
let text_to_embed = if !entry.keywords.is_empty() {
format!("{} {}", entry.content, entry.keywords.join(" "))
} else {
entry.content.clone()
};
match self.embedding_client.embed(&text_to_embed).await {
Ok(embedding) => {
self.entry_embeddings.insert(entry.uri.clone(), embedding);
}
Err(e) => {
tracing::warn!("[SemanticScorer] Failed to compute embedding for {}: {}", entry.uri, e);
}
}
}
}
/// Remove an entry from the index
pub fn remove_entry(&mut self, uri: &str) {
self.entry_vectors.remove(uri);
self.entry_embeddings.remove(uri);
}
/// Compute cosine similarity between two vectors
@@ -167,6 +253,57 @@ impl SemanticScorer {
}
}
/// Compute cosine similarity between two embedding vectors
fn cosine_similarity_embedding(v1: &[f32], v2: &[f32]) -> f32 {
if v1.is_empty() || v2.is_empty() || v1.len() != v2.len() {
return 0.0;
}
let mut dot_product = 0.0;
let mut norm1 = 0.0;
let mut norm2 = 0.0;
for i in 0..v1.len() {
dot_product += v1[i] * v2[i];
norm1 += v1[i] * v1[i];
norm2 += v2[i] * v2[i];
}
let denom = (norm1 * norm2).sqrt();
if denom == 0.0 {
0.0
} else {
(dot_product / denom).clamp(0.0, 1.0)
}
}
/// Score similarity between query and entry using embedding (async)
pub async fn score_similarity_with_embedding(&self, query: &str, entry: &MemoryEntry) -> f32 {
// If we have precomputed embedding for this entry and embedding is enabled
if self.use_embedding && self.embedding_client.is_available() {
if let Some(entry_embedding) = self.entry_embeddings.get(&entry.uri) {
// Compute query embedding
match self.embedding_client.embed(query).await {
Ok(query_embedding) => {
let embedding_score = Self::cosine_similarity_embedding(&query_embedding, entry_embedding);
// Also compute TF-IDF score for hybrid approach
let tfidf_score = self.score_similarity(query, entry);
// Weighted combination: 70% embedding, 30% TF-IDF
return embedding_score * 0.7 + tfidf_score * 0.3;
}
Err(e) => {
tracing::debug!("[SemanticScorer] Failed to embed query: {}", e);
}
}
}
}
// Fall back to TF-IDF
self.score_similarity(query, entry)
}
/// Score similarity between query and entry
pub fn score_similarity(&self, query: &str, entry: &MemoryEntry) -> f32 {
// Tokenize query
@@ -246,6 +383,7 @@ impl SemanticScorer {
self.document_frequencies.clear();
self.total_documents = 0;
self.entry_vectors.clear();
self.entry_embeddings.clear();
}
/// Get statistics about the index
@@ -254,6 +392,8 @@ impl SemanticScorer {
total_documents: self.total_documents,
unique_terms: self.document_frequencies.len(),
indexed_entries: self.entry_vectors.len(),
embedding_entries: self.entry_embeddings.len(),
use_embedding: self.use_embedding && self.embedding_client.is_available(),
}
}
}
@@ -270,6 +410,8 @@ pub struct IndexStats {
pub total_documents: usize,
pub unique_terms: usize,
pub indexed_entries: usize,
pub embedding_entries: usize,
pub use_embedding: bool,
}
#[cfg(test)]