//! Structured fact extraction and storage. //! //! Inspired by DeerFlow's LLM-driven fact extraction with deduplication //! and confidence scoring. Facts are natural language statements extracted //! from conversations, categorized and scored for retrieval quality. use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::sync::atomic::{AtomicU64, Ordering}; use std::time::{SystemTime, UNIX_EPOCH}; /// Global counter for generating unique fact IDs without uuid dependency overhead. static FACT_COUNTER: AtomicU64 = AtomicU64::new(0); fn now_secs() -> u64 { SystemTime::now() .duration_since(UNIX_EPOCH) .unwrap_or_default() .as_secs() } fn next_fact_id() -> String { let ts = now_secs(); let seq = FACT_COUNTER.fetch_add(1, Ordering::Relaxed); format!("fact-{}-{}", ts, seq) } /// A structured fact extracted from conversation. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Fact { /// Unique identifier pub id: String, /// The fact content (natural language) pub content: String, /// Category of the fact pub category: FactCategory, /// Confidence score (0.0 - 1.0) pub confidence: f64, /// When this fact was extracted (unix timestamp in seconds) pub created_at: u64, /// Source session ID pub source: Option, } /// Categories for structured facts. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] #[serde(rename_all = "snake_case")] pub enum FactCategory { /// User preference (language, style, format) Preference, /// Domain knowledge or context Knowledge, /// Behavioral pattern or habit Behavior, /// Task-specific context TaskContext, /// General information General, } impl Fact { /// Create a new fact with auto-generated ID and timestamp. pub fn new(content: impl Into, category: FactCategory, confidence: f64) -> Self { Self { id: next_fact_id(), content: content.into(), category, confidence: confidence.clamp(0.0, 1.0), created_at: now_secs(), source: None, } } /// Attach a source session ID (builder pattern). pub fn with_source(mut self, source: impl Into) -> Self { self.source = Some(source.into()); self } } /// Result of a fact extraction batch. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ExtractedFactBatch { pub facts: Vec, pub agent_id: String, pub session_id: String, } impl ExtractedFactBatch { /// Deduplicate facts by trimmed, lowercased content comparison. /// When duplicates are found, keep the one with higher confidence. pub fn deduplicate(mut self) -> Self { let mut best_index: HashMap = HashMap::new(); let mut to_remove: Vec = Vec::new(); for (i, fact) in self.facts.iter().enumerate() { let key = fact.content.trim().to_lowercase(); if let Some(&prev_idx) = best_index.get(&key) { // Keep the one with higher confidence if self.facts[prev_idx].confidence >= fact.confidence { to_remove.push(i); } else { to_remove.push(prev_idx); best_index.insert(key, i); } } else { best_index.insert(key, i); } } // Remove in reverse order to maintain valid indices for idx in to_remove.into_iter().rev() { self.facts.remove(idx); } self } /// Filter facts below the given confidence threshold. pub fn filter_by_confidence(mut self, min_confidence: f64) -> Self { self.facts.retain(|f| f.confidence >= min_confidence); self } /// Returns true if there are no facts in the batch. pub fn is_empty(&self) -> bool { self.facts.is_empty() } /// Returns the number of facts in the batch. pub fn len(&self) -> usize { self.facts.len() } } #[cfg(test)] mod tests { use super::*; #[test] fn test_fact_new_clamps_confidence() { let f = Fact::new("hello", FactCategory::General, 1.5); assert!((f.confidence - 1.0).abs() < f64::EPSILON); } #[test] fn test_fact_with_source() { let f = Fact::new("prefers dark mode", FactCategory::Preference, 0.9) .with_source("sess-123"); assert_eq!(f.source.as_deref(), Some("sess-123")); } #[test] fn test_deduplicate_keeps_higher_confidence() { let batch = ExtractedFactBatch { facts: vec![ Fact::new("likes Python", FactCategory::Preference, 0.8), Fact::new("Likes Python", FactCategory::Preference, 0.95), Fact::new("uses VSCode", FactCategory::Behavior, 0.7), ], agent_id: "agent-1".into(), session_id: "sess-1".into(), }; let deduped = batch.deduplicate(); assert_eq!(deduped.facts.len(), 2); // The "likes Python" fact with 0.95 confidence should survive let python_fact = deduped .facts .iter() .find(|f| f.content.contains("Python")) .unwrap(); assert!((python_fact.confidence - 0.95).abs() < f64::EPSILON); } #[test] fn test_filter_by_confidence() { let batch = ExtractedFactBatch { facts: vec![ Fact::new("high", FactCategory::General, 0.9), Fact::new("medium", FactCategory::General, 0.75), Fact::new("low", FactCategory::General, 0.3), ], agent_id: "agent-1".into(), session_id: "sess-1".into(), }; let filtered = batch.filter_by_confidence(0.7); assert_eq!(filtered.facts.len(), 2); } #[test] fn test_is_empty_and_len() { let batch = ExtractedFactBatch { facts: vec![], agent_id: "agent-1".into(), session_id: "sess-1".into(), }; assert!(batch.is_empty()); assert_eq!(batch.len(), 0); } }