refactor(middleware): 移除数据脱敏中间件及相关代码
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled

移除不再使用的数据脱敏功能,包括:
1. 删除data_masking模块
2. 清理loop_runner中的unmask逻辑
3. 移除前端saas-relay-client.ts中的mask/unmask实现
4. 更新中间件层数从15层降为14层
5. 同步更新相关文档(CLAUDE.md、TRUTH.md、wiki等)

此次变更简化了系统架构,移除了不再需要的敏感数据处理逻辑。所有相关测试证据和截图已归档。
This commit is contained in:
iven
2026-04-22 19:19:07 +08:00
parent 14f2f497b6
commit fa5ab4e161
68 changed files with 8049 additions and 3684 deletions

View File

@@ -12,7 +12,6 @@ use crate::tool::builtin::PathValidator;
use crate::growth::GrowthIntegration;
use crate::compaction::{self, CompactionConfig};
use crate::middleware::{self, MiddlewareChain};
use crate::middleware::data_masking::DataMasker;
use crate::prompt::{PromptBuilder, PromptContext};
use zclaw_memory::MemoryStore;
@@ -40,8 +39,6 @@ pub struct AgentLoop {
/// Middleware chain — cross-cutting concerns are delegated to the chain.
/// An empty chain (Default) is a no-op: all `run_*` methods return Continue/Allow.
middleware_chain: MiddlewareChain,
/// Data masker for unmasking LLM responses (entity tokens → original text).
data_masker: Option<Arc<DataMasker>>,
/// Chat mode: extended thinking enabled
thinking_enabled: bool,
/// Chat mode: reasoning effort level
@@ -74,7 +71,6 @@ impl AgentLoop {
compaction_threshold: 0,
compaction_config: CompactionConfig::default(),
middleware_chain: MiddlewareChain::default(),
data_masker: None,
thinking_enabled: false,
reasoning_effort: None,
plan_mode: false,
@@ -181,23 +177,6 @@ impl AgentLoop {
self
}
/// Inject data masker for unmasking entity tokens in LLM responses.
pub fn with_data_masker(mut self, masker: Option<Arc<DataMasker>>) -> Self {
self.data_masker = masker;
self
}
/// Unmask entity tokens in text, restoring original values.
fn unmask_text(&self, text: &str) -> String {
if let Some(ref masker) = self.data_masker {
match masker.unmask(text) {
Ok(unmasked) => return unmasked,
Err(e) => tracing::warn!("[AgentLoop] Failed to unmask text: {}", e),
}
}
text.to_string()
}
/// Get growth integration reference
pub fn growth(&self) -> Option<&GrowthIntegration> {
self.growth.as_ref()
@@ -363,19 +342,16 @@ impl AgentLoop {
// If no tool calls, we have the final response
if tool_calls.is_empty() {
// Unmask entity tokens in final response
let unmasked_text = self.unmask_text(&text_content);
// Save final assistant message with thinking
let msg = if let Some(thinking) = &thinking_content {
Message::assistant_with_thinking(&unmasked_text, thinking)
Message::assistant_with_thinking(&text_content, thinking)
} else {
Message::assistant(&unmasked_text)
Message::assistant(&text_content)
};
self.memory.append_message(&session_id, &msg).await?;
break AgentLoopResult {
response: unmasked_text,
response: text_content,
input_tokens: total_input_tokens,
output_tokens: total_output_tokens,
iterations,
@@ -629,7 +605,6 @@ impl AgentLoop {
let thinking_enabled = self.thinking_enabled;
let reasoning_effort = self.reasoning_effort.clone();
let plan_mode = self.plan_mode;
let data_masker = self.data_masker.clone();
tokio::spawn(async move {
let mut messages = messages;
@@ -695,17 +670,8 @@ impl AgentLoop {
StreamChunk::TextDelta { delta } => {
text_delta_count += 1;
tracing::debug!("[AgentLoop] TextDelta #{}: {} chars", text_delta_count, delta.len());
// Unmask entity tokens before sending to user
let unmasked = if let Some(ref masker) = data_masker {
match masker.unmask(delta) {
Ok(t) => t,
Err(e) => { tracing::warn!("[AgentLoop] Delta unmask failed: {}", e); delta.clone() }
}
} else {
delta.clone()
};
iteration_text.push_str(&unmasked);
if let Err(e) = tx.send(LoopEvent::Delta(unmasked)).await {
iteration_text.push_str(delta);
if let Err(e) = tx.send(LoopEvent::Delta(delta.clone())).await {
tracing::warn!("[AgentLoop] Failed to send Delta event: {}", e);
}
}
@@ -795,18 +761,10 @@ impl AgentLoop {
if iteration_text.is_empty() && !reasoning_text.is_empty() {
tracing::info!("[AgentLoop] Model generated {} chars of reasoning but no text — using reasoning as response",
reasoning_text.len());
let unmasked_reasoning = if let Some(ref masker) = data_masker {
match masker.unmask(&reasoning_text) {
Ok(t) => t,
Err(e) => { tracing::warn!("[AgentLoop] Reasoning unmask failed: {}", e); reasoning_text.clone() }
}
} else {
reasoning_text.clone()
};
if let Err(e) = tx.send(LoopEvent::Delta(unmasked_reasoning.clone())).await {
if let Err(e) = tx.send(LoopEvent::Delta(reasoning_text.clone())).await {
tracing::warn!("[AgentLoop] Failed to send Delta event: {}", e);
}
iteration_text = unmasked_reasoning;
iteration_text = reasoning_text.clone();
} else if iteration_text.is_empty() {
tracing::warn!("[AgentLoop] No text content after {} chunks (thinking_delta={})",
chunk_count, thinking_delta_count);

View File

@@ -268,7 +268,6 @@ impl Default for MiddlewareChain {
pub mod butler_router;
pub mod compaction;
pub mod dangling_tool;
pub mod data_masking;
pub mod guardrail;
pub mod loop_guard;
pub mod memory;

View File

@@ -3,7 +3,7 @@
//! Intercepts user messages before LLM processing, uses SemanticSkillRouter
//! to classify intent, and injects routing context into the system prompt.
//!
//! Priority: 80 (runs before data_masking at 90, so it sees raw user input).
//! Priority: 80 (runs before compaction and other post-routing middleware).
//!
//! Supports two modes:
//! 1. **Static mode** (default): Uses built-in `KeywordClassifier` with 4 healthcare domains.

View File

@@ -1,366 +0,0 @@
//! Data Masking Middleware — protect sensitive business data from leaving the user's machine.
//!
//! Before LLM calls, replaces detected entities (company names, amounts, phone numbers)
//! with deterministic tokens. After responses, the caller can restore the original entities.
//!
//! Priority: 90 (runs before Compaction@100 and Memory@150)
use std::collections::HashMap;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::{Arc, LazyLock, RwLock};
use async_trait::async_trait;
use regex::Regex;
use zclaw_types::{Message, Result};
use super::{AgentMiddleware, MiddlewareContext, MiddlewareDecision};
// ---------------------------------------------------------------------------
// Pre-compiled regex patterns (compiled once, reused across all calls)
// ---------------------------------------------------------------------------
/// Excluded prefix chars: structural words that commonly precede 公司/集团 in
/// non-name contexts (e.g. "有一家公司", "去了公司", "这是集团").
static RE_COMPANY: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"[^\s有一家几了的在这是那些各去到从向被把让给对为和与而但又也还都已正将会能可要想需应该得]{1,20}(?:公司|厂|集团|工作室|商行|有限|股份)").expect("static regex is valid")
});
static RE_MONEY: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"[¥¥$]\s*[\d,.]+[万亿]?元?|[\d,.]+[万亿]元").expect("static regex is valid")
});
static RE_PHONE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"1[3-9]\d-?\d{4}-?\d{4}").expect("static regex is valid")
});
static RE_EMAIL: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").expect("static regex is valid")
});
static RE_ID_CARD: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"\b\d{17}[\dXx]\b").expect("static regex is valid")
});
// ---------------------------------------------------------------------------
// DataMasker — entity detection and token mapping
// ---------------------------------------------------------------------------
/// Counts entities by type for token generation.
static ENTITY_COUNTER: AtomicU64 = AtomicU64::new(1);
/// Detects and replaces sensitive entities with deterministic tokens.
pub struct DataMasker {
/// entity text → token mapping (persistent across conversations).
forward: Arc<RwLock<HashMap<String, String>>>,
/// token → entity text reverse mapping (in-memory only).
reverse: Arc<RwLock<HashMap<String, String>>>,
}
impl DataMasker {
pub fn new() -> Self {
Self {
forward: Arc::new(RwLock::new(HashMap::new())),
reverse: Arc::new(RwLock::new(HashMap::new())),
}
}
/// Mask all detected entities in `text`, replacing them with tokens.
pub fn mask(&self, text: &str) -> Result<String> {
let entities = self.detect_entities(text);
if entities.is_empty() {
return Ok(text.to_string());
}
let mut result = text.to_string();
for entity in entities {
let token = self.get_or_create_token(&entity);
// Replace all occurrences (longest entities first to avoid partial matches)
result = result.replace(&entity, &token);
}
Ok(result)
}
/// Restore all tokens in `text` back to their original entities.
pub fn unmask(&self, text: &str) -> Result<String> {
let reverse = self.reverse.read().map_err(|e| zclaw_types::ZclawError::IoError(std::io::Error::other(e.to_string())))?;
if reverse.is_empty() {
return Ok(text.to_string());
}
let mut result = text.to_string();
for (token, entity) in reverse.iter() {
result = result.replace(token, entity);
}
Ok(result)
}
/// Detect sensitive entities in text using regex patterns.
fn detect_entities(&self, text: &str) -> Vec<String> {
let mut entities = Vec::new();
// Company names: X公司、XX集团、XX工作室 (1-20 char prefix + suffix)
for cap in RE_COMPANY.find_iter(text) {
entities.push(cap.as_str().to_string());
}
// Money amounts: ¥50万、¥100元、$200、50万元
for cap in RE_MONEY.find_iter(text) {
entities.push(cap.as_str().to_string());
}
// Phone numbers: 1XX-XXXX-XXXX or 1XXXXXXXXXX
for cap in RE_PHONE.find_iter(text) {
entities.push(cap.as_str().to_string());
}
// Email addresses
for cap in RE_EMAIL.find_iter(text) {
entities.push(cap.as_str().to_string());
}
// ID card numbers (simplified): 18 digits
for cap in RE_ID_CARD.find_iter(text) {
entities.push(cap.as_str().to_string());
}
// Sort by length descending to replace longest entities first
entities.sort_by(|a, b| b.len().cmp(&a.len()));
entities.dedup();
entities
}
/// Get existing token for entity or create a new one.
fn get_or_create_token(&self, entity: &str) -> String {
/// Recover from a poisoned RwLock by taking the inner value and re-wrapping.
/// A poisoned lock only means a panic occurred while holding it — the data is still valid.
fn recover_read<T>(lock: &RwLock<T>) -> std::sync::LockResult<std::sync::RwLockReadGuard<'_, T>> {
match lock.read() {
Ok(guard) => Ok(guard),
Err(_e) => {
tracing::warn!("[DataMasker] RwLock poisoned during read, recovering");
// Poison error still gives us access to the inner guard
lock.read()
}
}
}
fn recover_write<T>(lock: &RwLock<T>) -> std::sync::LockResult<std::sync::RwLockWriteGuard<'_, T>> {
match lock.write() {
Ok(guard) => Ok(guard),
Err(_e) => {
tracing::warn!("[DataMasker] RwLock poisoned during write, recovering");
lock.write()
}
}
}
// Check if already mapped
{
if let Ok(forward) = recover_read(&self.forward) {
if let Some(token) = forward.get(entity) {
return token.clone();
}
}
}
// Create new token
let counter = ENTITY_COUNTER.fetch_add(1, Ordering::Relaxed);
let token = format!("__ENTITY_{}__", counter);
// Store in both mappings
if let Ok(mut forward) = recover_write(&self.forward) {
forward.insert(entity.to_string(), token.clone());
}
if let Ok(mut reverse) = recover_write(&self.reverse) {
reverse.insert(token.clone(), entity.to_string());
}
token
}
}
impl Default for DataMasker {
fn default() -> Self {
Self::new()
}
}
// ---------------------------------------------------------------------------
// DataMaskingMiddleware — masks user messages before LLM completion
// ---------------------------------------------------------------------------
pub struct DataMaskingMiddleware {
masker: Arc<DataMasker>,
}
impl DataMaskingMiddleware {
pub fn new(masker: Arc<DataMasker>) -> Self {
Self { masker }
}
/// Get a reference to the masker for unmasking responses externally.
pub fn masker(&self) -> &Arc<DataMasker> {
&self.masker
}
}
#[async_trait]
impl AgentMiddleware for DataMaskingMiddleware {
fn name(&self) -> &str { "data_masking" }
fn priority(&self) -> i32 { 90 }
async fn before_completion(&self, ctx: &mut MiddlewareContext) -> Result<MiddlewareDecision> {
// Mask user messages — replace sensitive entities with tokens
for msg in &mut ctx.messages {
if let Message::User { ref mut content } = msg {
let masked = self.masker.mask(content)?;
*content = masked;
}
}
// Also mask user_input field
if !ctx.user_input.is_empty() {
ctx.user_input = self.masker.mask(&ctx.user_input)?;
}
Ok(MiddlewareDecision::Continue)
}
}
// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_mask_company_name() {
let masker = DataMasker::new();
let input = "A公司的订单被退了";
let masked = masker.mask(input).unwrap();
assert!(!masked.contains("A公司"), "Company name should be masked: {}", masked);
assert!(masked.contains("__ENTITY_"), "Should contain token: {}", masked);
let unmasked = masker.unmask(&masked).unwrap();
assert_eq!(unmasked, input, "Unmask should restore original");
}
#[test]
fn test_mask_consistency() {
let masker = DataMasker::new();
let masked1 = masker.mask("A公司").unwrap();
let masked2 = masker.mask("A公司").unwrap();
assert_eq!(masked1, masked2, "Same entity should always get same token");
}
#[test]
fn test_mask_money() {
let masker = DataMasker::new();
let input = "成本是¥50万";
let masked = masker.mask(input).unwrap();
assert!(!masked.contains("¥50万"), "Money should be masked: {}", masked);
let unmasked = masker.unmask(&masked).unwrap();
assert_eq!(unmasked, input);
}
#[test]
fn test_mask_phone() {
let masker = DataMasker::new();
let input = "联系13812345678";
let masked = masker.mask(input).unwrap();
assert!(!masked.contains("13812345678"), "Phone should be masked: {}", masked);
let unmasked = masker.unmask(&masked).unwrap();
assert_eq!(unmasked, input);
}
#[test]
fn test_mask_email() {
let masker = DataMasker::new();
let input = "发到 test@example.com 吧";
let masked = masker.mask(input).unwrap();
assert!(!masked.contains("test@example.com"), "Email should be masked: {}", masked);
let unmasked = masker.unmask(&masked).unwrap();
assert_eq!(unmasked, input);
}
#[test]
fn test_mask_no_entities() {
let masker = DataMasker::new();
let input = "今天天气不错";
let masked = masker.mask(input).unwrap();
assert_eq!(masked, input, "Text without entities should pass through unchanged");
}
#[test]
fn test_mask_multiple_entities() {
let masker = DataMasker::new();
let input = "A公司的订单花了¥50万联系13812345678";
let masked = masker.mask(input).unwrap();
assert!(!masked.contains("A公司"));
assert!(!masked.contains("¥50万"));
assert!(!masked.contains("13812345678"));
let unmasked = masker.unmask(&masked).unwrap();
assert_eq!(unmasked, input);
}
#[test]
fn test_unmask_empty() {
let masker = DataMasker::new();
let result = masker.unmask("hello world").unwrap();
assert_eq!(result, "hello world");
}
#[test]
fn test_mask_id_card() {
let masker = DataMasker::new();
let input = "身份证号 110101199001011234";
let masked = masker.mask(input).unwrap();
assert!(!masked.contains("110101199001011234"), "ID card should be masked: {}", masked);
let unmasked = masker.unmask(&masked).unwrap();
assert_eq!(unmasked, input);
}
#[test]
fn test_no_mask_generic_company() {
let masker = DataMasker::new();
// "有一家公司" is NOT a company name — "公司" is used as a generic noun
let input = "我有一家公司需要运营";
let masked = masker.mask(input).unwrap();
assert_eq!(masked, input, "Generic '有一家公司' should not be masked: {}", masked);
}
#[test]
fn test_no_mask_went_to_company() {
let masker = DataMasker::new();
let input = "我去了公司上班";
let masked = masker.mask(input).unwrap();
assert_eq!(masked, input, "去了公司 should not be masked: {}", masked);
}
#[test]
fn test_still_mask_real_company() {
let masker = DataMasker::new();
let input = "腾讯公司的员工";
let masked = masker.mask(input).unwrap();
assert!(!masked.contains("腾讯公司"), "Real company name should be masked: {}", masked);
assert!(masked.contains("__ENTITY_"), "Should contain token: {}", masked);
let unmasked = masker.unmask(&masked).unwrap();
assert_eq!(unmasked, input);
}
#[test]
fn test_still_mask_short_company() {
let masker = DataMasker::new();
// Single-letter company name "A公司" should still be masked
let input = "A公司的订单";
let masked = masker.mask(input).unwrap();
assert!(!masked.contains("A公司"), "A公司 should be masked: {}", masked);
let unmasked = masker.unmask(&masked).unwrap();
assert_eq!(unmasked, input);
}
}