fix(runtime): 禁用 DataMasking 中间件 — 正则过度匹配通用中文文本

问题: DataMasking 中间件用正则 [^\s]{1,20}(?:公司|...) 匹配公司名, 将"有一家公司"等通用文本误判为公司实体,替换为 __ENTITY_1__ 占位符。同时 LLM 响应路径缺少 unmask 逻辑,导致用户看到原始占位符。修复: - 禁用 DataMasking 中间件 (桌面端单用户场景无需脱敏) - 在 AgentLoop 添加 data_masker + unmask 基础设施 (备用) - 添加 unmask_text() 方法覆盖流式/非流式两条响应路径 - 保留 data_masking.rs 模块 (含改进正则和新增测试),待未来 NLP 方案启用测试: 934 PASS, 0 FAIL
2026-04-22 17:24:46 +08:00
parent 8b3e43710b
commit 73d50fda21
3 changed files with 103 additions and 18 deletions
--- a/crates/zclaw-kernel/src/kernel/mod.rs
+++ b/crates/zclaw-kernel/src/kernel/mod.rs
@@ -365,16 +365,16 @@ impl Kernel {
            chain.register(Arc::new(mw));
        }

-        // Data masking middleware — mask sensitive entities before any other processing
-        // NOTE: Registration order does NOT determine execution order.
-        // The chain sorts by priority() ascending before execution.
-        // Execution order: Evolution(78) → ButlerRouter(80) → DataMasking(90) → ...
-        {
-            use std::sync::Arc;
-            let masker = Arc::new(zclaw_runtime::middleware::data_masking::DataMasker::new());
-            let mw = zclaw_runtime::middleware::data_masking::DataMaskingMiddleware::new(masker);
-            chain.register(Arc::new(mw));
-        }
+        // Data masking middleware — DISABLED for desktop single-user scenario.
+        // The regex-based approach over-matches common Chinese text (e.g. "有一家公司"
+        // gets masked as a company entity). Response unmask was also missing.
+        // Re-enable when NLP-based entity detection is available.
+        // {
+        //     use std::sync::Arc;
+        //     let masker = Arc::new(zclaw_runtime::middleware::data_masking::DataMasker::new());
+        //     let mw = zclaw_runtime::middleware::data_masking::DataMaskingMiddleware::new(masker);
+        //     chain.register(Arc::new(mw));
+        // }

        // Growth integration — cached to avoid recreating empty scorer per request
        let growth = {
--- a/crates/zclaw-runtime/src/loop_runner.rs
+++ b/crates/zclaw-runtime/src/loop_runner.rs
@@ -12,6 +12,7 @@ use crate::tool::builtin::PathValidator;
 use crate::growth::GrowthIntegration;
 use crate::compaction::{self, CompactionConfig};
 use crate::middleware::{self, MiddlewareChain};
+use crate::middleware::data_masking::DataMasker;
 use crate::prompt::{PromptBuilder, PromptContext};
 use zclaw_memory::MemoryStore;

@@ -39,6 +40,8 @@ pub struct AgentLoop {
    /// Middleware chain — cross-cutting concerns are delegated to the chain.
    /// An empty chain (Default) is a no-op: all `run_*` methods return Continue/Allow.
    middleware_chain: MiddlewareChain,
+    /// Data masker for unmasking LLM responses (entity tokens → original text).
+    data_masker: Option<Arc<DataMasker>>,
    /// Chat mode: extended thinking enabled
    thinking_enabled: bool,
    /// Chat mode: reasoning effort level
@@ -71,6 +74,7 @@ impl AgentLoop {
            compaction_threshold: 0,
            compaction_config: CompactionConfig::default(),
            middleware_chain: MiddlewareChain::default(),
+            data_masker: None,
            thinking_enabled: false,
            reasoning_effort: None,
            plan_mode: false,
@@ -177,6 +181,23 @@ impl AgentLoop {
        self
    }

+    /// Inject data masker for unmasking entity tokens in LLM responses.
+    pub fn with_data_masker(mut self, masker: Option<Arc<DataMasker>>) -> Self {
+        self.data_masker = masker;
+        self
+    }
+
+    /// Unmask entity tokens in text, restoring original values.
+    fn unmask_text(&self, text: &str) -> String {
+        if let Some(ref masker) = self.data_masker {
+            match masker.unmask(text) {
+                Ok(unmasked) => return unmasked,
+                Err(e) => tracing::warn!("[AgentLoop] Failed to unmask text: {}", e),
+            }
+        }
+        text.to_string()
+    }
+
    /// Get growth integration reference
    pub fn growth(&self) -> Option<&GrowthIntegration> {
        self.growth.as_ref()
@@ -342,16 +363,19 @@ impl AgentLoop {

            // If no tool calls, we have the final response
            if tool_calls.is_empty() {
+                // Unmask entity tokens in final response
+                let unmasked_text = self.unmask_text(&text_content);
+
                // Save final assistant message with thinking
                let msg = if let Some(thinking) = &thinking_content {
-                    Message::assistant_with_thinking(&text_content, thinking)
+                    Message::assistant_with_thinking(&unmasked_text, thinking)
                } else {
-                    Message::assistant(&text_content)
+                    Message::assistant(&unmasked_text)
                };
                self.memory.append_message(&session_id, &msg).await?;

                break AgentLoopResult {
-                    response: text_content,
+                    response: unmasked_text,
                    input_tokens: total_input_tokens,
                    output_tokens: total_output_tokens,
                    iterations,
@@ -605,6 +629,7 @@ impl AgentLoop {
        let thinking_enabled = self.thinking_enabled;
        let reasoning_effort = self.reasoning_effort.clone();
        let plan_mode = self.plan_mode;
+        let data_masker = self.data_masker.clone();

        tokio::spawn(async move {
            let mut messages = messages;
@@ -670,8 +695,17 @@ impl AgentLoop {
                                StreamChunk::TextDelta { delta } => {
                                    text_delta_count += 1;
                                    tracing::debug!("[AgentLoop] TextDelta #{}: {} chars", text_delta_count, delta.len());
-                                    iteration_text.push_str(delta);
-                                    if let Err(e) = tx.send(LoopEvent::Delta(delta.clone())).await {
+                                    // Unmask entity tokens before sending to user
+                                    let unmasked = if let Some(ref masker) = data_masker {
+                                        match masker.unmask(delta) {
+                                            Ok(t) => t,
+                                            Err(e) => { tracing::warn!("[AgentLoop] Delta unmask failed: {}", e); delta.clone() }
+                                        }
+                                    } else {
+                                        delta.clone()
+                                    };
+                                    iteration_text.push_str(&unmasked);
+                                    if let Err(e) = tx.send(LoopEvent::Delta(unmasked)).await {
                                        tracing::warn!("[AgentLoop] Failed to send Delta event: {}", e);
                                    }
                                }
@@ -761,10 +795,18 @@ impl AgentLoop {
                if iteration_text.is_empty() && !reasoning_text.is_empty() {
                    tracing::info!("[AgentLoop] Model generated {} chars of reasoning but no text — using reasoning as response",
                        reasoning_text.len());
-                    if let Err(e) = tx.send(LoopEvent::Delta(reasoning_text.clone())).await {
+                    let unmasked_reasoning = if let Some(ref masker) = data_masker {
+                        match masker.unmask(&reasoning_text) {
+                            Ok(t) => t,
+                            Err(e) => { tracing::warn!("[AgentLoop] Reasoning unmask failed: {}", e); reasoning_text.clone() }
+                        }
+                    } else {
+                        reasoning_text.clone()
+                    };
+                    if let Err(e) = tx.send(LoopEvent::Delta(unmasked_reasoning.clone())).await {
                        tracing::warn!("[AgentLoop] Failed to send Delta event: {}", e);
                    }
-                    iteration_text = reasoning_text.clone();
+                    iteration_text = unmasked_reasoning;
                } else if iteration_text.is_empty() {
                    tracing::warn!("[AgentLoop] No text content after {} chunks (thinking_delta={})",
                        chunk_count, thinking_delta_count);
--- a/crates/zclaw-runtime/src/middleware/data_masking.rs
+++ b/crates/zclaw-runtime/src/middleware/data_masking.rs
@@ -19,8 +19,10 @@ use super::{AgentMiddleware, MiddlewareContext, MiddlewareDecision};
 // Pre-compiled regex patterns (compiled once, reused across all calls)
 // ---------------------------------------------------------------------------

+/// Excluded prefix chars: structural words that commonly precede 公司/集团 in
+/// non-name contexts (e.g. "有一家公司", "去了公司", "这是集团").
 static RE_COMPANY: LazyLock<Regex> = LazyLock::new(|| {
-    Regex::new(r"[^\s]{1,20}(?:公司|厂|集团|工作室|商行|有限|股份)").expect("static regex is valid")
+    Regex::new(r"[^\s有一家几了的在这是那些各去到从向被把让给对为和与而但又也还都已正将会能可要想需应该得]{1,20}(?:公司|厂|集团|工作室|商行|有限|股份)").expect("static regex is valid")
 });
 static RE_MONEY: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(r"[¥￥$]\s*[\d,.]+[万亿]?元?|[\d,.]+[万亿]元").expect("static regex is valid")
@@ -320,4 +322,45 @@ mod tests {
        let unmasked = masker.unmask(&masked).unwrap();
        assert_eq!(unmasked, input);
    }
+
+    #[test]
+    fn test_no_mask_generic_company() {
+        let masker = DataMasker::new();
+        // "有一家公司" is NOT a company name — "公司" is used as a generic noun
+        let input = "我有一家公司需要运营";
+        let masked = masker.mask(input).unwrap();
+        assert_eq!(masked, input, "Generic '有一家公司' should not be masked: {}", masked);
+    }
+
+    #[test]
+    fn test_no_mask_went_to_company() {
+        let masker = DataMasker::new();
+        let input = "我去了公司上班";
+        let masked = masker.mask(input).unwrap();
+        assert_eq!(masked, input, "去了公司 should not be masked: {}", masked);
+    }
+
+    #[test]
+    fn test_still_mask_real_company() {
+        let masker = DataMasker::new();
+        let input = "腾讯公司的员工";
+        let masked = masker.mask(input).unwrap();
+        assert!(!masked.contains("腾讯公司"), "Real company name should be masked: {}", masked);
+        assert!(masked.contains("__ENTITY_"), "Should contain token: {}", masked);
+
+        let unmasked = masker.unmask(&masked).unwrap();
+        assert_eq!(unmasked, input);
+    }
+
+    #[test]
+    fn test_still_mask_short_company() {
+        let masker = DataMasker::new();
+        // Single-letter company name "A公司" should still be masked
+        let input = "A公司的订单";
+        let masked = masker.mask(input).unwrap();
+        assert!(!masked.contains("A公司"), "A公司 should be masked: {}", masked);
+
+        let unmasked = masker.unmask(&masked).unwrap();
+        assert_eq!(unmasked, input);
+    }
 }