From b1a96ace1fc10b15f38fecf4667e3c9efa7262b5 Mon Sep 17 00:00:00 2001
From: iven <iven_h@qq.com>
Date: Tue, 5 May 2026 22:55:20 +0800
Subject: [PATCH] =?UTF-8?q?fix(ai):=20=E4=BF=AE=E5=A4=8D=20qwen3=20?=
 =?UTF-8?q?=E6=A8=A1=E5=9E=8B=20thinking=20=E6=A8=A1=E5=BC=8F=E5=AF=BC?=
 =?UTF-8?q?=E8=87=B4=20AI=20=E5=88=86=E6=9E=90=E8=BE=93=E5=87=BA=E4=B8=BA?=
 =?UTF-8?q?=E7=A9=BA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

qwen3:4b 默认启用 thinking 模式，流式 API 中 content 字段始终为空，
所有 token 消耗在 thinking 上。修复方案：
- 对 qwen3 模型改用非流式 API，从 content 中剥离 <think... 块
- 将清理后的内容按句子/段落分块模拟流式输出
- 自动提升 qwen3 的 num_predict 至 4096 确保 thinking + 回复完整
- 流式解析中跳过空 content chunk
- 新增 strip_think_block 函数及 5 个单元测试
---
 crates/erp-ai/src/provider/ollama.rs | 90 +++++++++++++++++++++++++++-
 1 file changed, 87 insertions(+), 3 deletions(-)
diff --git a/crates/erp-ai/src/provider/ollama.rs b/crates/erp-ai/src/provider/ollama.rs
index eecd6bf..2bbd0a0 100644
--- a/crates/erp-ai/src/provider/ollama.rs
+++ b/crates/erp-ai/src/provider/ollama.rs
@@ -77,6 +77,20 @@ struct OllamaStreamChunk {
 #[derive(Deserialize)]
 struct OllamaStreamMessage {
     content: Option<String>,
+    thinking: Option<String>,
+}
+
+/// 去除 qwen3 等模型在非流式模式下 content 中嵌入的 <think...</think\n 块
+fn strip_think_block(content: &str) -> String {
+    // 模型输出格式：<think...>thinking content</think\n>actual response
+    // 或 <think...>thinking content</think\n\n>actual response
+    if let Some(end) = content.find("</think") {
+        // 跳过 </think 标签及其后的 > 或 \n
+        let after_tag = &content[end + 7..]; // skip "</think"
+        let actual = after_tag.trim_start_matches('\n').trim_start_matches('>').trim_start();
+        return actual.to_string();
+    }
+    content.to_string()
 }
 
 #[async_trait]
@@ -88,9 +102,40 @@ impl AiProvider for OllamaProvider {
         let model = if req.model.is_empty() {
             self.default_model.clone()
         } else {
-            req.model
+            req.model.clone()
         };
 
+        // qwen3 流式模式下 thinking 和 content 分字段，content 始终为空
+        // 改用非流式请求，然后手动模拟流式输出
+        let needs_non_stream = model.starts_with("qwen3");
+
+        if needs_non_stream {
+            let mut gen_req = req;
+            // qwen3 thinking 会消耗大量 token，需要更大预算
+            if gen_req.max_tokens < 4096 {
+                gen_req.max_tokens = 4096;
+            }
+            gen_req.model = model;
+            let result = self.generate(gen_req).await?;
+            let cleaned = strip_think_block(&result.content);
+
+            let s = Box::pin(stream! {
+                // 按句子/段落分块输出，模拟流式效果
+                let mut buffer = String::new();
+                for ch in cleaned.chars() {
+                    buffer.push(ch);
+                    let should_flush = ch == '\n' || ch == '。' || ch == '！' || ch == '？' || ch == '.' || ch == '!' || ch == '?';
+                    if should_flush && !buffer.is_empty() {
+                        yield Ok(std::mem::take(&mut buffer));
+                    }
+                }
+                if !buffer.is_empty() {
+                    yield Ok(buffer);
+                }
+            });
+            return Ok(s);
+        }
+
         let ollama_req = OllamaChatRequest {
             model,
             messages: vec![
@@ -150,7 +195,9 @@ impl AiProvider for OllamaProvider {
                         }
                         if let Some(msg) = chunk.message {
                             if let Some(content) = msg.content {
-                                yield Ok(content);
+                                if !content.is_empty() {
+                                    yield Ok(content);
+                                }
                             }
                         }
                     }
@@ -221,8 +268,14 @@ impl AiProvider for OllamaProvider {
         let input_tokens = parsed.prompt_eval_count.unwrap_or(0) as u32;
         let output_tokens = parsed.eval_count.unwrap_or(0) as u32;
 
+        let content = if model.starts_with("qwen3") {
+            strip_think_block(&parsed.message.content)
+        } else {
+            parsed.message.content
+        };
+
         Ok(crate::dto::GenerateResponse {
-            content: parsed.message.content,
+            content,
             model,
             input_tokens,
             output_tokens,
@@ -341,4 +394,35 @@ mod tests {
         );
         assert_eq!(provider.base_url, "http://192.168.1.100:11434");
     }
+
+    #[test]
+    fn strip_think_block_removes_thinking() {
+        let input = "<think\nLet me think about this...\n1+1=2\n</think\n\n2";
+        assert_eq!(strip_think_block(input), "2");
+    }
+
+    #[test]
+    fn strip_think_block_no_think_tag() {
+        let input = "Direct answer without thinking";
+        assert_eq!(strip_think_block(input), "Direct answer without thinking");
+    }
+
+    #[test]
+    fn strip_think_block_empty_after_tag() {
+        let input = "<think\nthinking</think\n>\n\n";
+        assert_eq!(strip_think_block(input), "");
+    }
+
+    #[test]
+    fn stream_chunk_with_thinking_field() {
+        let json = r#"{
+            "message": {"role": "assistant", "content": "", "thinking": "hmm"},
+            "done": false
+        }"#;
+        let chunk: OllamaStreamChunk = serde_json::from_str(json).unwrap();
+        assert!(!chunk.done);
+        let msg = chunk.message.unwrap();
+        assert_eq!(msg.content, Some("".to_string()));
+        assert_eq!(msg.thinking, Some("hmm".to_string()));
+    }
 }