From b1a96ace1fc10b15f38fecf4667e3c9efa7262b5 Mon Sep 17 00:00:00 2001 From: iven Date: Tue, 5 May 2026 22:55:20 +0800 Subject: [PATCH] =?UTF-8?q?fix(ai):=20=E4=BF=AE=E5=A4=8D=20qwen3=20?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=20thinking=20=E6=A8=A1=E5=BC=8F=E5=AF=BC?= =?UTF-8?q?=E8=87=B4=20AI=20=E5=88=86=E6=9E=90=E8=BE=93=E5=87=BA=E4=B8=BA?= =?UTF-8?q?=E7=A9=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit qwen3:4b 默认启用 thinking 模式,流式 API 中 content 字段始终为空, 所有 token 消耗在 thinking 上。修复方案: - 对 qwen3 模型改用非流式 API,从 content 中剥离 , + thinking: Option, +} + +/// 去除 qwen3 等模型在非流式模式下 content 中嵌入的 String { + // 模型输出格式:thinking contentactual response + // 或 thinking contentactual response + if let Some(end) = content.find(" 或 \n + let after_tag = &content[end + 7..]; // skip "').trim_start(); + return actual.to_string(); + } + content.to_string() } #[async_trait] @@ -88,9 +102,40 @@ impl AiProvider for OllamaProvider { let model = if req.model.is_empty() { self.default_model.clone() } else { - req.model + req.model.clone() }; + // qwen3 流式模式下 thinking 和 content 分字段,content 始终为空 + // 改用非流式请求,然后手动模拟流式输出 + let needs_non_stream = model.starts_with("qwen3"); + + if needs_non_stream { + let mut gen_req = req; + // qwen3 thinking 会消耗大量 token,需要更大预算 + if gen_req.max_tokens < 4096 { + gen_req.max_tokens = 4096; + } + gen_req.model = model; + let result = self.generate(gen_req).await?; + let cleaned = strip_think_block(&result.content); + + let s = Box::pin(stream! { + // 按句子/段落分块输出,模拟流式效果 + let mut buffer = String::new(); + for ch in cleaned.chars() { + buffer.push(ch); + let should_flush = ch == '\n' || ch == '。' || ch == '!' || ch == '?' || ch == '.' || ch == '!' || ch == '?'; + if should_flush && !buffer.is_empty() { + yield Ok(std::mem::take(&mut buffer)); + } + } + if !buffer.is_empty() { + yield Ok(buffer); + } + }); + return Ok(s); + } + let ollama_req = OllamaChatRequest { model, messages: vec![ @@ -150,7 +195,9 @@ impl AiProvider for OllamaProvider { } if let Some(msg) = chunk.message { if let Some(content) = msg.content { - yield Ok(content); + if !content.is_empty() { + yield Ok(content); + } } } } @@ -221,8 +268,14 @@ impl AiProvider for OllamaProvider { let input_tokens = parsed.prompt_eval_count.unwrap_or(0) as u32; let output_tokens = parsed.eval_count.unwrap_or(0) as u32; + let content = if model.starts_with("qwen3") { + strip_think_block(&parsed.message.content) + } else { + parsed.message.content + }; + Ok(crate::dto::GenerateResponse { - content: parsed.message.content, + content, model, input_tokens, output_tokens, @@ -341,4 +394,35 @@ mod tests { ); assert_eq!(provider.base_url, "http://192.168.1.100:11434"); } + + #[test] + fn strip_think_block_removes_thinking() { + let input = "