fix(ai): 修复 qwen3 模型 thinking 模式导致 AI 分析输出为空
Some checks failed
CI / security-audit (push) Has been cancelled
CI / frontend-build (push) Has been cancelled
CI / rust-check (push) Has been cancelled
CI / rust-test (push) Has been cancelled

qwen3:4b 默认启用 thinking 模式,流式 API 中 content 字段始终为空,
所有 token 消耗在 thinking 上。修复方案:
- 对 qwen3 模型改用非流式 API,从 content 中剥离 <think... 块
- 将清理后的内容按句子/段落分块模拟流式输出
- 自动提升 qwen3 的 num_predict 至 4096 确保 thinking + 回复完整
- 流式解析中跳过空 content chunk
- 新增 strip_think_block 函数及 5 个单元测试
This commit is contained in:
iven
2026-05-05 22:55:20 +08:00
parent e9cfbd108a
commit b1a96ace1f

View File

@@ -77,6 +77,20 @@ struct OllamaStreamChunk {
#[derive(Deserialize)]
struct OllamaStreamMessage {
content: Option<String>,
thinking: Option<String>,
}
/// 去除 qwen3 等模型在非流式模式下 content 中嵌入的 <think...</think\n 块
fn strip_think_block(content: &str) -> String {
// 模型输出格式:<think...>thinking content</think\n>actual response
// 或 <think...>thinking content</think\n\n>actual response
if let Some(end) = content.find("</think") {
// 跳过 </think 标签及其后的 > 或 \n
let after_tag = &content[end + 7..]; // skip "</think"
let actual = after_tag.trim_start_matches('\n').trim_start_matches('>').trim_start();
return actual.to_string();
}
content.to_string()
}
#[async_trait]
@@ -88,9 +102,40 @@ impl AiProvider for OllamaProvider {
let model = if req.model.is_empty() {
self.default_model.clone()
} else {
req.model
req.model.clone()
};
// qwen3 流式模式下 thinking 和 content 分字段content 始终为空
// 改用非流式请求,然后手动模拟流式输出
let needs_non_stream = model.starts_with("qwen3");
if needs_non_stream {
let mut gen_req = req;
// qwen3 thinking 会消耗大量 token需要更大预算
if gen_req.max_tokens < 4096 {
gen_req.max_tokens = 4096;
}
gen_req.model = model;
let result = self.generate(gen_req).await?;
let cleaned = strip_think_block(&result.content);
let s = Box::pin(stream! {
// 按句子/段落分块输出,模拟流式效果
let mut buffer = String::new();
for ch in cleaned.chars() {
buffer.push(ch);
let should_flush = ch == '\n' || ch == '。' || ch == '' || ch == '' || ch == '.' || ch == '!' || ch == '?';
if should_flush && !buffer.is_empty() {
yield Ok(std::mem::take(&mut buffer));
}
}
if !buffer.is_empty() {
yield Ok(buffer);
}
});
return Ok(s);
}
let ollama_req = OllamaChatRequest {
model,
messages: vec![
@@ -150,7 +195,9 @@ impl AiProvider for OllamaProvider {
}
if let Some(msg) = chunk.message {
if let Some(content) = msg.content {
yield Ok(content);
if !content.is_empty() {
yield Ok(content);
}
}
}
}
@@ -221,8 +268,14 @@ impl AiProvider for OllamaProvider {
let input_tokens = parsed.prompt_eval_count.unwrap_or(0) as u32;
let output_tokens = parsed.eval_count.unwrap_or(0) as u32;
let content = if model.starts_with("qwen3") {
strip_think_block(&parsed.message.content)
} else {
parsed.message.content
};
Ok(crate::dto::GenerateResponse {
content: parsed.message.content,
content,
model,
input_tokens,
output_tokens,
@@ -341,4 +394,35 @@ mod tests {
);
assert_eq!(provider.base_url, "http://192.168.1.100:11434");
}
#[test]
fn strip_think_block_removes_thinking() {
let input = "<think\nLet me think about this...\n1+1=2\n</think\n\n2";
assert_eq!(strip_think_block(input), "2");
}
#[test]
fn strip_think_block_no_think_tag() {
let input = "Direct answer without thinking";
assert_eq!(strip_think_block(input), "Direct answer without thinking");
}
#[test]
fn strip_think_block_empty_after_tag() {
let input = "<think\nthinking</think\n>\n\n";
assert_eq!(strip_think_block(input), "");
}
#[test]
fn stream_chunk_with_thinking_field() {
let json = r#"{
"message": {"role": "assistant", "content": "", "thinking": "hmm"},
"done": false
}"#;
let chunk: OllamaStreamChunk = serde_json::from_str(json).unwrap();
assert!(!chunk.done);
let msg = chunk.message.unwrap();
assert_eq!(msg.content, Some("".to_string()));
assert_eq!(msg.thinking, Some("hmm".to_string()));
}
}