fix(ai): 修复 qwen3 模型 thinking 模式导致 AI 分析输出为空
qwen3:4b 默认启用 thinking 模式,流式 API 中 content 字段始终为空, 所有 token 消耗在 thinking 上。修复方案: - 对 qwen3 模型改用非流式 API,从 content 中剥离 <think... 块 - 将清理后的内容按句子/段落分块模拟流式输出 - 自动提升 qwen3 的 num_predict 至 4096 确保 thinking + 回复完整 - 流式解析中跳过空 content chunk - 新增 strip_think_block 函数及 5 个单元测试
This commit is contained in:
@@ -77,6 +77,20 @@ struct OllamaStreamChunk {
|
||||
#[derive(Deserialize)]
|
||||
struct OllamaStreamMessage {
|
||||
content: Option<String>,
|
||||
thinking: Option<String>,
|
||||
}
|
||||
|
||||
/// 去除 qwen3 等模型在非流式模式下 content 中嵌入的 <think...</think\n 块
|
||||
fn strip_think_block(content: &str) -> String {
|
||||
// 模型输出格式:<think...>thinking content</think\n>actual response
|
||||
// 或 <think...>thinking content</think\n\n>actual response
|
||||
if let Some(end) = content.find("</think") {
|
||||
// 跳过 </think 标签及其后的 > 或 \n
|
||||
let after_tag = &content[end + 7..]; // skip "</think"
|
||||
let actual = after_tag.trim_start_matches('\n').trim_start_matches('>').trim_start();
|
||||
return actual.to_string();
|
||||
}
|
||||
content.to_string()
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
@@ -88,9 +102,40 @@ impl AiProvider for OllamaProvider {
|
||||
let model = if req.model.is_empty() {
|
||||
self.default_model.clone()
|
||||
} else {
|
||||
req.model
|
||||
req.model.clone()
|
||||
};
|
||||
|
||||
// qwen3 流式模式下 thinking 和 content 分字段,content 始终为空
|
||||
// 改用非流式请求,然后手动模拟流式输出
|
||||
let needs_non_stream = model.starts_with("qwen3");
|
||||
|
||||
if needs_non_stream {
|
||||
let mut gen_req = req;
|
||||
// qwen3 thinking 会消耗大量 token,需要更大预算
|
||||
if gen_req.max_tokens < 4096 {
|
||||
gen_req.max_tokens = 4096;
|
||||
}
|
||||
gen_req.model = model;
|
||||
let result = self.generate(gen_req).await?;
|
||||
let cleaned = strip_think_block(&result.content);
|
||||
|
||||
let s = Box::pin(stream! {
|
||||
// 按句子/段落分块输出,模拟流式效果
|
||||
let mut buffer = String::new();
|
||||
for ch in cleaned.chars() {
|
||||
buffer.push(ch);
|
||||
let should_flush = ch == '\n' || ch == '。' || ch == '!' || ch == '?' || ch == '.' || ch == '!' || ch == '?';
|
||||
if should_flush && !buffer.is_empty() {
|
||||
yield Ok(std::mem::take(&mut buffer));
|
||||
}
|
||||
}
|
||||
if !buffer.is_empty() {
|
||||
yield Ok(buffer);
|
||||
}
|
||||
});
|
||||
return Ok(s);
|
||||
}
|
||||
|
||||
let ollama_req = OllamaChatRequest {
|
||||
model,
|
||||
messages: vec![
|
||||
@@ -150,7 +195,9 @@ impl AiProvider for OllamaProvider {
|
||||
}
|
||||
if let Some(msg) = chunk.message {
|
||||
if let Some(content) = msg.content {
|
||||
yield Ok(content);
|
||||
if !content.is_empty() {
|
||||
yield Ok(content);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -221,8 +268,14 @@ impl AiProvider for OllamaProvider {
|
||||
let input_tokens = parsed.prompt_eval_count.unwrap_or(0) as u32;
|
||||
let output_tokens = parsed.eval_count.unwrap_or(0) as u32;
|
||||
|
||||
let content = if model.starts_with("qwen3") {
|
||||
strip_think_block(&parsed.message.content)
|
||||
} else {
|
||||
parsed.message.content
|
||||
};
|
||||
|
||||
Ok(crate::dto::GenerateResponse {
|
||||
content: parsed.message.content,
|
||||
content,
|
||||
model,
|
||||
input_tokens,
|
||||
output_tokens,
|
||||
@@ -341,4 +394,35 @@ mod tests {
|
||||
);
|
||||
assert_eq!(provider.base_url, "http://192.168.1.100:11434");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn strip_think_block_removes_thinking() {
|
||||
let input = "<think\nLet me think about this...\n1+1=2\n</think\n\n2";
|
||||
assert_eq!(strip_think_block(input), "2");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn strip_think_block_no_think_tag() {
|
||||
let input = "Direct answer without thinking";
|
||||
assert_eq!(strip_think_block(input), "Direct answer without thinking");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn strip_think_block_empty_after_tag() {
|
||||
let input = "<think\nthinking</think\n>\n\n";
|
||||
assert_eq!(strip_think_block(input), "");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn stream_chunk_with_thinking_field() {
|
||||
let json = r#"{
|
||||
"message": {"role": "assistant", "content": "", "thinking": "hmm"},
|
||||
"done": false
|
||||
}"#;
|
||||
let chunk: OllamaStreamChunk = serde_json::from_str(json).unwrap();
|
||||
assert!(!chunk.done);
|
||||
let msg = chunk.message.unwrap();
|
||||
assert_eq!(msg.content, Some("".to_string()));
|
||||
assert_eq!(msg.thinking, Some("hmm".to_string()));
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user