perf(chat): 回复效率 + 建议生成并行化优化
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled

- identity prompt 缓存: LazyLock<RwLock<HashMap>> 缓存已构建的 identity prompt,
  soul.md 更新时自动失效, 省去每次请求的 mutex + 磁盘 I/O (~0.5-1s)
- pre-conversation hook 并行化: tokio::join! 并行执行 identity build 和
  continuity context 查询, 不再串行等待 (~1-2s)
- suggestion context 预取: 流式回复期间提前启动 fetchSuggestionContext,
  回复结束时 context 已就绪 (~0.5-1s)
- 建议生成与 memory extraction 解耦: generateLLMSuggestions 不再等待
  memory extraction LLM 调用完成, 独立启动 (~3-8s)
- Path B (agent stream) 补全 context: lifecycle:end 路径使用预取 context,
  修复零个性化问题
- 上下文窗口扩展: slice(-6) → slice(-20), 每条截断 200 字符
- suggestion prompt 重写: 1 深入追问 + 1 实用行动 + 1 管家关怀,
  明确角色定位, 禁止空泛建议
This commit is contained in:
iven
2026-04-23 23:13:20 +08:00
parent 10497362bb
commit 5cf7adff69
3 changed files with 141 additions and 53 deletions

View File

@@ -646,22 +646,24 @@ const HARDCODED_PROMPTS: Record<string, { system: string; user: (arg: string) =>
},
suggestions: {
system: `你是对话分析助手和智能管家。根据对话内容和用户画像信息,生成 3 个个性化建议。
system: `你是 ZCLAW 的管家助手,需要站在用户角度思考他们真正需要什么,生成 3 个个性化建议。
## 生成规则
1. 2 条对话续问(深入当前话题,帮助用户继续探索
2. 1 条管家关怀(基于用户消息中提供的痛点、经验或技能信息
1. 第 1 条 — 深入追问:基于当前话题,提出一个有洞察力的追问,帮助用户深入探索
2. 第 2 条 — 实用行动:建议一个具体的、可操作的下一步(调用技能、执行工具、查看数据等
3. 第 3 条 — 管家关怀:
- 如果有未解决痛点 → 回访建议,如"上次你提到X后来解决了吗"
- 如果有相关经验 → 引导复用,如"上次用X方法解决了类似问题要再试试吗"
- 如果有匹配技能 → 推荐使用,如"你可以试试 [技能名] 来处理这个"
- 如果没有提供痛点/经验/技能信息 → 全部生成对话续问
3. 每个不超过 30 个中文字符
4. 不要重复对话中已讨论过的内容
5. 使用与用户相同的语言
- 如果没有提供痛点/经验/技能信息 → 给出一个启发性的思考角度
4. 每个不超过 30 个中文字符
5. 不要重复对话中已讨论过的内容
6. 不要生成空泛的建议(如"继续分析"、"换个角度"
7. 使用与用户相同的语言
只输出 JSON 数组,包含恰好 3 个字符串。不要输出任何其他内容。
示例:["科室绩效分析可以按哪些维度拆解?", "上次的 researcher 技能能用在查房数据整理上吗?", "自动生成合规检查报告的模板有哪些"]`,
user: (context: string) => `以下是对话中最近的消息:\n\n${context}\n\n请生成 3 个后续问题`,
示例:["科室绩效分析可以按哪些维度拆解?", " researcher 技能帮你查一下相关文献?", "上次你提到排班冲突的问题,需要我再帮你想解决方案吗"]`,
user: (context: string) => `以下是对话中最近的消息:\n\n${context}\n\n请生成 3 个后续建议1 深入追问 + 1 实用行动 + 1 管家关怀)`,
},
};

View File

@@ -40,6 +40,10 @@ import { fetchSuggestionContext, type SuggestionContext } from '../../lib/sugges
const log = createLogger('StreamStore');
// Module-level prefetch for suggestion context — started during streaming,
// consumed on stream completion. Saves ~0.5-1s vs fetching after stream ends.
let _activeSuggestionContextPrefetch: Promise<SuggestionContext> | null = null;
// ---------------------------------------------------------------------------
// Error formatting — convert raw LLM/API errors to user-friendly messages
// ---------------------------------------------------------------------------
@@ -400,44 +404,50 @@ function createCompleteHandler(
}
}
// Parallel: memory extraction + intelligence context fetch
// Decoupled: suggestion generation runs immediately with prefetched context,
// memory extraction + reflection run independently in background.
const filtered = msgs
.filter(m => m.role === 'user' || m.role === 'assistant')
.map(m => ({ role: m.role, content: m.content }));
const convId = useConversationStore.getState().currentConversationId;
const lastUserContent = typeof lastContent === 'string' ? lastContent : '';
const suggestionContextPromise = fetchSuggestionContext(agentId, lastUserContent);
// Build conversation messages for suggestions
const latestMsgs = chat.getMessages() || [];
const conversationMessages = latestMsgs
.filter(m => m.role === 'user' || m.role === 'assistant')
.filter(m => !m.streaming)
.map(m => ({ role: m.role, content: m.content }));
// Fire-and-forget background tasks
Promise.all([
getMemoryExtractor().extractFromConversation(filtered, agentId, convId ?? undefined)
.catch(err => log.warn('Memory extraction failed:', err)),
intelligenceClient.reflection.recordConversation()
.catch(err => log.warn('Recording conversation failed:', err)),
suggestionContextPromise,
]).then(([, , context]) => {
// Conditional reflection (after context is ready)
intelligenceClient.reflection.shouldReflect().then(shouldReflect => {
// Consume prefetched context (started in sendMessage during streaming)
const prefetchPromise = _activeSuggestionContextPrefetch;
_activeSuggestionContextPrefetch = null;
// Fire suggestion generation immediately — don't wait for memory extraction
const fireSuggestions = (ctx?: SuggestionContext) => {
generateLLMSuggestions(conversationMessages, set, ctx).catch(err => {
log.warn('Suggestion generation error:', err);
set({ suggestionsLoading: false });
});
};
if (prefetchPromise) {
prefetchPromise.then(fireSuggestions).catch(() => fireSuggestions());
} else {
fireSuggestions();
}
// Background tasks run independently — never block suggestions
getMemoryExtractor().extractFromConversation(filtered, agentId, convId ?? undefined)
.catch(err => log.warn('Memory extraction failed:', err));
intelligenceClient.reflection.recordConversation()
.catch(err => log.warn('Recording conversation failed:', err))
.then(() => intelligenceClient.reflection.shouldReflect())
.then(shouldReflect => {
if (shouldReflect) {
intelligenceClient.reflection.reflect(agentId, []).catch(err => {
log.warn('Reflection failed:', err);
});
}
});
// Follow-up suggestions with enriched context
const latestMsgs = chat.getMessages() || [];
const conversationMessages = latestMsgs
.filter(m => m.role === 'user' || m.role === 'assistant')
.filter(m => !m.streaming)
.map(m => ({ role: m.role, content: m.content }));
generateLLMSuggestions(conversationMessages, set, context).catch(err => {
log.warn('Suggestion generation error:', err);
set({ suggestionsLoading: false });
});
});
}).catch(() => {});
};
}
@@ -573,9 +583,9 @@ async function generateLLMSuggestions(
set({ suggestionsLoading: true });
try {
const recentMessages = messages.slice(-6);
const recentMessages = messages.slice(-20);
const conversationContext = recentMessages
.map(m => `${m.role === 'user' ? '用户' : '助手'}: ${m.content}`)
.map(m => `${m.role === 'user' ? '用户' : '助手'}: ${m.content.slice(0, 200)}`)
.join('\n\n');
// Build dynamic user message with intelligence context
@@ -799,6 +809,9 @@ export const useStreamStore = create<StreamState>()(
});
set({ isStreaming: true, activeRunId: null });
// Prefetch suggestion context during streaming — saves ~0.5-1s post-stream
_activeSuggestionContextPrefetch = fetchSuggestionContext(agentId, content);
// Delta buffer — batches updates at ~60fps
const buffer = new DeltaBuffer(assistantId, _chat);
@@ -1072,10 +1085,20 @@ export const useStreamStore = create<StreamState>()(
.filter(m => !m.streaming)
.map(m => ({ role: m.role, content: m.content }));
generateLLMSuggestions(conversationMessages, set).catch(err => {
log.warn('Suggestion generation error:', err);
set({ suggestionsLoading: false });
});
// Path B: use prefetched context for agent stream — fixes zero-personalization
const prefetchPromise = _activeSuggestionContextPrefetch;
_activeSuggestionContextPrefetch = null;
const fireSuggestions = (ctx?: SuggestionContext) => {
generateLLMSuggestions(conversationMessages, set, ctx).catch(err => {
log.warn('Suggestion generation error:', err);
set({ suggestionsLoading: false });
});
};
if (prefetchPromise) {
prefetchPromise.then(fireSuggestions).catch(() => fireSuggestions());
} else {
fireSuggestions();
}
}
}
} else if (delta.stream === 'hand') {