perf(runtime): Hermes Phase 1-3 — prompt caching + parallel tools + smart retry

Phase 1: Anthropic prompt caching
- Add cache_control ephemeral on system prompt blocks
- Track cache_creation/cache_read tokens in CompletionResponse + StreamChunk

Phase 2A: Parallel tool execution
- Add ToolConcurrency enum (ReadOnly/Exclusive/Interactive)
- JoinSet + Semaphore(3) for bounded parallel tool calls
- 7 tools annotated with correct concurrency level
- AtomicU32 for lock-free failure tracking in ToolErrorMiddleware

Phase 2B: Tool output pruning
- prune_tool_outputs() trims old ToolResult > 2000 chars to 500 chars
- Integrated into CompactionMiddleware before token estimation

Phase 3: Error classification + smart retry
- LlmErrorKind + ClassifiedLlmError for structured error mapping
- RetryDriver decorator with jittered exponential backoff
- Kernel wraps all LLM calls with RetryDriver
- CONTEXT_OVERFLOW recovery triggers emergency compaction in loop_runner
This commit is contained in:
iven
2026-04-24 08:39:56 +08:00
parent 6d6673bf5b
commit 9060935401
25 changed files with 672 additions and 129 deletions

View File

@@ -31,6 +31,8 @@ async fn seam_hand_tool_routing() {
input_tokens: 10,
output_tokens: 20,
stop_reason: "tool_use".to_string(),
cache_creation_input_tokens: None,
cache_read_input_tokens: None,
},
])
// Second stream: final text after tool executes
@@ -40,6 +42,8 @@ async fn seam_hand_tool_routing() {
input_tokens: 10,
output_tokens: 5,
stop_reason: "end_turn".to_string(),
cache_creation_input_tokens: None,
cache_read_input_tokens: None,
},
]);
@@ -105,6 +109,8 @@ async fn seam_hand_execution_callback() {
input_tokens: 10,
output_tokens: 5,
stop_reason: "tool_use".to_string(),
cache_creation_input_tokens: None,
cache_read_input_tokens: None,
},
])
.with_stream_chunks(vec![
@@ -113,6 +119,8 @@ async fn seam_hand_execution_callback() {
input_tokens: 5,
output_tokens: 1,
stop_reason: "end_turn".to_string(),
cache_creation_input_tokens: None,
cache_read_input_tokens: None,
},
]);
@@ -173,6 +181,8 @@ async fn seam_generic_tool_routing() {
input_tokens: 10,
output_tokens: 5,
stop_reason: "tool_use".to_string(),
cache_creation_input_tokens: None,
cache_read_input_tokens: None,
},
])
.with_stream_chunks(vec![
@@ -181,6 +191,8 @@ async fn seam_generic_tool_routing() {
input_tokens: 5,
output_tokens: 3,
stop_reason: "end_turn".to_string(),
cache_creation_input_tokens: None,
cache_read_input_tokens: None,
},
]);

View File

@@ -27,6 +27,8 @@ async fn smoke_hands_full_lifecycle() {
input_tokens: 15,
output_tokens: 10,
stop_reason: "tool_use".to_string(),
cache_creation_input_tokens: None,
cache_read_input_tokens: None,
},
])
// After hand_quiz returns, LLM generates final response
@@ -36,6 +38,8 @@ async fn smoke_hands_full_lifecycle() {
input_tokens: 20,
output_tokens: 5,
stop_reason: "end_turn".to_string(),
cache_creation_input_tokens: None,
cache_read_input_tokens: None,
},
]);