perf(runtime): Hermes Phase 1-3 — prompt caching + parallel tools + smart retry
Phase 1: Anthropic prompt caching - Add cache_control ephemeral on system prompt blocks - Track cache_creation/cache_read tokens in CompletionResponse + StreamChunk Phase 2A: Parallel tool execution - Add ToolConcurrency enum (ReadOnly/Exclusive/Interactive) - JoinSet + Semaphore(3) for bounded parallel tool calls - 7 tools annotated with correct concurrency level - AtomicU32 for lock-free failure tracking in ToolErrorMiddleware Phase 2B: Tool output pruning - prune_tool_outputs() trims old ToolResult > 2000 chars to 500 chars - Integrated into CompactionMiddleware before token estimation Phase 3: Error classification + smart retry - LlmErrorKind + ClassifiedLlmError for structured error mapping - RetryDriver decorator with jittered exponential backoff - Kernel wraps all LLM calls with RetryDriver - CONTEXT_OVERFLOW recovery triggers emergency compaction in loop_runner
This commit is contained in:
@@ -15,11 +15,14 @@ mod anthropic;
|
||||
mod openai;
|
||||
mod gemini;
|
||||
mod local;
|
||||
mod error_classifier;
|
||||
mod retry_driver;
|
||||
|
||||
pub use anthropic::AnthropicDriver;
|
||||
pub use openai::OpenAiDriver;
|
||||
pub use gemini::GeminiDriver;
|
||||
pub use local::LocalDriver;
|
||||
pub use retry_driver::{RetryDriver, RetryConfig};
|
||||
|
||||
/// LLM Driver trait - unified interface for all providers
|
||||
#[async_trait]
|
||||
@@ -106,6 +109,12 @@ pub struct CompletionResponse {
|
||||
pub output_tokens: u32,
|
||||
/// Stop reason
|
||||
pub stop_reason: StopReason,
|
||||
/// Cache creation input tokens (Anthropic prompt caching)
|
||||
#[serde(default)]
|
||||
pub cache_creation_input_tokens: Option<u32>,
|
||||
/// Cache read input tokens (Anthropic prompt caching)
|
||||
#[serde(default)]
|
||||
pub cache_read_input_tokens: Option<u32>,
|
||||
}
|
||||
|
||||
/// LLM driver response content block (subset of canonical zclaw_types::ContentBlock).
|
||||
|
||||
Reference in New Issue
Block a user