Phase 1: Anthropic prompt caching - Add cache_control ephemeral on system prompt blocks - Track cache_creation/cache_read tokens in CompletionResponse + StreamChunk Phase 2A: Parallel tool execution - Add ToolConcurrency enum (ReadOnly/Exclusive/Interactive) - JoinSet + Semaphore(3) for bounded parallel tool calls - 7 tools annotated with correct concurrency level - AtomicU32 for lock-free failure tracking in ToolErrorMiddleware Phase 2B: Tool output pruning - prune_tool_outputs() trims old ToolResult > 2000 chars to 500 chars - Integrated into CompactionMiddleware before token estimation Phase 3: Error classification + smart retry - LlmErrorKind + ClassifiedLlmError for structured error mapping - RetryDriver decorator with jittered exponential backoff - Kernel wraps all LLM calls with RetryDriver - CONTEXT_OVERFLOW recovery triggers emergency compaction in loop_runner
196 lines
5.3 KiB
Rust
196 lines
5.3 KiB
Rust
//! LLM Driver trait and implementations
|
|
//!
|
|
//! This module provides a unified interface for multiple LLM providers.
|
|
|
|
use async_trait::async_trait;
|
|
use futures::Stream;
|
|
use secrecy::SecretString;
|
|
use serde::{Deserialize, Serialize};
|
|
use std::pin::Pin;
|
|
use zclaw_types::Result;
|
|
|
|
use crate::stream::StreamChunk;
|
|
|
|
mod anthropic;
|
|
mod openai;
|
|
mod gemini;
|
|
mod local;
|
|
mod error_classifier;
|
|
mod retry_driver;
|
|
|
|
pub use anthropic::AnthropicDriver;
|
|
pub use openai::OpenAiDriver;
|
|
pub use gemini::GeminiDriver;
|
|
pub use local::LocalDriver;
|
|
pub use retry_driver::{RetryDriver, RetryConfig};
|
|
|
|
/// LLM Driver trait - unified interface for all providers
|
|
#[async_trait]
|
|
pub trait LlmDriver: Send + Sync {
|
|
/// Get the provider name
|
|
fn provider(&self) -> &str;
|
|
|
|
/// Send a completion request
|
|
async fn complete(&self, request: CompletionRequest) -> Result<CompletionResponse>;
|
|
|
|
/// Send a streaming completion request
|
|
/// Returns a stream of chunks
|
|
fn stream(
|
|
&self,
|
|
request: CompletionRequest,
|
|
) -> Pin<Box<dyn Stream<Item = Result<StreamChunk>> + Send + '_>>;
|
|
|
|
/// Check if the driver is properly configured
|
|
fn is_configured(&self) -> bool;
|
|
}
|
|
|
|
/// Completion request
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct CompletionRequest {
|
|
/// Model identifier
|
|
pub model: String,
|
|
/// System prompt
|
|
pub system: Option<String>,
|
|
/// Conversation messages
|
|
pub messages: Vec<zclaw_types::Message>,
|
|
/// Available tools
|
|
pub tools: Vec<ToolDefinition>,
|
|
/// Maximum tokens to generate
|
|
pub max_tokens: Option<u32>,
|
|
/// Temperature (0.0 - 1.0)
|
|
pub temperature: Option<f32>,
|
|
/// Stop sequences
|
|
pub stop: Vec<String>,
|
|
/// Enable streaming
|
|
pub stream: bool,
|
|
/// Enable extended thinking/reasoning
|
|
#[serde(default)]
|
|
pub thinking_enabled: bool,
|
|
/// Reasoning effort level (for providers that support it)
|
|
#[serde(default)]
|
|
pub reasoning_effort: Option<String>,
|
|
/// Enable plan mode
|
|
#[serde(default)]
|
|
pub plan_mode: bool,
|
|
}
|
|
|
|
impl Default for CompletionRequest {
|
|
fn default() -> Self {
|
|
Self {
|
|
model: String::new(),
|
|
system: None,
|
|
messages: Vec::new(),
|
|
tools: Vec::new(),
|
|
max_tokens: Some(4096),
|
|
temperature: Some(0.7),
|
|
stop: Vec::new(),
|
|
stream: false,
|
|
thinking_enabled: false,
|
|
reasoning_effort: None,
|
|
plan_mode: false,
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Tool definition for LLM function calling.
|
|
/// Re-exported from `zclaw_types::tool::ToolDefinition` (canonical definition).
|
|
pub use zclaw_types::tool::ToolDefinition;
|
|
|
|
/// Completion response
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct CompletionResponse {
|
|
/// Generated content blocks
|
|
pub content: Vec<ContentBlock>,
|
|
/// Model used
|
|
pub model: String,
|
|
/// Input tokens
|
|
pub input_tokens: u32,
|
|
/// Output tokens
|
|
pub output_tokens: u32,
|
|
/// Stop reason
|
|
pub stop_reason: StopReason,
|
|
/// Cache creation input tokens (Anthropic prompt caching)
|
|
#[serde(default)]
|
|
pub cache_creation_input_tokens: Option<u32>,
|
|
/// Cache read input tokens (Anthropic prompt caching)
|
|
#[serde(default)]
|
|
pub cache_read_input_tokens: Option<u32>,
|
|
}
|
|
|
|
/// LLM driver response content block (subset of canonical zclaw_types::ContentBlock).
|
|
/// Used internally by Anthropic/OpenAI/Gemini/Local drivers for API response parsing.
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
#[serde(tag = "type", rename_all = "snake_case")]
|
|
pub enum ContentBlock {
|
|
Text { text: String },
|
|
Thinking { thinking: String },
|
|
ToolUse { id: String, name: String, input: serde_json::Value },
|
|
/// Anthropic API tool result — must be sent as `role: "user"` with this content block.
|
|
ToolResult {
|
|
tool_use_id: String,
|
|
content: String,
|
|
#[serde(skip_serializing_if = "std::ops::Not::not")]
|
|
is_error: bool,
|
|
},
|
|
}
|
|
|
|
/// Stop reason
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
#[serde(rename_all = "snake_case")]
|
|
pub enum StopReason {
|
|
EndTurn,
|
|
MaxTokens,
|
|
StopSequence,
|
|
ToolUse,
|
|
Error,
|
|
}
|
|
|
|
/// Driver configuration
|
|
#[derive(Debug, Clone)]
|
|
pub enum DriverConfig {
|
|
Anthropic { api_key: SecretString },
|
|
OpenAi { api_key: SecretString, base_url: Option<String> },
|
|
Gemini { api_key: SecretString },
|
|
Local { base_url: String },
|
|
}
|
|
|
|
impl DriverConfig {
|
|
pub fn anthropic(api_key: impl Into<String>) -> Self {
|
|
Self::Anthropic {
|
|
api_key: SecretString::new(api_key.into()),
|
|
}
|
|
}
|
|
|
|
pub fn openai(api_key: impl Into<String>) -> Self {
|
|
Self::OpenAi {
|
|
api_key: SecretString::new(api_key.into()),
|
|
base_url: None,
|
|
}
|
|
}
|
|
|
|
pub fn openai_with_base(api_key: impl Into<String>, base_url: impl Into<String>) -> Self {
|
|
Self::OpenAi {
|
|
api_key: SecretString::new(api_key.into()),
|
|
base_url: Some(base_url.into()),
|
|
}
|
|
}
|
|
|
|
pub fn gemini(api_key: impl Into<String>) -> Self {
|
|
Self::Gemini {
|
|
api_key: SecretString::new(api_key.into()),
|
|
}
|
|
}
|
|
|
|
pub fn ollama() -> Self {
|
|
Self::Local {
|
|
base_url: "http://localhost:11434".to_string(),
|
|
}
|
|
}
|
|
|
|
pub fn local(base_url: impl Into<String>) -> Self {
|
|
Self::Local {
|
|
base_url: base_url.into(),
|
|
}
|
|
}
|
|
}
|