feat(saas): add model groups for cross-provider failover

Model Groups provide logical model names that map to multiple physical models across providers, with automatic failover when one provider's key pool is exhausted. Backend: - New model_groups + model_group_members tables with FK constraints - Full CRUD API (7 endpoints) with admin-only write permissions - Cache layer: DashMap-backed CachedModelGroup with load_from_db - Relay integration: ModelResolution enum for Direct/Group routing - Cross-provider failover: sort_candidates_by_quota + OnceLock cache - Relay failure path: record failure usage + relay_dequeue (fixes queue counter leak that caused connection pool exhaustion) - add_group_member: validate model_id exists before insert Frontend: - saas-relay-client: accept getModel() callback for dynamic model selection - connectionStore: prefer conversationStore.currentModel over first available Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-04 09:56:21 +08:00
parent 9af7b0dd46
commit be0a78a523
11 changed files with 849 additions and 64 deletions
--- a/crates/zclaw-saas/src/cache.rs
+++ b/crates/zclaw-saas/src/cache.rs
@@ -37,15 +37,39 @@ pub struct CachedProvider {
    pub enabled: bool,
 }

+// ============ Model Group 缓存（跨 Provider Failover） ============
+
+#[derive(Debug, Clone)]
+pub struct CachedModelGroup {
+    pub id: String,
+    pub name: String,
+    pub display_name: String,
+    pub description: String,
+    pub enabled: bool,
+    pub failover_strategy: String,
+    pub members: Vec<CachedGroupMember>,
+}
+
+#[derive(Debug, Clone)]
+pub struct CachedGroupMember {
+    pub id: String,
+    pub provider_id: String,
+    pub model_id: String,
+    pub priority: i32,
+    pub enabled: bool,
+}
+
 // ============ 聚合缓存结构 ============

-/// 全局缓存，持有 Model / Provider / 队列计数器
+/// 全局缓存，持有 Model / Provider / Model Groups / 队列计数器
 #[derive(Debug, Clone)]
 pub struct AppCache {
    /// model_id → CachedModel (key 是 models.model_id，不是 id)
    pub models: Arc<DashMap<String, CachedModel>>,
    /// provider id → CachedProvider
    pub providers: Arc<DashMap<String, CachedProvider>>,
+    /// model group name → CachedModelGroup（逻辑模型名到候选列表的映射）
+    pub model_groups: Arc<DashMap<String, CachedModelGroup>>,
    /// account_id → 当前排队/处理中的任务数
    pub relay_queue_counts: Arc<DashMap<String, Arc<AtomicI64>>>,
 }
@@ -55,6 +79,7 @@ impl AppCache {
        Self {
            models: Arc::new(DashMap::new()),
            providers: Arc::new(DashMap::new()),
+            model_groups: Arc::new(DashMap::new()),
            relay_queue_counts: Arc::new(DashMap::new()),
        }
    }
@@ -104,10 +129,44 @@ impl AppCache {
            });
        }

+        // Load model groups with members
+        let group_rows: Vec<(String, String, String, String, bool, String)> = sqlx::query_as(
+            "SELECT id, name, display_name, COALESCE(description, ''), enabled, COALESCE(failover_strategy, 'quota_aware') FROM model_groups"
+        ).fetch_all(db).await?;
+
+        let member_rows: Vec<(String, String, String, String, i32, bool)> = sqlx::query_as(
+            "SELECT id, group_id, provider_id, model_id, priority, enabled \
+             FROM model_group_members ORDER BY priority ASC"
+        ).fetch_all(db).await?;
+
+        self.model_groups.clear();
+        for (id, name, display_name, description, enabled, failover_strategy) in &group_rows {
+            let members: Vec<CachedGroupMember> = member_rows.iter()
+                .filter(|(_, gid, _, _, _, _)| gid == id)
+                .map(|(mid, _, pid, mid2, pri, en)| CachedGroupMember {
+                    id: mid.clone(),
+                    provider_id: pid.clone(),
+                    model_id: mid2.clone(),
+                    priority: *pri,
+                    enabled: *en,
+                })
+                .collect();
+            self.model_groups.insert(name.clone(), CachedModelGroup {
+                id: id.clone(),
+                name: name.clone(),
+                display_name: display_name.clone(),
+                description: description.clone(),
+                enabled: *enabled,
+                failover_strategy: failover_strategy.clone(),
+                members,
+            });
+        }
+
        tracing::info!(
-            "Cache loaded: {} providers, {} models",
+            "Cache loaded: {} providers, {} models, {} model groups",
            self.providers.len(),
-            self.models.len()
+            self.models.len(),
+            self.model_groups.len()
        );
        Ok(())
    }
@@ -183,6 +242,13 @@ impl AppCache {
            .map(|r| r.value().clone())
    }

+    /// 按逻辑模型名查找已启用的模型组。O(1) DashMap 查找。
+    pub fn get_model_group(&self, name: &str) -> Option<CachedModelGroup> {
+        self.model_groups.get(name)
+            .filter(|g| g.enabled)
+            .map(|r| r.value().clone())
+    }
+
    // ============ 缓存失效 ============

    /// 清除 model 缓存中的指定条目（Admin CRUD 后调用）
@@ -204,4 +270,9 @@ impl AppCache {
    pub fn invalidate_all_providers(&self) {
        self.providers.clear();
    }
+
+    /// 清除全部 model group 缓存
+    pub fn invalidate_all_model_groups(&self) {
+        self.model_groups.clear();
+    }
 }