fix(saas): harden model group failover + relay reliability
Some checks failed
CI / Lint & TypeCheck (push) Has been cancelled
CI / Unit Tests (push) Has been cancelled
CI / Build Frontend (push) Has been cancelled
CI / Rust Check (push) Has been cancelled
CI / Security Scan (push) Has been cancelled
CI / E2E Tests (push) Has been cancelled

- cache: insert-then-retain pattern avoids empty-window race during refresh
- relay: manage_task_status flag for proper failover state transitions
- relay: retry_task re-resolves model groups instead of blind provider reuse
- relay: filter empty-member groups from available models list
- relay: quota cache stale entry cleanup (TTL 5x expiry)
- error: from_sqlx_unique helper for 409 vs 500 distinction
- model_config: unique constraint handling, duplicate member check
- model_config: failover_strategy whitelist, model_id vs group name conflict check
- model_config: group-scoped member removal with group_id validation

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
iven
2026-04-04 12:26:55 +08:00
parent 894c0d7b15
commit 5c48d62f7e
6 changed files with 221 additions and 64 deletions

View File

@@ -84,52 +84,60 @@ impl AppCache {
}
}
/// 从 DB 全量加载 models + providers
/// 从 DB 全量加载 models + providers + model_groups
///
/// 使用 insert-then-retain 模式避免 clear+repopulate 竞态窗口:
/// 先插入所有新数据(覆盖旧值),再删除不在新数据中的陈旧条目。
/// 这确保缓存从不出现空窗期。
pub async fn load_from_db(&self, db: &PgPool) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
// Load providers
use std::collections::HashSet;
// Load providers — insert-then-retain 避免空窗
let provider_rows: Vec<(String, String, String, String, String, bool)> = sqlx::query_as(
"SELECT id, name, display_name, base_url, api_protocol, enabled FROM providers"
).fetch_all(db).await?;
self.providers.clear();
for (id, name, display_name, base_url, api_protocol, enabled) in provider_rows {
let provider_keys: HashSet<String> = provider_rows.iter().map(|(id, ..)| id.clone()).collect();
for (id, name, display_name, base_url, api_protocol, enabled) in &provider_rows {
self.providers.insert(id.clone(), CachedProvider {
id,
name,
display_name,
base_url,
api_protocol,
enabled,
id: id.clone(),
name: name.clone(),
display_name: display_name.clone(),
base_url: base_url.clone(),
api_protocol: api_protocol.clone(),
enabled: *enabled,
});
}
self.providers.retain(|k, _| provider_keys.contains(k));
// Load models (key = model_id for relay lookup)
// Load models (key = model_id for relay lookup) — insert-then-retain
let model_rows: Vec<(String, String, String, String, i64, i64, bool, bool, bool, f64, f64)> = sqlx::query_as(
"SELECT id, provider_id, model_id, alias, context_window, max_output_tokens,
supports_streaming, supports_vision, enabled, pricing_input, pricing_output
FROM models"
).fetch_all(db).await?;
self.models.clear();
let model_keys: HashSet<String> = model_rows.iter().map(|(_, _, mid, ..)| mid.clone()).collect();
for (id, provider_id, model_id, alias, context_window, max_output_tokens,
supports_streaming, supports_vision, enabled, pricing_input, pricing_output) in model_rows
supports_streaming, supports_vision, enabled, pricing_input, pricing_output) in &model_rows
{
self.models.insert(model_id.clone(), CachedModel {
id,
provider_id,
id: id.clone(),
provider_id: provider_id.clone(),
model_id: model_id.clone(),
alias,
context_window,
max_output_tokens,
supports_streaming,
supports_vision,
enabled,
pricing_input,
pricing_output,
alias: alias.clone(),
context_window: *context_window,
max_output_tokens: *max_output_tokens,
supports_streaming: *supports_streaming,
supports_vision: *supports_vision,
enabled: *enabled,
pricing_input: *pricing_input,
pricing_output: *pricing_output,
});
}
self.models.retain(|k, _| model_keys.contains(k));
// Load model groups with members
// Load model groups with members — insert-then-retain
let group_rows: Vec<(String, String, String, String, bool, String)> = sqlx::query_as(
"SELECT id, name, display_name, COALESCE(description, ''), enabled, COALESCE(failover_strategy, 'quota_aware') FROM model_groups"
).fetch_all(db).await?;
@@ -139,7 +147,7 @@ impl AppCache {
FROM model_group_members ORDER BY priority ASC"
).fetch_all(db).await?;
self.model_groups.clear();
let group_keys: HashSet<String> = group_rows.iter().map(|(_, name, ..)| name.clone()).collect();
for (id, name, display_name, description, enabled, failover_strategy) in &group_rows {
let members: Vec<CachedGroupMember> = member_rows.iter()
.filter(|(_, gid, _, _, _, _)| gid == id)
@@ -161,6 +169,7 @@ impl AppCache {
members,
});
}
self.model_groups.retain(|k, _| group_keys.contains(k));
tracing::info!(
"Cache loaded: {} providers, {} models, {} model groups",