release(v0.2.0): streaming, MCP protocol, Browser Hand, security enhancements

## Major Features ### Streaming Response System - Implement LlmDriver trait with `stream()` method returning async Stream - Add SSE parsing for Anthropic and OpenAI API streaming - Integrate Tauri event system for frontend streaming (`stream:chunk` events) - Add StreamChunk types: Delta, ToolStart, ToolEnd, Complete, Error ### MCP Protocol Implementation - Add MCP JSON-RPC 2.0 types (mcp_types.rs) - Implement stdio-based MCP transport (mcp_transport.rs) - Support tool discovery, execution, and resource operations ### Browser Hand Implementation - Complete browser automation with Playwright-style actions - Support Navigate, Click, Type, Scrape, Screenshot, Wait actions - Add educational Hands: Whiteboard, Slideshow, Speech, Quiz ### Security Enhancements - Implement command whitelist/blacklist for shell_exec tool - Add SSRF protection with private IP blocking - Create security.toml configuration file ## Test Improvements - Fix test import paths (security-utils, setup) - Fix vi.mock hoisting issues with vi.hoisted() - Update test expectations for validateUrl and sanitizeFilename - Add getUnsupportedLocalGatewayStatus mock ## Documentation Updates - Update architecture documentation - Improve configuration reference - Add quick-start guide updates Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-24 03:24:24 +08:00
parent e49ba4460b
commit 3ff08faa56
78 changed files with 29575 additions and 1682 deletions
--- a/crates/zclaw-hands/src/hands/speech.rs
+++ b/crates/zclaw-hands/src/hands/speech.rs
@@ -0,0 +1,425 @@
+//! Speech Hand - Text-to-Speech synthesis capabilities
+//!
+//! Provides speech synthesis for teaching:
+//! - speak: Convert text to speech
+//! - speak_ssml: Advanced speech with SSML markup
+//! - pause/resume/stop: Playback control
+//! - list_voices: Get available voices
+//! - set_voice: Configure voice settings
+
+use async_trait::async_trait;
+use serde::{Deserialize, Serialize};
+use serde_json::Value;
+use std::sync::Arc;
+use tokio::sync::RwLock;
+use zclaw_types::Result;
+
+use crate::{Hand, HandConfig, HandContext, HandResult, HandStatus};
+
+/// TTS Provider types
+#[derive(Debug, Clone, Serialize, Deserialize, Default)]
+#[serde(rename_all = "lowercase")]
+pub enum TtsProvider {
+    #[default]
+    Browser,
+    Azure,
+    OpenAI,
+    ElevenLabs,
+    Local,
+}
+
+/// Speech action types
+#[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(tag = "action", rename_all = "snake_case")]
+pub enum SpeechAction {
+    /// Speak text
+    Speak {
+        text: String,
+        #[serde(default)]
+        voice: Option<String>,
+        #[serde(default = "default_rate")]
+        rate: f32,
+        #[serde(default = "default_pitch")]
+        pitch: f32,
+        #[serde(default = "default_volume")]
+        volume: f32,
+        #[serde(default)]
+        language: Option<String>,
+    },
+    /// Speak with SSML markup
+    SpeakSsml {
+        ssml: String,
+        #[serde(default)]
+        voice: Option<String>,
+    },
+    /// Pause playback
+    Pause,
+    /// Resume playback
+    Resume,
+    /// Stop playback
+    Stop,
+    /// List available voices
+    ListVoices {
+        #[serde(default)]
+        language: Option<String>,
+    },
+    /// Set default voice
+    SetVoice {
+        voice: String,
+        #[serde(default)]
+        language: Option<String>,
+    },
+    /// Set provider
+    SetProvider {
+        provider: TtsProvider,
+        #[serde(default)]
+        api_key: Option<String>,
+        #[serde(default)]
+        region: Option<String>,
+    },
+}
+
+fn default_rate() -> f32 { 1.0 }
+fn default_pitch() -> f32 { 1.0 }
+fn default_volume() -> f32 { 1.0 }
+
+/// Voice information
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct VoiceInfo {
+    pub id: String,
+    pub name: String,
+    pub language: String,
+    pub gender: String,
+    #[serde(default)]
+    pub preview_url: Option<String>,
+}
+
+/// Playback state
+#[derive(Debug, Clone, Serialize, Deserialize, Default)]
+pub enum PlaybackState {
+    #[default]
+    Idle,
+    Playing,
+    Paused,
+}
+
+/// Speech configuration
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct SpeechConfig {
+    pub provider: TtsProvider,
+    pub default_voice: Option<String>,
+    pub default_language: String,
+    pub default_rate: f32,
+    pub default_pitch: f32,
+    pub default_volume: f32,
+}
+
+impl Default for SpeechConfig {
+    fn default() -> Self {
+        Self {
+            provider: TtsProvider::Browser,
+            default_voice: None,
+            default_language: "zh-CN".to_string(),
+            default_rate: 1.0,
+            default_pitch: 1.0,
+            default_volume: 1.0,
+        }
+    }
+}
+
+/// Speech state
+#[derive(Debug, Clone, Default)]
+pub struct SpeechState {
+    pub config: SpeechConfig,
+    pub playback: PlaybackState,
+    pub current_text: Option<String>,
+    pub position_ms: u64,
+    pub available_voices: Vec<VoiceInfo>,
+}
+
+/// Speech Hand implementation
+pub struct SpeechHand {
+    config: HandConfig,
+    state: Arc<RwLock<SpeechState>>,
+}
+
+impl SpeechHand {
+    /// Create a new speech hand
+    pub fn new() -> Self {
+        Self {
+            config: HandConfig {
+                id: "speech".to_string(),
+                name: "Speech".to_string(),
+                description: "Text-to-speech synthesis for voice output".to_string(),
+                needs_approval: false,
+                dependencies: vec![],
+                input_schema: Some(serde_json::json!({
+                    "type": "object",
+                    "properties": {
+                        "action": { "type": "string" },
+                        "text": { "type": "string" },
+                        "voice": { "type": "string" },
+                        "rate": { "type": "number" },
+                    }
+                })),
+                tags: vec!["audio".to_string(), "tts".to_string(), "education".to_string()],
+                enabled: true,
+            },
+            state: Arc::new(RwLock::new(SpeechState {
+                config: SpeechConfig::default(),
+                playback: PlaybackState::Idle,
+                available_voices: Self::get_default_voices(),
+                ..Default::default()
+            })),
+        }
+    }
+
+    /// Create with custom provider
+    pub fn with_provider(provider: TtsProvider) -> Self {
+        let mut hand = Self::new();
+        let mut state = hand.state.blocking_write();
+        state.config.provider = provider;
+        drop(state);
+        hand
+    }
+
+    /// Get default voices
+    fn get_default_voices() -> Vec<VoiceInfo> {
+        vec![
+            VoiceInfo {
+                id: "zh-CN-XiaoxiaoNeural".to_string(),
+                name: "Xiaoxiao".to_string(),
+                language: "zh-CN".to_string(),
+                gender: "female".to_string(),
+                preview_url: None,
+            },
+            VoiceInfo {
+                id: "zh-CN-YunxiNeural".to_string(),
+                name: "Yunxi".to_string(),
+                language: "zh-CN".to_string(),
+                gender: "male".to_string(),
+                preview_url: None,
+            },
+            VoiceInfo {
+                id: "en-US-JennyNeural".to_string(),
+                name: "Jenny".to_string(),
+                language: "en-US".to_string(),
+                gender: "female".to_string(),
+                preview_url: None,
+            },
+            VoiceInfo {
+                id: "en-US-GuyNeural".to_string(),
+                name: "Guy".to_string(),
+                language: "en-US".to_string(),
+                gender: "male".to_string(),
+                preview_url: None,
+            },
+        ]
+    }
+
+    /// Execute a speech action
+    pub async fn execute_action(&self, action: SpeechAction) -> Result<HandResult> {
+        let mut state = self.state.write().await;
+
+        match action {
+            SpeechAction::Speak { text, voice, rate, pitch, volume, language } => {
+                let voice_id = voice.or(state.config.default_voice.clone())
+                    .unwrap_or_else(|| "default".to_string());
+                let lang = language.unwrap_or_else(|| state.config.default_language.clone());
+                let actual_rate = if rate == 1.0 { state.config.default_rate } else { rate };
+                let actual_pitch = if pitch == 1.0 { state.config.default_pitch } else { pitch };
+                let actual_volume = if volume == 1.0 { state.config.default_volume } else { volume };
+
+                state.playback = PlaybackState::Playing;
+                state.current_text = Some(text.clone());
+
+                // In real implementation, would call TTS API
+                Ok(HandResult::success(serde_json::json!({
+                    "status": "speaking",
+                    "text": text,
+                    "voice": voice_id,
+                    "language": lang,
+                    "rate": actual_rate,
+                    "pitch": actual_pitch,
+                    "volume": actual_volume,
+                    "provider": state.config.provider,
+                    "duration_ms": text.len() as u64 * 80, // Rough estimate
+                })))
+            }
+            SpeechAction::SpeakSsml { ssml, voice } => {
+                let voice_id = voice.or(state.config.default_voice.clone())
+                    .unwrap_or_else(|| "default".to_string());
+
+                state.playback = PlaybackState::Playing;
+                state.current_text = Some(ssml.clone());
+
+                Ok(HandResult::success(serde_json::json!({
+                    "status": "speaking_ssml",
+                    "ssml": ssml,
+                    "voice": voice_id,
+                    "provider": state.config.provider,
+                })))
+            }
+            SpeechAction::Pause => {
+                state.playback = PlaybackState::Paused;
+                Ok(HandResult::success(serde_json::json!({
+                    "status": "paused",
+                    "position_ms": state.position_ms,
+                })))
+            }
+            SpeechAction::Resume => {
+                state.playback = PlaybackState::Playing;
+                Ok(HandResult::success(serde_json::json!({
+                    "status": "resumed",
+                    "position_ms": state.position_ms,
+                })))
+            }
+            SpeechAction::Stop => {
+                state.playback = PlaybackState::Idle;
+                state.current_text = None;
+                state.position_ms = 0;
+                Ok(HandResult::success(serde_json::json!({
+                    "status": "stopped",
+                })))
+            }
+            SpeechAction::ListVoices { language } => {
+                let voices: Vec<_> = state.available_voices.iter()
+                    .filter(|v| {
+                        language.as_ref()
+                            .map(|l| v.language.starts_with(l))
+                            .unwrap_or(true)
+                    })
+                    .cloned()
+                    .collect();
+
+                Ok(HandResult::success(serde_json::json!({
+                    "voices": voices,
+                    "count": voices.len(),
+                })))
+            }
+            SpeechAction::SetVoice { voice, language } => {
+                state.config.default_voice = Some(voice.clone());
+                if let Some(lang) = language {
+                    state.config.default_language = lang;
+                }
+                Ok(HandResult::success(serde_json::json!({
+                    "status": "voice_set",
+                    "voice": voice,
+                    "language": state.config.default_language,
+                })))
+            }
+            SpeechAction::SetProvider { provider, api_key, region } => {
+                state.config.provider = provider.clone();
+                // In real implementation, would configure provider
+                Ok(HandResult::success(serde_json::json!({
+                    "status": "provider_set",
+                    "provider": provider,
+                    "configured": api_key.is_some(),
+                })))
+            }
+        }
+    }
+
+    /// Get current state
+    pub async fn get_state(&self) -> SpeechState {
+        self.state.read().await.clone()
+    }
+}
+
+impl Default for SpeechHand {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[async_trait]
+impl Hand for SpeechHand {
+    fn config(&self) -> &HandConfig {
+        &self.config
+    }
+
+    async fn execute(&self, _context: &HandContext, input: Value) -> Result<HandResult> {
+        let action: SpeechAction = match serde_json::from_value(input) {
+            Ok(a) => a,
+            Err(e) => {
+                return Ok(HandResult::error(format!("Invalid speech action: {}", e)));
+            }
+        };
+
+        self.execute_action(action).await
+    }
+
+    fn status(&self) -> HandStatus {
+        HandStatus::Idle
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn test_speech_creation() {
+        let hand = SpeechHand::new();
+        assert_eq!(hand.config().id, "speech");
+    }
+
+    #[tokio::test]
+    async fn test_speak() {
+        let hand = SpeechHand::new();
+        let action = SpeechAction::Speak {
+            text: "Hello, world!".to_string(),
+            voice: None,
+            rate: 1.0,
+            pitch: 1.0,
+            volume: 1.0,
+            language: None,
+        };
+
+        let result = hand.execute_action(action).await.unwrap();
+        assert!(result.success);
+    }
+
+    #[tokio::test]
+    async fn test_pause_resume() {
+        let hand = SpeechHand::new();
+
+        // Speak first
+        hand.execute_action(SpeechAction::Speak {
+            text: "Test".to_string(),
+            voice: None, rate: 1.0, pitch: 1.0, volume: 1.0, language: None,
+        }).await.unwrap();
+
+        // Pause
+        let result = hand.execute_action(SpeechAction::Pause).await.unwrap();
+        assert!(result.success);
+
+        // Resume
+        let result = hand.execute_action(SpeechAction::Resume).await.unwrap();
+        assert!(result.success);
+    }
+
+    #[tokio::test]
+    async fn test_list_voices() {
+        let hand = SpeechHand::new();
+        let action = SpeechAction::ListVoices { language: Some("zh".to_string()) };
+
+        let result = hand.execute_action(action).await.unwrap();
+        assert!(result.success);
+    }
+
+    #[tokio::test]
+    async fn test_set_voice() {
+        let hand = SpeechHand::new();
+        let action = SpeechAction::SetVoice {
+            voice: "zh-CN-XiaoxiaoNeural".to_string(),
+            language: Some("zh-CN".to_string()),
+        };
+
+        let result = hand.execute_action(action).await.unwrap();
+        assert!(result.success);
+
+        let state = hand.get_state().await;
+        assert_eq!(state.config.default_voice, Some("zh-CN-XiaoxiaoNeural".to_string()));
+    }
+}