release(v0.2.0): streaming, MCP protocol, Browser Hand, security enhancements
## Major Features ### Streaming Response System - Implement LlmDriver trait with `stream()` method returning async Stream - Add SSE parsing for Anthropic and OpenAI API streaming - Integrate Tauri event system for frontend streaming (`stream:chunk` events) - Add StreamChunk types: Delta, ToolStart, ToolEnd, Complete, Error ### MCP Protocol Implementation - Add MCP JSON-RPC 2.0 types (mcp_types.rs) - Implement stdio-based MCP transport (mcp_transport.rs) - Support tool discovery, execution, and resource operations ### Browser Hand Implementation - Complete browser automation with Playwright-style actions - Support Navigate, Click, Type, Scrape, Screenshot, Wait actions - Add educational Hands: Whiteboard, Slideshow, Speech, Quiz ### Security Enhancements - Implement command whitelist/blacklist for shell_exec tool - Add SSRF protection with private IP blocking - Create security.toml configuration file ## Test Improvements - Fix test import paths (security-utils, setup) - Fix vi.mock hoisting issues with vi.hoisted() - Update test expectations for validateUrl and sanitizeFilename - Add getUnsupportedLocalGatewayStatus mock ## Documentation Updates - Update architecture documentation - Improve configuration reference - Add quick-start guide updates Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
425
crates/zclaw-hands/src/hands/speech.rs
Normal file
425
crates/zclaw-hands/src/hands/speech.rs
Normal file
@@ -0,0 +1,425 @@
|
||||
//! Speech Hand - Text-to-Speech synthesis capabilities
|
||||
//!
|
||||
//! Provides speech synthesis for teaching:
|
||||
//! - speak: Convert text to speech
|
||||
//! - speak_ssml: Advanced speech with SSML markup
|
||||
//! - pause/resume/stop: Playback control
|
||||
//! - list_voices: Get available voices
|
||||
//! - set_voice: Configure voice settings
|
||||
|
||||
use async_trait::async_trait;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::RwLock;
|
||||
use zclaw_types::Result;
|
||||
|
||||
use crate::{Hand, HandConfig, HandContext, HandResult, HandStatus};
|
||||
|
||||
/// TTS Provider types
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum TtsProvider {
|
||||
#[default]
|
||||
Browser,
|
||||
Azure,
|
||||
OpenAI,
|
||||
ElevenLabs,
|
||||
Local,
|
||||
}
|
||||
|
||||
/// Speech action types
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(tag = "action", rename_all = "snake_case")]
|
||||
pub enum SpeechAction {
|
||||
/// Speak text
|
||||
Speak {
|
||||
text: String,
|
||||
#[serde(default)]
|
||||
voice: Option<String>,
|
||||
#[serde(default = "default_rate")]
|
||||
rate: f32,
|
||||
#[serde(default = "default_pitch")]
|
||||
pitch: f32,
|
||||
#[serde(default = "default_volume")]
|
||||
volume: f32,
|
||||
#[serde(default)]
|
||||
language: Option<String>,
|
||||
},
|
||||
/// Speak with SSML markup
|
||||
SpeakSsml {
|
||||
ssml: String,
|
||||
#[serde(default)]
|
||||
voice: Option<String>,
|
||||
},
|
||||
/// Pause playback
|
||||
Pause,
|
||||
/// Resume playback
|
||||
Resume,
|
||||
/// Stop playback
|
||||
Stop,
|
||||
/// List available voices
|
||||
ListVoices {
|
||||
#[serde(default)]
|
||||
language: Option<String>,
|
||||
},
|
||||
/// Set default voice
|
||||
SetVoice {
|
||||
voice: String,
|
||||
#[serde(default)]
|
||||
language: Option<String>,
|
||||
},
|
||||
/// Set provider
|
||||
SetProvider {
|
||||
provider: TtsProvider,
|
||||
#[serde(default)]
|
||||
api_key: Option<String>,
|
||||
#[serde(default)]
|
||||
region: Option<String>,
|
||||
},
|
||||
}
|
||||
|
||||
fn default_rate() -> f32 { 1.0 }
|
||||
fn default_pitch() -> f32 { 1.0 }
|
||||
fn default_volume() -> f32 { 1.0 }
|
||||
|
||||
/// Voice information
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct VoiceInfo {
|
||||
pub id: String,
|
||||
pub name: String,
|
||||
pub language: String,
|
||||
pub gender: String,
|
||||
#[serde(default)]
|
||||
pub preview_url: Option<String>,
|
||||
}
|
||||
|
||||
/// Playback state
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
||||
pub enum PlaybackState {
|
||||
#[default]
|
||||
Idle,
|
||||
Playing,
|
||||
Paused,
|
||||
}
|
||||
|
||||
/// Speech configuration
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SpeechConfig {
|
||||
pub provider: TtsProvider,
|
||||
pub default_voice: Option<String>,
|
||||
pub default_language: String,
|
||||
pub default_rate: f32,
|
||||
pub default_pitch: f32,
|
||||
pub default_volume: f32,
|
||||
}
|
||||
|
||||
impl Default for SpeechConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
provider: TtsProvider::Browser,
|
||||
default_voice: None,
|
||||
default_language: "zh-CN".to_string(),
|
||||
default_rate: 1.0,
|
||||
default_pitch: 1.0,
|
||||
default_volume: 1.0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Speech state
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct SpeechState {
|
||||
pub config: SpeechConfig,
|
||||
pub playback: PlaybackState,
|
||||
pub current_text: Option<String>,
|
||||
pub position_ms: u64,
|
||||
pub available_voices: Vec<VoiceInfo>,
|
||||
}
|
||||
|
||||
/// Speech Hand implementation
|
||||
pub struct SpeechHand {
|
||||
config: HandConfig,
|
||||
state: Arc<RwLock<SpeechState>>,
|
||||
}
|
||||
|
||||
impl SpeechHand {
|
||||
/// Create a new speech hand
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
config: HandConfig {
|
||||
id: "speech".to_string(),
|
||||
name: "Speech".to_string(),
|
||||
description: "Text-to-speech synthesis for voice output".to_string(),
|
||||
needs_approval: false,
|
||||
dependencies: vec![],
|
||||
input_schema: Some(serde_json::json!({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"action": { "type": "string" },
|
||||
"text": { "type": "string" },
|
||||
"voice": { "type": "string" },
|
||||
"rate": { "type": "number" },
|
||||
}
|
||||
})),
|
||||
tags: vec!["audio".to_string(), "tts".to_string(), "education".to_string()],
|
||||
enabled: true,
|
||||
},
|
||||
state: Arc::new(RwLock::new(SpeechState {
|
||||
config: SpeechConfig::default(),
|
||||
playback: PlaybackState::Idle,
|
||||
available_voices: Self::get_default_voices(),
|
||||
..Default::default()
|
||||
})),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create with custom provider
|
||||
pub fn with_provider(provider: TtsProvider) -> Self {
|
||||
let mut hand = Self::new();
|
||||
let mut state = hand.state.blocking_write();
|
||||
state.config.provider = provider;
|
||||
drop(state);
|
||||
hand
|
||||
}
|
||||
|
||||
/// Get default voices
|
||||
fn get_default_voices() -> Vec<VoiceInfo> {
|
||||
vec![
|
||||
VoiceInfo {
|
||||
id: "zh-CN-XiaoxiaoNeural".to_string(),
|
||||
name: "Xiaoxiao".to_string(),
|
||||
language: "zh-CN".to_string(),
|
||||
gender: "female".to_string(),
|
||||
preview_url: None,
|
||||
},
|
||||
VoiceInfo {
|
||||
id: "zh-CN-YunxiNeural".to_string(),
|
||||
name: "Yunxi".to_string(),
|
||||
language: "zh-CN".to_string(),
|
||||
gender: "male".to_string(),
|
||||
preview_url: None,
|
||||
},
|
||||
VoiceInfo {
|
||||
id: "en-US-JennyNeural".to_string(),
|
||||
name: "Jenny".to_string(),
|
||||
language: "en-US".to_string(),
|
||||
gender: "female".to_string(),
|
||||
preview_url: None,
|
||||
},
|
||||
VoiceInfo {
|
||||
id: "en-US-GuyNeural".to_string(),
|
||||
name: "Guy".to_string(),
|
||||
language: "en-US".to_string(),
|
||||
gender: "male".to_string(),
|
||||
preview_url: None,
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
/// Execute a speech action
|
||||
pub async fn execute_action(&self, action: SpeechAction) -> Result<HandResult> {
|
||||
let mut state = self.state.write().await;
|
||||
|
||||
match action {
|
||||
SpeechAction::Speak { text, voice, rate, pitch, volume, language } => {
|
||||
let voice_id = voice.or(state.config.default_voice.clone())
|
||||
.unwrap_or_else(|| "default".to_string());
|
||||
let lang = language.unwrap_or_else(|| state.config.default_language.clone());
|
||||
let actual_rate = if rate == 1.0 { state.config.default_rate } else { rate };
|
||||
let actual_pitch = if pitch == 1.0 { state.config.default_pitch } else { pitch };
|
||||
let actual_volume = if volume == 1.0 { state.config.default_volume } else { volume };
|
||||
|
||||
state.playback = PlaybackState::Playing;
|
||||
state.current_text = Some(text.clone());
|
||||
|
||||
// In real implementation, would call TTS API
|
||||
Ok(HandResult::success(serde_json::json!({
|
||||
"status": "speaking",
|
||||
"text": text,
|
||||
"voice": voice_id,
|
||||
"language": lang,
|
||||
"rate": actual_rate,
|
||||
"pitch": actual_pitch,
|
||||
"volume": actual_volume,
|
||||
"provider": state.config.provider,
|
||||
"duration_ms": text.len() as u64 * 80, // Rough estimate
|
||||
})))
|
||||
}
|
||||
SpeechAction::SpeakSsml { ssml, voice } => {
|
||||
let voice_id = voice.or(state.config.default_voice.clone())
|
||||
.unwrap_or_else(|| "default".to_string());
|
||||
|
||||
state.playback = PlaybackState::Playing;
|
||||
state.current_text = Some(ssml.clone());
|
||||
|
||||
Ok(HandResult::success(serde_json::json!({
|
||||
"status": "speaking_ssml",
|
||||
"ssml": ssml,
|
||||
"voice": voice_id,
|
||||
"provider": state.config.provider,
|
||||
})))
|
||||
}
|
||||
SpeechAction::Pause => {
|
||||
state.playback = PlaybackState::Paused;
|
||||
Ok(HandResult::success(serde_json::json!({
|
||||
"status": "paused",
|
||||
"position_ms": state.position_ms,
|
||||
})))
|
||||
}
|
||||
SpeechAction::Resume => {
|
||||
state.playback = PlaybackState::Playing;
|
||||
Ok(HandResult::success(serde_json::json!({
|
||||
"status": "resumed",
|
||||
"position_ms": state.position_ms,
|
||||
})))
|
||||
}
|
||||
SpeechAction::Stop => {
|
||||
state.playback = PlaybackState::Idle;
|
||||
state.current_text = None;
|
||||
state.position_ms = 0;
|
||||
Ok(HandResult::success(serde_json::json!({
|
||||
"status": "stopped",
|
||||
})))
|
||||
}
|
||||
SpeechAction::ListVoices { language } => {
|
||||
let voices: Vec<_> = state.available_voices.iter()
|
||||
.filter(|v| {
|
||||
language.as_ref()
|
||||
.map(|l| v.language.starts_with(l))
|
||||
.unwrap_or(true)
|
||||
})
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
Ok(HandResult::success(serde_json::json!({
|
||||
"voices": voices,
|
||||
"count": voices.len(),
|
||||
})))
|
||||
}
|
||||
SpeechAction::SetVoice { voice, language } => {
|
||||
state.config.default_voice = Some(voice.clone());
|
||||
if let Some(lang) = language {
|
||||
state.config.default_language = lang;
|
||||
}
|
||||
Ok(HandResult::success(serde_json::json!({
|
||||
"status": "voice_set",
|
||||
"voice": voice,
|
||||
"language": state.config.default_language,
|
||||
})))
|
||||
}
|
||||
SpeechAction::SetProvider { provider, api_key, region } => {
|
||||
state.config.provider = provider.clone();
|
||||
// In real implementation, would configure provider
|
||||
Ok(HandResult::success(serde_json::json!({
|
||||
"status": "provider_set",
|
||||
"provider": provider,
|
||||
"configured": api_key.is_some(),
|
||||
})))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get current state
|
||||
pub async fn get_state(&self) -> SpeechState {
|
||||
self.state.read().await.clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for SpeechHand {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Hand for SpeechHand {
|
||||
fn config(&self) -> &HandConfig {
|
||||
&self.config
|
||||
}
|
||||
|
||||
async fn execute(&self, _context: &HandContext, input: Value) -> Result<HandResult> {
|
||||
let action: SpeechAction = match serde_json::from_value(input) {
|
||||
Ok(a) => a,
|
||||
Err(e) => {
|
||||
return Ok(HandResult::error(format!("Invalid speech action: {}", e)));
|
||||
}
|
||||
};
|
||||
|
||||
self.execute_action(action).await
|
||||
}
|
||||
|
||||
fn status(&self) -> HandStatus {
|
||||
HandStatus::Idle
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_speech_creation() {
|
||||
let hand = SpeechHand::new();
|
||||
assert_eq!(hand.config().id, "speech");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_speak() {
|
||||
let hand = SpeechHand::new();
|
||||
let action = SpeechAction::Speak {
|
||||
text: "Hello, world!".to_string(),
|
||||
voice: None,
|
||||
rate: 1.0,
|
||||
pitch: 1.0,
|
||||
volume: 1.0,
|
||||
language: None,
|
||||
};
|
||||
|
||||
let result = hand.execute_action(action).await.unwrap();
|
||||
assert!(result.success);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_pause_resume() {
|
||||
let hand = SpeechHand::new();
|
||||
|
||||
// Speak first
|
||||
hand.execute_action(SpeechAction::Speak {
|
||||
text: "Test".to_string(),
|
||||
voice: None, rate: 1.0, pitch: 1.0, volume: 1.0, language: None,
|
||||
}).await.unwrap();
|
||||
|
||||
// Pause
|
||||
let result = hand.execute_action(SpeechAction::Pause).await.unwrap();
|
||||
assert!(result.success);
|
||||
|
||||
// Resume
|
||||
let result = hand.execute_action(SpeechAction::Resume).await.unwrap();
|
||||
assert!(result.success);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_list_voices() {
|
||||
let hand = SpeechHand::new();
|
||||
let action = SpeechAction::ListVoices { language: Some("zh".to_string()) };
|
||||
|
||||
let result = hand.execute_action(action).await.unwrap();
|
||||
assert!(result.success);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_set_voice() {
|
||||
let hand = SpeechHand::new();
|
||||
let action = SpeechAction::SetVoice {
|
||||
voice: "zh-CN-XiaoxiaoNeural".to_string(),
|
||||
language: Some("zh-CN".to_string()),
|
||||
};
|
||||
|
||||
let result = hand.execute_action(action).await.unwrap();
|
||||
assert!(result.success);
|
||||
|
||||
let state = hand.get_state().await;
|
||||
assert_eq!(state.config.default_voice, Some("zh-CN-XiaoxiaoNeural".to_string()));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user