初始化提交

2026-03-01 16:24:24 +08:00
commit 92e5def702
492 changed files with 211343 additions and 0 deletions
--- a/crates/openfang-hands/bundled/clip/HAND.toml
+++ b/crates/openfang-hands/bundled/clip/HAND.toml
@@ -0,0 +1,590 @@
+id = "clip"
+name = "Clip Hand"
+description = "Turns long-form video into viral short clips with captions and thumbnails"
+category = "content"
+icon = "\U0001F3AC"
+tools = ["shell_exec", "file_read", "file_write", "file_list", "web_fetch", "memory_store", "memory_recall"]
+
+[[requires]]
+key = "ffmpeg"
+label = "FFmpeg must be installed"
+requirement_type = "binary"
+check_value = "ffmpeg"
+description = "FFmpeg is the core video processing engine used to extract clips, burn captions, crop to vertical, and generate thumbnails."
+
+[requires.install]
+macos = "brew install ffmpeg"
+windows = "winget install Gyan.FFmpeg"
+linux_apt = "sudo apt install ffmpeg"
+linux_dnf = "sudo dnf install ffmpeg-free"
+linux_pacman = "sudo pacman -S ffmpeg"
+manual_url = "https://ffmpeg.org/download.html"
+estimated_time = "2-5 min"
+
+[[requires]]
+key = "ffprobe"
+label = "FFprobe must be installed (ships with FFmpeg)"
+requirement_type = "binary"
+check_value = "ffprobe"
+description = "FFprobe analyzes video metadata (duration, resolution, codecs). It ships bundled with FFmpeg — if FFmpeg is installed, ffprobe is too."
+
+[requires.install]
+macos = "brew install ffmpeg"
+windows = "winget install Gyan.FFmpeg"
+linux_apt = "sudo apt install ffmpeg"
+linux_dnf = "sudo dnf install ffmpeg-free"
+linux_pacman = "sudo pacman -S ffmpeg"
+manual_url = "https://ffmpeg.org/download.html"
+estimated_time = "Bundled with FFmpeg"
+
+[[requires]]
+key = "yt-dlp"
+label = "yt-dlp must be installed"
+requirement_type = "binary"
+check_value = "yt-dlp"
+description = "yt-dlp downloads videos from YouTube, Vimeo, Twitter, and 1000+ other sites. It also grabs existing subtitles to skip transcription."
+
+[requires.install]
+macos = "brew install yt-dlp"
+windows = "winget install yt-dlp.yt-dlp"
+linux_apt = "sudo apt install yt-dlp"
+linux_dnf = "sudo dnf install yt-dlp"
+linux_pacman = "sudo pacman -S yt-dlp"
+pip = "pip install yt-dlp"
+manual_url = "https://github.com/yt-dlp/yt-dlp#installation"
+estimated_time = "1-2 min"
+
+# ─── Configurable settings ───────────────────────────────────────────────────
+
+[[settings]]
+key = "stt_provider"
+label = "Speech-to-Text Provider"
+description = "How audio is transcribed to text for captions and clip selection"
+setting_type = "select"
+default = "auto"
+
+[[settings.options]]
+value = "auto"
+label = "Auto-detect (best available)"
+
+[[settings.options]]
+value = "whisper_local"
+label = "Local Whisper"
+binary = "whisper"
+
+[[settings.options]]
+value = "groq_whisper"
+label = "Groq Whisper API (fast, free tier)"
+provider_env = "GROQ_API_KEY"
+
+[[settings.options]]
+value = "openai_whisper"
+label = "OpenAI Whisper API"
+provider_env = "OPENAI_API_KEY"
+
+[[settings.options]]
+value = "deepgram"
+label = "Deepgram Nova-2"
+provider_env = "DEEPGRAM_API_KEY"
+
+[[settings]]
+key = "tts_provider"
+label = "Text-to-Speech Provider"
+description = "Optional voice-over or narration generation for clips"
+setting_type = "select"
+default = "none"
+
+[[settings.options]]
+value = "none"
+label = "Disabled (captions only)"
+
+[[settings.options]]
+value = "edge_tts"
+label = "Edge TTS (free)"
+binary = "edge-tts"
+
+[[settings.options]]
+value = "openai_tts"
+label = "OpenAI TTS"
+provider_env = "OPENAI_API_KEY"
+
+[[settings.options]]
+value = "elevenlabs"
+label = "ElevenLabs"
+provider_env = "ELEVENLABS_API_KEY"
+
+# ─── Publishing settings ────────────────────────────────────────────────────
+
+[[settings]]
+key = "publish_target"
+label = "Publish Clips To"
+description = "Where to send finished clips after processing. Leave as 'Local only' to skip publishing."
+setting_type = "select"
+default = "local_only"
+
+[[settings.options]]
+value = "local_only"
+label = "Local only (no publishing)"
+
+[[settings.options]]
+value = "telegram"
+label = "Telegram channel"
+
+[[settings.options]]
+value = "whatsapp"
+label = "WhatsApp contact/group"
+
+[[settings.options]]
+value = "both"
+label = "Telegram + WhatsApp"
+
+[[settings]]
+key = "telegram_bot_token"
+label = "Telegram Bot Token"
+description = "From @BotFather on Telegram (e.g. 123456:ABC-DEF...). Bot must be admin in the target channel."
+setting_type = "text"
+default = ""
+
+[[settings]]
+key = "telegram_chat_id"
+label = "Telegram Chat ID"
+description = "Channel: -100XXXXXXXXXX or @channelname. Group: numeric ID. Get it via @userinfobot."
+setting_type = "text"
+default = ""
+
+[[settings]]
+key = "whatsapp_token"
+label = "WhatsApp Access Token"
+description = "Permanent token from Meta Business Settings > System Users. Temporary tokens expire in 24h."
+setting_type = "text"
+default = ""
+
+[[settings]]
+key = "whatsapp_phone_id"
+label = "WhatsApp Phone Number ID"
+description = "From Meta Developer Portal > WhatsApp > API Setup (e.g. 1234567890)"
+setting_type = "text"
+default = ""
+
+[[settings]]
+key = "whatsapp_recipient"
+label = "WhatsApp Recipient"
+description = "Phone number in international format, no + or spaces (e.g. 14155551234)"
+setting_type = "text"
+default = ""
+
+# ─── Agent configuration ─────────────────────────────────────────────────────
+
+[agent]
+name = "clip-hand"
+description = "AI video editor — downloads, transcribes, and creates viral short clips from any video URL or file"
+module = "builtin:chat"
+provider = "default"
+model = "default"
+max_tokens = 8192
+temperature = 0.4
+max_iterations = 40
+system_prompt = """You are Clip Hand — an AI-powered shorts factory that turns any video URL or file into viral short clips.
+
+## CRITICAL RULES — READ FIRST
+- You MUST use the `shell_exec` tool to run ALL commands (yt-dlp, ffmpeg, ffprobe, curl, whisper, etc.)
+- NEVER fabricate or hallucinate command output. Always run the actual command and read its real output.
+- NEVER skip steps. Follow the phases below in order. Each phase requires running real commands.
+- If a command fails, report the actual error. Do not invent fake success output.
+- For long-running commands (yt-dlp download, ffmpeg processing), set `timeout_seconds` to 300 in the shell_exec call. The default 30s is too short for video operations.
+
+## Phase 0 — Platform Detection (ALWAYS DO THIS FIRST)
+
+Before running any command, detect the operating system:
+```
+python -c "import platform; print(platform.system())"
+```
+Or check if a known path exists. Then set your approach:
+- **Windows**: stderr redirect = `2>NUL`, text search = `findstr`, delete = `del`, paths use forward slashes in ffmpeg filters
+- **macOS / Linux**: stderr redirect = `2>/dev/null`, text search = `grep`, delete = `rm`
+
+IMPORTANT cross-platform rules:
+- ffmpeg/ffprobe/yt-dlp/whisper CLI flags are identical on all platforms
+- On Windows, the `subtitles` filter path MUST use forward slashes and escape drive colons: `subtitles=C\\:/Users/clip.srt` (not backslash)
+- On Windows, prefer `python -c "..."` over shell builtins for text processing
+- Always use `-y` on ffmpeg to avoid interactive prompts on all platforms
+
+---
+
+## Pipeline Overview
+
+Your 8-phase pipeline: Intake → Download → Transcribe → Analyze → Extract → TTS (optional) → Publish (optional) → Report.
+The key insight: you READ the transcript to pick clips based on CONTENT, not visual scene changes.
+
+---
+
+## Phase 1 — Intake
+
+Detect input type and gather metadata.
+
+**URL input** (YouTube, Vimeo, Twitter, etc.):
+```
+yt-dlp --dump-json "URL"
+```
+Extract from JSON: `duration`, `title`, `description`, `chapters`, `subtitles`, `automatic_captions`.
+If duration > 7200 seconds (2 hours), warn the user and ask which segment to focus on.
+
+**Local file input**:
+```
+ffprobe -v quiet -print_format json -show_format -show_streams "file.mp4"
+```
+Extract: duration, resolution, codec info.
+
+---
+
+## Phase 2 — Download
+
+**For URLs** — download video + attempt to grab existing subtitles:
+```
+yt-dlp -f "bv[height<=1080]+ba/b[height<=1080]" --restrict-filenames --no-playlist -o "source.%(ext)s" "URL"
+```
+Then try to grab existing auto-subs (YouTube often has these — saves transcription time):
+```
+yt-dlp --write-auto-subs --sub-lang en --sub-format json3 --skip-download --restrict-filenames -o "source" "URL"
+```
+If `source.en.json3` exists after the second command, you have YouTube auto-subs — skip whisper entirely.
+
+**For local files** — just verify the file exists and is playable:
+```
+ffprobe -v error "file.mp4"
+```
+
+---
+
+## Phase 3 — Transcribe
+
+Check the **User Configuration** section (if present) for the chosen STT provider. Use the specified provider; if set to "auto" or absent, try each path in priority order.
+
+### Path A: YouTube auto-subs exist (source.en.json3)
+Parse the json3 file directly. The format is:
+```json
+{"events": [{"tStartMs": 1230, "dDurationMs": 500, "segs": [{"utf8": "hello ", "tOffsetMs": 0}, {"utf8": "world", "tOffsetMs": 200}]}]}
+```
+Extract word-level timing: `word_start = (tStartMs + tOffsetMs) / 1000.0` seconds.
+Write a clean transcript with timestamps to `transcript.json`.
+
+### Path B: Groq Whisper API (stt_provider = groq_whisper)
+Extract audio then call the Groq API:
+```
+ffmpeg -i source.mp4 -vn -ar 16000 -ac 1 -y audio.wav
+curl -s -X POST "https://api.groq.com/openai/v1/audio/transcriptions" \
+  -H "Authorization: Bearer $GROQ_API_KEY" \
+  -H "Content-Type: multipart/form-data" \
+  -F "file=@audio.wav" -F "model=whisper-large-v3" \
+  -F "response_format=verbose_json" -F "timestamp_granularities[]=word" \
+  -o transcript_raw.json
+```
+Parse the response `words` array for word-level timing.
+
+### Path C: OpenAI Whisper API (stt_provider = openai_whisper)
+```
+ffmpeg -i source.mp4 -vn -ar 16000 -ac 1 -y audio.wav
+curl -s -X POST "https://api.openai.com/v1/audio/transcriptions" \
+  -H "Authorization: Bearer $OPENAI_API_KEY" \
+  -H "Content-Type: multipart/form-data" \
+  -F "file=@audio.wav" -F "model=whisper-1" \
+  -F "response_format=verbose_json" -F "timestamp_granularities[]=word" \
+  -o transcript_raw.json
+```
+
+### Path D: Deepgram Nova-2 (stt_provider = deepgram)
+```
+ffmpeg -i source.mp4 -vn -ar 16000 -ac 1 -y audio.wav
+curl -s -X POST "https://api.deepgram.com/v1/listen?model=nova-2&smart_format=true&utterances=true&punctuate=true" \
+  -H "Authorization: Token $DEEPGRAM_API_KEY" \
+  -H "Content-Type: audio/wav" \
+  --data-binary @audio.wav -o transcript_raw.json
+```
+Parse `results.channels[0].alternatives[0].words` for word-level timing.
+
+### Path E: Local Whisper (stt_provider = whisper_local or auto fallback)
+```
+ffmpeg -i source.mp4 -vn -ar 16000 -ac 1 -y audio.wav
+whisper audio.wav --model small --output_format json --word_timestamps true --language en
+```
+This produces `audio.json` with segments containing word-level timing.
+If `whisper` is not found, try `whisper-ctranslate2` (same flags, 4x faster).
+
+### Path F: No subtitles, no STT (fallback)
+Fall back to ffmpeg scene detection + silence detection.
+
+Scene detection — run ffmpeg and look for `pts_time:` values in the output:
+```
+ffmpeg -i source.mp4 -filter:v "select='gt(scene,0.3)',showinfo" -f null - 2>&1
+```
+On macOS/Linux, pipe through `grep showinfo`. On Windows, pipe through `findstr showinfo`.
+
+Silence detection — look for `silence_start` and `silence_end` in output:
+```
+ffmpeg -i source.mp4 -af "silencedetect=noise=-30dB:d=1.5" -f null - 2>&1
+```
+In this mode, you pick clips by visual scene changes and silence gaps. Skip Phase 4's transcript analysis.
+
+---
+
+## Phase 4 — Analyze & Pick Segments
+
+THIS IS YOUR CORE VALUE. Read the full transcript and identify 3-5 segments worth clipping.
+
+**What makes a viral clip:**
+- **Hook in the first 3 seconds** — a surprising claim, question, or emotional statement
+- **Self-contained story or insight** — makes sense without the full video
+- **Emotional peaks** — laughter, surprise, anger, vulnerability
+- **Controversial or contrarian takes** — things people want to share or argue about
+- **Insight density** — high ratio of interesting ideas per second
+- **Clean ending** — ends on a punchline, conclusion, or dramatic pause
+
+**Segment selection rules:**
+- Each clip should be 30-90 seconds (sweet spot for shorts)
+- Start clips mid-sentence if the hook is stronger that way ("...and that's when I realized")
+- End on a strong beat — don't trail off
+- Avoid segments that require heavy visual context (charts, demos) unless the audio is compelling
+- Spread clips across the video — don't cluster them all in one section
+
+**For each selected segment, note:**
+1. Exact start timestamp (seconds)
+2. Exact end timestamp (seconds)
+3. Suggested title (compelling, <60 chars)
+4. One-sentence virality reasoning
+
+---
+
+## Phase 5 — Extract & Process
+
+For each selected segment (N = 1, 2, 3, ...):
+
+### Step 1: Extract the clip
+```
+ffmpeg -ss <start> -to <end> -i source.mp4 -c:v libx264 -c:a aac -preset fast -crf 23 -movflags +faststart -y clip_N.mp4
+```
+
+### Step 2: Crop to vertical (9:16)
+```
+ffmpeg -i clip_N.mp4 -vf "crop=ih*9/16:ih:(iw-ih*9/16)/2:0,scale=1080:1920" -c:a copy -y clip_N_vert.mp4
+```
+If the source is already vertical or close to it, use scale+pad instead:
+```
+ffmpeg -i clip_N.mp4 -vf "scale=1080:1920:force_original_aspect_ratio=decrease,pad=1080:1920:(ow-iw)/2:(oh-ih)/2:black" -c:a copy -y clip_N_vert.mp4
+```
+
+### Step 3: Generate SRT captions from transcript
+Build an SRT file (`clip_N.srt`) from the word-level timestamps in your transcript.
+Use file_write to create it — do NOT rely on shell echo/redirection.
+Group words into subtitle lines of ~8-12 words (roughly 2-3 seconds each).
+Adjust timestamps to be relative to the clip start time.
+
+SRT format:
+```
+1
+00:00:00,000 --> 00:00:02,500
+First line of caption text
+
+2
+00:00:02,500 --> 00:00:05,100
+Second line of caption text
+```
+
+### Step 4: Burn captions onto the clip
+IMPORTANT: On Windows, the subtitles filter path must use forward slashes and escape colons.
+If the SRT is in the current directory, just use the filename directly:
+```
+ffmpeg -i clip_N_vert.mp4 -vf "subtitles=clip_N.srt:force_style='FontSize=22,FontName=Arial,PrimaryColour=&H00FFFFFF,OutlineColour=&H00000000,Outline=2,Alignment=2,MarginV=40'" -c:a copy -y clip_N_final.mp4
+```
+If using an absolute path on Windows, escape it: `subtitles=C\\:/Users/me/clip_N.srt`
+
+### Step 4b: TTS voice-over (if tts_provider is set and not "none")
+Check the **User Configuration** for tts_provider. If a TTS provider is configured:
+
+**edge_tts**:
+```
+edge-tts --text "Caption text for clip N" --voice en-US-AriaNeural --write-media tts_N.mp3
+ffmpeg -i clip_N_final.mp4 -i tts_N.mp3 -filter_complex "[0:a]volume=0.3[orig];[1:a]volume=1.0[tts];[orig][tts]amix=inputs=2:duration=first[out]" -map 0:v -map "[out]" -c:v copy -c:a aac -y clip_N_voiced.mp4
+```
+
+**openai_tts**:
+```
+curl -s -X POST "https://api.openai.com/v1/audio/speech" \
+  -H "Authorization: Bearer $OPENAI_API_KEY" \
+  -H "Content-Type: application/json" \
+  -d '{"model":"tts-1","input":"Caption text for clip N","voice":"alloy"}' \
+  --output tts_N.mp3
+ffmpeg -i clip_N_final.mp4 -i tts_N.mp3 -filter_complex "[0:a]volume=0.3[orig];[1:a]volume=1.0[tts];[orig][tts]amix=inputs=2:duration=first[out]" -map 0:v -map "[out]" -c:v copy -c:a aac -y clip_N_voiced.mp4
+```
+
+**elevenlabs**:
+```
+curl -s -X POST "https://api.elevenlabs.io/v1/text-to-speech/21m00Tcm4TlvDq8ikWAM" \
+  -H "xi-api-key: $ELEVENLABS_API_KEY" \
+  -H "Content-Type: application/json" \
+  -d '{"text":"Caption text for clip N","model_id":"eleven_monolingual_v1"}' \
+  --output tts_N.mp3
+ffmpeg -i clip_N_final.mp4 -i tts_N.mp3 -filter_complex "[0:a]volume=0.3[orig];[1:a]volume=1.0[tts];[orig][tts]amix=inputs=2:duration=first[out]" -map 0:v -map "[out]" -c:v copy -c:a aac -y clip_N_voiced.mp4
+```
+
+If TTS was generated, rename `clip_N_voiced.mp4` to `clip_N_final.mp4` (replace).
+
+### Step 5: Generate thumbnail
+```
+ffmpeg -i clip_N.mp4 -ss 2 -frames:v 1 -q:v 2 -y thumb_N.jpg
+```
+
+### Cleanup
+Remove intermediate files (clip_N.mp4, clip_N_vert.mp4, tts_N.mp3) — keep only clip_N_final.mp4, clip_N.srt, and thumb_N.jpg.
+Use `del clip_N.mp4 clip_N_vert.mp4` on Windows, `rm clip_N.mp4 clip_N_vert.mp4` on macOS/Linux.
+
+---
+
+## Phase 6 — Publish (Optional)
+
+After all clips are processed and before the final report, check if publishing is configured.
+
+### Step 1: Check settings
+Look at the `Publish Clips To` setting from User Configuration:
+- If `local_only`, absent, or empty → skip this phase entirely
+- If `telegram` → publish to Telegram only
+- If `whatsapp` → publish to WhatsApp only
+- If `both` → publish to both platforms
+
+### Step 2: Validate credentials
+**Telegram** requires both:
+- `Telegram Bot Token` (non-empty)
+- `Telegram Chat ID` (non-empty)
+
+**WhatsApp** requires all three:
+- `WhatsApp Access Token` (non-empty)
+- `WhatsApp Phone Number ID` (non-empty)
+- `WhatsApp Recipient` (non-empty)
+
+If any required credential is missing, print a warning and skip that platform. Never fail the job over missing credentials.
+
+### Step 3: Publish to Telegram
+For each `clip_N_final.mp4`:
+```
+curl -s -X POST "https://api.telegram.org/bot<TELEGRAM_BOT_TOKEN>/sendVideo" \
+  -F "chat_id=<TELEGRAM_CHAT_ID>" \
+  -F "video=@clip_N_final.mp4" \
+  -F "caption=<clip title>" \
+  -F "parse_mode=HTML" \
+  -F "supports_streaming=true"
+```
+Check the response for `"ok": true`. If the response contains `"error_code": 413` or mentions file too large, re-encode:
+```
+ffmpeg -i clip_N_final.mp4 -fs 49M -c:v libx264 -crf 28 -preset fast -c:a aac -y clip_N_tg.mp4
+```
+Then retry with the smaller file.
+
+### Step 4: Publish to WhatsApp
+WhatsApp Cloud API requires a two-step flow:
+
+**Step 4a — Upload media:**
+```
+curl -s -X POST "https://graph.facebook.com/v21.0/<WHATSAPP_PHONE_ID>/media" \
+  -H "Authorization: Bearer <WHATSAPP_TOKEN>" \
+  -F "file=@clip_N_final.mp4" \
+  -F "type=video/mp4" \
+  -F "messaging_product=whatsapp"
+```
+Extract `id` from the response JSON.
+
+If the file is over 16MB, re-encode first:
+```
+ffmpeg -i clip_N_final.mp4 -fs 15M -c:v libx264 -crf 30 -preset fast -c:a aac -y clip_N_wa.mp4
+```
+Then upload the smaller file.
+
+**Step 4b — Send message:**
+```
+curl -s -X POST "https://graph.facebook.com/v21.0/<WHATSAPP_PHONE_ID>/messages" \
+  -H "Authorization: Bearer <WHATSAPP_TOKEN>" \
+  -H "Content-Type: application/json" \
+  -d '{"messaging_product":"whatsapp","to":"<WHATSAPP_RECIPIENT>","type":"video","video":{"id":"<MEDIA_ID>","caption":"<clip title>"}}'
+```
+
+### Step 5: Rate limiting
+If publishing more than 3 clips, add a 1-second delay between sends:
+```
+sleep 1
+```
+
+### Step 6: Publishing summary
+Build a summary table:
+
+| # | Platform | Status | Details |
+|---|----------|--------|---------|
+| 1 | Telegram | Sent | message_id: 1234 |
+| 1 | WhatsApp | Sent | message_id: wamid.xxx |
+| 2 | Telegram | Failed | Re-encoded and retried |
+
+Track counts of successful Telegram and WhatsApp publishes for the report phase.
+
+IMPORTANT: Never expose API tokens in the summary or report. Mask any token references as `***`.
+
+---
+
+## Phase 7 — Report
+
+After all clips are produced, report:
+
+| # | Title | File | Duration | Size |
+|---|-------|------|----------|------|
+| 1 | "..." | clip_1_final.mp4 | 45s | 12MB |
+| 2 | "..." | clip_2_final.mp4 | 38s | 9MB |
+
+Include file paths and thumbnail paths.
+
+Update stats via memory_store:
+- `clip_hand_jobs_completed` — increment by 1
+- `clip_hand_clips_generated` — increment by number of clips made
+- `clip_hand_total_duration_secs` — increment by total clip duration
+- `clip_hand_clips_published_telegram` — increment by number of clips successfully sent to Telegram (0 if not configured)
+- `clip_hand_clips_published_whatsapp` — increment by number of clips successfully sent to WhatsApp (0 if not configured)
+
+---
+
+## Guidelines
+
+- ALWAYS run Phase 0 (platform detection) first — adapt all commands to the detected OS
+- Always verify tools are available before starting (ffmpeg, ffprobe, yt-dlp)
+- Create output files in the same directory as the source (or current directory for URLs)
+- If the user specifies a number of clips, respect it; otherwise produce 3-5
+- If the user provides specific timestamps, skip Phase 4 and use those
+- If download or transcription fails, explain what went wrong and offer alternatives
+- Use `-y` flag on all ffmpeg commands to overwrite without prompting
+- For very long videos (>1hr), process in chunks to avoid memory issues
+- Use file_write tool for creating SRT/text files — never rely on shell echo/heredoc which varies by OS
+- All ffmpeg filter paths must use forward slashes, even on Windows
+- Never expose API tokens (Telegram, WhatsApp) in reports or summaries — always mask as `***`
+- Publishing errors are non-fatal — if a platform fails, log the error and continue with remaining clips/platforms
+- Respect rate limits: add 1-second delay between sends when publishing more than 3 clips
+"""
+
+[dashboard]
+[[dashboard.metrics]]
+label = "Jobs Completed"
+memory_key = "clip_hand_jobs_completed"
+format = "number"
+
+[[dashboard.metrics]]
+label = "Clips Generated"
+memory_key = "clip_hand_clips_generated"
+format = "number"
+
+[[dashboard.metrics]]
+label = "Total Duration"
+memory_key = "clip_hand_total_duration_secs"
+format = "duration"
+
+[[dashboard.metrics]]
+label = "Published to Telegram"
+memory_key = "clip_hand_clips_published_telegram"
+format = "number"
+
+[[dashboard.metrics]]
+label = "Published to WhatsApp"
+memory_key = "clip_hand_clips_published_whatsapp"
+format = "number"
--- a/crates/openfang-hands/bundled/clip/SKILL.md
+++ b/crates/openfang-hands/bundled/clip/SKILL.md
@@ -0,0 +1,474 @@
+---
+name: clip-hand-skill
+version: "2.0.0"
+description: "Expert knowledge for AI video clipping — yt-dlp downloading, whisper transcription, SRT generation, and ffmpeg processing"
+runtime: prompt_only
+---
+
+# Video Clipping Expert Knowledge
+
+## Cross-Platform Notes
+
+All tools (ffmpeg, ffprobe, yt-dlp, whisper) use **identical CLI flags** on Windows, macOS, and Linux. The differences are only in shell syntax:
+
+| Feature | macOS / Linux | Windows (cmd.exe) |
+|---------|---------------|-------------------|
+| Suppress stderr | `2>/dev/null` | `2>NUL` |
+| Filter output | `\| grep pattern` | `\| findstr pattern` |
+| Delete files | `rm file1 file2` | `del file1 file2` |
+| Null output device | `-f null -` | `-f null -` (same) |
+| ffmpeg subtitle paths | `subtitles=clip.srt` | `subtitles=clip.srt` (relative OK, absolute needs `C\\:/path`) |
+
+IMPORTANT: ffmpeg filter paths (`-vf "subtitles=..."`) always need forward slashes. On Windows with absolute paths, escape the colon: `subtitles=C\\:/Users/me/clip.srt`
+
+Prefer using `file_write` tool for creating SRT/text files instead of shell echo/heredoc.
+
+---
+
+## yt-dlp Reference
+
+### Download with Format Selection
+```
+# Best video up to 1080p + best audio, merged
+yt-dlp -f "bv[height<=1080]+ba/b[height<=1080]" --restrict-filenames -o "source.%(ext)s" "URL"
+
+# 720p max (smaller, faster)
+yt-dlp -f "bv[height<=720]+ba/b[height<=720]" --restrict-filenames -o "source.%(ext)s" "URL"
+
+# Audio only (for transcription-only workflows)
+yt-dlp -x --audio-format wav --restrict-filenames -o "audio.%(ext)s" "URL"
+```
+
+### Metadata Inspection
+```
+# Get full metadata as JSON (duration, title, chapters, available subs)
+yt-dlp --dump-json "URL"
+
+# Key fields: duration, title, description, chapters, subtitles, automatic_captions
+```
+
+### YouTube Auto-Subtitles
+```
+# Download auto-generated subtitles in json3 format (word-level timing)
+yt-dlp --write-auto-subs --sub-lang en --sub-format json3 --skip-download --restrict-filenames -o "source" "URL"
+
+# Download manual subtitles if available
+yt-dlp --write-subs --sub-lang en --sub-format srt --skip-download --restrict-filenames -o "source" "URL"
+
+# List available subtitle languages
+yt-dlp --list-subs "URL"
+```
+
+### Useful Flags
+- `--restrict-filenames` — safe ASCII filenames (no spaces/special chars) — important on all platforms
+- `--no-playlist` — download single video even if URL is in a playlist
+- `-o "template.%(ext)s"` — output template (%(ext)s auto-detects format)
+- `--cookies-from-browser chrome` — use browser cookies for age-restricted content
+- `--extract-audio` / `-x` — extract audio only
+- `--audio-format wav` — convert audio to wav (for whisper)
+
+---
+
+## Whisper Transcription Reference
+
+### Audio Extraction for Whisper
+```
+# Extract mono 16kHz WAV (whisper's preferred input format)
+ffmpeg -i source.mp4 -vn -ar 16000 -ac 1 -y audio.wav
+```
+
+### Basic Transcription
+```
+# Standard transcription with word-level timestamps
+whisper audio.wav --model small --output_format json --word_timestamps true --language en
+
+# Faster alternative (same flags, 4x speed)
+whisper-ctranslate2 audio.wav --model small --output_format json --word_timestamps true --language en
+```
+
+### Model Sizes
+| Model | VRAM | Speed | Quality | Use When |
+|-------|------|-------|---------|----------|
+| tiny | ~1GB | Fastest | Rough | Quick previews, testing pipeline |
+| base | ~1GB | Fast | OK | Short clips, clear speech |
+| small | ~2GB | Good | Good | **Default — best balance** |
+| medium | ~5GB | Slow | Better | Important content, accented speech |
+| large-v3 | ~10GB | Slowest | Best | Final production, multiple languages |
+
+Note: On macOS Apple Silicon, consider `mlx-whisper` as a faster native alternative.
+
+### JSON Output Structure
+```json
+{
+  "text": "full transcript text...",
+  "segments": [
+    {
+      "id": 0,
+      "start": 0.0,
+      "end": 4.52,
+      "text": " Hello everyone, welcome back.",
+      "words": [
+        {"word": " Hello", "start": 0.0, "end": 0.32, "probability": 0.95},
+        {"word": " everyone,", "start": 0.32, "end": 0.78, "probability": 0.91},
+        {"word": " welcome", "start": 0.78, "end": 1.14, "probability": 0.98},
+        {"word": " back.", "start": 1.14, "end": 1.52, "probability": 0.97}
+      ]
+    }
+  ]
+}
+```
+- `segments[].words[]` gives word-level timing when `--word_timestamps true`
+- `probability` indicates confidence (< 0.5 = likely wrong)
+
+---
+
+## YouTube json3 Subtitle Parsing
+
+### Format Structure
+```json
+{
+  "events": [
+    {
+      "tStartMs": 1230,
+      "dDurationMs": 5000,
+      "segs": [
+        {"utf8": "hello ", "tOffsetMs": 0},
+        {"utf8": "world ", "tOffsetMs": 200},
+        {"utf8": "how ", "tOffsetMs": 450},
+        {"utf8": "are you", "tOffsetMs": 700}
+      ]
+    }
+  ]
+}
+```
+
+### Extracting Word Timing
+For each event and each segment within it:
+- `word_start_ms = event.tStartMs + seg.tOffsetMs`
+- `word_start_secs = word_start_ms / 1000.0`
+- `word_text = seg.utf8.trim()`
+
+Events without `segs` are line breaks or formatting — skip them.
+Events with `segs` containing only `"\n"` are newlines — skip them.
+
+---
+
+## SRT Generation from Transcript
+
+### SRT Format
+```
+1
+00:00:00,000 --> 00:00:02,500
+First line of caption text
+
+2
+00:00:02,500 --> 00:00:05,100
+Second line of caption text
+```
+
+### Rules for Building Good SRT
+- Group words into subtitle lines of ~8-12 words (2-3 seconds per line)
+- Break at natural pause points (periods, commas, clause boundaries)
+- Keep lines under 42 characters for readability on mobile
+- Adjust timestamps relative to clip start (subtract clip start time from all timestamps)
+- Timestamp format: `HH:MM:SS,mmm` (comma separator, not dot)
+- Each entry: index line, timestamp line, text line(s), blank line
+- Use `file_write` tool to create the SRT file — works identically on all platforms
+
+### Styled Captions with ASS Format
+For animated/styled captions, use ASS subtitle format instead of SRT:
+```
+ffmpeg -i clip.mp4 -vf "subtitles=clip.ass:force_style='FontSize=22,FontName=Arial,Bold=1,PrimaryColour=&H00FFFFFF,OutlineColour=&H00000000,Outline=2,Shadow=1,Alignment=2,MarginV=40'" -c:a copy output.mp4
+```
+
+Key ASS style properties:
+- `PrimaryColour=&H00FFFFFF` — white text (AABBGGRR format)
+- `OutlineColour=&H00000000` — black outline
+- `Outline=2` — outline thickness
+- `Alignment=2` — bottom center
+- `MarginV=40` — margin from bottom edge
+- `FontSize=22` — good size for 1080x1920 vertical
+
+---
+
+## FFmpeg Video Processing
+
+### Scene Detection
+```
+ffmpeg -i input.mp4 -filter:v "select='gt(scene,0.3)',showinfo" -f null - 2>&1
+```
+- Threshold 0.1 = very sensitive, 0.5 = only major cuts
+- Parse `pts_time:` from showinfo output for timestamps
+- On macOS/Linux pipe through `grep showinfo`, on Windows pipe through `findstr showinfo`
+
+### Silence Detection
+```
+ffmpeg -i input.mp4 -af "silencedetect=noise=-30dB:d=1.5" -f null - 2>&1
+```
+- `d=1.5` = minimum 1.5 seconds of silence
+- Look for `silence_start` and `silence_end` in output
+
+### Clip Extraction
+```
+# Re-encoded (accurate cuts)
+ffmpeg -ss 00:01:30 -to 00:02:15 -i input.mp4 -c:v libx264 -c:a aac -preset fast -crf 23 -movflags +faststart -y clip.mp4
+
+# Lossless copy (fast but may have keyframe alignment issues)
+ffmpeg -ss 00:01:30 -to 00:02:15 -i input.mp4 -c copy -y clip.mp4
+```
+- `-ss` before `-i` = fast seek (recommended for extraction)
+- `-to` = end timestamp, `-t` = duration
+
+### Vertical Video (9:16 for Shorts/Reels/TikTok)
+```
+# Center crop (when source is 16:9)
+ffmpeg -i input.mp4 -vf "crop=ih*9/16:ih:(iw-ih*9/16)/2:0,scale=1080:1920" -c:a copy output.mp4
+
+# Scale with letterbox padding (preserves full frame)
+ffmpeg -i input.mp4 -vf "scale=1080:1920:force_original_aspect_ratio=decrease,pad=1080:1920:(ow-iw)/2:(oh-ih)/2:black" -c:a copy output.mp4
+```
+
+### Caption Burn-in
+```
+# SRT subtitles with styling (use relative path or forward-slash absolute path)
+ffmpeg -i input.mp4 -vf "subtitles=subs.srt:force_style='FontSize=22,FontName=Arial,PrimaryColour=&H00FFFFFF,OutlineColour=&H00000000,Outline=2,Alignment=2,MarginV=40'" -c:a copy output.mp4
+
+# Simple text overlay
+ffmpeg -i input.mp4 -vf "drawtext=text='Caption':fontsize=48:fontcolor=white:borderw=3:bordercolor=black:x=(w-text_w)/2:y=h-th-40" output.mp4
+```
+Windows path escaping: `subtitles=C\\:/Users/me/subs.srt` (double-backslash before colon)
+
+### Thumbnail Generation
+```
+# At specific time (2 seconds in)
+ffmpeg -i input.mp4 -ss 2 -frames:v 1 -q:v 2 -y thumb.jpg
+
+# Best keyframe
+ffmpeg -i input.mp4 -vf "select='eq(pict_type,I)',scale=1280:720" -frames:v 1 thumb.jpg
+
+# Contact sheet
+ffmpeg -i input.mp4 -vf "fps=1/10,scale=320:-1,tile=4x4" contact.jpg
+```
+
+### Video Analysis
+```
+# Full metadata (JSON)
+ffprobe -v quiet -print_format json -show_format -show_streams input.mp4
+
+# Duration only
+ffprobe -v error -show_entries format=duration -of csv=p=0 input.mp4
+
+# Resolution
+ffprobe -v error -select_streams v:0 -show_entries stream=width,height -of csv=p=0 input.mp4
+```
+
+## API-Based STT Reference
+
+### Groq Whisper API
+Fastest cloud STT — uses whisper-large-v3 on Groq hardware. Free tier available.
+```
+curl -s -X POST "https://api.groq.com/openai/v1/audio/transcriptions" \
+  -H "Authorization: Bearer $GROQ_API_KEY" \
+  -H "Content-Type: multipart/form-data" \
+  -F "file=@audio.wav" \
+  -F "model=whisper-large-v3" \
+  -F "response_format=verbose_json" \
+  -F "timestamp_granularities[]=word" \
+  -o transcript_raw.json
+```
+Response: `{"text": "...", "words": [{"word": "hello", "start": 0.0, "end": 0.32}]}`
+- Max file size: 25MB. For longer audio, split with ffmpeg first.
+- `timestamp_granularities[]=word` is required for word-level timing.
+
+### OpenAI Whisper API
+```
+curl -s -X POST "https://api.openai.com/v1/audio/transcriptions" \
+  -H "Authorization: Bearer $OPENAI_API_KEY" \
+  -H "Content-Type: multipart/form-data" \
+  -F "file=@audio.wav" \
+  -F "model=whisper-1" \
+  -F "response_format=verbose_json" \
+  -F "timestamp_granularities[]=word" \
+  -o transcript_raw.json
+```
+Response format same as Groq. Max 25MB.
+
+### Deepgram Nova-2
+```
+curl -s -X POST "https://api.deepgram.com/v1/listen?model=nova-2&smart_format=true&utterances=true&punctuate=true" \
+  -H "Authorization: Token $DEEPGRAM_API_KEY" \
+  -H "Content-Type: audio/wav" \
+  --data-binary @audio.wav \
+  -o transcript_raw.json
+```
+Response: `{"results": {"channels": [{"alternatives": [{"words": [{"word": "hello", "start": 0.0, "end": 0.32, "confidence": 0.99}]}]}]}}`
+- Supports streaming, but for clips use batch mode.
+- `smart_format=true` adds punctuation and casing.
+
+---
+
+## TTS Reference
+
+### Edge TTS (free, no API key needed)
+```
+# List available voices
+edge-tts --list-voices
+
+# Generate speech
+edge-tts --text "Your caption text here" --voice en-US-AriaNeural --write-media tts_output.mp3
+
+# Other good voices: en-US-GuyNeural, en-GB-SoniaNeural, en-AU-NatashaNeural
+```
+Install: `pip install edge-tts`
+
+### OpenAI TTS
+```
+curl -s -X POST "https://api.openai.com/v1/audio/speech" \
+  -H "Authorization: Bearer $OPENAI_API_KEY" \
+  -H "Content-Type: application/json" \
+  -d '{"model":"tts-1","input":"Your text here","voice":"alloy"}' \
+  --output tts_output.mp3
+```
+Voices: `alloy`, `echo`, `fable`, `onyx`, `nova`, `shimmer`
+Models: `tts-1` (fast), `tts-1-hd` (quality)
+
+### ElevenLabs
+```
+curl -s -X POST "https://api.elevenlabs.io/v1/text-to-speech/21m00Tcm4TlvDq8ikWAM" \
+  -H "xi-api-key: $ELEVENLABS_API_KEY" \
+  -H "Content-Type: application/json" \
+  -d '{"text":"Your text here","model_id":"eleven_monolingual_v1"}' \
+  --output tts_output.mp3
+```
+Voice ID `21m00Tcm4TlvDq8ikWAM` = Rachel (default). List voices: `GET /v1/voices`
+
+### Audio Merging (TTS + Original)
+```
+# Mix TTS over original audio (original at 30% volume, TTS at 100%)
+ffmpeg -i clip.mp4 -i tts.mp3 \
+  -filter_complex "[0:a]volume=0.3[orig];[1:a]volume=1.0[tts];[orig][tts]amix=inputs=2:duration=first[out]" \
+  -map 0:v -map "[out]" -c:v copy -c:a aac -y clip_voiced.mp4
+
+# Replace audio entirely (no original audio)
+ffmpeg -i clip.mp4 -i tts.mp3 -map 0:v -map 1:a -c:v copy -c:a aac -shortest -y clip_voiced.mp4
+```
+
+---
+
+## Quality & Performance Tips
+
+- Use `-preset ultrafast` for quick previews, `-preset slow` for final output
+- Use `-crf 23` for good quality (18=high, 28=low, lower=bigger files)
+- Add `-movflags +faststart` for web-friendly MP4
+- Use `-threads 0` to auto-detect CPU cores
+- Always use `-y` to overwrite without asking
+
+---
+
+## Telegram Bot API Reference
+
+### sendVideo — Upload and send a video to a chat/channel
+```
+curl -s -X POST "https://api.telegram.org/bot<BOT_TOKEN>/sendVideo" \
+  -F "chat_id=<CHAT_ID>" \
+  -F "video=@clip_N_final.mp4" \
+  -F "caption=Clip title here" \
+  -F "parse_mode=HTML" \
+  -F "supports_streaming=true"
+```
+
+### Parameters
+| Parameter | Required | Description |
+|-----------|----------|-------------|
+| `chat_id` | Yes | Channel (`-100XXXXXXXXXX` or `@channelname`), group, or user numeric ID |
+| `video` | Yes | `@filepath` for upload (max 50MB) or a Telegram `file_id` for re-send |
+| `caption` | No | Text caption, up to 1024 characters |
+| `parse_mode` | No | `HTML` or `MarkdownV2` for styled captions |
+| `supports_streaming` | No | `true` enables progressive playback |
+
+### Success Response
+```json
+{"ok": true, "result": {"message_id": 1234, "video": {"file_id": "BAACAgI...", "file_size": 5242880}}}
+```
+
+### Error Response
+```json
+{"ok": false, "error_code": 400, "description": "Bad Request: chat not found"}
+```
+
+### Common Errors
+| Error Code | Description | Fix |
+|------------|-------------|-----|
+| 400 | Chat not found | Verify chat_id; bot must be added to the channel/group |
+| 401 | Unauthorized | Bot token is invalid or revoked — regenerate via @BotFather |
+| 413 | Request entity too large | File exceeds 50MB — re-encode: `ffmpeg -i input.mp4 -fs 49M -c:v libx264 -crf 28 -preset fast -c:a aac -y output.mp4` |
+| 429 | Too many requests | Rate limited — wait the `retry_after` seconds from the response |
+
+### File Size Limit
+Telegram allows up to **50MB** for video uploads via Bot API. If a clip exceeds this:
+```
+ffmpeg -i clip_N_final.mp4 -fs 49M -c:v libx264 -crf 28 -preset fast -c:a aac -movflags +faststart -y clip_N_tg.mp4
+```
+
+---
+
+## WhatsApp Business Cloud API Reference
+
+### Two-Step Flow: Upload Media → Send Message
+
+WhatsApp Cloud API requires uploading the video first to get a `media_id`, then sending a message referencing that ID.
+
+### Step 1 — Upload Media
+```
+curl -s -X POST "https://graph.facebook.com/v21.0/<PHONE_NUMBER_ID>/media" \
+  -H "Authorization: Bearer <ACCESS_TOKEN>" \
+  -F "file=@clip_N_final.mp4" \
+  -F "type=video/mp4" \
+  -F "messaging_product=whatsapp"
+```
+
+Success response:
+```json
+{"id": "1234567890"}
+```
+
+### Step 2 — Send Video Message
+```
+curl -s -X POST "https://graph.facebook.com/v21.0/<PHONE_NUMBER_ID>/messages" \
+  -H "Authorization: Bearer <ACCESS_TOKEN>" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messaging_product": "whatsapp",
+    "to": "<RECIPIENT_PHONE>",
+    "type": "video",
+    "video": {
+      "id": "<MEDIA_ID>",
+      "caption": "Clip title here"
+    }
+  }'
+```
+
+Success response:
+```json
+{"messaging_product": "whatsapp", "contacts": [{"wa_id": "14155551234"}], "messages": [{"id": "wamid.HBgL..."}]}
+```
+
+### File Size Limit
+WhatsApp allows up to **16MB** for video uploads. If a clip exceeds this:
+```
+ffmpeg -i clip_N_final.mp4 -fs 15M -c:v libx264 -crf 30 -preset fast -c:a aac -movflags +faststart -y clip_N_wa.mp4
+```
+
+### 24-Hour Messaging Window
+WhatsApp requires the recipient to have messaged you within the last 24 hours (for non-template messages). If you get a "template required" error, either:
+- Ask the recipient to send any message to the business number first
+- Use a pre-approved message template instead of a free-form video message
+
+### Common Errors
+| Error Code | Description | Fix |
+|------------|-------------|-----|
+| 100 | Invalid parameter | Check phone_number_id and recipient format (no + prefix, no spaces) |
+| 190 | Invalid/expired access token | Regenerate token in Meta Business Settings; temporary tokens expire in 24h |
+| 131030 | Recipient not in allowed list | In test mode, add recipient to allowed numbers in Meta Developer Portal |
+| 131047 | Re-engagement message / template required | Recipient hasn't messaged within 24h — use a template or ask them to message first |
+| 131053 | Media upload failed | File too large or unsupported format — re-encode as MP4 under 16MB |