Add complete Browser Hand UI system for browser automation: Components: - BrowserHandCard: Main card with status display and screenshot preview - TaskTemplateModal: Template selection and parameter configuration - ScreenshotPreview: Screenshot display with fullscreen capability Templates: - Basic operations: navigate, screenshot, form fill, click, execute JS - Scraping: text, list, images, links, tables - Automation: login+action, multi-page, monitoring, pagination Features: - 15 built-in task templates across 3 categories - Real-time execution status with progress bar - Screenshot preview with zoom and fullscreen - Integration with HandsPanel for seamless UX - Zustand store for state management - Comprehensive test coverage (16 tests) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
536 lines
15 KiB
TypeScript
536 lines
15 KiB
TypeScript
/**
|
||
* Scraping Templates for Browser Hand
|
||
*
|
||
* Contains data scraping and extraction templates.
|
||
*/
|
||
|
||
import type { TaskTemplate, ExecutionContext } from './types';
|
||
|
||
// ============================================================================
|
||
// Template: Scrape Text
|
||
// ============================================================================
|
||
|
||
const scrapeTextTemplate: TaskTemplate = {
|
||
id: 'scrape_text',
|
||
name: '抓取页面文本',
|
||
description: '从多个选择器提取文本内容',
|
||
category: 'scraping',
|
||
icon: 'FileText',
|
||
params: [
|
||
{
|
||
key: 'url',
|
||
label: '网页地址',
|
||
type: 'url',
|
||
required: true,
|
||
placeholder: 'https://example.com',
|
||
},
|
||
{
|
||
key: 'selectors',
|
||
label: '选择器列表',
|
||
type: 'textarea',
|
||
required: true,
|
||
placeholder: '.title\n.description\n.price',
|
||
description: 'CSS 选择器(每行一个)',
|
||
},
|
||
{
|
||
key: 'waitFor',
|
||
label: '等待元素',
|
||
type: 'text',
|
||
required: false,
|
||
placeholder: '.content',
|
||
description: '等待此元素出现后再抓取',
|
||
},
|
||
],
|
||
execute: async (params, context: ExecutionContext) => {
|
||
const { browser, onProgress, onLog } = context;
|
||
const url = params.url as string;
|
||
const selectorsText = params.selectors as string;
|
||
const waitFor = params.waitFor as string | undefined;
|
||
const selectors = selectorsText.split('\n').map((s) => s.trim()).filter(Boolean);
|
||
|
||
onProgress('正在导航到页面...', 0);
|
||
onLog('info', `访问: ${url}`);
|
||
await browser.goto(url);
|
||
|
||
if (waitFor) {
|
||
onProgress('等待页面加载...', 20);
|
||
onLog('action', `等待元素: ${waitFor}`);
|
||
await browser.wait(waitFor, 10000);
|
||
}
|
||
|
||
onProgress('正在抓取文本...', 50);
|
||
const result: Record<string, string | string[]> = {};
|
||
|
||
for (let i = 0; i < selectors.length; i++) {
|
||
const selector = selectors[i];
|
||
const progress = 50 + Math.floor((i / selectors.length) * 40);
|
||
|
||
onProgress(`正在抓取 ${i + 1}/${selectors.length}...`, progress);
|
||
|
||
try {
|
||
// Try to get multiple elements first
|
||
const multipleResult = await browser.eval(`
|
||
(selector) => {
|
||
const elements = document.querySelectorAll(selector);
|
||
if (elements.length > 1) {
|
||
return Array.from(elements).map(el => el.textContent?.trim() || '');
|
||
} else if (elements.length === 1) {
|
||
return elements[0].textContent?.trim() || '';
|
||
}
|
||
return null;
|
||
}
|
||
`, [selector]);
|
||
|
||
if (multipleResult !== null) {
|
||
result[selector] = multipleResult as string | string[];
|
||
onLog('info', `抓取成功: ${selector}`);
|
||
} else {
|
||
result[selector] = '';
|
||
onLog('warn', `未找到元素: ${selector}`);
|
||
}
|
||
} catch (error) {
|
||
result[selector] = '';
|
||
onLog('error', `抓取失败: ${selector}`, { error: String(error) });
|
||
}
|
||
}
|
||
|
||
onProgress('完成', 100);
|
||
return { url: await browser.url(), data: result };
|
||
},
|
||
};
|
||
|
||
// ============================================================================
|
||
// Template: Scrape List
|
||
// ============================================================================
|
||
|
||
const scrapeListTemplate: TaskTemplate = {
|
||
id: 'scrape_list',
|
||
name: '提取列表数据',
|
||
description: '从重复元素中批量提取结构化数据',
|
||
category: 'scraping',
|
||
icon: 'List',
|
||
params: [
|
||
{
|
||
key: 'url',
|
||
label: '网页地址',
|
||
type: 'url',
|
||
required: true,
|
||
placeholder: 'https://example.com/products',
|
||
},
|
||
{
|
||
key: 'itemSelector',
|
||
label: '列表项选择器',
|
||
type: 'text',
|
||
required: true,
|
||
placeholder: '.product-item',
|
||
description: '每个列表项的 CSS 选择器',
|
||
},
|
||
{
|
||
key: 'fieldMappings',
|
||
label: '字段映射',
|
||
type: 'json',
|
||
required: true,
|
||
default: {},
|
||
description: 'JSON 对象,映射字段名到选择器',
|
||
placeholder: '{"title": ".title", "price": ".price", "link": "a@href"}',
|
||
},
|
||
{
|
||
key: 'limit',
|
||
label: '最大数量',
|
||
type: 'number',
|
||
required: false,
|
||
default: 50,
|
||
min: 1,
|
||
max: 500,
|
||
description: '最多提取多少条数据',
|
||
},
|
||
],
|
||
execute: async (params, context: ExecutionContext) => {
|
||
const { browser, onProgress, onLog } = context;
|
||
const url = params.url as string;
|
||
const itemSelector = params.itemSelector as string;
|
||
const fieldMappings = params.fieldMappings as Record<string, string>;
|
||
const limit = (params.limit as number) ?? 50;
|
||
|
||
onProgress('正在导航到页面...', 0);
|
||
onLog('info', `访问: ${url}`);
|
||
await browser.goto(url);
|
||
|
||
onProgress('等待列表加载...', 30);
|
||
await browser.wait(itemSelector, 10000);
|
||
|
||
onProgress('正在提取列表数据...', 50);
|
||
|
||
const scrapingScript = `
|
||
({ itemSelector, fieldMappings, limit }) => {
|
||
const items = document.querySelectorAll(itemSelector);
|
||
const results = [];
|
||
|
||
for (let i = 0; i < Math.min(items.length, limit); i++) {
|
||
const item = items[i];
|
||
const row = {};
|
||
|
||
for (const [field, selector] of Object.entries(fieldMappings)) {
|
||
// Handle attribute selectors like "a@href"
|
||
const parts = selector.split('@');
|
||
const cssSelector = parts[0];
|
||
const attr = parts[1];
|
||
|
||
const el = item.querySelector(cssSelector);
|
||
if (el) {
|
||
if (attr) {
|
||
row[field] = el.getAttribute(attr) || '';
|
||
} else {
|
||
row[field] = el.textContent?.trim() || '';
|
||
}
|
||
} else {
|
||
row[field] = '';
|
||
}
|
||
}
|
||
|
||
results.push(row);
|
||
}
|
||
|
||
return results;
|
||
}
|
||
`;
|
||
|
||
const result = await browser.eval(scrapingScript, [{
|
||
itemSelector,
|
||
fieldMappings,
|
||
limit,
|
||
}]);
|
||
|
||
const items = result as Array<Record<string, string>>;
|
||
onLog('info', `提取了 ${items.length} 条数据`);
|
||
|
||
onProgress('完成', 100);
|
||
return {
|
||
url: await browser.url(),
|
||
count: items.length,
|
||
data: items,
|
||
};
|
||
},
|
||
};
|
||
|
||
// ============================================================================
|
||
// Template: Scrape Images
|
||
// ============================================================================
|
||
|
||
const scrapeImagesTemplate: TaskTemplate = {
|
||
id: 'scrape_images',
|
||
name: '抓取图片列表',
|
||
description: '提取页面中的图片 URL',
|
||
category: 'scraping',
|
||
icon: 'Image',
|
||
params: [
|
||
{
|
||
key: 'url',
|
||
label: '网页地址',
|
||
type: 'url',
|
||
required: true,
|
||
placeholder: 'https://example.com/gallery',
|
||
},
|
||
{
|
||
key: 'imageSelector',
|
||
label: '图片选择器',
|
||
type: 'text',
|
||
required: false,
|
||
default: 'img',
|
||
placeholder: 'img.gallery-image',
|
||
description: '图片元素的 CSS 选择器',
|
||
},
|
||
{
|
||
key: 'minWidth',
|
||
label: '最小宽度',
|
||
type: 'number',
|
||
required: false,
|
||
default: 100,
|
||
description: '忽略小于此宽度的图片',
|
||
},
|
||
{
|
||
key: 'minHeight',
|
||
label: '最小高度',
|
||
type: 'number',
|
||
required: false,
|
||
default: 100,
|
||
description: '忽略小于此高度的图片',
|
||
},
|
||
],
|
||
execute: async (params, context: ExecutionContext) => {
|
||
const { browser, onProgress, onLog } = context;
|
||
const url = params.url as string;
|
||
const imageSelector = (params.imageSelector as string) ?? 'img';
|
||
const minWidth = (params.minWidth as number) ?? 100;
|
||
const minHeight = (params.minHeight as number) ?? 100;
|
||
|
||
onProgress('正在导航到页面...', 0);
|
||
onLog('info', `访问: ${url}`);
|
||
await browser.goto(url);
|
||
|
||
onProgress('正在提取图片...', 50);
|
||
|
||
const extractScript = `
|
||
({ imageSelector, minWidth, minHeight }) => {
|
||
const images = document.querySelectorAll(imageSelector);
|
||
const results = [];
|
||
|
||
images.forEach(img => {
|
||
const width = img.naturalWidth || img.width;
|
||
const height = img.naturalHeight || img.height;
|
||
|
||
if (width >= minWidth && height >= minHeight) {
|
||
results.push({
|
||
src: img.src,
|
||
alt: img.alt || '',
|
||
width,
|
||
height,
|
||
});
|
||
}
|
||
});
|
||
|
||
return results;
|
||
}
|
||
`;
|
||
|
||
const result = await browser.eval(extractScript, [{
|
||
imageSelector,
|
||
minWidth,
|
||
minHeight,
|
||
}]);
|
||
|
||
const images = result as Array<{
|
||
src: string;
|
||
alt: string;
|
||
width: number;
|
||
height: number;
|
||
}>;
|
||
|
||
onLog('info', `找到 ${images.length} 张图片`);
|
||
|
||
onProgress('完成', 100);
|
||
return {
|
||
url: await browser.url(),
|
||
count: images.length,
|
||
images,
|
||
};
|
||
},
|
||
};
|
||
|
||
// ============================================================================
|
||
// Template: Scrape Links
|
||
// ============================================================================
|
||
|
||
const scrapeLinksTemplate: TaskTemplate = {
|
||
id: 'scrape_links',
|
||
name: '抓取链接列表',
|
||
description: '提取页面中的所有链接',
|
||
category: 'scraping',
|
||
icon: 'Link',
|
||
params: [
|
||
{
|
||
key: 'url',
|
||
label: '网页地址',
|
||
type: 'url',
|
||
required: true,
|
||
placeholder: 'https://example.com',
|
||
},
|
||
{
|
||
key: 'linkSelector',
|
||
label: '链接选择器',
|
||
type: 'text',
|
||
required: false,
|
||
default: 'a[href]',
|
||
placeholder: 'a[href]',
|
||
description: '链接元素的 CSS 选择器',
|
||
},
|
||
{
|
||
key: 'filterPattern',
|
||
label: 'URL 过滤',
|
||
type: 'text',
|
||
required: false,
|
||
placeholder: 'example.com',
|
||
description: '只保留包含此文本的链接',
|
||
},
|
||
{
|
||
key: 'excludePattern',
|
||
label: '排除模式',
|
||
type: 'text',
|
||
required: false,
|
||
placeholder: '#, javascript:',
|
||
description: '排除包含此文本的链接',
|
||
},
|
||
],
|
||
execute: async (params, context: ExecutionContext) => {
|
||
const { browser, onProgress, onLog } = context;
|
||
const url = params.url as string;
|
||
const linkSelector = (params.linkSelector as string) ?? 'a[href]';
|
||
const filterPattern = params.filterPattern as string | undefined;
|
||
const excludePattern = params.excludePattern as string | undefined;
|
||
|
||
onProgress('正在导航到页面...', 0);
|
||
onLog('info', `访问: ${url}`);
|
||
await browser.goto(url);
|
||
|
||
onProgress('正在提取链接...', 50);
|
||
|
||
const extractScript = `
|
||
({ linkSelector, filterPattern, excludePattern }) => {
|
||
const links = document.querySelectorAll(linkSelector);
|
||
const results = [];
|
||
const seen = new Set();
|
||
|
||
links.forEach(a => {
|
||
const href = a.href;
|
||
const text = a.textContent?.trim() || '';
|
||
|
||
if (!href || seen.has(href)) return;
|
||
|
||
// Apply filter
|
||
if (filterPattern && !href.includes(filterPattern) && !text.includes(filterPattern)) {
|
||
return;
|
||
}
|
||
|
||
// Apply exclude
|
||
if (excludePattern) {
|
||
const patterns = excludePattern.split(',').map(p => p.trim());
|
||
for (const p of patterns) {
|
||
if (href.includes(p)) return;
|
||
}
|
||
}
|
||
|
||
seen.add(href);
|
||
results.push({ href, text });
|
||
});
|
||
|
||
return results;
|
||
}
|
||
`;
|
||
|
||
const result = await browser.eval(extractScript, [{
|
||
linkSelector,
|
||
filterPattern,
|
||
excludePattern,
|
||
}]);
|
||
|
||
const links = result as Array<{ href: string; text: string }>;
|
||
onLog('info', `找到 ${links.length} 个链接`);
|
||
|
||
onProgress('完成', 100);
|
||
return {
|
||
url: await browser.url(),
|
||
count: links.length,
|
||
links,
|
||
};
|
||
},
|
||
};
|
||
|
||
// ============================================================================
|
||
// Template: Scrape Table
|
||
// ============================================================================
|
||
|
||
const scrapeTableTemplate: TaskTemplate = {
|
||
id: 'scrape_table',
|
||
name: '抓取表格数据',
|
||
description: '从 HTML 表格中提取数据',
|
||
category: 'scraping',
|
||
icon: 'Table',
|
||
params: [
|
||
{
|
||
key: 'url',
|
||
label: '网页地址',
|
||
type: 'url',
|
||
required: true,
|
||
placeholder: 'https://example.com/data',
|
||
},
|
||
{
|
||
key: 'tableSelector',
|
||
label: '表格选择器',
|
||
type: 'text',
|
||
required: false,
|
||
default: 'table',
|
||
placeholder: 'table.data-table',
|
||
description: '表格元素的 CSS 选择器',
|
||
},
|
||
{
|
||
key: 'headerRow',
|
||
label: '表头行',
|
||
type: 'number',
|
||
required: false,
|
||
default: 1,
|
||
min: 0,
|
||
max: 10,
|
||
description: '表头所在行(0 表示无表头)',
|
||
},
|
||
],
|
||
execute: async (params, context: ExecutionContext) => {
|
||
const { browser, onProgress, onLog } = context;
|
||
const url = params.url as string;
|
||
const tableSelector = (params.tableSelector as string) ?? 'table';
|
||
const headerRow = (params.headerRow as number) ?? 1;
|
||
|
||
onProgress('正在导航到页面...', 0);
|
||
onLog('info', `访问: ${url}`);
|
||
await browser.goto(url);
|
||
|
||
onProgress('正在提取表格数据...', 50);
|
||
|
||
const extractScript = `
|
||
({ tableSelector, headerRow }) => {
|
||
const table = document.querySelector(tableSelector);
|
||
if (!table) return { headers: [], rows: [] };
|
||
|
||
const allRows = table.querySelectorAll('tr');
|
||
|
||
// Extract headers
|
||
let headers = [];
|
||
if (headerRow > 0 && allRows[headerRow - 1]) {
|
||
const headerCells = allRows[headerRow - 1].querySelectorAll('th, td');
|
||
headers = Array.from(headerCells).map(cell => cell.textContent?.trim() || '');
|
||
}
|
||
|
||
// Extract data rows
|
||
const startRow = headerRow > 0 ? headerRow : 0;
|
||
const rows = [];
|
||
|
||
for (let i = startRow; i < allRows.length; i++) {
|
||
const cells = allRows[i].querySelectorAll('td, th');
|
||
const rowData = Array.from(cells).map(cell => cell.textContent?.trim() || '');
|
||
if (rowData.some(d => d)) { // Skip empty rows
|
||
rows.push(rowData);
|
||
}
|
||
}
|
||
|
||
return { headers, rows };
|
||
}
|
||
`;
|
||
|
||
const result = await browser.eval(extractScript, [{ tableSelector, headerRow }]) as {
|
||
headers: string[];
|
||
rows: string[][];
|
||
};
|
||
|
||
onLog('info', `提取了 ${result.rows.length} 行数据,${result.headers.length} 列`);
|
||
|
||
onProgress('完成', 100);
|
||
return {
|
||
url: await browser.url(),
|
||
headers: result.headers,
|
||
rowCount: result.rows.length,
|
||
data: result.rows,
|
||
};
|
||
},
|
||
};
|
||
|
||
// ============================================================================
|
||
// Export All Scraping Templates
|
||
// ============================================================================
|
||
|
||
export const scrapingTemplates: TaskTemplate[] = [
|
||
scrapeTextTemplate,
|
||
scrapeListTemplate,
|
||
scrapeImagesTemplate,
|
||
scrapeLinksTemplate,
|
||
scrapeTableTemplate,
|
||
];
|