/** * Scraping Templates for Browser Hand * * Contains data scraping and extraction templates. */ import type { TaskTemplate, ExecutionContext } from './types'; // ============================================================================ // Template: Scrape Text // ============================================================================ const scrapeTextTemplate: TaskTemplate = { id: 'scrape_text', name: '抓取页面文本', description: '从多个选择器提取文本内容', category: 'scraping', icon: 'FileText', params: [ { key: 'url', label: '网页地址', type: 'url', required: true, placeholder: 'https://example.com', }, { key: 'selectors', label: '选择器列表', type: 'textarea', required: true, placeholder: '.title\n.description\n.price', description: 'CSS 选择器(每行一个)', }, { key: 'waitFor', label: '等待元素', type: 'text', required: false, placeholder: '.content', description: '等待此元素出现后再抓取', }, ], execute: async (params, context: ExecutionContext) => { const { browser, onProgress, onLog } = context; const url = params.url as string; const selectorsText = params.selectors as string; const waitFor = params.waitFor as string | undefined; const selectors = selectorsText.split('\n').map((s) => s.trim()).filter(Boolean); onProgress('正在导航到页面...', 0); onLog('info', `访问: ${url}`); await browser.goto(url); if (waitFor) { onProgress('等待页面加载...', 20); onLog('action', `等待元素: ${waitFor}`); await browser.wait(waitFor, 10000); } onProgress('正在抓取文本...', 50); const result: Record = {}; for (let i = 0; i < selectors.length; i++) { const selector = selectors[i]; const progress = 50 + Math.floor((i / selectors.length) * 40); onProgress(`正在抓取 ${i + 1}/${selectors.length}...`, progress); try { // Try to get multiple elements first const multipleResult = await browser.eval(` (selector) => { const elements = document.querySelectorAll(selector); if (elements.length > 1) { return Array.from(elements).map(el => el.textContent?.trim() || ''); } else if (elements.length === 1) { return elements[0].textContent?.trim() || ''; } return null; } `, [selector]); if (multipleResult !== null) { result[selector] = multipleResult as string | string[]; onLog('info', `抓取成功: ${selector}`); } else { result[selector] = ''; onLog('warn', `未找到元素: ${selector}`); } } catch (error) { result[selector] = ''; onLog('error', `抓取失败: ${selector}`, { error: String(error) }); } } onProgress('完成', 100); return { url: await browser.url(), data: result }; }, }; // ============================================================================ // Template: Scrape List // ============================================================================ const scrapeListTemplate: TaskTemplate = { id: 'scrape_list', name: '提取列表数据', description: '从重复元素中批量提取结构化数据', category: 'scraping', icon: 'List', params: [ { key: 'url', label: '网页地址', type: 'url', required: true, placeholder: 'https://example.com/products', }, { key: 'itemSelector', label: '列表项选择器', type: 'text', required: true, placeholder: '.product-item', description: '每个列表项的 CSS 选择器', }, { key: 'fieldMappings', label: '字段映射', type: 'json', required: true, default: {}, description: 'JSON 对象,映射字段名到选择器', placeholder: '{"title": ".title", "price": ".price", "link": "a@href"}', }, { key: 'limit', label: '最大数量', type: 'number', required: false, default: 50, min: 1, max: 500, description: '最多提取多少条数据', }, ], execute: async (params, context: ExecutionContext) => { const { browser, onProgress, onLog } = context; const url = params.url as string; const itemSelector = params.itemSelector as string; const fieldMappings = params.fieldMappings as Record; const limit = (params.limit as number) ?? 50; onProgress('正在导航到页面...', 0); onLog('info', `访问: ${url}`); await browser.goto(url); onProgress('等待列表加载...', 30); await browser.wait(itemSelector, 10000); onProgress('正在提取列表数据...', 50); const scrapingScript = ` ({ itemSelector, fieldMappings, limit }) => { const items = document.querySelectorAll(itemSelector); const results = []; for (let i = 0; i < Math.min(items.length, limit); i++) { const item = items[i]; const row = {}; for (const [field, selector] of Object.entries(fieldMappings)) { // Handle attribute selectors like "a@href" const parts = selector.split('@'); const cssSelector = parts[0]; const attr = parts[1]; const el = item.querySelector(cssSelector); if (el) { if (attr) { row[field] = el.getAttribute(attr) || ''; } else { row[field] = el.textContent?.trim() || ''; } } else { row[field] = ''; } } results.push(row); } return results; } `; const result = await browser.eval(scrapingScript, [{ itemSelector, fieldMappings, limit, }]); const items = result as Array>; onLog('info', `提取了 ${items.length} 条数据`); onProgress('完成', 100); return { url: await browser.url(), count: items.length, data: items, }; }, }; // ============================================================================ // Template: Scrape Images // ============================================================================ const scrapeImagesTemplate: TaskTemplate = { id: 'scrape_images', name: '抓取图片列表', description: '提取页面中的图片 URL', category: 'scraping', icon: 'Image', params: [ { key: 'url', label: '网页地址', type: 'url', required: true, placeholder: 'https://example.com/gallery', }, { key: 'imageSelector', label: '图片选择器', type: 'text', required: false, default: 'img', placeholder: 'img.gallery-image', description: '图片元素的 CSS 选择器', }, { key: 'minWidth', label: '最小宽度', type: 'number', required: false, default: 100, description: '忽略小于此宽度的图片', }, { key: 'minHeight', label: '最小高度', type: 'number', required: false, default: 100, description: '忽略小于此高度的图片', }, ], execute: async (params, context: ExecutionContext) => { const { browser, onProgress, onLog } = context; const url = params.url as string; const imageSelector = (params.imageSelector as string) ?? 'img'; const minWidth = (params.minWidth as number) ?? 100; const minHeight = (params.minHeight as number) ?? 100; onProgress('正在导航到页面...', 0); onLog('info', `访问: ${url}`); await browser.goto(url); onProgress('正在提取图片...', 50); const extractScript = ` ({ imageSelector, minWidth, minHeight }) => { const images = document.querySelectorAll(imageSelector); const results = []; images.forEach(img => { const width = img.naturalWidth || img.width; const height = img.naturalHeight || img.height; if (width >= minWidth && height >= minHeight) { results.push({ src: img.src, alt: img.alt || '', width, height, }); } }); return results; } `; const result = await browser.eval(extractScript, [{ imageSelector, minWidth, minHeight, }]); const images = result as Array<{ src: string; alt: string; width: number; height: number; }>; onLog('info', `找到 ${images.length} 张图片`); onProgress('完成', 100); return { url: await browser.url(), count: images.length, images, }; }, }; // ============================================================================ // Template: Scrape Links // ============================================================================ const scrapeLinksTemplate: TaskTemplate = { id: 'scrape_links', name: '抓取链接列表', description: '提取页面中的所有链接', category: 'scraping', icon: 'Link', params: [ { key: 'url', label: '网页地址', type: 'url', required: true, placeholder: 'https://example.com', }, { key: 'linkSelector', label: '链接选择器', type: 'text', required: false, default: 'a[href]', placeholder: 'a[href]', description: '链接元素的 CSS 选择器', }, { key: 'filterPattern', label: 'URL 过滤', type: 'text', required: false, placeholder: 'example.com', description: '只保留包含此文本的链接', }, { key: 'excludePattern', label: '排除模式', type: 'text', required: false, placeholder: '#, javascript:', description: '排除包含此文本的链接', }, ], execute: async (params, context: ExecutionContext) => { const { browser, onProgress, onLog } = context; const url = params.url as string; const linkSelector = (params.linkSelector as string) ?? 'a[href]'; const filterPattern = params.filterPattern as string | undefined; const excludePattern = params.excludePattern as string | undefined; onProgress('正在导航到页面...', 0); onLog('info', `访问: ${url}`); await browser.goto(url); onProgress('正在提取链接...', 50); const extractScript = ` ({ linkSelector, filterPattern, excludePattern }) => { const links = document.querySelectorAll(linkSelector); const results = []; const seen = new Set(); links.forEach(a => { const href = a.href; const text = a.textContent?.trim() || ''; if (!href || seen.has(href)) return; // Apply filter if (filterPattern && !href.includes(filterPattern) && !text.includes(filterPattern)) { return; } // Apply exclude if (excludePattern) { const patterns = excludePattern.split(',').map(p => p.trim()); for (const p of patterns) { if (href.includes(p)) return; } } seen.add(href); results.push({ href, text }); }); return results; } `; const result = await browser.eval(extractScript, [{ linkSelector, filterPattern, excludePattern, }]); const links = result as Array<{ href: string; text: string }>; onLog('info', `找到 ${links.length} 个链接`); onProgress('完成', 100); return { url: await browser.url(), count: links.length, links, }; }, }; // ============================================================================ // Template: Scrape Table // ============================================================================ const scrapeTableTemplate: TaskTemplate = { id: 'scrape_table', name: '抓取表格数据', description: '从 HTML 表格中提取数据', category: 'scraping', icon: 'Table', params: [ { key: 'url', label: '网页地址', type: 'url', required: true, placeholder: 'https://example.com/data', }, { key: 'tableSelector', label: '表格选择器', type: 'text', required: false, default: 'table', placeholder: 'table.data-table', description: '表格元素的 CSS 选择器', }, { key: 'headerRow', label: '表头行', type: 'number', required: false, default: 1, min: 0, max: 10, description: '表头所在行(0 表示无表头)', }, ], execute: async (params, context: ExecutionContext) => { const { browser, onProgress, onLog } = context; const url = params.url as string; const tableSelector = (params.tableSelector as string) ?? 'table'; const headerRow = (params.headerRow as number) ?? 1; onProgress('正在导航到页面...', 0); onLog('info', `访问: ${url}`); await browser.goto(url); onProgress('正在提取表格数据...', 50); const extractScript = ` ({ tableSelector, headerRow }) => { const table = document.querySelector(tableSelector); if (!table) return { headers: [], rows: [] }; const allRows = table.querySelectorAll('tr'); // Extract headers let headers = []; if (headerRow > 0 && allRows[headerRow - 1]) { const headerCells = allRows[headerRow - 1].querySelectorAll('th, td'); headers = Array.from(headerCells).map(cell => cell.textContent?.trim() || ''); } // Extract data rows const startRow = headerRow > 0 ? headerRow : 0; const rows = []; for (let i = startRow; i < allRows.length; i++) { const cells = allRows[i].querySelectorAll('td, th'); const rowData = Array.from(cells).map(cell => cell.textContent?.trim() || ''); if (rowData.some(d => d)) { // Skip empty rows rows.push(rowData); } } return { headers, rows }; } `; const result = await browser.eval(extractScript, [{ tableSelector, headerRow }]) as { headers: string[]; rows: string[][]; }; onLog('info', `提取了 ${result.rows.length} 行数据,${result.headers.length} 列`); onProgress('完成', 100); return { url: await browser.url(), headers: result.headers, rowCount: result.rows.length, data: result.rows, }; }, }; // ============================================================================ // Export All Scraping Templates // ============================================================================ export const scrapingTemplates: TaskTemplate[] = [ scrapeTextTemplate, scrapeListTemplate, scrapeImagesTemplate, scrapeLinksTemplate, scrapeTableTemplate, ];