zclaw_openfang/desktop/src/components/BrowserHand/templates/scraping.ts

/**
 * Scraping Templates for Browser Hand
 *
 * Contains data scraping and extraction templates.
 */

import type { TaskTemplate, ExecutionContext } from './types';

// ============================================================================
// Template: Scrape Text
// ============================================================================

const scrapeTextTemplate: TaskTemplate = {
  id: 'scrape_text',
  name: '抓取页面文本',
  description: '从多个选择器提取文本内容',
  category: 'scraping',
  icon: 'FileText',
  params: [
    {
      key: 'url',
      label: '网页地址',
      type: 'url',
      required: true,
      placeholder: 'https://example.com',
    },
    {
      key: 'selectors',
      label: '选择器列表',
      type: 'textarea',
      required: true,
      placeholder: '.title\n.description\n.price',
      description: 'CSS 选择器（每行一个）',
    },
    {
      key: 'waitFor',
      label: '等待元素',
      type: 'text',
      required: false,
      placeholder: '.content',
      description: '等待此元素出现后再抓取',
    },
  ],
  execute: async (params, context: ExecutionContext) => {
    const { browser, onProgress, onLog } = context;
    const url = params.url as string;
    const selectorsText = params.selectors as string;
    const waitFor = params.waitFor as string | undefined;
    const selectors = selectorsText.split('\n').map((s) => s.trim()).filter(Boolean);

    onProgress('正在导航到页面...', 0);
    onLog('info', `访问: ${url}`);
    await browser.goto(url);

    if (waitFor) {
      onProgress('等待页面加载...', 20);
      onLog('action', `等待元素: ${waitFor}`);
      await browser.wait(waitFor, 10000);
    }

    onProgress('正在抓取文本...', 50);
    const result: Record<string, string | string[]> = {};

    for (let i = 0; i < selectors.length; i++) {
      const selector = selectors[i];
      const progress = 50 + Math.floor((i / selectors.length) * 40);

      onProgress(`正在抓取 ${i + 1}/${selectors.length}...`, progress);

      try {
        // Try to get multiple elements first
        const multipleResult = await browser.eval(`
          (selector) => {
            const elements = document.querySelectorAll(selector);
            if (elements.length > 1) {
              return Array.from(elements).map(el => el.textContent?.trim() || '');
            } else if (elements.length === 1) {
              return elements[0].textContent?.trim() || '';
            }
            return null;
          }
        `, [selector]);

        if (multipleResult !== null) {
          result[selector] = multipleResult as string | string[];
          onLog('info', `抓取成功: ${selector}`);
        } else {
          result[selector] = '';
          onLog('warn', `未找到元素: ${selector}`);
        }
      } catch (error) {
        result[selector] = '';
        onLog('error', `抓取失败: ${selector}`, { error: String(error) });
      }
    }

    onProgress('完成', 100);
    return { url: await browser.url(), data: result };
  },
};

// ============================================================================
// Template: Scrape List
// ============================================================================

const scrapeListTemplate: TaskTemplate = {
  id: 'scrape_list',
  name: '提取列表数据',
  description: '从重复元素中批量提取结构化数据',
  category: 'scraping',
  icon: 'List',
  params: [
    {
      key: 'url',
      label: '网页地址',
      type: 'url',
      required: true,
      placeholder: 'https://example.com/products',
    },
    {
      key: 'itemSelector',
      label: '列表项选择器',
      type: 'text',
      required: true,
      placeholder: '.product-item',
      description: '每个列表项的 CSS 选择器',
    },
    {
      key: 'fieldMappings',
      label: '字段映射',
      type: 'json',
      required: true,
      default: {},
      description: 'JSON 对象，映射字段名到选择器',
      placeholder: '{"title": ".title", "price": ".price", "link": "a@href"}',
    },
    {
      key: 'limit',
      label: '最大数量',
      type: 'number',
      required: false,
      default: 50,
      min: 1,
      max: 500,
      description: '最多提取多少条数据',
    },
  ],
  execute: async (params, context: ExecutionContext) => {
    const { browser, onProgress, onLog } = context;
    const url = params.url as string;
    const itemSelector = params.itemSelector as string;
    const fieldMappings = params.fieldMappings as Record<string, string>;
    const limit = (params.limit as number) ?? 50;

    onProgress('正在导航到页面...', 0);
    onLog('info', `访问: ${url}`);
    await browser.goto(url);

    onProgress('等待列表加载...', 30);
    await browser.wait(itemSelector, 10000);

    onProgress('正在提取列表数据...', 50);

    const scrapingScript = `
      ({ itemSelector, fieldMappings, limit }) => {
        const items = document.querySelectorAll(itemSelector);
        const results = [];

        for (let i = 0; i < Math.min(items.length, limit); i++) {
          const item = items[i];
          const row = {};

          for (const [field, selector] of Object.entries(fieldMappings)) {
            // Handle attribute selectors like "a@href"
            const parts = selector.split('@');
            const cssSelector = parts[0];
            const attr = parts[1];

            const el = item.querySelector(cssSelector);
            if (el) {
              if (attr) {
                row[field] = el.getAttribute(attr) || '';
              } else {
                row[field] = el.textContent?.trim() || '';
              }
            } else {
              row[field] = '';
            }
          }

          results.push(row);
        }

        return results;
      }
    `;

    const result = await browser.eval(scrapingScript, [{
      itemSelector,
      fieldMappings,
      limit,
    }]);

    const items = result as Array<Record<string, string>>;
    onLog('info', `提取了 ${items.length} 条数据`);

    onProgress('完成', 100);
    return {
      url: await browser.url(),
      count: items.length,
      data: items,
    };
  },
};

// ============================================================================
// Template: Scrape Images
// ============================================================================

const scrapeImagesTemplate: TaskTemplate = {
  id: 'scrape_images',
  name: '抓取图片列表',
  description: '提取页面中的图片 URL',
  category: 'scraping',
  icon: 'Image',
  params: [
    {
      key: 'url',
      label: '网页地址',
      type: 'url',
      required: true,
      placeholder: 'https://example.com/gallery',
    },
    {
      key: 'imageSelector',
      label: '图片选择器',
      type: 'text',
      required: false,
      default: 'img',
      placeholder: 'img.gallery-image',
      description: '图片元素的 CSS 选择器',
    },
    {
      key: 'minWidth',
      label: '最小宽度',
      type: 'number',
      required: false,
      default: 100,
      description: '忽略小于此宽度的图片',
    },
    {
      key: 'minHeight',
      label: '最小高度',
      type: 'number',
      required: false,
      default: 100,
      description: '忽略小于此高度的图片',
    },
  ],
  execute: async (params, context: ExecutionContext) => {
    const { browser, onProgress, onLog } = context;
    const url = params.url as string;
    const imageSelector = (params.imageSelector as string) ?? 'img';
    const minWidth = (params.minWidth as number) ?? 100;
    const minHeight = (params.minHeight as number) ?? 100;

    onProgress('正在导航到页面...', 0);
    onLog('info', `访问: ${url}`);
    await browser.goto(url);

    onProgress('正在提取图片...', 50);

    const extractScript = `
      ({ imageSelector, minWidth, minHeight }) => {
        const images = document.querySelectorAll(imageSelector);
        const results = [];

        images.forEach(img => {
          const width = img.naturalWidth || img.width;
          const height = img.naturalHeight || img.height;

          if (width >= minWidth && height >= minHeight) {
            results.push({
              src: img.src,
              alt: img.alt || '',
              width,
              height,
            });
          }
        });

        return results;
      }
    `;

    const result = await browser.eval(extractScript, [{
      imageSelector,
      minWidth,
      minHeight,
    }]);

    const images = result as Array<{
      src: string;
      alt: string;
      width: number;
      height: number;
    }>;

    onLog('info', `找到 ${images.length} 张图片`);

    onProgress('完成', 100);
    return {
      url: await browser.url(),
      count: images.length,
      images,
    };
  },
};

// ============================================================================
// Template: Scrape Links
// ============================================================================

const scrapeLinksTemplate: TaskTemplate = {
  id: 'scrape_links',
  name: '抓取链接列表',
  description: '提取页面中的所有链接',
  category: 'scraping',
  icon: 'Link',
  params: [
    {
      key: 'url',
      label: '网页地址',
      type: 'url',
      required: true,
      placeholder: 'https://example.com',
    },
    {
      key: 'linkSelector',
      label: '链接选择器',
      type: 'text',
      required: false,
      default: 'a[href]',
      placeholder: 'a[href]',
      description: '链接元素的 CSS 选择器',
    },
    {
      key: 'filterPattern',
      label: 'URL 过滤',
      type: 'text',
      required: false,
      placeholder: 'example.com',
      description: '只保留包含此文本的链接',
    },
    {
      key: 'excludePattern',
      label: '排除模式',
      type: 'text',
      required: false,
      placeholder: '#, javascript:',
      description: '排除包含此文本的链接',
    },
  ],
  execute: async (params, context: ExecutionContext) => {
    const { browser, onProgress, onLog } = context;
    const url = params.url as string;
    const linkSelector = (params.linkSelector as string) ?? 'a[href]';
    const filterPattern = params.filterPattern as string | undefined;
    const excludePattern = params.excludePattern as string | undefined;

    onProgress('正在导航到页面...', 0);
    onLog('info', `访问: ${url}`);
    await browser.goto(url);

    onProgress('正在提取链接...', 50);

    const extractScript = `
      ({ linkSelector, filterPattern, excludePattern }) => {
        const links = document.querySelectorAll(linkSelector);
        const results = [];
        const seen = new Set();

        links.forEach(a => {
          const href = a.href;
          const text = a.textContent?.trim() || '';

          if (!href || seen.has(href)) return;

          // Apply filter
          if (filterPattern && !href.includes(filterPattern) && !text.includes(filterPattern)) {
            return;
          }

          // Apply exclude
          if (excludePattern) {
            const patterns = excludePattern.split(',').map(p => p.trim());
            for (const p of patterns) {
              if (href.includes(p)) return;
            }
          }

          seen.add(href);
          results.push({ href, text });
        });

        return results;
      }
    `;

    const result = await browser.eval(extractScript, [{
      linkSelector,
      filterPattern,
      excludePattern,
    }]);

    const links = result as Array<{ href: string; text: string }>;
    onLog('info', `找到 ${links.length} 个链接`);

    onProgress('完成', 100);
    return {
      url: await browser.url(),
      count: links.length,
      links,
    };
  },
};

// ============================================================================
// Template: Scrape Table
// ============================================================================

const scrapeTableTemplate: TaskTemplate = {
  id: 'scrape_table',
  name: '抓取表格数据',
  description: '从 HTML 表格中提取数据',
  category: 'scraping',
  icon: 'Table',
  params: [
    {
      key: 'url',
      label: '网页地址',
      type: 'url',
      required: true,
      placeholder: 'https://example.com/data',
    },
    {
      key: 'tableSelector',
      label: '表格选择器',
      type: 'text',
      required: false,
      default: 'table',
      placeholder: 'table.data-table',
      description: '表格元素的 CSS 选择器',
    },
    {
      key: 'headerRow',
      label: '表头行',
      type: 'number',
      required: false,
      default: 1,
      min: 0,
      max: 10,
      description: '表头所在行（0 表示无表头）',
    },
  ],
  execute: async (params, context: ExecutionContext) => {
    const { browser, onProgress, onLog } = context;
    const url = params.url as string;
    const tableSelector = (params.tableSelector as string) ?? 'table';
    const headerRow = (params.headerRow as number) ?? 1;

    onProgress('正在导航到页面...', 0);
    onLog('info', `访问: ${url}`);
    await browser.goto(url);

    onProgress('正在提取表格数据...', 50);

    const extractScript = `
      ({ tableSelector, headerRow }) => {
        const table = document.querySelector(tableSelector);
        if (!table) return { headers: [], rows: [] };

        const allRows = table.querySelectorAll('tr');

        // Extract headers
        let headers = [];
        if (headerRow > 0 && allRows[headerRow - 1]) {
          const headerCells = allRows[headerRow - 1].querySelectorAll('th, td');
          headers = Array.from(headerCells).map(cell => cell.textContent?.trim() || '');
        }

        // Extract data rows
        const startRow = headerRow > 0 ? headerRow : 0;
        const rows = [];

        for (let i = startRow; i < allRows.length; i++) {
          const cells = allRows[i].querySelectorAll('td, th');
          const rowData = Array.from(cells).map(cell => cell.textContent?.trim() || '');
          if (rowData.some(d => d)) { // Skip empty rows
            rows.push(rowData);
          }
        }

        return { headers, rows };
      }
    `;

    const result = await browser.eval(extractScript, [{ tableSelector, headerRow }]) as {
      headers: string[];
      rows: string[][];
    };

    onLog('info', `提取了 ${result.rows.length} 行数据，${result.headers.length} 列`);

    onProgress('完成', 100);
    return {
      url: await browser.url(),
      headers: result.headers,
      rowCount: result.rows.length,
      data: result.rows,
    };
  },
};

// ============================================================================
// Export All Scraping Templates
// ============================================================================

export const scrapingTemplates: TaskTemplate[] = [
  scrapeTextTemplate,
  scrapeListTemplate,
  scrapeImagesTemplate,
  scrapeLinksTemplate,
  scrapeTableTemplate,
];