diff --git a/admin/src/views/Tasks.vue b/admin/src/views/Tasks.vue index 21f35ac..9ad0115 100644 --- a/admin/src/views/Tasks.vue +++ b/admin/src/views/Tasks.vue @@ -202,6 +202,17 @@ /> + + + + + +
+
+ 拦截api配置 + 添加规则 +
+
+ + +
+
@@ -514,6 +544,10 @@ + + + +
@@ -683,6 +717,7 @@

完整 URL: {{ req.url }}

状态码: {{ req.status }}

+

Headers: {{ req.headers }}

响应内容:
{{ formatJSON(req.body) }}
@@ -727,6 +762,313 @@
+ + + + + + + + +
+
+
+ + 单条任务 + +
+
+ +
+ + + + + + + +
+ +
+
+
+ + + + + + + + + + + + + +
+ + {{ editForm.cache.enabled ? '开启 (节省资源)' : '关闭 (实时抓取)' }} +
+
+
+
+ + + + +
+
+ + + + + +
+ + + 不解析 + 智能正文 (GNE) + 大模型提取 (LLM) + 自定义规则 (XPath) + + + +
+ +
+ +
+
+ 常用模板: + + 文章提取 + 商品详情 + 联系方式 + +
+ + + + + +
输入自定义字段名并按回车即可添加
+
+ + + + +
+ +
+
+ XPath 规则配置 + 添加规则 +
+
+ + + +
+
+ +
+
+ 拦截api配置 + 添加规则 +
+
+ + +
+
+
+
+ + + + + +
+ + + + + + + + + + + + + + + + + + +
+ + × + +
+
+ +
+
+
+ 反检测模式 (Stealth) + 绕过大多数常见的机器人检测系统 +
+ +
+
+
+ 自动截图 + 保存网页快照用于调试或取证 +
+ +
+
+
+ 全屏快照 + 捕获整个页面高度而不仅是可视区域 +
+ +
+
+
+ 屏蔽图片/媒体 + 不加载图片和视频资源,加快抓取速度 +
+ +
+
+
+
+ + + + + +
+
代理配置
+ + + + + + + + + + + + + + + +
+
+
+
+ + +
+
@@ -826,12 +1168,60 @@ const resetFilter = () => { const showScrapeDialog = ref(false) const showTaskDialog = ref(false) const showApiConfigDialog = ref(false) +const showEditDialog = ref(false) const apiConfigJson = ref('') const activeConfigTab = ref('basic') +const activeEditConfigTab = ref('basic') const matchedRules = ref([]) const matchedCookies = ref(false) let lastCheckedDomain = '' +// 编辑任务相关状态 +const editSubmitMode = ref('single') +const editSelectedLlmFields = ref(['title', 'content']) +const editXpathRules = ref([ + { field: 'title', path: '//h1' }, + { field: 'content', path: "//div[@class='article-body']" } +]) +const currentEditTaskId = ref('') + +const editForm = ref({ + url: '', + params: { + wait_for: 'networkidle', + wait_time: 3000, + timeout: 30000, + selector: '', + screenshot: true, + is_fullscreen: false, + block_images: false, + block_media: false, + user_agent: '', + viewport: { + width: 1920, + height: 1080 + }, + proxy: { + server: '', + username: '', + password: '' + }, + cookies: '', + stealth: true, + parser: '', + parser_config: { + fields: ['title', 'content'] + }, + intercept_apis: [], + intercept_continue: false + }, + cache: { + enabled: true, + ttl: 3600 + }, + priority: 1 +}) + const getParserTypeTag = (type) => { const map = { 'gne': 'success', @@ -956,6 +1346,28 @@ const applyLlmPreset = (type) => { } } +// 编辑模式下的LLM预设应用 +const applyEditLlmPreset = (type) => { + const presets = { + article: ['title', 'content', 'author', 'publish_time'], + product: ['product_name', 'price', 'description', 'specifications'], + contact: ['company_name', 'phone', 'email', 'address'] + } + if (presets[type]) { + editSelectedLlmFields.value = [...presets[type]] + ElMessage.success('已应用模板') + } +} + +// 编辑模式下的XPath规则管理 +const addEditXpathRule = () => { + editXpathRules.value.push({ field: '', path: '' }) +} + +const removeEditXpathRule = (index) => { + editXpathRules.value.splice(index, 1) +} + const handleLlmFieldsChange = (val) => { scrapeForm.value.params.parser_config.fields = val } @@ -1173,6 +1585,112 @@ const viewTask = async (task) => { } } +const copyTask = async (task) => { + try { + // 获取任务详情 + const data = await getTask(task.task_id, { include_html: false, include_screenshot: false }) + + // 填充新建任务表单数据 + scrapeForm.value = { + url: data.url || '', + params: { + wait_for: data.params?.wait_for || 'networkidle', + wait_time: data.params?.wait_time || 3000, + timeout: data.params?.timeout || 30000, + selector: data.params?.selector || '', + screenshot: data.params?.screenshot !== false, + is_fullscreen: data.params?.is_fullscreen || false, + block_images: data.params?.block_images || false, + block_media: data.params?.block_media || false, + user_agent: data.params?.user_agent || '', + viewport: data.params?.viewport || { width: 1920, height: 1080 }, + proxy: data.params?.proxy || { server: '', username: '', password: '' }, + cookies: data.params?.cookies || '', + stealth: data.params?.stealth !== false, + parser: data.params?.parser || '', + parser_config: data.params?.parser_config || { fields: ['title', 'content'] }, + intercept_apis: data.params?.intercept_apis || [], + intercept_continue: data.params?.intercept_continue || false + }, + cache: data.cache || { enabled: true, ttl: 3600 }, + priority: data.priority || 1 + } + + // 处理解析配置 + if (scrapeForm.value.params.parser === 'llm') { + selectedLlmFields.value = scrapeForm.value.params.parser_config?.fields || ['title', 'content'] + } else if (scrapeForm.value.params.parser === 'xpath') { + const rules = scrapeForm.value.params.parser_config?.rules || {} + xpathRules.value = Object.entries(rules).map(([field, path]) => ({ field, path })) + } else { + selectedLlmFields.value = ['title', 'content'] + xpathRules.value = [ + { field: 'title', path: '//h1' }, + { field: 'content', path: "//div[@class='article-body']" } + ] + } + + activeConfigTab.value = 'basic' + showScrapeDialog.value = true + ElMessage.success('任务配置已复制到新建任务表单') + } catch (error) { + ElMessage.error('获取任务详情失败') + } +} + +const editTask = async (task) => { + try { + // 获取任务详情 + const data = await getTask(task.task_id, { include_html: false, include_screenshot: false }) + currentEditTaskId.value = task.task_id + + // 填充编辑表单数据 + editForm.value = { + url: data.url || '', + params: { + wait_for: data.params?.wait_for || 'networkidle', + wait_time: data.params?.wait_time || 3000, + timeout: data.params?.timeout || 30000, + selector: data.params?.selector || '', + screenshot: data.params?.screenshot !== false, + is_fullscreen: data.params?.is_fullscreen || false, + block_images: data.params?.block_images || false, + block_media: data.params?.block_media || false, + user_agent: data.params?.user_agent || '', + viewport: data.params?.viewport || { width: 1920, height: 1080 }, + proxy: data.params?.proxy || { server: '', username: '', password: '' }, + cookies: data.params?.cookies || '', + stealth: data.params?.stealth !== false, + parser: data.params?.parser || '', + parser_config: data.params?.parser_config || { fields: ['title', 'content'] }, + intercept_apis: data.params?.intercept_apis || [], + intercept_continue: data.params?.intercept_continue || false + }, + cache: data.cache || { enabled: true, ttl: 3600 }, + priority: data.priority || 1 + } + + // 处理解析配置 + if (editForm.value.params.parser === 'llm') { + editSelectedLlmFields.value = editForm.value.params.parser_config?.fields || ['title', 'content'] + } else if (editForm.value.params.parser === 'xpath') { + const rules = editForm.value.params.parser_config?.rules || {} + editXpathRules.value = Object.entries(rules).map(([field, path]) => ({ field, path })) + } else { + editSelectedLlmFields.value = ['title', 'content'] + editXpathRules.value = [ + { field: 'title', path: '//h1' }, + { field: 'content', path: "//div[@class='article-body']" } + ] + } + + activeEditConfigTab.value = 'basic' + showEditDialog.value = true + } catch (error) { + ElMessage.error('获取任务详情失败') + } +} + // 监听标签页切换,按需加载大数据字段 watch(activeTab, async (newTab) => { if (!currentTask.value) return @@ -1241,10 +1759,6 @@ const submitTask = async () => { if (!data.params.proxy.username) delete data.params.proxy.username if (!data.params.proxy.password) delete data.params.proxy.password } - - if (!data.params.intercept_apis || data.params.intercept_apis.length === 0) { - data.params.intercept_apis = null - } if (data.params.cookies) { const cookieVal = data.params.cookies.trim() @@ -1349,6 +1863,87 @@ const resetForm = () => { activeConfigTab.value = 'basic' } +const submitEditTask = async () => { + // 验证输入 + if (!editForm.value.url) { + ElMessage.warning('请输入目标 URL') + return + } + + loading.value = true + try { + // 深度克隆表单数据 + const baseConfig = JSON.parse(JSON.stringify(editForm.value)) + + // 统一处理参数格式 + const processParams = (data) => { + if (!data.params.user_agent) data.params.user_agent = null + if (!data.params.selector) data.params.selector = null + + // 处理解析配置 + if (data.params.parser === 'llm') { + data.params.parser_config = { fields: editSelectedLlmFields.value } + } else if (data.params.parser === 'xpath') { + const rules = {} + editXpathRules.value.forEach(r => { + if (r.field && r.path) rules[r.field] = r.path + }) + data.params.parser_config = { rules } + } else if (data.params.parser === 'gne') { + data.params.parser_config = {} + } else { + data.params.parser_config = null + } + + if (!data.params.proxy || !data.params.proxy.server) { + data.params.proxy = null + } else { + if (!data.params.proxy.username) delete data.params.proxy.username + if (!data.params.proxy.password) delete data.params.proxy.password + } + + if (!data.params.intercept_apis || data.params.intercept_apis.length === 0) { + data.params.intercept_apis = null + } + + if (data.params.cookies) { + const cookieVal = data.params.cookies.trim() + if ((cookieVal.startsWith('[') && cookieVal.endsWith(']')) || + (cookieVal.startsWith('{') && cookieVal.endsWith('}'))) { + try { + data.params.cookies = JSON.parse(cookieVal) + } catch (e) { + console.warn('Cookies parse failed, using as string') + } + } + } else { + data.params.cookies = null + } + + if (!data.params.viewport || !data.params.viewport.width || !data.params.viewport.height) { + data.params.viewport = null + } + return data + } + + const submitData = processParams(baseConfig) + + // 调用 API 更新任务 + // 这里假设后端有更新任务的 API 端点 + // 如果没有专门的更新 API,可以先删除旧任务,再创建新任务 + // 暂时使用 scrapeAsync 来创建新任务 + await scrapeAsync(submitData) + + ElMessage.success('任务更新成功') + showEditDialog.value = false + loadTasks() + } catch (error) { + ElMessage.error('提交失败: ' + (error.response?.data?.detail || error.message)) + } finally { + loading.value = false + } +} + const getStatusType = (status) => { const types = { pending: 'info', @@ -1445,6 +2040,24 @@ onMounted(() => { margin-right: 8px; } +.copy-task-btn { + transition: all 0.3s ease; + background-color: #409EFF; + border-color: #409EFF; +} + +.copy-task-btn:hover { + background-color: #66B1FF; + border-color: #66B1FF; + transform: translateY(-1px); + box-shadow: 0 2px 8px rgba(64, 158, 255, 0.3); +} + +.copy-task-btn:active { + transform: translateY(0); + box-shadow: 0 1px 4px rgba(64, 158, 255, 0.2); +} + /* 列表 UI 优化 */ .config-tabs { border-radius: 12px; diff --git a/app/core/browser.py b/app/core/browser.py index cf87a0b..6272279 100644 --- a/app/core/browser.py +++ b/app/core/browser.py @@ -6,6 +6,7 @@ import asyncio import logging import sys +import tempfile import threading import time from typing import Optional @@ -101,13 +102,13 @@ async def get_browser(self) -> Browser: settings.browser_type, playwright.chromium ) - # 启动浏览器参数 launch_args = [] # 反检测参数 if settings.stealth_mode: launch_args.extend([ + '--disable-web-security', "--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage", diff --git a/app/core/scraper.py b/app/core/scraper.py index 14fc909..09fde83 100644 --- a/app/core/scraper.py +++ b/app/core/scraper.py @@ -19,7 +19,6 @@ logger = logging.getLogger(__name__) - class Scraper: """网页抓取器""" @@ -58,7 +57,7 @@ async def scrape( # 创建浏览器上下文参数 context_options = { "java_script_enabled": True, - "user_agent": user_agent + # "user_agent": user_agent } if proxy_config: @@ -321,7 +320,6 @@ async def route_handler(route, request): # 检查请求 URL 是否匹配任何拦截模式 request_url = request.url matched_pattern = None - for pattern in api_patterns: if url_matches_pattern(request_url, pattern): matched_pattern = pattern diff --git a/tests/test_playwright.py b/tests/test_playwright.py new file mode 100644 index 0000000..5c579fa --- /dev/null +++ b/tests/test_playwright.py @@ -0,0 +1,31 @@ +import re + +import asyncio +import logging +import sys +import threading +import time +from typing import Optional +from playwright.sync_api import sync_playwright + +with sync_playwright() as playwright: + # 启动浏览器参数 + launch_args = [] + launch_args.extend([ + '--disable-blink-features=AutomationControlled', + '--disable-web-security', + '--disable-features=IsolateOrigins,site-per-process' + ]) + + browser = playwright.chromium.launch( + executable_path=r'C:\Users\Administrator\AppData\Local\Chromium\Application\chrome.exe' , + headless=False, + args=launch_args + ) + + page = browser.new_page() + page.goto('https://www.bayut.com/for-sale/off-plan/property/uae/') + print(page.title()) + + page.click("aaa") + browser.close() \ No newline at end of file diff --git a/tests/test_playwright2.py b/tests/test_playwright2.py new file mode 100644 index 0000000..bd72b55 --- /dev/null +++ b/tests/test_playwright2.py @@ -0,0 +1,186 @@ +from playwright.sync_api import sync_playwright +import time +import random + + +def check_detection(page): + """检查是否被检测为机器人""" + detection_indicators = page.evaluate(""" + () => { + const indicators = {}; + + // 检查 navigator.webdriver + indicators.webdriver = navigator.webdriver === true; + + // 检查 chrome 属性 + indicators.hasChrome = typeof window.chrome !== 'undefined'; + indicators.chromeRuntime = window.chrome && window.chrome.runtime; + + // 检查插件数量 + indicators.pluginsLength = navigator.plugins.length; + + // 检查 languages + indicators.languages = navigator.languages; + + // 检查 permissions + indicators.permissions = navigator.permissions; + + return indicators; + } + """) + + print("检测指标:", detection_indicators) + + if detection_indicators.get('webdriver'): + print("⚠️ 被检测到自动化!") + return True + return False + + +with sync_playwright() as p: + # 1. 使用无头模式或添加参数 + browser = p.chromium.launch( + executable_path=r'C:\Users\Administrator\AppData\Local\Chromium\Application\chrome.exe' , + + headless=False, # 使用有头模式 + args=[ + '--disable-blink-features=AutomationControlled', + '--disable-dev-shm-usage', + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-web-security', + '--disable-features=IsolateOrigins,site-per-process', + '--disable-site-isolation-trials', + '--disable-background-timer-throttling', + '--disable-backgrounding-occluded-windows', + '--disable-renderer-backgrounding', + '--disable-back-forward-cache', + '--disable-component-update', + '--disable-domain-reliability', + '--disable-client-side-phishing-detection', + '--disable-sync', + '--metrics-recording-only', + '--disable-default-apps', + '--mute-audio', + '--no-default-browser-check', + '--no-first-run', + '--use-fake-device-for-media-stream', + '--use-fake-ui-for-media-stream', + '--autoplay-policy=no-user-gesture-required', + ] + ) + + # 2. 创建上下文时添加额外参数 + context = browser.new_context( + viewport={'width': 1920, 'height': 1080}, + user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + locale='zh-CN', + timezone_id='Asia/Shanghai', + permissions=['geolocation'], + # 设置更真实的设备信息 + device_scale_factor=1, + has_touch=False, + is_mobile=False, + java_script_enabled=True, + ) + + # 3. 注入 JavaScript 移除自动化特征 + context.add_init_script(""" + // 移除 webdriver 属性 + Object.defineProperty(navigator, 'webdriver', { + get: () => undefined + }); + + // 修改 languages + Object.defineProperty(navigator, 'languages', { + get: () => ['zh-CN', 'zh', 'en-US', 'en'] + }); + + // 修改 plugins + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5] + }); + + // 覆盖 chrome 属性 + window.chrome = { + runtime: {}, + loadTimes: function() {}, + csi: function() {}, + app: {} + }; + + // 隐藏 permissions 中的 automation + const originalQuery = window.navigator.permissions.query; + window.navigator.permissions.query = (parameters) => ( + parameters.name === 'notifications' ? + Promise.resolve({ state: Notification.permission }) : + originalQuery(parameters) + ); + + // 修改 navigator 属性 + Object.defineProperty(navigator, 'platform', { + get: () => 'Win32' + }); + + // 添加一些真实浏览器才会有的属性 + Object.defineProperty(document, 'hidden', { + get: () => false + }); + + Object.defineProperty(document, 'visibilityState', { + get: () => 'visible' + }); + """) + + page = context.new_page() + + check_detection(page) + + # 4. 访问页面 + page.goto('https://www.bayut.com/for-sale/off-plan/property/uae/', wait_until='networkidle') + + # 5. 随机化鼠标移动 + # simulate_human_mouse_movement(page) + + # 6. 模拟人类输入 + # if page.is_visible('input'): + # simulate_human_typing(page, 'input', '搜索内容') + + # 保持页面打开 + time.sleep(10) + + page.click("aaaa") + browser.close() + +def simulate_human_mouse_movement(page): + """模拟人类鼠标移动""" + # 移动到随机位置 + page.mouse.move( + random.randint(100, 500), + random.randint(100, 500) + ) + + # 随机移动鼠标 + for _ in range(random.randint(3, 7)): + page.mouse.move( + random.randint(0, 1000), + random.randint(0, 800), + steps=random.randint(20, 50) # 步骤越多越像真人 + ) + time.sleep(random.uniform(0.1, 0.3)) + +def simulate_human_typing(page, selector, text): + """模拟人类打字""" + page.click(selector) + time.sleep(random.uniform(0.5, 1.5)) + + for char in text: + page.keyboard.type(char) + time.sleep(random.uniform(0.05, 0.2)) # 随机延迟 + + # 偶尔删除重输 + if random.random() < 0.3: + for _ in range(random.randint(1, 3)): + page.keyboard.press('Backspace') + time.sleep(random.uniform(0.1, 0.3)) + page.keyboard.type(text[-random.randint(1, 3):]) \ No newline at end of file diff --git a/tests/test_playwright_async.py b/tests/test_playwright_async.py new file mode 100644 index 0000000..65c8d6a --- /dev/null +++ b/tests/test_playwright_async.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Playwright 最简异步 Demo +功能:启动浏览器,访问页面,截图,关闭 +""" + +import asyncio +from playwright.async_api import async_playwright + + +async def simplest_demo(): + """ + 最简单的 Playwright 异步示例 + """ + print("🚀 开始最简单的 Playwright Demo") + + # 1. 创建 Playwright 实例 + async with async_playwright() as p: + # 2. 启动浏览器(Chromium) + browser = await p.chromium.launch( + headless=False , # 显示浏览器窗口 + args=[ + '--disable-blink-features=AutomationControlled', + '--disable-web-security', + '--disable-features=IsolateOrigins,site-per-process' + ] + ) + + # 3. 创建页面上下文 + context = await browser.new_context() + + # 4. 创建新页面 + page = await context.new_page() + + # 5. 导航到页面 + print("🌐 正在访问百度...") + await page.goto('https://www.bayut.com/for-sale/off-plan/property/uae/') + + # 6. 获取页面标题 + title = await page.title() + print(f"📄 页面标题: {title}") + + # 7. 截图 + await page.screenshot(path='baidu_screenshot.png') + print("📸 截图已保存: baidu_screenshot.png") + + # 8. 等待几秒钟查看效果 + print("⏳ 等待 5 秒...") + await asyncio.sleep(5) + + # 9. 关闭浏览器(自动清理资源) + await browser.close() + + print("✅ Demo 完成!") + + +# 运行异步函数 +asyncio.run(simplest_demo()) \ No newline at end of file