@@ -826,12 +1168,60 @@ const resetFilter = () => {
const showScrapeDialog = ref(false)
const showTaskDialog = ref(false)
const showApiConfigDialog = ref(false)
+const showEditDialog = ref(false)
const apiConfigJson = ref('')
const activeConfigTab = ref('basic')
+const activeEditConfigTab = ref('basic')
const matchedRules = ref([])
const matchedCookies = ref(false)
let lastCheckedDomain = ''
+// 编辑任务相关状态
+const editSubmitMode = ref('single')
+const editSelectedLlmFields = ref(['title', 'content'])
+const editXpathRules = ref([
+ { field: 'title', path: '//h1' },
+ { field: 'content', path: "//div[@class='article-body']" }
+])
+const currentEditTaskId = ref('')
+
+const editForm = ref({
+ url: '',
+ params: {
+ wait_for: 'networkidle',
+ wait_time: 3000,
+ timeout: 30000,
+ selector: '',
+ screenshot: true,
+ is_fullscreen: false,
+ block_images: false,
+ block_media: false,
+ user_agent: '',
+ viewport: {
+ width: 1920,
+ height: 1080
+ },
+ proxy: {
+ server: '',
+ username: '',
+ password: ''
+ },
+ cookies: '',
+ stealth: true,
+ parser: '',
+ parser_config: {
+ fields: ['title', 'content']
+ },
+ intercept_apis: [],
+ intercept_continue: false
+ },
+ cache: {
+ enabled: true,
+ ttl: 3600
+ },
+ priority: 1
+})
+
const getParserTypeTag = (type) => {
const map = {
'gne': 'success',
@@ -956,6 +1346,28 @@ const applyLlmPreset = (type) => {
}
}
+// 编辑模式下的LLM预设应用
+const applyEditLlmPreset = (type) => {
+ const presets = {
+ article: ['title', 'content', 'author', 'publish_time'],
+ product: ['product_name', 'price', 'description', 'specifications'],
+ contact: ['company_name', 'phone', 'email', 'address']
+ }
+ if (presets[type]) {
+ editSelectedLlmFields.value = [...presets[type]]
+ ElMessage.success('已应用模板')
+ }
+}
+
+// 编辑模式下的XPath规则管理
+const addEditXpathRule = () => {
+ editXpathRules.value.push({ field: '', path: '' })
+}
+
+const removeEditXpathRule = (index) => {
+ editXpathRules.value.splice(index, 1)
+}
+
const handleLlmFieldsChange = (val) => {
scrapeForm.value.params.parser_config.fields = val
}
@@ -1173,6 +1585,112 @@ const viewTask = async (task) => {
}
}
+const copyTask = async (task) => {
+ try {
+ // 获取任务详情
+ const data = await getTask(task.task_id, { include_html: false, include_screenshot: false })
+
+ // 填充新建任务表单数据
+ scrapeForm.value = {
+ url: data.url || '',
+ params: {
+ wait_for: data.params?.wait_for || 'networkidle',
+ wait_time: data.params?.wait_time || 3000,
+ timeout: data.params?.timeout || 30000,
+ selector: data.params?.selector || '',
+ screenshot: data.params?.screenshot !== false,
+ is_fullscreen: data.params?.is_fullscreen || false,
+ block_images: data.params?.block_images || false,
+ block_media: data.params?.block_media || false,
+ user_agent: data.params?.user_agent || '',
+ viewport: data.params?.viewport || { width: 1920, height: 1080 },
+ proxy: data.params?.proxy || { server: '', username: '', password: '' },
+ cookies: data.params?.cookies || '',
+ stealth: data.params?.stealth !== false,
+ parser: data.params?.parser || '',
+ parser_config: data.params?.parser_config || { fields: ['title', 'content'] },
+ intercept_apis: data.params?.intercept_apis || [],
+ intercept_continue: data.params?.intercept_continue || false
+ },
+ cache: data.cache || { enabled: true, ttl: 3600 },
+ priority: data.priority || 1
+ }
+
+ // 处理解析配置
+ if (scrapeForm.value.params.parser === 'llm') {
+ selectedLlmFields.value = scrapeForm.value.params.parser_config?.fields || ['title', 'content']
+ } else if (scrapeForm.value.params.parser === 'xpath') {
+ const rules = scrapeForm.value.params.parser_config?.rules || {}
+ xpathRules.value = Object.entries(rules).map(([field, path]) => ({ field, path }))
+ } else {
+ selectedLlmFields.value = ['title', 'content']
+ xpathRules.value = [
+ { field: 'title', path: '//h1' },
+ { field: 'content', path: "//div[@class='article-body']" }
+ ]
+ }
+
+ activeConfigTab.value = 'basic'
+ showScrapeDialog.value = true
+ ElMessage.success('任务配置已复制到新建任务表单')
+ } catch (error) {
+ ElMessage.error('获取任务详情失败')
+ }
+}
+
+const editTask = async (task) => {
+ try {
+ // 获取任务详情
+ const data = await getTask(task.task_id, { include_html: false, include_screenshot: false })
+ currentEditTaskId.value = task.task_id
+
+ // 填充编辑表单数据
+ editForm.value = {
+ url: data.url || '',
+ params: {
+ wait_for: data.params?.wait_for || 'networkidle',
+ wait_time: data.params?.wait_time || 3000,
+ timeout: data.params?.timeout || 30000,
+ selector: data.params?.selector || '',
+ screenshot: data.params?.screenshot !== false,
+ is_fullscreen: data.params?.is_fullscreen || false,
+ block_images: data.params?.block_images || false,
+ block_media: data.params?.block_media || false,
+ user_agent: data.params?.user_agent || '',
+ viewport: data.params?.viewport || { width: 1920, height: 1080 },
+ proxy: data.params?.proxy || { server: '', username: '', password: '' },
+ cookies: data.params?.cookies || '',
+ stealth: data.params?.stealth !== false,
+ parser: data.params?.parser || '',
+ parser_config: data.params?.parser_config || { fields: ['title', 'content'] },
+ intercept_apis: data.params?.intercept_apis || [],
+ intercept_continue: data.params?.intercept_continue || false
+ },
+ cache: data.cache || { enabled: true, ttl: 3600 },
+ priority: data.priority || 1
+ }
+
+ // 处理解析配置
+ if (editForm.value.params.parser === 'llm') {
+ editSelectedLlmFields.value = editForm.value.params.parser_config?.fields || ['title', 'content']
+ } else if (editForm.value.params.parser === 'xpath') {
+ const rules = editForm.value.params.parser_config?.rules || {}
+ editXpathRules.value = Object.entries(rules).map(([field, path]) => ({ field, path }))
+ } else {
+ editSelectedLlmFields.value = ['title', 'content']
+ editXpathRules.value = [
+ { field: 'title', path: '//h1' },
+ { field: 'content', path: "//div[@class='article-body']" }
+ ]
+ }
+
+ activeEditConfigTab.value = 'basic'
+ showEditDialog.value = true
+ } catch (error) {
+ ElMessage.error('获取任务详情失败')
+ }
+}
+
// 监听标签页切换,按需加载大数据字段
watch(activeTab, async (newTab) => {
if (!currentTask.value) return
@@ -1241,10 +1759,6 @@ const submitTask = async () => {
if (!data.params.proxy.username) delete data.params.proxy.username
if (!data.params.proxy.password) delete data.params.proxy.password
}
-
- if (!data.params.intercept_apis || data.params.intercept_apis.length === 0) {
- data.params.intercept_apis = null
- }
if (data.params.cookies) {
const cookieVal = data.params.cookies.trim()
@@ -1349,6 +1863,87 @@ const resetForm = () => {
activeConfigTab.value = 'basic'
}
+const submitEditTask = async () => {
+ // 验证输入
+ if (!editForm.value.url) {
+ ElMessage.warning('请输入目标 URL')
+ return
+ }
+
+ loading.value = true
+ try {
+ // 深度克隆表单数据
+ const baseConfig = JSON.parse(JSON.stringify(editForm.value))
+
+ // 统一处理参数格式
+ const processParams = (data) => {
+ if (!data.params.user_agent) data.params.user_agent = null
+ if (!data.params.selector) data.params.selector = null
+
+ // 处理解析配置
+ if (data.params.parser === 'llm') {
+ data.params.parser_config = { fields: editSelectedLlmFields.value }
+ } else if (data.params.parser === 'xpath') {
+ const rules = {}
+ editXpathRules.value.forEach(r => {
+ if (r.field && r.path) rules[r.field] = r.path
+ })
+ data.params.parser_config = { rules }
+ } else if (data.params.parser === 'gne') {
+ data.params.parser_config = {}
+ } else {
+ data.params.parser_config = null
+ }
+
+ if (!data.params.proxy || !data.params.proxy.server) {
+ data.params.proxy = null
+ } else {
+ if (!data.params.proxy.username) delete data.params.proxy.username
+ if (!data.params.proxy.password) delete data.params.proxy.password
+ }
+
+ if (!data.params.intercept_apis || data.params.intercept_apis.length === 0) {
+ data.params.intercept_apis = null
+ }
+
+ if (data.params.cookies) {
+ const cookieVal = data.params.cookies.trim()
+ if ((cookieVal.startsWith('[') && cookieVal.endsWith(']')) ||
+ (cookieVal.startsWith('{') && cookieVal.endsWith('}'))) {
+ try {
+ data.params.cookies = JSON.parse(cookieVal)
+ } catch (e) {
+ console.warn('Cookies parse failed, using as string')
+ }
+ }
+ } else {
+ data.params.cookies = null
+ }
+
+ if (!data.params.viewport || !data.params.viewport.width || !data.params.viewport.height) {
+ data.params.viewport = null
+ }
+ return data
+ }
+
+ const submitData = processParams(baseConfig)
+
+ // 调用 API 更新任务
+ // 这里假设后端有更新任务的 API 端点
+ // 如果没有专门的更新 API,可以先删除旧任务,再创建新任务
+ // 暂时使用 scrapeAsync 来创建新任务
+ await scrapeAsync(submitData)
+
+ ElMessage.success('任务更新成功')
+ showEditDialog.value = false
+ loadTasks()
+ } catch (error) {
+ ElMessage.error('提交失败: ' + (error.response?.data?.detail || error.message))
+ } finally {
+ loading.value = false
+ }
+}
+
const getStatusType = (status) => {
const types = {
pending: 'info',
@@ -1445,6 +2040,24 @@ onMounted(() => {
margin-right: 8px;
}
+.copy-task-btn {
+ transition: all 0.3s ease;
+ background-color: #409EFF;
+ border-color: #409EFF;
+}
+
+.copy-task-btn:hover {
+ background-color: #66B1FF;
+ border-color: #66B1FF;
+ transform: translateY(-1px);
+ box-shadow: 0 2px 8px rgba(64, 158, 255, 0.3);
+}
+
+.copy-task-btn:active {
+ transform: translateY(0);
+ box-shadow: 0 1px 4px rgba(64, 158, 255, 0.2);
+}
+
/* 列表 UI 优化 */
.config-tabs {
border-radius: 12px;
diff --git a/app/core/browser.py b/app/core/browser.py
index cf87a0b..6272279 100644
--- a/app/core/browser.py
+++ b/app/core/browser.py
@@ -6,6 +6,7 @@
import asyncio
import logging
import sys
+import tempfile
import threading
import time
from typing import Optional
@@ -101,13 +102,13 @@ async def get_browser(self) -> Browser:
settings.browser_type,
playwright.chromium
)
-
# 启动浏览器参数
launch_args = []
# 反检测参数
if settings.stealth_mode:
launch_args.extend([
+ '--disable-web-security',
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
diff --git a/app/core/scraper.py b/app/core/scraper.py
index 14fc909..09fde83 100644
--- a/app/core/scraper.py
+++ b/app/core/scraper.py
@@ -19,7 +19,6 @@
logger = logging.getLogger(__name__)
-
class Scraper:
"""网页抓取器"""
@@ -58,7 +57,7 @@ async def scrape(
# 创建浏览器上下文参数
context_options = {
"java_script_enabled": True,
- "user_agent": user_agent
+ # "user_agent": user_agent
}
if proxy_config:
@@ -321,7 +320,6 @@ async def route_handler(route, request):
# 检查请求 URL 是否匹配任何拦截模式
request_url = request.url
matched_pattern = None
-
for pattern in api_patterns:
if url_matches_pattern(request_url, pattern):
matched_pattern = pattern
diff --git a/tests/test_playwright.py b/tests/test_playwright.py
new file mode 100644
index 0000000..5c579fa
--- /dev/null
+++ b/tests/test_playwright.py
@@ -0,0 +1,31 @@
+import re
+
+import asyncio
+import logging
+import sys
+import threading
+import time
+from typing import Optional
+from playwright.sync_api import sync_playwright
+
+with sync_playwright() as playwright:
+ # 启动浏览器参数
+ launch_args = []
+ launch_args.extend([
+ '--disable-blink-features=AutomationControlled',
+ '--disable-web-security',
+ '--disable-features=IsolateOrigins,site-per-process'
+ ])
+
+ browser = playwright.chromium.launch(
+ executable_path=r'C:\Users\Administrator\AppData\Local\Chromium\Application\chrome.exe' ,
+ headless=False,
+ args=launch_args
+ )
+
+ page = browser.new_page()
+ page.goto('https://www.bayut.com/for-sale/off-plan/property/uae/')
+ print(page.title())
+
+ page.click("aaa")
+ browser.close()
\ No newline at end of file
diff --git a/tests/test_playwright2.py b/tests/test_playwright2.py
new file mode 100644
index 0000000..bd72b55
--- /dev/null
+++ b/tests/test_playwright2.py
@@ -0,0 +1,186 @@
+from playwright.sync_api import sync_playwright
+import time
+import random
+
+
+def check_detection(page):
+ """检查是否被检测为机器人"""
+ detection_indicators = page.evaluate("""
+ () => {
+ const indicators = {};
+
+ // 检查 navigator.webdriver
+ indicators.webdriver = navigator.webdriver === true;
+
+ // 检查 chrome 属性
+ indicators.hasChrome = typeof window.chrome !== 'undefined';
+ indicators.chromeRuntime = window.chrome && window.chrome.runtime;
+
+ // 检查插件数量
+ indicators.pluginsLength = navigator.plugins.length;
+
+ // 检查 languages
+ indicators.languages = navigator.languages;
+
+ // 检查 permissions
+ indicators.permissions = navigator.permissions;
+
+ return indicators;
+ }
+ """)
+
+ print("检测指标:", detection_indicators)
+
+ if detection_indicators.get('webdriver'):
+ print("⚠️ 被检测到自动化!")
+ return True
+ return False
+
+
+with sync_playwright() as p:
+ # 1. 使用无头模式或添加参数
+ browser = p.chromium.launch(
+ executable_path=r'C:\Users\Administrator\AppData\Local\Chromium\Application\chrome.exe' ,
+
+ headless=False, # 使用有头模式
+ args=[
+ '--disable-blink-features=AutomationControlled',
+ '--disable-dev-shm-usage',
+ '--no-sandbox',
+ '--disable-setuid-sandbox',
+ '--disable-web-security',
+ '--disable-features=IsolateOrigins,site-per-process',
+ '--disable-site-isolation-trials',
+ '--disable-background-timer-throttling',
+ '--disable-backgrounding-occluded-windows',
+ '--disable-renderer-backgrounding',
+ '--disable-back-forward-cache',
+ '--disable-component-update',
+ '--disable-domain-reliability',
+ '--disable-client-side-phishing-detection',
+ '--disable-sync',
+ '--metrics-recording-only',
+ '--disable-default-apps',
+ '--mute-audio',
+ '--no-default-browser-check',
+ '--no-first-run',
+ '--use-fake-device-for-media-stream',
+ '--use-fake-ui-for-media-stream',
+ '--autoplay-policy=no-user-gesture-required',
+ ]
+ )
+
+ # 2. 创建上下文时添加额外参数
+ context = browser.new_context(
+ viewport={'width': 1920, 'height': 1080},
+ user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+ locale='zh-CN',
+ timezone_id='Asia/Shanghai',
+ permissions=['geolocation'],
+ # 设置更真实的设备信息
+ device_scale_factor=1,
+ has_touch=False,
+ is_mobile=False,
+ java_script_enabled=True,
+ )
+
+ # 3. 注入 JavaScript 移除自动化特征
+ context.add_init_script("""
+ // 移除 webdriver 属性
+ Object.defineProperty(navigator, 'webdriver', {
+ get: () => undefined
+ });
+
+ // 修改 languages
+ Object.defineProperty(navigator, 'languages', {
+ get: () => ['zh-CN', 'zh', 'en-US', 'en']
+ });
+
+ // 修改 plugins
+ Object.defineProperty(navigator, 'plugins', {
+ get: () => [1, 2, 3, 4, 5]
+ });
+
+ // 覆盖 chrome 属性
+ window.chrome = {
+ runtime: {},
+ loadTimes: function() {},
+ csi: function() {},
+ app: {}
+ };
+
+ // 隐藏 permissions 中的 automation
+ const originalQuery = window.navigator.permissions.query;
+ window.navigator.permissions.query = (parameters) => (
+ parameters.name === 'notifications' ?
+ Promise.resolve({ state: Notification.permission }) :
+ originalQuery(parameters)
+ );
+
+ // 修改 navigator 属性
+ Object.defineProperty(navigator, 'platform', {
+ get: () => 'Win32'
+ });
+
+ // 添加一些真实浏览器才会有的属性
+ Object.defineProperty(document, 'hidden', {
+ get: () => false
+ });
+
+ Object.defineProperty(document, 'visibilityState', {
+ get: () => 'visible'
+ });
+ """)
+
+ page = context.new_page()
+
+ check_detection(page)
+
+ # 4. 访问页面
+ page.goto('https://www.bayut.com/for-sale/off-plan/property/uae/', wait_until='networkidle')
+
+ # 5. 随机化鼠标移动
+ # simulate_human_mouse_movement(page)
+
+ # 6. 模拟人类输入
+ # if page.is_visible('input'):
+ # simulate_human_typing(page, 'input', '搜索内容')
+
+ # 保持页面打开
+ time.sleep(10)
+
+ page.click("aaaa")
+ browser.close()
+
+def simulate_human_mouse_movement(page):
+ """模拟人类鼠标移动"""
+ # 移动到随机位置
+ page.mouse.move(
+ random.randint(100, 500),
+ random.randint(100, 500)
+ )
+
+ # 随机移动鼠标
+ for _ in range(random.randint(3, 7)):
+ page.mouse.move(
+ random.randint(0, 1000),
+ random.randint(0, 800),
+ steps=random.randint(20, 50) # 步骤越多越像真人
+ )
+ time.sleep(random.uniform(0.1, 0.3))
+
+def simulate_human_typing(page, selector, text):
+ """模拟人类打字"""
+ page.click(selector)
+ time.sleep(random.uniform(0.5, 1.5))
+
+ for char in text:
+ page.keyboard.type(char)
+ time.sleep(random.uniform(0.05, 0.2)) # 随机延迟
+
+ # 偶尔删除重输
+ if random.random() < 0.3:
+ for _ in range(random.randint(1, 3)):
+ page.keyboard.press('Backspace')
+ time.sleep(random.uniform(0.1, 0.3))
+ page.keyboard.type(text[-random.randint(1, 3):])
\ No newline at end of file
diff --git a/tests/test_playwright_async.py b/tests/test_playwright_async.py
new file mode 100644
index 0000000..65c8d6a
--- /dev/null
+++ b/tests/test_playwright_async.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Playwright 最简异步 Demo
+功能:启动浏览器,访问页面,截图,关闭
+"""
+
+import asyncio
+from playwright.async_api import async_playwright
+
+
+async def simplest_demo():
+ """
+ 最简单的 Playwright 异步示例
+ """
+ print("🚀 开始最简单的 Playwright Demo")
+
+ # 1. 创建 Playwright 实例
+ async with async_playwright() as p:
+ # 2. 启动浏览器(Chromium)
+ browser = await p.chromium.launch(
+ headless=False , # 显示浏览器窗口
+ args=[
+ '--disable-blink-features=AutomationControlled',
+ '--disable-web-security',
+ '--disable-features=IsolateOrigins,site-per-process'
+ ]
+ )
+
+ # 3. 创建页面上下文
+ context = await browser.new_context()
+
+ # 4. 创建新页面
+ page = await context.new_page()
+
+ # 5. 导航到页面
+ print("🌐 正在访问百度...")
+ await page.goto('https://www.bayut.com/for-sale/off-plan/property/uae/')
+
+ # 6. 获取页面标题
+ title = await page.title()
+ print(f"📄 页面标题: {title}")
+
+ # 7. 截图
+ await page.screenshot(path='baidu_screenshot.png')
+ print("📸 截图已保存: baidu_screenshot.png")
+
+ # 8. 等待几秒钟查看效果
+ print("⏳ 等待 5 秒...")
+ await asyncio.sleep(5)
+
+ # 9. 关闭浏览器(自动清理资源)
+ await browser.close()
+
+ print("✅ Demo 完成!")
+
+
+# 运行异步函数
+asyncio.run(simplest_demo())
\ No newline at end of file