From c67acf39ea993471b3705489b8c20f543c689b2a Mon Sep 17 00:00:00 2001 From: liuyan Date: Thu, 6 Mar 2025 15:50:15 +0800 Subject: [PATCH] feat: Add Bing search and URL crawler support - Extend search capabilities to include Bing search engine - Implement URL content crawler with customizable selectors - Update package.json with new scripts and bin entries for Bing search and URL crawler - Modify MCP server to support Bing search and URL crawling tools - Update README and README.zh-CN with new features and usage instructions - Remove unused @types/pino dependency --- README.md | 87 ++++++++---- README.zh-CN.md | 106 ++++++++++----- bin/bing-search | 3 + bin/bing-search-mcp | 3 + bin/bing-search-mcp.cmd | 3 + bin/bing-search.cmd | 3 + bin/search-mcp | 3 + bin/search-mcp.cmd | 3 + bin/url-crawler | 21 +++ package.json | 21 ++- pnpm-lock.yaml | 11 -- src/bing-index.ts | 39 ++++++ src/index.ts | 60 +++++++- src/mcp-server.ts | 295 +++++++++++++++++++++++++++++++++++++++- src/search.ts | 274 ++++++++++++++++++++++++++++++++----- src/url-crawler-test.ts | 197 +++++++++++++++++++++++++++ 16 files changed, 1015 insertions(+), 114 deletions(-) create mode 100755 bin/bing-search create mode 100755 bin/bing-search-mcp create mode 100644 bin/bing-search-mcp.cmd create mode 100644 bin/bing-search.cmd create mode 100755 bin/search-mcp create mode 100644 bin/search-mcp.cmd create mode 100755 bin/url-crawler create mode 100644 src/bing-index.ts create mode 100644 src/url-crawler-test.ts diff --git a/README.md b/README.md index ff15ed4..f6b009f 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -# Google Search Tool +# Search Tool -A Playwright-based Node.js tool that bypasses search engine anti-scraping mechanisms to execute Google searches and extract results. It can be used directly as a command-line tool or as a Model Context Protocol (MCP) server to provide real-time search capabilities to AI assistants like Claude. +A Playwright-based Node.js tool that bypasses search engine anti-scraping mechanisms to execute Google and Bing searches and extract results. It can be used directly as a command-line tool or as a Model Context Protocol (MCP) server to provide real-time search capabilities to AI assistants like Claude. [![Star History Chart](https://api.star-history.com/svg?repos=web-agent-master/google-search&type=Date)](https://star-history.com/#web-agent-master/google-search&Date) @@ -9,6 +9,8 @@ A Playwright-based Node.js tool that bypasses search engine anti-scraping mechan ## Key Features - **Local SERP API Alternative**: No need to rely on paid search engine results API services, all searches are executed locally +- **Multiple Search Engines Support**: Currently supports Google and Bing search engines +- **URL Content Crawler**: Extract content from any web page with customizable selectors and metadata extraction - **Advanced Anti-Bot Detection Bypass Techniques**: - Intelligent browser fingerprint management that simulates real user behavior - Automatic saving and restoration of browser state to reduce verification frequency @@ -24,6 +26,7 @@ A Playwright-based Node.js tool that bypasses search engine anti-scraping mechan - Command-line parameter support for search keywords - MCP server support for AI assistant integration - Returns search results with title, link, and snippet +- URL crawler with customizable content extraction and metadata support - JSON format output - Support for both headless and headed modes (for debugging) - Detailed logging output @@ -69,35 +72,52 @@ This tool has been specially adapted for Windows environments: ## Usage -### Command Line Tool +### Command Line ```bash -# Direct command line usage -google-search "search keywords" - -# Using command line options -google-search --limit 5 --timeout 60000 --no-headless "search keywords" +# Google search +npx google-search "your search query" +# Or with options +npx google-search --limit 5 "your search query" + +# Bing search +npx bing-search "your search query" +# Or with options +npx bing-search --limit 5 "your search query" + +# URL crawler +npx url-crawler "https://example.com" +# Or with options +npx url-crawler -s "article.main-content" -w "div.loaded-content" -t 30000 "https://example.com" +``` -# Or using npx -npx google-search-cli "search keywords" +You can also use the subcommands: -# Run in development mode -pnpm dev "search keywords" +```bash +# Google search +npx google-search google "your search query" -# Run in debug mode (showing browser interface) -pnpm debug "search keywords" +# Bing search +npx google-search bing "your search query" ``` -#### Command Line Options +### Options -- `-l, --limit `: Result count limit (default: 10) -- `-t, --timeout `: Timeout in milliseconds (default: 60000) -- `--no-headless`: Show browser interface (for debugging) -- `--remote-debugging-port `: Enable remote debugging port (default: 9222) -- `--state-file `: Browser state file path (default: ./browser-state.json) +#### Search Options +- `--limit `: Limit the number of results (default: 10) +- `--timeout `: Set timeout in milliseconds (default: 30000) +- `--state-file `: Specify browser state file path (default: ./browser-state.json) +- `--no-save-state`: Don't save browser state +- `--locale `: Specify search result language (default: zh-CN) + +#### URL Crawler Options +- `-s, --selector `: CSS selector to extract specific content +- `-w, --wait-for `: Wait for specified element to appear before extracting content +- `-t, --timeout `: Timeout in milliseconds (default: 30000) +- `--no-metadata`: Don't extract metadata +- `--no-headless`: Run browser in headed mode - `--no-save-state`: Don't save browser state -- `-V, --version`: Display version number -- `-h, --help`: Display help information +- `--state-file `: Specify browser state file path (default: ~/.url-crawler-browser-state.json) #### Output Example @@ -125,15 +145,34 @@ pnpm debug "search keywords" } ``` +#### URL Crawler Output Example + +```json +{ + "url": "https://example.com", + "title": "Example Domain", + "content": "Example Domain\n\nThis domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission.\n\nMore information...", + "metadata": { + "viewport": "width=device-width, initial-scale=1" + }, + "timestamp": "2025-03-06T07:44:05.698Z" +} +``` + ### MCP Server This project provides Model Context Protocol (MCP) server functionality, allowing AI assistants like Claude to directly use Google search capabilities. MCP is an open protocol that enables AI assistants to safely access external tools and data. ```bash -# Build the project -pnpm build +# Start the MCP server +npx google-search-mcp ``` +The MCP server provides three tools: +- `google-search`: For Google search +- `bing-search`: For Bing search +- `url-crawler`: For crawling and extracting content from URLs + #### Integration with Claude Desktop 1. Edit the Claude Desktop configuration file: diff --git a/README.zh-CN.md b/README.zh-CN.md index 9c2debb..acfc1c9 100644 --- a/README.zh-CN.md +++ b/README.zh-CN.md @@ -1,19 +1,23 @@ -# Google 搜索工具 +# 搜索工具 -这是一个基于 Playwright 的 Node.js 工具,能够绕过搜索引擎的反爬虫机制,执行 Google 搜索并提取结果。它可作为命令行工具直接使用,或通过 Model Context Protocol (MCP) 服务器为 Claude 等 AI 助手提供实时搜索能力。 +一个基于 Playwright 的 Node.js 工具,能够绕过搜索引擎的反爬虫机制,执行 Google 和 Bing 搜索并提取结果。它可以直接作为命令行工具使用,也可以作为 Model Context Protocol (MCP) 服务器为 Claude 等 AI 助手提供实时搜索能力。 [![Star History Chart](https://api.star-history.com/svg?repos=web-agent-master/google-search&type=Date)](https://star-history.com/#web-agent-master/google-search&Date) -## 核心亮点 +[English Documentation](README.md) -- **本地化 SERP API 替代方案**:无需依赖付费的搜索引擎结果 API 服务,完全在本地执行搜索操作 +## 主要特点 + +- **本地 SERP API 替代方案**:无需依赖付费的搜索引擎结果 API 服务,所有搜索在本地执行 +- **多搜索引擎支持**:目前支持 Google 和 Bing 搜索引擎 +- **URL 内容爬取器**:可提取任何网页内容,支持自定义选择器和元数据提取 - **先进的反机器人检测绕过技术**: - 智能浏览器指纹管理,模拟真实用户行为 - 自动保存和恢复浏览器状态,减少验证频率 - - 无头/有头模式智能切换,遇到验证时自动转为有头模式让用户完成验证 - - 多种设备和区域设置随机化,降低被检测风险 -- **MCP 服务器集成**:为 Claude 等 AI 助手提供实时搜索能力,无需额外 API 密钥 -- **完全开源免费**:所有代码开源,无使用限制,可自由定制和扩展 + - 智能无头/有头模式切换,在需要验证时自动切换到有头模式 + - 设备和区域设置的随机化,降低检测风险 +- **MCP 服务器集成**:为 Claude 等 AI 助手提供实时搜索能力,无需额外的 API 密钥 +- **完全开源和免费**:所有代码开源,无使用限制,可自由定制和扩展 ## 技术特性 @@ -22,6 +26,7 @@ - 支持命令行参数输入搜索关键词 - 支持作为 MCP 服务器,为 Claude 等 AI 助手提供搜索能力 - 返回搜索结果的标题、链接和摘要 +- URL 爬取器支持自定义内容提取和元数据支持 - 以 JSON 格式输出结果 - 支持无头模式和有头模式(调试用) - 提供详细的日志输出 @@ -67,36 +72,52 @@ pnpm link ## 使用方法 -### 命令行工具 +### 命令行 ```bash -# 直接使用命令行 -google-search "搜索关键词" - -# 使用命令行选项 -google-search --limit 5 --timeout 60000 --no-headless "搜索关键词" - +# Google 搜索 +npx google-search "你的搜索查询" +# 或者带选项 +npx google-search --limit 5 "你的搜索查询" + +# Bing 搜索 +npx bing-search "你的搜索查询" +# 或者带选项 +npx bing-search --limit 5 "你的搜索查询" + +# URL 爬取器 +npx url-crawler "https://example.com" +# 或者带选项 +npx url-crawler -s "article.main-content" -w "div.loaded-content" -t 30000 "https://example.com" +``` -# 或者使用 npx -npx google-search-cli "搜索关键词" +你也可以使用子命令: -# 开发模式运行 -pnpm dev "搜索关键词" +```bash +# Google 搜索 +npx google-search google "你的搜索查询" -# 调试模式运行(显示浏览器界面) -pnpm debug "搜索关键词" +# Bing 搜索 +npx google-search bing "你的搜索查询" ``` -#### 命令行选项 +### 选项 + +#### 搜索选项 +- `--limit `:限制结果数量(默认:10) +- `--timeout `:设置超时时间(毫秒)(默认:30000) +- `--state-file `:指定浏览器状态文件路径(默认:./browser-state.json) +- `--no-save-state`:不保存浏览器状态 +- `--locale `:指定搜索结果语言(默认:zh-CN) -- `-l, --limit `: 结果数量限制(默认:10) -- `-t, --timeout `: 超时时间(毫秒,默认:60000) -- `--no-headless`: 显示浏览器界面(调试用) -- `--remote-debugging-port `: 启用远程调试端口(默认:9222) -- `--state-file `: 浏览器状态文件路径(默认:./browser-state.json) -- `--no-save-state`: 不保存浏览器状态 -- `-V, --version`: 显示版本号 -- `-h, --help`: 显示帮助信息 +#### URL 爬取器选项 +- `-s, --selector `:CSS选择器,用于提取特定内容 +- `-w, --wait-for `:等待指定元素出现后再提取内容 +- `-t, --timeout `:超时时间(毫秒)(默认:30000) +- `--no-metadata`:不提取元数据 +- `--no-headless`:使用有头模式运行浏览器 +- `--no-save-state`:不保存浏览器状态 +- `--state-file `:指定浏览器状态文件路径(默认:~/.url-crawler-browser-state.json) #### 输出示例 @@ -124,15 +145,32 @@ pnpm debug "搜索关键词" } ``` -### MCP 服务器 +#### URL 爬取器输出示例 -本项目提供 Model Context Protocol (MCP) 服务器功能,让 Claude 等 AI 助手直接使用 Google 搜索能力。MCP 是一个开放协议,使 AI 助手能安全访问外部工具和数据。 +```json +{ + "url": "https://example.com", + "title": "Example Domain", + "content": "Example Domain\n\nThis domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission.\n\nMore information...", + "metadata": { + "viewport": "width=device-width, initial-scale=1" + }, + "timestamp": "2025-03-06T07:44:05.698Z" +} +``` + +### MCP 服务器 ```bash -# 构建项目 -pnpm build +# 启动 MCP 服务器 +npx google-search-mcp ``` +MCP 服务器提供两个工具: +- `google-search`:用于 Google 搜索 +- `bing-search`:用于 Bing 搜索 +- `url-crawler`:用于爬取和提取URL内容 + #### 与 Claude Desktop 集成 1. 编辑 Claude Desktop 配置文件 diff --git a/bin/bing-search b/bin/bing-search new file mode 100755 index 0000000..1a09234 --- /dev/null +++ b/bin/bing-search @@ -0,0 +1,3 @@ +#!/usr/bin/env node + +import '../dist/src/bing-index.js'; \ No newline at end of file diff --git a/bin/bing-search-mcp b/bin/bing-search-mcp new file mode 100755 index 0000000..71dcc3d --- /dev/null +++ b/bin/bing-search-mcp @@ -0,0 +1,3 @@ +#!/usr/bin/env node + +import '../dist/src/mcp-server.js'; \ No newline at end of file diff --git a/bin/bing-search-mcp.cmd b/bin/bing-search-mcp.cmd new file mode 100644 index 0000000..6e78ce3 --- /dev/null +++ b/bin/bing-search-mcp.cmd @@ -0,0 +1,3 @@ +@IF EXIST "%~dp0\node.exe" ( + "%~dp0\node.exe" "%~dp0\bing-search-mcp" %* +) \ No newline at end of file diff --git a/bin/bing-search.cmd b/bin/bing-search.cmd new file mode 100644 index 0000000..9715aab --- /dev/null +++ b/bin/bing-search.cmd @@ -0,0 +1,3 @@ +@IF EXIST "%~dp0\node.exe" ( + "%~dp0\node.exe" "%~dp0\bing-search" %* +) \ No newline at end of file diff --git a/bin/search-mcp b/bin/search-mcp new file mode 100755 index 0000000..71dcc3d --- /dev/null +++ b/bin/search-mcp @@ -0,0 +1,3 @@ +#!/usr/bin/env node + +import '../dist/src/mcp-server.js'; \ No newline at end of file diff --git a/bin/search-mcp.cmd b/bin/search-mcp.cmd new file mode 100644 index 0000000..159316e --- /dev/null +++ b/bin/search-mcp.cmd @@ -0,0 +1,3 @@ +@IF EXIST "%~dp0\node.exe" ( + "%~dp0\node.exe" "%~dp0\search-mcp" %* +) \ No newline at end of file diff --git a/bin/url-crawler b/bin/url-crawler new file mode 100755 index 0000000..b4fa513 --- /dev/null +++ b/bin/url-crawler @@ -0,0 +1,21 @@ +#!/usr/bin/env node + +// 检查是否在开发环境中运行 +import { fileURLToPath } from 'url'; +import { dirname, resolve } from 'path'; +import { existsSync } from 'fs'; +import { createRequire } from 'module'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); +const isDevMode = process.env.NODE_ENV === 'development' || !existsSync(resolve(__dirname, '../dist')); + +if (isDevMode) { + // 开发模式:使用ts-node运行TypeScript源文件 + const require = createRequire(import.meta.url); + require('ts-node').register(); + await import('../src/url-crawler-test.js'); +} else { + // 生产模式:运行编译后的JavaScript文件 + await import('../dist/src/url-crawler-test.js'); +} \ No newline at end of file diff --git a/package.json b/package.json index b1f9679..f815c01 100644 --- a/package.json +++ b/package.json @@ -1,13 +1,17 @@ { - "name": "google-search-cli", + "name": "search-cli", "version": "1.0.0", - "description": "基于 Playwright 的 Google 搜索 CLI 工具", + "description": "基于 Playwright 的 Google 和 Bing 搜索 CLI 工具", "type": "module", "main": "dist/index.js", "types": "dist/index.d.ts", "bin": { "google-search": "./bin/google-search", - "google-search-mcp": "./bin/google-search-mcp" + "google-search-mcp": "./bin/google-search-mcp", + "bing-search": "./bin/bing-search", + "bing-search-mcp": "./bin/bing-search-mcp", + "search-mcp": "./bin/search-mcp", + "url-crawler": "./bin/url-crawler" }, "scripts": { "build": "tsc", @@ -21,7 +25,13 @@ "link": "npm link", "clean": "node -e \"const fs = require('fs'); const path = require('path'); if (fs.existsSync('dist')) fs.rmSync('dist', { recursive: true, force: true });\"", "mcp": "ts-node src/mcp-server.ts", - "mcp:build": "npm run build && node dist/src/mcp-server.js" + "mcp:build": "npm run build && node dist/src/mcp-server.js", + "bing": "ts-node src/bing-index.ts", + "bing:build": "npm run build && node dist/src/bing-index.js", + "bing:test": "ts-node src/bing-index.ts \"playwright typescript\"", + "bing:debug": "ts-node src/bing-index.ts --no-headless \"playwright typescript\"", + "url-crawler": "ts-node src/url-crawler-test.ts", + "url-crawler:build": "npm run build && node dist/src/url-crawler-test.js" }, "repository": { "type": "git", @@ -57,5 +67,6 @@ }, "engines": { "node": ">=16.0.0" - } + }, + "packageManager": "yarn@1.22.22+sha1.ac34549e6aa8e7ead463a7407e1c7390f61a6610" } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index c62c6d6..ee576bb 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -30,9 +30,6 @@ importers: '@types/node': specifier: ^22.13.9 version: 22.13.9 - '@types/pino': - specifier: ^7.0.5 - version: 7.0.5 ts-node: specifier: ^10.9.2 version: 10.9.2(@types/node@22.13.9)(typescript@5.8.2) @@ -75,10 +72,6 @@ packages: '@types/node@22.13.9': resolution: {integrity: sha512-acBjXdRJ3A6Pb3tqnw9HZmyR3Fiol3aGxRCK1x3d+6CDAMjl7I649wpSd+yNURCjbOUGu9tqtLKnTGxmK6CyGw==} - '@types/pino@7.0.5': - resolution: {integrity: sha512-wKoab31pknvILkxAF8ss+v9iNyhw5Iu/0jLtRkUD74cNfOOLJNnqfFKAv0r7wVaTQxRZtWrMpGfShwwBjOcgcg==} - deprecated: This is a stub types definition. pino provides its own type definitions, so you do not need this installed. - accepts@2.0.0: resolution: {integrity: sha512-5cvg6CtKwfgdmVqY1WIiXKc3Q1bkRqGLi+2W/6ao+6Y7gu/RCwRuAhGEzh5B4KlszSuTLgZYuqFqo5bImjNKng==} engines: {node: '>= 0.6'} @@ -626,10 +619,6 @@ snapshots: dependencies: undici-types: 6.20.0 - '@types/pino@7.0.5': - dependencies: - pino: 9.6.0 - accepts@2.0.0: dependencies: mime-types: 3.0.0 diff --git a/src/bing-index.ts b/src/bing-index.ts new file mode 100644 index 0000000..67cd5a2 --- /dev/null +++ b/src/bing-index.ts @@ -0,0 +1,39 @@ +#!/usr/bin/env node + +import { Command } from "commander"; +import { bingSearch } from "./search.js"; +import { CommandOptions } from "./types.js"; + +// 获取包信息 +import packageJson from "../package.json" with { type: "json" }; + +// 创建命令行程序 +const program = new Command(); + +// 配置命令行选项 +program + .name("bing-search") + .description("基于 Playwright 的 Bing 搜索 CLI 工具") + .version(packageJson.version) + .argument("", "搜索关键词") + .option("-l, --limit ", "结果数量限制", parseInt, 10) + .option("-t, --timeout ", "超时时间(毫秒)", parseInt, 30000) + .option("--no-headless", "已废弃: 现在总是先尝试无头模式,如果遇到人机验证会自动切换到有头模式") + .option("--state-file ", "浏览器状态文件路径", "./browser-state.json") + .option("--no-save-state", "不保存浏览器状态") + .option("--locale ", "搜索结果语言", "zh-CN") + .action(async (query: string, options: CommandOptions) => { + try { + // 执行搜索 + const results = await bingSearch(query, options); + + // 输出结果 + console.log(JSON.stringify(results, null, 2)); + } catch (error) { + console.error("错误:", error); + process.exit(1); + } + }); + +// 解析命令行参数 +program.parse(process.argv); \ No newline at end of file diff --git a/src/index.ts b/src/index.ts index 605650a..c7f1791 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,7 +1,7 @@ #!/usr/bin/env node import { Command } from "commander"; -import { googleSearch } from "./search.js"; +import { googleSearch, bingSearch } from "./search.js"; import { CommandOptions } from "./types.js"; // 获取包信息 @@ -12,15 +12,67 @@ const program = new Command(); // 配置命令行选项 program - .name("google-search") - .description("基于 Playwright 的 Google 搜索 CLI 工具") - .version(packageJson.version) + .name("search-cli") + .description("基于 Playwright 的 Google 和 Bing 搜索 CLI 工具") + .version(packageJson.version); + +// Google 搜索命令 +program + .command("google") + .description("使用 Google 搜索") + .argument("", "搜索关键词") + .option("-l, --limit ", "结果数量限制", parseInt, 10) + .option("-t, --timeout ", "超时时间(毫秒)", parseInt, 30000) + .option("--no-headless", "已废弃: 现在总是先尝试无头模式,如果遇到人机验证会自动切换到有头模式") + .option("--state-file ", "浏览器状态文件路径", "./browser-state.json") + .option("--no-save-state", "不保存浏览器状态") + .option("--locale ", "搜索结果语言", "zh-CN") + .action(async (query: string, options: CommandOptions) => { + try { + // 执行搜索 + const results = await googleSearch(query, options); + + // 输出结果 + console.log(JSON.stringify(results, null, 2)); + } catch (error) { + console.error("错误:", error); + process.exit(1); + } + }); + +// Bing 搜索命令 +program + .command("bing") + .description("使用 Bing 搜索") + .argument("", "搜索关键词") + .option("-l, --limit ", "结果数量限制", parseInt, 10) + .option("-t, --timeout ", "超时时间(毫秒)", parseInt, 30000) + .option("--no-headless", "已废弃: 现在总是先尝试无头模式,如果遇到人机验证会自动切换到有头模式") + .option("--state-file ", "浏览器状态文件路径", "./browser-state.json") + .option("--no-save-state", "不保存浏览器状态") + .option("--locale ", "搜索结果语言", "zh-CN") + .action(async (query: string, options: CommandOptions) => { + try { + // 执行搜索 + const results = await bingSearch(query, options); + + // 输出结果 + console.log(JSON.stringify(results, null, 2)); + } catch (error) { + console.error("错误:", error); + process.exit(1); + } + }); + +// 默认命令(向后兼容) +program .argument("", "搜索关键词") .option("-l, --limit ", "结果数量限制", parseInt, 10) .option("-t, --timeout ", "超时时间(毫秒)", parseInt, 30000) .option("--no-headless", "已废弃: 现在总是先尝试无头模式,如果遇到人机验证会自动切换到有头模式") .option("--state-file ", "浏览器状态文件路径", "./browser-state.json") .option("--no-save-state", "不保存浏览器状态") + .option("--locale ", "搜索结果语言", "zh-CN") .action(async (query: string, options: CommandOptions) => { try { // 执行搜索 diff --git a/src/mcp-server.ts b/src/mcp-server.ts index cef640c..13e3073 100644 --- a/src/mcp-server.ts +++ b/src/mcp-server.ts @@ -3,7 +3,7 @@ import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; import { z } from "zod"; -import { googleSearch } from "./search.js"; +import { googleSearch, bingSearch } from "./search.js"; import * as os from "os"; import * as path from "path"; import * as fs from "fs"; @@ -15,7 +15,7 @@ let globalBrowser: Browser | undefined = undefined; // 创建MCP服务器实例 const server = new McpServer({ - name: "google-search-server", + name: "search-server", version: "1.0.0", }); @@ -105,6 +105,297 @@ server.tool( } ); +// 注册Bing搜索工具 +server.tool( + "bing-search", + "使用Bing搜索引擎查询实时网络信息,返回包含标题、链接和摘要的搜索结果。适用于需要获取最新信息、查找特定主题资料、研究当前事件或验证事实的场景。结果以JSON格式返回,包含查询内容和匹配结果列表。", + { + query: z + .string() + .describe( + "搜索查询字符串。为获得最佳结果:1)使用具体关键词而非模糊短语;2)可使用引号\"精确短语\"强制匹配;3)使用site:域名限定特定网站;4)使用-排除词过滤结果;5)使用OR连接备选词;6)优先使用专业术语;7)控制在2-5个关键词以获得平衡结果。例如:'气候变化 研究报告 2024 site:gov -观点' 或 '\"机器学习算法\" 教程 (Python OR Julia)'" + ), + limit: z + .number() + .optional() + .describe("返回的搜索结果数量 (默认: 10,建议范围: 1-20)"), + timeout: z + .number() + .optional() + .describe("搜索操作的超时时间(毫秒) (默认: 30000,可根据网络状况调整)"), + }, + async (params) => { + try { + const { query, limit, timeout } = params; + logger.info({ query }, "执行Bing搜索"); + + // 获取用户主目录下的状态文件路径 + const stateFilePath = path.join( + os.homedir(), + ".bing-search-browser-state.json" + ); + logger.info({ stateFilePath }, "使用状态文件路径"); + + // 检查状态文件是否存在 + const stateFileExists = fs.existsSync(stateFilePath); + + // 初始化警告消息 + let warningMessage = ""; + + if (!stateFileExists) { + warningMessage = + "⚠️ 注意:浏览器状态文件不存在。首次使用时,如果遇到人机验证,系统会自动切换到有头模式让您完成验证。完成后,系统会保存状态文件,后续搜索将更加顺畅。"; + logger.warn(warningMessage); + } + + // 使用全局浏览器实例执行搜索 + const results = await bingSearch( + query, + { + limit: limit, + timeout: timeout, + stateFile: stateFilePath, + }, + globalBrowser + ); + + // 构建返回结果,包含警告信息 + let responseText = JSON.stringify(results, null, 2); + if (warningMessage) { + responseText = warningMessage + "\n\n" + responseText; + } + + return { + content: [ + { + type: "text", + text: responseText, + }, + ], + }; + } catch (error) { + logger.error({ error }, "搜索工具执行错误"); + + return { + isError: true, + content: [ + { + type: "text", + text: `搜索失败: ${ + error instanceof Error ? error.message : String(error) + }`, + }, + ], + }; + } + } +); + +// 注册URL爬取工具 +server.tool( + "url-crawler", + "爬取指定URL的网页内容,返回页面标题、正文内容和元数据。适用于需要获取特定网页详细内容、提取文章正文、分析网页结构或获取网页元数据的场景。结果以JSON格式返回,包含URL和提取的内容。", + { + url: z + .string() + .describe( + "要爬取的完整URL地址,必须包含协议前缀(http://或https://)。例如:'https://example.com/article/123'" + ), + selector: z + .string() + .optional() + .describe( + "可选的CSS选择器,用于提取页面特定部分内容。不提供时将提取整个页面内容。例如:'article.main-content'或'div.news-body'" + ), + timeout: z + .number() + .optional() + .describe("爬取操作的超时时间(毫秒) (默认: 30000,可根据网络状况调整)"), + waitForSelector: z + .string() + .optional() + .describe( + "可选的CSS选择器,指定在提取内容前等待该元素出现。适用于动态加载内容的网页。例如:'div.loaded-content'" + ), + extractMetadata: z + .boolean() + .optional() + .describe( + "是否提取页面元数据,如Open Graph标签、Twitter卡片信息等 (默认: true)" + ), + }, + async (params) => { + try { + const { url, selector, timeout = 30000, waitForSelector, extractMetadata = true } = params; + logger.info({ url }, "执行URL爬取"); + + // 获取用户主目录下的状态文件路径 + const stateFilePath = path.join( + os.homedir(), + ".url-crawler-browser-state.json" + ); + logger.info({ stateFilePath }, "使用状态文件路径"); + + // 检查状态文件是否存在 + const stateFileExists = fs.existsSync(stateFilePath); + + // 初始化警告消息 + let warningMessage = ""; + + if (!stateFileExists) { + warningMessage = + "⚠️ 注意:浏览器状态文件不存在。首次使用时,如果遇到人机验证,系统会自动切换到有头模式让您完成验证。完成后,系统会保存状态文件,后续爬取将更加顺畅。"; + logger.warn(warningMessage); + } + + // 使用全局浏览器实例 + if (!globalBrowser) { + throw new Error("浏览器实例未初始化"); + } + + // 创建新的浏览器上下文 + const context = await globalBrowser.newContext({ + viewport: { width: 1280, height: 800 }, + userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", + storageState: stateFileExists ? stateFilePath : undefined, + }); + + // 创建新页面 + const page = await context.newPage(); + + try { + // 设置超时 + page.setDefaultTimeout(timeout); + + // 导航到URL + await page.goto(url, { waitUntil: "domcontentloaded" }); + + // 如果指定了等待选择器,则等待该元素出现 + if (waitForSelector) { + await page.waitForSelector(waitForSelector, { timeout }); + } + + // 提取页面标题 + const title = await page.title(); + + // 提取页面内容 + let content = ""; + if (selector) { + // 如果提供了选择器,则提取特定元素的内容 + const elements = await page.$$(selector); + for (const element of elements) { + const text = await element.textContent(); + if (text) { + content += text.trim() + "\n"; + } + } + } else { + // 否则提取整个页面的文本内容 + content = await page.evaluate(() => { + // 移除脚本、样式和隐藏元素 + const scripts = Array.from(document.querySelectorAll('script, style, [style*="display:none"], [style*="display: none"]')); + scripts.forEach(s => s.remove()); + + // 获取正文内容 + return document.body.innerText; + }); + } + + // 提取元数据 + let metadata: Record = {}; + if (extractMetadata) { + metadata = await page.evaluate(() => { + const meta: Record = {}; + + // 提取所有meta标签 + const metaTags = document.querySelectorAll('meta'); + metaTags.forEach(tag => { + const name = tag.getAttribute('name') || tag.getAttribute('property'); + const content = tag.getAttribute('content'); + if (name && content) { + meta[name] = content; + } + }); + + // 提取Open Graph标签 + const ogTags = document.querySelectorAll('meta[property^="og:"]'); + ogTags.forEach(tag => { + const property = tag.getAttribute('property'); + const content = tag.getAttribute('content'); + if (property && content) { + meta[property] = content; + } + }); + + // 提取Twitter卡片信息 + const twitterTags = document.querySelectorAll('meta[name^="twitter:"]'); + twitterTags.forEach(tag => { + const name = tag.getAttribute('name'); + const content = tag.getAttribute('content'); + if (name && content) { + meta[name] = content; + } + }); + + return meta; + }); + } + + // 如果启用了状态保存,保存浏览器状态 + if (!stateFileExists) { + await context.storageState({ path: stateFilePath }); + logger.info("已保存浏览器状态到文件"); + } + + // 构建结果 + const result = { + url, + title, + content: content.trim(), + metadata, + timestamp: new Date().toISOString(), + }; + + // 关闭上下文 + await context.close(); + + // 构建返回结果,包含警告信息 + let responseText = JSON.stringify(result, null, 2); + if (warningMessage) { + responseText = warningMessage + "\n\n" + responseText; + } + + return { + content: [ + { + type: "text", + text: responseText, + }, + ], + }; + } catch (error) { + // 确保关闭上下文 + await context.close(); + throw error; + } + } catch (error) { + logger.error({ error }, "URL爬取工具执行错误"); + + return { + isError: true, + content: [ + { + type: "text", + text: `URL爬取失败: ${ + error instanceof Error ? error.message : String(error) + }`, + }, + ], + }; + } + } +); + // 启动服务器 async function main() { try { diff --git a/src/search.ts b/src/search.ts index e0f8f70..32e1cf7 100644 --- a/src/search.ts +++ b/src/search.ts @@ -1,5 +1,5 @@ -import { chromium, devices, BrowserContextOptions, Browser } from "playwright"; -import { SearchResponse, SearchResult, CommandOptions } from "./types.js"; +import { Browser, BrowserContextOptions, chromium, devices } from "playwright"; +import { CommandOptions, SearchResponse, SearchResult } from "./types.js"; import * as fs from "fs"; import * as path from "path"; import * as os from "os"; @@ -62,8 +62,9 @@ function getHostMachineConfig(userLocale?: string): FingerprintConfig { // Node.js 无法直接获取系统颜色方案,使用合理的默认值 // 可以根据时间推断:晚上使用深色模式,白天使用浅色模式 const hour = new Date().getHours(); - const colorScheme = - hour >= 19 || hour < 7 ? ("dark" as const) : ("light" as const); + const colorScheme = hour >= 19 || hour < 7 + ? ("dark" as const) + : ("light" as const); // 其他设置使用合理的默认值 const reducedMotion = "no-preference" as const; // 大多数用户不会启用减少动画 @@ -107,7 +108,7 @@ function getHostMachineConfig(userLocale?: string): FingerprintConfig { export async function googleSearch( query: string, options: CommandOptions = {}, - existingBrowser?: Browser + existingBrowser?: Browser, ): Promise { // 设置默认选项 const { @@ -133,7 +134,7 @@ export async function googleSearch( if (fs.existsSync(stateFile)) { logger.info( { stateFile }, - "发现浏览器状态文件,将使用保存的浏览器状态以避免反机器人检测" + "发现浏览器状态文件,将使用保存的浏览器状态以避免反机器人检测", ); storageState = stateFile; @@ -150,7 +151,7 @@ export async function googleSearch( } else { logger.info( { stateFile }, - "未找到浏览器状态文件,将创建新的浏览器会话和指纹" + "未找到浏览器状态文件,将创建新的浏览器会话和指纹", ); } @@ -215,7 +216,7 @@ export async function googleSearch( } else { logger.info( { headless }, - `准备以${headless ? "无头" : "有头"}模式启动浏览器...` + `准备以${headless ? "无头" : "有头"}模式启动浏览器...`, ); // 初始化浏览器,添加更多参数以避免检测 @@ -282,7 +283,7 @@ export async function googleSearch( if (hostConfig.deviceName !== deviceName) { logger.info( { deviceType: hostConfig.deviceName }, - "根据宿主机器设置使用设备类型" + "根据宿主机器设置使用设备类型", ); // 使用新的设备配置 contextOptions = { ...devices[hostConfig.deviceName] }; @@ -306,7 +307,7 @@ export async function googleSearch( colorScheme: hostConfig.colorScheme, deviceType: hostConfig.deviceName, }, - "已根据宿主机器生成新的浏览器指纹配置" + "已根据宿主机器生成新的浏览器指纹配置", ); } @@ -325,7 +326,7 @@ export async function googleSearch( } const context = await browser.newContext( - storageState ? { ...contextOptions, storageState } : contextOptions + storageState ? { ...contextOptions, storageState } : contextOptions, ); // 设置额外的浏览器属性以避免检测 @@ -352,7 +353,7 @@ export async function googleSearch( if (typeof WebGLRenderingContext !== "undefined") { const getParameter = WebGLRenderingContext.prototype.getParameter; WebGLRenderingContext.prototype.getParameter = function ( - parameter: number + parameter: number, ) { // 随机化 UNMASKED_VENDOR_WEBGL 和 UNMASKED_RENDERER_WEBGL if (parameter === 37445) { @@ -412,7 +413,7 @@ export async function googleSearch( const isBlockedPage = sorryPatterns.some( (pattern) => currentUrl.includes(pattern) || - (response && response.url().toString().includes(pattern)) + (response && response.url().toString().includes(pattern)), ); if (isBlockedPage) { @@ -426,7 +427,7 @@ export async function googleSearch( // 如果是外部提供的浏览器,不关闭它,而是创建一个新的浏览器实例 if (browserWasProvided) { logger.info( - "使用外部浏览器实例时遇到人机验证,创建新的浏览器实例..." + "使用外部浏览器实例时遇到人机验证,创建新的浏览器实例...", ); // 创建一个新的浏览器实例,不再使用外部提供的实例 const newBrowser = await chromium.launch({ @@ -493,7 +494,7 @@ export async function googleSearch( url: (url) => { const urlStr = url.toString(); return sorryPatterns.every( - (pattern) => !urlStr.includes(pattern) + (pattern) => !urlStr.includes(pattern), ); }, }); @@ -552,7 +553,7 @@ export async function googleSearch( if (isBlockedAfterSearch) { if (headless) { logger.warn( - "搜索后检测到人机验证页面,将以有头模式重新启动浏览器..." + "搜索后检测到人机验证页面,将以有头模式重新启动浏览器...", ); // 关闭当前页面和上下文 @@ -562,7 +563,7 @@ export async function googleSearch( // 如果是外部提供的浏览器,不关闭它,而是创建一个新的浏览器实例 if (browserWasProvided) { logger.info( - "使用外部浏览器实例时搜索后遇到人机验证,创建新的浏览器实例..." + "使用外部浏览器实例时搜索后遇到人机验证,创建新的浏览器实例...", ); // 创建一个新的浏览器实例,不再使用外部提供的实例 const newBrowser = await chromium.launch({ @@ -629,7 +630,7 @@ export async function googleSearch( url: (url) => { const urlStr = url.toString(); return sorryPatterns.every( - (pattern) => !urlStr.includes(pattern) + (pattern) => !urlStr.includes(pattern), ); }, }); @@ -673,7 +674,7 @@ export async function googleSearch( if (isBlockedDuringResults) { if (headless) { logger.warn( - "等待搜索结果时检测到人机验证页面,将以有头模式重新启动浏览器..." + "等待搜索结果时检测到人机验证页面,将以有头模式重新启动浏览器...", ); // 关闭当前页面和上下文 @@ -683,7 +684,7 @@ export async function googleSearch( // 如果是外部提供的浏览器,不关闭它,而是创建一个新的浏览器实例 if (browserWasProvided) { logger.info( - "使用外部浏览器实例时等待搜索结果遇到人机验证,创建新的浏览器实例..." + "使用外部浏览器实例时等待搜索结果遇到人机验证,创建新的浏览器实例...", ); // 创建一个新的浏览器实例,不再使用外部提供的实例 const newBrowser = await chromium.launch({ @@ -744,7 +745,7 @@ export async function googleSearch( } } else { logger.warn( - "等待搜索结果时检测到人机验证页面,请在浏览器中完成验证..." + "等待搜索结果时检测到人机验证页面,请在浏览器中完成验证...", ); // 等待用户完成验证并重定向回搜索页面 await page.waitForNavigation({ @@ -752,7 +753,7 @@ export async function googleSearch( url: (url) => { const urlStr = url.toString(); return sorryPatterns.every( - (pattern) => !urlStr.includes(pattern) + (pattern) => !urlStr.includes(pattern), ); }, }); @@ -816,7 +817,7 @@ export async function googleSearch( maxResults: number; titleSelector: string; snippetSelector: string; - } + }, ) => { return elements .slice(0, params.maxResults) @@ -824,7 +825,7 @@ export async function googleSearch( const titleElement = el.querySelector(params.titleSelector); const linkElement = el.querySelector("a"); const snippetElement = el.querySelector( - params.snippetSelector + params.snippetSelector, ); return { @@ -840,14 +841,14 @@ export async function googleSearch( }) .filter( (item: { title: string; link: string; snippet: string }) => - item.title && item.link + item.title && item.link, ); // 过滤掉空结果 }, { maxResults: limit, titleSelector: selector.title, snippetSelector: selector.snippet, - } + }, ); if (results.length > 0) { @@ -879,10 +880,9 @@ export async function googleSearch( .slice(0, maxResults) .map((el: Element) => { const title = el.textContent || ""; - const link = - el instanceof HTMLAnchorElement - ? el.href - : el.getAttribute("href") || ""; + const link = el instanceof HTMLAnchorElement + ? el.href + : el.getAttribute("href") || ""; // 尝试获取周围的文本作为摘要 let snippet = ""; let parent = el.parentElement; @@ -898,10 +898,10 @@ export async function googleSearch( }) .filter( (item: { title: string; link: string; snippet: string }) => - item.title && item.link + item.title && item.link, ); // 过滤掉空结果 }, - limit + limit, ); } @@ -927,7 +927,7 @@ export async function googleSearch( fs.writeFileSync( fingerprintFile, JSON.stringify(savedState, null, 2), - "utf8" + "utf8", ); logger.info({ fingerprintFile }, "指纹配置已保存"); } catch (fingerprintError) { @@ -971,7 +971,7 @@ export async function googleSearch( fs.writeFileSync( fingerprintFile, JSON.stringify(savedState, null, 2), - "utf8" + "utf8", ); logger.info({ fingerprintFile }, "指纹配置已保存"); } catch (fingerprintError) { @@ -1009,3 +1009,209 @@ export async function googleSearch( // 首先尝试以无头模式执行搜索 return performSearch(useHeadless); } + +/** + * 执行Bing搜索 + * @param query 搜索查询 + * @param options 命令行选项 + * @param existingBrowser 可选的现有浏览器实例 + * @returns 搜索响应 + */ +export async function bingSearch( + query: string, + options: CommandOptions = {}, + existingBrowser?: Browser, +): Promise { + const { + limit = 10, + timeout = 30000, + headless = true, + stateFile = "./browser-state.json", + noSaveState = false, + locale = "zh-CN", + } = options; + + // 指纹状态文件路径 + const fingerprintFile = "./browser-state-fingerprint.json"; + + // 加载保存的状态 + let savedState: SavedState = {}; + try { + if (fs.existsSync(fingerprintFile)) { + const fingerprintData = fs.readFileSync(fingerprintFile, "utf8"); + savedState.fingerprint = JSON.parse(fingerprintData); + } + } catch (error) { + logger.warn(`无法加载指纹配置: ${error}`); + } + + // 如果没有保存的指纹配置,使用宿主机器配置 + if (!savedState.fingerprint) { + savedState.fingerprint = getHostMachineConfig(locale); + // 保存新的指纹配置 + try { + fs.writeFileSync( + fingerprintFile, + JSON.stringify(savedState.fingerprint, null, 2), + ); + } catch (error) { + logger.warn(`无法保存指纹配置: ${error}`); + } + } + + // 获取设备配置 + const getDeviceConfig = (): [string, any] => { + const { deviceName } = savedState.fingerprint || {}; + const deviceType = deviceName || "Desktop Chrome"; + const deviceConfig = devices[deviceType]; + return [deviceType, deviceConfig]; + }; + + // 随机延迟函数 + const getRandomDelay = (min: number, max: number) => { + return Math.floor(Math.random() * (max - min + 1) + min); + }; + + async function performSearch(headless: boolean): Promise { + const [deviceType, deviceConfig] = getDeviceConfig(); + logger.info(`使用设备配置: ${deviceType}`); + + // 浏览器上下文选项 + const contextOptions: BrowserContextOptions = { + ...deviceConfig, + locale: savedState.fingerprint?.locale || locale, + timezoneId: savedState.fingerprint?.timezoneId, + colorScheme: savedState.fingerprint?.colorScheme, + reducedMotion: savedState.fingerprint?.reducedMotion, + forcedColors: savedState.fingerprint?.forcedColors, + }; + + // 如果存在状态文件,尝试加载 + if (fs.existsSync(stateFile) && !noSaveState) { + try { + contextOptions.storageState = stateFile; + } catch (error) { + logger.warn(`无法加载浏览器状态: ${error}`); + } + } + + // 使用现有浏览器或创建新的浏览器 + const browser = existingBrowser || await chromium.launch({ headless }); + const context = await browser.newContext(contextOptions); + const page = await context.newPage(); + + try { + // 设置超时 + page.setDefaultTimeout(timeout); + + // 访问 Bing 搜索页面 + await page.goto("https://www.bing.com/"); + logger.info("已加载 Bing 搜索页面"); + + // 等待搜索框出现 + await page.waitForSelector("#sb_form_q", { timeout: 10000 }); + await page.waitForTimeout(getRandomDelay(1000, 3000)); + await page.waitForTimeout(getRandomDelay(1000, 3000)); + await page.focus("#sb_form_q"); + await page.click("#sb_form_q"); + + await page.waitForTimeout(getRandomDelay(1000, 3000)); + + //获得sb_form_q坐标并用鼠标点击 + const sbFormQ = await page.$("#sb_form_q"); + + const sbFormQRect = await sbFormQ?.boundingBox(); + if (sbFormQRect) { + await page.mouse.click(sbFormQRect.x + sbFormQRect.width / 2, sbFormQRect.y + sbFormQRect.height / 2); + } + await page.waitForTimeout(getRandomDelay(1000, 3000)); + await page.waitForTimeout(getRandomDelay(1000, 3000)); + await page.waitForTimeout(getRandomDelay(1000, 3000)); + await page.waitForTimeout(getRandomDelay(1000, 3000)); + await page.fill("#sb_form_q", query); // 使用 fill 输入内容,更稳定 + + logger.info(`已输入查询: ${query}`); + + // 提交搜索 + await page.waitForTimeout(getRandomDelay(1000, 3000)); + await page.waitForTimeout(getRandomDelay(1000, 3000)); + await page.waitForTimeout(getRandomDelay(1000, 3000)); + await page.waitForTimeout(getRandomDelay(1000, 3000)); + await page.keyboard.press("Control+Enter"); + await page.waitForTimeout(getRandomDelay(1000, 3000)); + + // 等待搜索结果加载 + await page.waitForSelector(".b_algo h2", { timeout: 15000 }); + + // 等待搜索结果加载 + await page.waitForSelector(".b_algo h2", { timeout: 15000 }); + logger.info("搜索结果已加载"); + + // 提取搜索结果 + const results = await page.evaluate((resultLimit) => { + const searchResults: SearchResult[] = []; + + // 选择所有搜索结果项 + const resultElements = document.querySelectorAll("#b_results .b_algo"); + + // 遍历结果元素 + for (let i = 0; i < Math.min(resultElements.length, resultLimit); i++) { + const element = resultElements[i]; + + // 提取标题和链接 + const titleElement = element.querySelector("h2 a"); + const title = titleElement?.textContent?.trim() || ""; + const link = titleElement?.getAttribute("href") || ""; + + // 提取摘要 + const snippetElement = element.querySelector(".b_caption p"); + const snippet = snippetElement?.textContent?.trim() || ""; + + // 只添加有效的结果 + if (title && link) { + searchResults.push({ title, link, snippet }); + } + } + + return searchResults; + }, limit); + + // 保存浏览器状态 + if (!noSaveState) { + try { + await context.storageState({ path: stateFile }); + logger.info("已保存浏览器状态"); + } catch (error) { + logger.warn(`无法保存浏览器状态: ${error}`); + } + } + + // 关闭浏览器(如果不是外部提供的) + if (!existingBrowser) { + await browser.close(); + } + + return { + query, + results, + }; + } catch (error) { + // 关闭浏览器(如果不是外部提供的) + if (!existingBrowser) { + await browser.close(); + } + throw error; + } + } + + try { + // 首先尝试无头模式 + logger.info("尝试使用无头模式进行搜索"); + return await performSearch(false); + } catch (error) { + // 如果无头模式失败,尝试有头模式 + logger.warn(`无头模式搜索失败: ${error}`); + logger.info("切换到有头模式"); + return await performSearch(false); + } +} diff --git a/src/url-crawler-test.ts b/src/url-crawler-test.ts new file mode 100644 index 0000000..a205a52 --- /dev/null +++ b/src/url-crawler-test.ts @@ -0,0 +1,197 @@ +#!/usr/bin/env node + +import { Command } from "commander"; +import { chromium } from "playwright"; +import * as path from "path"; +import * as os from "os"; +import * as fs from "fs"; +import logger from "./logger.js"; + +// 定义命令行参数 +const program = new Command(); +program + .name("url-crawler") + .description("爬取指定URL的网页内容") + .version("1.0.0") + .argument("", "要爬取的URL") + .option("-s, --selector ", "CSS选择器,用于提取特定内容") + .option("-w, --wait-for ", "等待指定元素出现后再提取内容") + .option("-t, --timeout ", "超时时间(毫秒)", "30000") + .option("--no-metadata", "不提取元数据") + .option("--no-headless", "使用有头模式运行浏览器") + .option("--no-save-state", "不保存浏览器状态") + .option("--state-file ", "浏览器状态文件路径", path.join(os.homedir(), ".url-crawler-browser-state.json")) + .parse(process.argv); + +// 获取命令行参数 +const url = program.args[0]; +const options = program.opts(); + +// 主函数 +async function main() { + logger.info({ url, options }, "开始爬取URL"); + + // 检查状态文件是否存在 + const stateFileExists = fs.existsSync(options.stateFile); + let storageState: string | undefined = undefined; + + if (stateFileExists && !options.noSaveState) { + logger.info({ stateFile: options.stateFile }, "使用保存的浏览器状态"); + storageState = options.stateFile; + } + + // 启动浏览器 + const browser = await chromium.launch({ + headless: options.headless, + args: [ + "--disable-blink-features=AutomationControlled", + "--disable-features=IsolateOrigins,site-per-process", + "--disable-site-isolation-trials", + "--disable-web-security", + "--no-sandbox", + "--disable-setuid-sandbox", + "--disable-dev-shm-usage", + "--disable-accelerated-2d-canvas", + "--no-first-run", + "--no-zygote", + "--disable-gpu", + "--hide-scrollbars", + "--mute-audio", + "--disable-background-networking", + "--disable-background-timer-throttling", + "--disable-backgrounding-occluded-windows", + "--disable-breakpad", + "--disable-component-extensions-with-background-pages", + "--disable-extensions", + "--disable-features=TranslateUI", + "--disable-ipc-flooding-protection", + "--disable-renderer-backgrounding", + "--enable-features=NetworkService,NetworkServiceInProcess", + "--force-color-profile=srgb", + "--metrics-recording-only", + ], + ignoreDefaultArgs: ["--enable-automation"], + }); + + try { + // 创建新的浏览器上下文 + const context = await browser.newContext({ + viewport: { width: 1280, height: 800 }, + userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", + storageState: storageState, + }); + + // 创建新页面 + const page = await context.newPage(); + + try { + // 设置超时 + page.setDefaultTimeout(parseInt(options.timeout)); + + // 导航到URL + await page.goto(url, { waitUntil: "domcontentloaded" }); + + // 如果指定了等待选择器,则等待该元素出现 + if (options.waitFor) { + await page.waitForSelector(options.waitFor, { timeout: parseInt(options.timeout) }); + } + + // 提取页面标题 + const title = await page.title(); + + // 提取页面内容 + let content = ""; + if (options.selector) { + // 如果提供了选择器,则提取特定元素的内容 + const elements = await page.$$(options.selector); + for (const element of elements) { + const text = await element.textContent(); + if (text) { + content += text.trim() + "\n"; + } + } + } else { + // 否则提取整个页面的文本内容 + content = await page.evaluate(() => { + // 移除脚本、样式和隐藏元素 + const scripts = Array.from(document.querySelectorAll('script, style, [style*="display:none"], [style*="display: none"]')); + scripts.forEach(s => s.remove()); + + // 获取正文内容 + return document.body.innerText; + }); + } + + // 提取元数据 + let metadata: Record = {}; + if (options.metadata) { + metadata = await page.evaluate(() => { + const meta: Record = {}; + + // 提取所有meta标签 + const metaTags = document.querySelectorAll('meta'); + metaTags.forEach(tag => { + const name = tag.getAttribute('name') || tag.getAttribute('property'); + const content = tag.getAttribute('content'); + if (name && content) { + meta[name] = content; + } + }); + + // 提取Open Graph标签 + const ogTags = document.querySelectorAll('meta[property^="og:"]'); + ogTags.forEach(tag => { + const property = tag.getAttribute('property'); + const content = tag.getAttribute('content'); + if (property && content) { + meta[property] = content; + } + }); + + // 提取Twitter卡片信息 + const twitterTags = document.querySelectorAll('meta[name^="twitter:"]'); + twitterTags.forEach(tag => { + const name = tag.getAttribute('name'); + const content = tag.getAttribute('content'); + if (name && content) { + meta[name] = content; + } + }); + + return meta; + }); + } + + // 如果启用了状态保存,保存浏览器状态 + if (!stateFileExists && !options.noSaveState) { + await context.storageState({ path: options.stateFile }); + logger.info("已保存浏览器状态到文件"); + } + + // 构建结果 + const result = { + url, + title, + content: content.trim(), + metadata, + timestamp: new Date().toISOString(), + }; + + // 输出结果 + console.log(JSON.stringify(result, null, 2)); + + } finally { + // 关闭上下文 + await context.close(); + } + } finally { + // 关闭浏览器 + await browser.close(); + } +} + +// 执行主函数 +main().catch(error => { + logger.error({ error }, "URL爬取失败"); + process.exit(1); +}); \ No newline at end of file