diff --git a/.github/workflows/build-docker.yml b/.github/workflows/build-docker.yml new file mode 100644 index 0000000..59022fc --- /dev/null +++ b/.github/workflows/build-docker.yml @@ -0,0 +1,78 @@ +name: Build and Push Docker Images + +env: + NODE_OPTIONS: '--max_old_space_size=6144' + +on: + workflow_dispatch: + push: + branches: + - master + paths: + - 'blocklets/snap-kit/dockerfile' + +jobs: + build: + if: "!contains(toJSON(github.event.commits.*.message) , '[skip ci]')" + name: Build and Push Docker Image + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v3 + with: + fetch-depth: 2 + + - name: Check Dockerfile changes + id: check_changes + run: | + CHANGED=$(git diff --name-only HEAD^ HEAD | grep "blocklets/snap-kit/dockerfile" || echo "") + echo "CHANGED=$CHANGED" >> $GITHUB_ENV + if [ -z "$CHANGED" ]; then + echo "changed=false" >> $GITHUB_OUTPUT + echo "No changes to Dockerfile, skipping build" + else + echo "changed=true" >> $GITHUB_OUTPUT + echo "Dockerfile changed, proceeding with build" + fi + shell: bash + + - name: Get version + if: steps.check_changes.outputs.changed == 'true' || github.event_name == 'workflow_dispatch' + id: get_version + run: | + VERSION=$(cat version) + echo "VERSION=$VERSION" >> $GITHUB_ENV + echo "version=$VERSION" >> $GITHUB_OUTPUT + + - name: Send Notification Before Build + if: steps.check_changes.outputs.changed == 'true' || github.event_name == 'workflow_dispatch' + run: | + curl -H 'Content-type: application/json' -X POST -d '{"text": "Starting to build Docker image for arcblock/snap-kit from blocklets/snap-kit"}' ${{ secrets.SLACK_WEBHOOK }} + + - name: Set up Docker Buildx + if: steps.check_changes.outputs.changed == 'true' || github.event_name == 'workflow_dispatch' + uses: docker/setup-buildx-action@v2 + + - name: Login to DockerHub + if: steps.check_changes.outputs.changed == 'true' || github.event_name == 'workflow_dispatch' + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKER_HUB_USER_NAME }} + password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }} + + - name: Build and push + if: steps.check_changes.outputs.changed == 'true' || github.event_name == 'workflow_dispatch' + uses: docker/build-push-action@v4 + with: + context: blocklets/snap-kit + file: blocklets/snap-kit/dockerfile + push: true + platforms: linux/amd64,linux/arm64 + tags: | + arcblock/snap-kit:latest + arcblock/snap-kit:${{ env.VERSION }} + + - name: Send Notification After Build + if: steps.check_changes.outputs.changed == 'true' || github.event_name == 'workflow_dispatch' + run: | + curl -H 'Content-type: application/json' -X POST -d '{"text": "Docker image arcblock/snap-kit:${{ env.VERSION }} from blocklets/snap-kit has been built and pushed successfully"}' ${{ secrets.SLACK_WEBHOOK }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 493eca6..c65cb50 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 1.0.6 (2025-5-23) + +- feat: add actions for building docker images; update dockerfile for font and locale support + ## 1.0.5 (2025-5-22) - chore: update screenshot diff --git a/blocklets/snap-kit/blocklet.yml b/blocklets/snap-kit/blocklet.yml index 8146289..009bd40 100644 --- a/blocklets/snap-kit/blocklet.yml +++ b/blocklets/snap-kit/blocklet.yml @@ -16,7 +16,7 @@ repository: type: git url: git+https://github.com/blocklet/create-blocklet.git specVersion: 1.2.8 -version: 1.0.5 +version: 1.0.6 logo: logo.png files: - dist @@ -57,7 +57,7 @@ capabilities: screenshots: - 1.jpeg docker: - image: arcblock/blocklet-with-puppeteer + image: arcblock/snap-kit shell: gosu workdir: /var/lib/blocklet installNodeModules: true diff --git a/blocklets/snap-kit/dockerfile b/blocklets/snap-kit/dockerfile index 7859267..7ed71e0 100644 --- a/blocklets/snap-kit/dockerfile +++ b/blocklets/snap-kit/dockerfile @@ -13,12 +13,19 @@ RUN apt-get update && \ apt-get install -y --no-install-recommends \ build-essential socat curl gnupg wget \ gosu strace passwd git ca-certificates libcairo2-dev libpango1.0-dev \ - libjpeg-dev libgif-dev && \ + libjpeg-dev libgif-dev \ + # Install fontconfig and fonts for Chinese and emoji + fontconfig \ + fonts-noto-cjk fonts-noto-color-emoji && \ rm -rf /var/lib/apt/lists/* # Create necessary directories RUN mkdir -p /data/bin /var/lib/blocklet /home/node/.npm-global/lib +# Set locale to C.UTF-8 for broad UTF-8 support +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 + # Set environment variables ENV DOCKER_DATA=/var/lib/blocklet ENV NODE_VERSION=22 diff --git a/package.json b/package.json index e8938b1..75b6bc1 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "crawler", "private": true, - "version": "1.0.5", + "version": "1.0.6", "scripts": { "dev": "blocklet dev", "lint": "pnpm -r lint", diff --git a/packages/crawler/package.json b/packages/crawler/package.json index 8604566..dfe2a99 100644 --- a/packages/crawler/package.json +++ b/packages/crawler/package.json @@ -15,7 +15,8 @@ "build:cjs": "tsc -p tsconfig.cjs.json", "build:esm": "tsc -p tsconfig.esm.json", "build": "npm run build:cjs && npm run build:esm", - "prepublishOnly": "npm run build" + "prepublishOnly": "npm run build", + "fix:sqlite": "cd node_modules/sqlite3 && npm run rebuild" }, "files": [ "dist", diff --git a/packages/crawler/src/crawler.ts b/packages/crawler/src/crawler.ts index 9f57839..9ecf1dd 100644 --- a/packages/crawler/src/crawler.ts +++ b/packages/crawler/src/crawler.ts @@ -11,7 +11,7 @@ import { config, logger } from './config'; import { Job, JobState } from './db/job'; import { Snapshot, SnapshotModel } from './db/snapshot'; import { initPage } from './puppeteer'; -import { formatUrl, isAcceptCrawler, md5 } from './utils'; +import { findMaxScrollHeight, formatUrl, isAcceptCrawler, md5 } from './utils'; const { BaseState } = require('@abtnode/models'); @@ -24,7 +24,7 @@ export function createCrawlQueue() { store: new SequelizeStore(db, 'crawler'), concurrency: 1, onJob: async (job: JobState) => { - logger.debug('job start:', job); + logger.info('Starting to execute crawl job', job); const canCrawl = await isAcceptCrawler(job.url); if (!canCrawl) { @@ -167,7 +167,7 @@ export const getPageContent = async ({ width = 1440, height = 900, quality = 80, - timeout = 60 * 1000, + timeout = 90 * 1000, fullPage = false, }: { url: string; @@ -185,7 +185,7 @@ export const getPageContent = async ({ const page = await initPage(); if (width && height) { - await page.setViewport({ width, height }); + await page.setViewport({ width, height, deviceScaleFactor: 2 }); } let html: string | null = null; @@ -207,13 +207,28 @@ export const getPageContent = async ({ } // await for networkidle0 - // https://pptr.dev/api/puppeteer.page.goforward/#remarks + // https://pptr.dev/api/puppeteer.page.waitfornetworkidle await page.waitForNetworkIdle({ - idleTime: 2 * 1000, + idleTime: 1.5 * 1000, }); // get screenshot if (includeScreenshot) { + // Try to find the tallest element and set the browser to the same height + if (fullPage) { + const maxScrollHeight = await findMaxScrollHeight(page); + + logger.info('findMaxScrollHeight', { maxScrollHeight }); + + if (maxScrollHeight) { + await page.setViewport({ width, height: maxScrollHeight || height, deviceScaleFactor: 2 }); + await page.evaluate((scrollHeight) => { + window.scrollTo(0, scrollHeight || 0); + document.documentElement.scrollTo(0, scrollHeight || 0); + }, maxScrollHeight); + } + } + try { screenshot = await page.screenshot({ fullPage, quality, type: 'webp' }); } catch (err) { @@ -247,6 +262,7 @@ export const getPageContent = async ({ export async function createCrawlJob(params: JobState, callback?: (snapshot: SnapshotModel | null) => void) { params = { ...params, + id: randomUUID(), url: formatUrl(params.url), }; @@ -261,18 +277,17 @@ export async function createCrawlJob(params: JobState, callback?: (snapshot: Sna fullPage: params.fullPage, }); - logger.info('create crawl job', params); - if (existsJob) { logger.warn(`Crawl job already exists for ${params.url}, skip`); return existsJob.id; } - const jobId = randomUUID(); - const job = crawlQueue.push({ ...params, id: jobId }); + logger.info('create crawl job', params); + + const job = crawlQueue.push(params); job.on('finished', ({ result }) => { - logger.info(`Crawl completed ${params.url}, status: ${result ? 'success' : 'failed'}`, { job: params, result }); + logger.info(`Crawl completed ${params.url}`, { job: params, result }); callback?.(result); }); @@ -281,7 +296,7 @@ export async function createCrawlJob(params: JobState, callback?: (snapshot: Sna callback?.(null); }); - return jobId; + return params.id; } // @ts-ignore diff --git a/packages/crawler/src/puppeteer.ts b/packages/crawler/src/puppeteer.ts index 2172733..6cf9a6a 100644 --- a/packages/crawler/src/puppeteer.ts +++ b/packages/crawler/src/puppeteer.ts @@ -53,7 +53,7 @@ export async function ensureBrowser() { const executablePath = process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium'; - logger.info('executablePath', executablePath); + logger.debug('Chromium executablePath', executablePath); if (!fs.existsSync(executablePath)) { logger.info('start download browser', puppeteerConfig); @@ -148,7 +148,7 @@ export async function launchBrowser() { '--font-render-hinting=none', ], }); - logger.info('Launch browser success'); + logger.info('Launch browser'); } catch (error) { logger.error('launch browser failed: ', error); // cleanup browser endpoint @@ -231,21 +231,21 @@ export const closeBrowser = async ({ trimCache = true }: { trimCache?: boolean } const pages = await browser.pages(); await Promise.all(pages.map((page) => page.close())); } catch (err) { - logger.error('Failed to close all pages:', err); + logger.warn('Failed to close all pages:', err); } // close browser try { await browser.close(); } catch (err) { - logger.error('Failed to close browser:', err); + logger.warn('Failed to close browser:', err); } // clear cache try { if (trimCache) { await puppeteer.trimCache(); - logger.info('Trim cache success'); + logger.debug('Trim cache success'); } // try to clear temporary directory @@ -257,7 +257,7 @@ export const closeBrowser = async ({ trimCache = true }: { trimCache?: boolean } global.gc(); } } catch (err) { - logger.error('Failed to clear browser cache:', err); + logger.warn('Failed to clear browser cache:', err); } browser = null; diff --git a/packages/crawler/src/utils.ts b/packages/crawler/src/utils.ts index 9c4d1ce..f13789f 100644 --- a/packages/crawler/src/utils.ts +++ b/packages/crawler/src/utils.ts @@ -1,3 +1,4 @@ +import { Page } from '@blocklet/puppeteer'; import { components, env } from '@blocklet/sdk/lib/config'; import axios from 'axios'; import flattenDeep from 'lodash/flattenDeep'; @@ -238,3 +239,27 @@ export const formatUrl = (url: string) => { export function md5(content: string | Uint8Array) { return createHash('md5').update(content).digest('hex'); } + +export async function findMaxScrollHeight(page: Page) { + const maxHeightHandler = await page.evaluateHandle(() => { + const elements = Array.from(document.querySelectorAll('*')); + let maxHeight = document.body.scrollHeight; + + for (const el of elements) { + const style = window.getComputedStyle(el); + if (style.overflowY === 'auto' || style.overflowY === 'scroll') { + if (el.scrollHeight > el.clientHeight && el.scrollHeight > maxHeight) { + maxHeight = el.scrollHeight; + } + } + } + + return maxHeight; + }); + + const maxHeight = await maxHeightHandler.jsonValue(); + + maxHeightHandler.dispose(); + + return maxHeight; +} diff --git a/prd/20250521-review.md b/prd/20250521-review.md deleted file mode 100644 index 613e251..0000000 --- a/prd/20250521-review.md +++ /dev/null @@ -1,23 +0,0 @@ -# Review 优化点 - -- 查询接口通过任务ID查询 ✅ - -- 爬虫接口支持同步返回 ✅ - -- 爬虫接口返回任务ID ✅ - -- 队列用同一个db ✅ - -- 接口支持质量参数/超时 ✅ - -- 图片格式使用webp ✅ - -- 分成HTML和WEBP两个文件夹 ✅ - -- 文件名用哈希 ✅ - -- 接口加上access-key ✅ - -- 在 docker 模式下部署 ✅ - -- blocklet 爬虫改造 diff --git a/version b/version index 1464c52..ece61c6 100644 --- a/version +++ b/version @@ -1 +1 @@ -1.0.5 \ No newline at end of file +1.0.6 \ No newline at end of file