Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 78 additions & 0 deletions .github/workflows/build-docker.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
name: Build and Push Docker Images

env:
NODE_OPTIONS: '--max_old_space_size=6144'

on:
workflow_dispatch:
push:
branches:
- master
paths:
- 'blocklets/snap-kit/dockerfile'

jobs:
build:
if: "!contains(toJSON(github.event.commits.*.message) , '[skip ci]')"
name: Build and Push Docker Image
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v3
with:
fetch-depth: 2

- name: Check Dockerfile changes
id: check_changes
run: |
CHANGED=$(git diff --name-only HEAD^ HEAD | grep "blocklets/snap-kit/dockerfile" || echo "")
echo "CHANGED=$CHANGED" >> $GITHUB_ENV
if [ -z "$CHANGED" ]; then
echo "changed=false" >> $GITHUB_OUTPUT
echo "No changes to Dockerfile, skipping build"
else
echo "changed=true" >> $GITHUB_OUTPUT
echo "Dockerfile changed, proceeding with build"
fi
shell: bash

- name: Get version
if: steps.check_changes.outputs.changed == 'true' || github.event_name == 'workflow_dispatch'
id: get_version
run: |
VERSION=$(cat version)
echo "VERSION=$VERSION" >> $GITHUB_ENV
echo "version=$VERSION" >> $GITHUB_OUTPUT

- name: Send Notification Before Build
if: steps.check_changes.outputs.changed == 'true' || github.event_name == 'workflow_dispatch'
run: |
curl -H 'Content-type: application/json' -X POST -d '{"text": "Starting to build Docker image for arcblock/snap-kit from blocklets/snap-kit"}' ${{ secrets.SLACK_WEBHOOK }}

- name: Set up Docker Buildx
if: steps.check_changes.outputs.changed == 'true' || github.event_name == 'workflow_dispatch'
uses: docker/setup-buildx-action@v2

- name: Login to DockerHub
if: steps.check_changes.outputs.changed == 'true' || github.event_name == 'workflow_dispatch'
uses: docker/login-action@v2
with:
username: ${{ secrets.DOCKER_HUB_USER_NAME }}
password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}

- name: Build and push
if: steps.check_changes.outputs.changed == 'true' || github.event_name == 'workflow_dispatch'
uses: docker/build-push-action@v4
with:
context: blocklets/snap-kit
file: blocklets/snap-kit/dockerfile
push: true
platforms: linux/amd64,linux/arm64
tags: |
arcblock/snap-kit:latest
arcblock/snap-kit:${{ env.VERSION }}

- name: Send Notification After Build
if: steps.check_changes.outputs.changed == 'true' || github.event_name == 'workflow_dispatch'
run: |
curl -H 'Content-type: application/json' -X POST -d '{"text": "Docker image arcblock/snap-kit:${{ env.VERSION }} from blocklets/snap-kit has been built and pushed successfully"}' ${{ secrets.SLACK_WEBHOOK }}
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 1.0.6 (2025-5-23)

- feat: add actions for building docker images; update dockerfile for font and locale support

## 1.0.5 (2025-5-22)

- chore: update screenshot
Expand Down
4 changes: 2 additions & 2 deletions blocklets/snap-kit/blocklet.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ repository:
type: git
url: git+https://github.com/blocklet/create-blocklet.git
specVersion: 1.2.8
version: 1.0.5
version: 1.0.6
logo: logo.png
files:
- dist
Expand Down Expand Up @@ -57,7 +57,7 @@ capabilities:
screenshots:
- 1.jpeg
docker:
image: arcblock/blocklet-with-puppeteer
image: arcblock/snap-kit
shell: gosu
workdir: /var/lib/blocklet
installNodeModules: true
Expand Down
9 changes: 8 additions & 1 deletion blocklets/snap-kit/dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,19 @@ RUN apt-get update && \
apt-get install -y --no-install-recommends \
build-essential socat curl gnupg wget \
gosu strace passwd git ca-certificates libcairo2-dev libpango1.0-dev \
libjpeg-dev libgif-dev && \
libjpeg-dev libgif-dev \
# Install fontconfig and fonts for Chinese and emoji
fontconfig \
fonts-noto-cjk fonts-noto-color-emoji && \
rm -rf /var/lib/apt/lists/*

# Create necessary directories
RUN mkdir -p /data/bin /var/lib/blocklet /home/node/.npm-global/lib

# Set locale to C.UTF-8 for broad UTF-8 support
ENV LANG=C.UTF-8
ENV LC_ALL=C.UTF-8

# Set environment variables
ENV DOCKER_DATA=/var/lib/blocklet
ENV NODE_VERSION=22
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "crawler",
"private": true,
"version": "1.0.5",
"version": "1.0.6",
"scripts": {
"dev": "blocklet dev",
"lint": "pnpm -r lint",
Expand Down
3 changes: 2 additions & 1 deletion packages/crawler/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
"build:cjs": "tsc -p tsconfig.cjs.json",
"build:esm": "tsc -p tsconfig.esm.json",
"build": "npm run build:cjs && npm run build:esm",
"prepublishOnly": "npm run build"
"prepublishOnly": "npm run build",
"fix:sqlite": "cd node_modules/sqlite3 && npm run rebuild"
},
"files": [
"dist",
Expand Down
39 changes: 27 additions & 12 deletions packages/crawler/src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import { config, logger } from './config';
import { Job, JobState } from './db/job';
import { Snapshot, SnapshotModel } from './db/snapshot';
import { initPage } from './puppeteer';
import { formatUrl, isAcceptCrawler, md5 } from './utils';
import { findMaxScrollHeight, formatUrl, isAcceptCrawler, md5 } from './utils';

const { BaseState } = require('@abtnode/models');

Expand All @@ -24,7 +24,7 @@ export function createCrawlQueue() {
store: new SequelizeStore(db, 'crawler'),
concurrency: 1,
onJob: async (job: JobState) => {
logger.debug('job start:', job);
logger.info('Starting to execute crawl job', job);

const canCrawl = await isAcceptCrawler(job.url);
if (!canCrawl) {
Expand Down Expand Up @@ -167,7 +167,7 @@ export const getPageContent = async ({
width = 1440,
height = 900,
quality = 80,
timeout = 60 * 1000,
timeout = 90 * 1000,
fullPage = false,
}: {
url: string;
Expand All @@ -185,7 +185,7 @@ export const getPageContent = async ({
const page = await initPage();

if (width && height) {
await page.setViewport({ width, height });
await page.setViewport({ width, height, deviceScaleFactor: 2 });
}

let html: string | null = null;
Expand All @@ -207,13 +207,28 @@ export const getPageContent = async ({
}

// await for networkidle0
// https://pptr.dev/api/puppeteer.page.goforward/#remarks
// https://pptr.dev/api/puppeteer.page.waitfornetworkidle
await page.waitForNetworkIdle({
idleTime: 2 * 1000,
idleTime: 1.5 * 1000,
});

// get screenshot
if (includeScreenshot) {
// Try to find the tallest element and set the browser to the same height
if (fullPage) {
const maxScrollHeight = await findMaxScrollHeight(page);

logger.info('findMaxScrollHeight', { maxScrollHeight });

if (maxScrollHeight) {
await page.setViewport({ width, height: maxScrollHeight || height, deviceScaleFactor: 2 });
await page.evaluate((scrollHeight) => {
window.scrollTo(0, scrollHeight || 0);
document.documentElement.scrollTo(0, scrollHeight || 0);
}, maxScrollHeight);
}
}

try {
screenshot = await page.screenshot({ fullPage, quality, type: 'webp' });
} catch (err) {
Expand Down Expand Up @@ -247,6 +262,7 @@ export const getPageContent = async ({
export async function createCrawlJob(params: JobState, callback?: (snapshot: SnapshotModel | null) => void) {
params = {
...params,
id: randomUUID(),
url: formatUrl(params.url),
};

Expand All @@ -261,18 +277,17 @@ export async function createCrawlJob(params: JobState, callback?: (snapshot: Sna
fullPage: params.fullPage,
});

logger.info('create crawl job', params);

if (existsJob) {
logger.warn(`Crawl job already exists for ${params.url}, skip`);
return existsJob.id;
}

const jobId = randomUUID();
const job = crawlQueue.push({ ...params, id: jobId });
logger.info('create crawl job', params);

const job = crawlQueue.push(params);

job.on('finished', ({ result }) => {
logger.info(`Crawl completed ${params.url}, status: ${result ? 'success' : 'failed'}`, { job: params, result });
logger.info(`Crawl completed ${params.url}`, { job: params, result });
callback?.(result);
});

Expand All @@ -281,7 +296,7 @@ export async function createCrawlJob(params: JobState, callback?: (snapshot: Sna
callback?.(null);
});

return jobId;
return params.id;
}

// @ts-ignore
Expand Down
12 changes: 6 additions & 6 deletions packages/crawler/src/puppeteer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ export async function ensureBrowser() {

const executablePath = process.env.PUPPETEER_EXECUTABLE_PATH || '/usr/bin/chromium';

logger.info('executablePath', executablePath);
logger.debug('Chromium executablePath', executablePath);

if (!fs.existsSync(executablePath)) {
logger.info('start download browser', puppeteerConfig);
Expand Down Expand Up @@ -148,7 +148,7 @@ export async function launchBrowser() {
'--font-render-hinting=none',
],
});
logger.info('Launch browser success');
logger.info('Launch browser');
} catch (error) {
logger.error('launch browser failed: ', error);
// cleanup browser endpoint
Expand Down Expand Up @@ -231,21 +231,21 @@ export const closeBrowser = async ({ trimCache = true }: { trimCache?: boolean }
const pages = await browser.pages();
await Promise.all(pages.map((page) => page.close()));
} catch (err) {
logger.error('Failed to close all pages:', err);
logger.warn('Failed to close all pages:', err);
}

// close browser
try {
await browser.close();
} catch (err) {
logger.error('Failed to close browser:', err);
logger.warn('Failed to close browser:', err);
}

// clear cache
try {
if (trimCache) {
await puppeteer.trimCache();
logger.info('Trim cache success');
logger.debug('Trim cache success');
}

// try to clear temporary directory
Expand All @@ -257,7 +257,7 @@ export const closeBrowser = async ({ trimCache = true }: { trimCache?: boolean }
global.gc();
}
} catch (err) {
logger.error('Failed to clear browser cache:', err);
logger.warn('Failed to clear browser cache:', err);
}

browser = null;
Expand Down
25 changes: 25 additions & 0 deletions packages/crawler/src/utils.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import { Page } from '@blocklet/puppeteer';
import { components, env } from '@blocklet/sdk/lib/config';
import axios from 'axios';
import flattenDeep from 'lodash/flattenDeep';
Expand Down Expand Up @@ -238,3 +239,27 @@ export const formatUrl = (url: string) => {
export function md5(content: string | Uint8Array) {
return createHash('md5').update(content).digest('hex');
}

export async function findMaxScrollHeight(page: Page) {
const maxHeightHandler = await page.evaluateHandle(() => {
const elements = Array.from(document.querySelectorAll('*'));
let maxHeight = document.body.scrollHeight;

for (const el of elements) {
const style = window.getComputedStyle(el);
if (style.overflowY === 'auto' || style.overflowY === 'scroll') {
if (el.scrollHeight > el.clientHeight && el.scrollHeight > maxHeight) {
maxHeight = el.scrollHeight;
}
}
}

return maxHeight;
});

const maxHeight = await maxHeightHandler.jsonValue();

maxHeightHandler.dispose();

return maxHeight;
}
23 changes: 0 additions & 23 deletions prd/20250521-review.md

This file was deleted.

2 changes: 1 addition & 1 deletion version
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.0.5
1.0.6
Loading