Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/npm-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@ jobs:
- uses: actions/checkout@v3
- uses: pnpm/action-setup@v4
with:
version: 10
version: 10.7.0
- uses: actions/setup-node@v3
with:
node-version: 22.14.0
registry-url: https://registry.npmjs.org/
- run: pnpm i --frozen-lockfile
- run: pnpm clean
- run: pnpm build
- run: npm publish --access public
- run: pnpm -r publish --access public
env:
NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
5 changes: 2 additions & 3 deletions .github/workflows/pull-request.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,10 @@ jobs:
- uses: actions/checkout@v3
- uses: pnpm/action-setup@v4
with:
version: 10
version: 10.7.0
- uses: actions/setup-node@v3
with:
node-version: 22.14.0
- run: pnpm i --frozen-lockfile
- run: pnpm lint
- run: pnpm pretty:check
- run: pnpm neat
- run: pnpm test
4 changes: 0 additions & 4 deletions .prettierignore

This file was deleted.

6 changes: 0 additions & 6 deletions .prettierrc.json

This file was deleted.

30 changes: 30 additions & 0 deletions biome.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"$schema": "https://biomejs.dev/schemas/1.9.4/schema.json",
"vcs": {
"enabled": false,
"clientKind": "git",
"useIgnoreFile": true
},
"files": {
"ignoreUnknown": false,
"ignore": [".turbo", "dist", "node_modules", "coverage"]
},
"formatter": {
"enabled": true,
"indentStyle": "tab"
},
"organizeImports": {
"enabled": true
},
"linter": {
"enabled": true,
"rules": {
"recommended": true
}
},
"javascript": {
"formatter": {
"quoteStyle": "double"
}
}
}
12 changes: 0 additions & 12 deletions eslint.config.mjs

This file was deleted.

76 changes: 25 additions & 51 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,53 +1,27 @@
{
"name": "@qnaplus/scraper",
"version": "3.0.0",
"description": "Utility package to scrape questions from the VEX Robotics Q&A.",
"main": "dist/index.js",
"repository": {
"url": "git+ssh://git@github.com/qnaplus/scraper.git"
},
"author": "Battlesquid <25509915+Battlesquid@users.noreply.github.com>",
"license": "GPL-3.0",
"scripts": {
"build": "tsc",
"build:clean": "pnpm clean && pnpm build",
"test": "vitest",
"clean": "rimraf dist",
"reset": "pnpm clean && rimraf node_modules",
"lint": "eslint src/**",
"lint:fix": "pnpm lint --fix",
"neat": "pnpm lint:fix && pnpm pretty",
"pretty": "prettier . --write",
"pretty:check": "prettier . --check"
},
"dependencies": {
"@crawlee/core": "^3.11.5",
"cheerio": "^1.0.0",
"got-scraping": "^4.0.6",
"node-pdf-parser": "^1.0.3",
"pino": "^9.4.0"
},
"devDependencies": {
"@eslint/js": "^9.12.0",
"eslint": "^9.12.0",
"pino-pretty": "^11.2.2",
"prettier": "^3.3.3",
"rimraf": "^6.0.1",
"typescript": "5.5.4",
"typescript-eslint": "^8.9.0",
"vitest": "^3.0.8"
},
"volta": {
"node": "22.14.0",
"pnpm": "10.6.2"
},
"files": [
"dist"
],
"pnpm": {
"onlyBuiltDependencies": [
"canvas",
"esbuild"
]
}
"name": "scraper",
"version": "1.0.0",
"description": "",
"private": true,
"author": "battlesqui_d",
"license": "GPL-3.0",
"devDependencies": {
"@biomejs/biome": "1.9.4",
"rimraf": "^6.0.1",
"syncpack": "^13.0.2",
"typescript": "5.5.4"
},
"scripts": {
"build": "tsc --build",
"clean": "pnpm -r clean",
"reset": "pnpm -r reset && rimraf node_modules",
"test": "pnpm -r test",
"neat": "biome check .",
"neat:fix": "biome check . --write"
},
"volta": {
"pnpm": "10.7.0",
"node": "22.14.0"
},
"packageManager": "pnpm@10.7.0"
}
33 changes: 33 additions & 0 deletions packages/scraper-strategies/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"name": "@qnaplus/scraper-strategies",
"version": "3.0.0",
"description": "Request stategies for @qnaplus/scraper",
"main": "dist/index.js",
"repository": {
"url": "git+ssh://git@github.com/qnaplus/scraper.git"
},
"author": "Battlesquid <25509915+Battlesquid@users.noreply.github.com>",
"license": "GPL-3.0",
"scripts": {
"build": "tsc",
"build:clean": "pnpm clean && pnpm build",
"clean": "rimraf dist tsconfig.tsbuildinfo",
"reset": "pnpm clean && rimraf node_modules"
},
"dependencies": {
"@qnaplus/node-curl-impersonate": "^1.0.0",
"@qnaplus/scraper": "workspace:*",
"cycletls": "^1.0.27",
"form-data": "^4.0.1",
"header-generator": "^2.1.62",
"pino": "catalog:"
},
"devDependencies": {
"@qnaplus/typescript-config": "workspace:1.0.0",
"pino-pretty": "catalog:"
},
"volta": {
"extends": "../../package.json"
},
"files": ["dist"]
}
77 changes: 77 additions & 0 deletions packages/scraper-strategies/src/curl_impersonate_client.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import {
BrowserPresets,
type BrowserType,
type ChromePresetVersion,
type CurlResultOk,
type FirefoxPresetVersion,
RequestBuilder,
type RequestPreset,
type SafariPresetVersion,
getCompatibleBrowsers,
} from "@qnaplus/node-curl-impersonate";
import { FetchClient, type FetchClientResponse } from "@qnaplus/scraper";

export class CurlImpersonateScrapingClient extends FetchClient<FetchClientResponse> {
async fetch(url: string): Promise<FetchClientResponse> {
const browsers = getCompatibleBrowsers();
const badStatusCodes = [403];
let latestResponse: FetchClientResponse = {
ok: false,
body: "",
status: -1,
url: "",
};

// TODO: when a bad status code is not recieved, cache the preset that was used.
// Continue using that preset until failure, then search again for another working preset.
for (const browser of browsers) {
for (const version of Object.keys(BrowserPresets[browser.name])) {
const { response, details } = await this.doPresetRequest(url, {
name: browser.name,
version: version as
| ChromePresetVersion
| FirefoxPresetVersion
| SafariPresetVersion,
});
latestResponse = {
body: response,
ok: details.response_code === 200,
url: details.url_effective,
status: details.response_code,
};
if (!badStatusCodes.includes(details.response_code)) {
return latestResponse;
}
this.logger?.trace(
`Request did not return an accepted response code (preset: ${browser.name} v${version})`,
);
}
}
this.logger?.trace(
`All presets failed (latest response: ${latestResponse}`,
);
return latestResponse;
}

async buffer(): Promise<ArrayBufferLike | null> {
// TODO: implement reading response buffer
return null;
}

teardown(): Promise<void> | void {}

private async doPresetRequest<T extends BrowserType>(
url: string,
preset: RequestPreset<T>,
): Promise<CurlResultOk> {
const response = await new RequestBuilder()
.url(url)
.preset(preset)
.follow()
.send();
if (response.stderr !== undefined) {
throw new Error(response.stderr); // TODO figure out how to handle this
}
return response;
}
}
71 changes: 71 additions & 0 deletions packages/scraper-strategies/src/cycletls_client.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import { FetchClient, type FetchClientResponse } from "@qnaplus/scraper";
import initCycleTLS, { type CycleTLSClient } from "cycletls";
import { HeaderGenerator } from "header-generator";
import type { Logger } from "pino";

export type CycleTLSResponse = FetchClientResponse;

export class CycleTLSScrapingClient extends FetchClient<CycleTLSResponse> {
private static instance: CycleTLSScrapingClient | null = null;
private static logger?: Logger;
private static client: CycleTLSClient;
private static initialized = false;

private constructor(logger?: Logger) {
super(logger);
}

async fetch(url: string): Promise<CycleTLSResponse> {
const response = await this.getResponse(url);
return {
body: response.body as string,
ok: response.status === 200,
url: response.finalUrl,
status: response.status,
};
}

async buffer(): Promise<ArrayBufferLike | null> {
// TODO: implement reading response buffer
return null;
}

static async initialize(logger?: Logger): Promise<void> {
if (CycleTLSScrapingClient.initialized) {
return;
}
CycleTLSScrapingClient.logger = logger;
CycleTLSScrapingClient.client = await initCycleTLS();
CycleTLSScrapingClient.initialized = true;
}

static getInstance(): CycleTLSScrapingClient {
if (!CycleTLSScrapingClient.initialized) {
throw Error("Initialize was not called on CycleTLSScrapingClient.");
}
if (CycleTLSScrapingClient.instance === null) {
CycleTLSScrapingClient.instance = new CycleTLSScrapingClient(
CycleTLSScrapingClient.logger,
);
}
return CycleTLSScrapingClient.instance;
}

teardown(): Promise<void> | void {
CycleTLSScrapingClient.logger?.trace("Tearing down CycleTLS");
return CycleTLSScrapingClient.client.exit().then(() => {
CycleTLSScrapingClient.initialized = false;
});
}

private async getResponse(url: string) {
const client = CycleTLSScrapingClient.client;
const headerGenerator = new HeaderGenerator();
const options = {
headers: headerGenerator.getHeaders(),
ja3: "771,4865-4867-4866-49195-49199-52393-52392-49196-49200-49162-49161-49171-49172-51-57-47-53-10,0-23-65281-10-11-35-16-5-51-43-13-45-28-21,29-23-24-25-256-257,0",
};
const response = await client.get(url, options);
return response;
}
}
2 changes: 2 additions & 0 deletions packages/scraper-strategies/src/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
export * from "./cycletls_client";
export * from "./curl_impersonate_client";
5 changes: 5 additions & 0 deletions packages/scraper-strategies/src/utils.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
// https://bugs.chromium.org/p/v8/issues/detail?id=2869
export const unleak = (str: string | undefined | null): string => {
// biome-ignore lint: style/useTemplate
return (" " + (str ?? "")).slice(1);
};
14 changes: 14 additions & 0 deletions packages/scraper-strategies/tsconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"extends": "@qnaplus/typescript-config/base.json",
"compilerOptions": {
"rootDir": "src",
"outDir": "dist"
},
"references": [
{
"path": "../scraper"
}
],
"include": ["src"],
"exclude": ["node_modules"]
}
Loading