diff --git a/jest.config.mjs b/jest.config.mjs index 557dffd..3ab2287 100644 --- a/jest.config.mjs +++ b/jest.config.mjs @@ -13,7 +13,8 @@ export default { testMatch: [ '**/tests/unit/**/*.test.js', '**/tests/mock/**/*.test.js', - '**/tests/e2e/**/*.test.js' + '**/tests/e2e/**/*.test.js', + '**/tests/xpaste/**/*.test.js' ], setupFiles: ['./tests/jest.setup.mjs'] }; \ No newline at end of file diff --git a/package.json b/package.json index 5a584d5..cf6e80a 100644 --- a/package.json +++ b/package.json @@ -18,6 +18,7 @@ "examples": "yarn build && yarn examples:javascript && yarn examples:python" }, "dependencies": { + "archiver": "^7.0.1", "cheerio": "^1.0.0", "express": "^4.18.2", "node-fetch": "^2.7.0", @@ -44,6 +45,6 @@ ], "license": "UNLICENSED", "engines": { - "node": ">=22.0.0 <23.0.0" + "node": ">=20.0.0" } } diff --git a/scripts/xpaste/capture-fullpage.js b/scripts/xpaste/capture-fullpage.js new file mode 100644 index 0000000..219fc46 --- /dev/null +++ b/scripts/xpaste/capture-fullpage.js @@ -0,0 +1,19 @@ +import fetch from 'node-fetch'; +import fs from 'fs'; + +const baseUrl = 'http://localhost:3000'; +const targetUrl = 'https://xpaste.pro/p/t4q0Lsp0'; + +console.log('Capturing full-page screenshot of xpaste.pro/p/t4q0Lsp0...'); +const imageResponse = await fetch(`${baseUrl}/image?url=${encodeURIComponent(targetUrl)}&fullPage=true&engine=playwright`); + +if (!imageResponse.ok) { + console.error('Failed to capture screenshot:', await imageResponse.text()); + process.exit(1); +} + +const imageBuffer = await imageResponse.buffer(); +const outputPath = 'tests/xpaste/data/t4q0Lsp0-screenshot.png'; +fs.writeFileSync(outputPath, imageBuffer); +console.log(`Full-page screenshot saved to ${outputPath}`); +console.log(`Screenshot size: ${imageBuffer.length} bytes`); diff --git a/scripts/xpaste/regenerate-markdown.js b/scripts/xpaste/regenerate-markdown.js new file mode 100644 index 0000000..610fb2f --- /dev/null +++ b/scripts/xpaste/regenerate-markdown.js @@ -0,0 +1,26 @@ +import { readFileSync, writeFileSync } from 'fs'; +import { convertHtmlToMarkdown } from '../../src/lib.js'; + +const html = readFileSync('./tests/xpaste/data/t4q0Lsp0-page.html', 'utf-8'); +const markdown = convertHtmlToMarkdown(html, 'https://xpaste.pro/p/t4q0Lsp0'); +writeFileSync('./tests/xpaste/data/t4q0Lsp0-page.md', markdown); + +console.log('✅ Markdown regenerated successfully'); +console.log('\nFirst 30 lines:'); +console.log(markdown.split('\n').slice(0, 30).join('\n')); + +console.log('\n\n=== Checking key elements ==='); +const lines = markdown.split('\n'); +const headingLine = lines.findIndex(l => l.includes('Упакуем пароль')); +const formatLine = lines.findIndex(l => l.includes('Формат:')); +const languageLine = lines.findIndex(l => l.includes('[Ru]') || l.includes('[En]')); + +console.log(`Heading "Упакуем пароль..." at line: ${headingLine + 1}`); +console.log(`Format metadata at line: ${formatLine + 1}`); +console.log(`Language links at line: ${languageLine + 1}`); + +if (headingLine < formatLine) { + console.log('✅ Heading comes before metadata (correct order)'); +} else { + console.log('❌ Heading comes after metadata (incorrect order)'); +} diff --git a/scripts/xpaste/test-html-markdown.js b/scripts/xpaste/test-html-markdown.js new file mode 100644 index 0000000..7ee5ddb --- /dev/null +++ b/scripts/xpaste/test-html-markdown.js @@ -0,0 +1,69 @@ +#!/usr/bin/env node +/** + * Test script to fetch xpaste.pro HTML and convert to markdown + * This helps us understand what the markdown extraction looks like + */ + +import fetch from 'node-fetch'; +import { convertHtmlToMarkdown } from '../../src/lib.js'; +import fs from 'fs'; +import path from 'path'; +import { fileURLToPath } from 'url'; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); + +async function testXpasteHtmlToMarkdown() { + const url = 'https://xpaste.pro/p/t4q0Lsp0'; + + console.log('Fetching HTML from:', url); + const response = await fetch(url); + const html = await response.text(); + + // Save the HTML for reference + const htmlPath = path.join(__dirname, '../../tests/xpaste/data/t4q0Lsp0-page.html'); + fs.writeFileSync(htmlPath, html); + console.log('Saved HTML to:', htmlPath); + + // Convert to markdown + console.log('\nConverting to markdown...'); + const markdown = convertHtmlToMarkdown(html, url); + + // Save the markdown + const mdPath = path.join(__dirname, '../../tests/xpaste/data/t4q0Lsp0-page.md'); + fs.writeFileSync(mdPath, markdown); + console.log('Saved markdown to:', mdPath); + + // Analyze the markdown + const lines = markdown.split('\n'); + console.log('\nMarkdown statistics:'); + console.log('- Total lines:', lines.length); + console.log('- Total characters:', markdown.length); + + // Check for key elements from the screenshot + const checks = [ + { name: 'Page title/header', pattern: /xpaste|упакует/i }, + { name: 'Format info', pattern: /формат|format.*text/i }, + { name: 'Creation date', pattern: /07\.07\.2021|время создания/i }, + { name: 'SQL query #1', pattern: /User@Host.*1703313381.*1138102510/i }, + { name: 'SQL query content', pattern: /SELECT.*phpbb_posts/i }, + { name: 'Footer text', pattern: /soulbridge|справка|политика/i }, + ]; + + console.log('\nContent checks:'); + checks.forEach(check => { + const found = check.pattern.test(markdown); + console.log(`- ${check.name}: ${found ? '✓' : '✗'}`); + }); + + // Show first 500 chars of markdown + console.log('\nFirst 500 characters of markdown:'); + console.log(markdown.substring(0, 500)); + console.log('...\n'); + + // Show last 300 chars of markdown + console.log('Last 300 characters of markdown:'); + console.log('...'); + console.log(markdown.substring(markdown.length - 300)); +} + +testXpasteHtmlToMarkdown().catch(console.error); diff --git a/src/browser.js b/src/browser.js index 3fff7d1..c4405dd 100644 --- a/src/browser.js +++ b/src/browser.js @@ -129,13 +129,17 @@ function createPuppeteerPageAdapter(page) { * @param {Object} page - Playwright page object * @returns {PageAdapter} */ -function createPlaywrightPageAdapter(page) { +function createPlaywrightPageAdapter(page, context) { return { async setExtraHTTPHeaders(headers) { await page.setExtraHTTPHeaders(headers); }, async setUserAgent(userAgent) { - await page.setUserAgent(userAgent); + // Playwright sets user agent at context level, but we can also use route + // For simplicity, we'll store it and use it in goto if needed + // Note: In Playwright, user agent should ideally be set when creating the context + // Since we can't change it after page creation, we'll just ignore this for now + // or we could recreate the page with the new user agent }, async setViewport(viewport) { // Playwright uses setViewportSize instead of setViewport diff --git a/src/image.js b/src/image.js index adcfced..9a2808b 100644 --- a/src/image.js +++ b/src/image.js @@ -7,6 +7,7 @@ export async function imageHandler(req, res) { // Ensure URL is absolute const absoluteUrl = url.startsWith('http') ? url : `https://${url}`; const engine = getBrowserEngine(req); + const fullPage = req.query.fullPage === 'true'; const browser = await createBrowser(engine); try { const page = await browser.newPage(); @@ -22,8 +23,8 @@ export async function imageHandler(req, res) { }); // Wait for 5 seconds after page load await new Promise(resolve => setTimeout(resolve, 5000)); - // Take a screenshot of just the viewport (not the full page) - const buffer = await page.screenshot({ type: 'png' }); + // Take a screenshot (viewport or full page based on query parameter) + const buffer = await page.screenshot({ type: 'png', fullPage }); res.set('Content-Type', 'image/png'); res.set('Content-Disposition', 'inline; filename="screenshot.png"'); res.end(buffer); diff --git a/src/index.js b/src/index.js index 8dba3a8..c332085 100644 --- a/src/index.js +++ b/src/index.js @@ -5,6 +5,7 @@ import { markdownHandler } from './markdown.js'; import { imageHandler } from './image.js'; import { streamHandler } from './stream.js'; import { fetchHandler } from './fetch.js'; +import { txtHandler } from './txt.js'; const app = express(); const port = process.env.PORT || 3000; @@ -14,6 +15,7 @@ app.get('/markdown', markdownHandler); app.get('/image', imageHandler); app.get('/stream', streamHandler); app.get('/fetch', fetchHandler); +app.get('/txt', txtHandler); // Start the server if this is the main module const isMainModule = process.argv[1] && fileURLToPath(import.meta.url) === process.argv[1]; diff --git a/src/lib.js b/src/lib.js index d0ba86a..8f6a69b 100644 --- a/src/lib.js +++ b/src/lib.js @@ -20,6 +20,23 @@ export function convertHtmlToMarkdown(html, baseUrl) { // Load HTML into Cheerio const $ = cheerio.load(html); + // Reorder header/main/footer elements to match visual layout + // Some sites (like xpaste.pro) use CSS to position header at top, + // but in HTML it comes after main. We need to fix the order for markdown. + const $header = $('header').first(); + const $main = $('main').first(); + const $footer = $('footer').first(); + + if ($header.length && $main.length && $header.index() > $main.index()) { + // Header comes after main in DOM, but should come before in markdown + $main.before($header); + } + + if ($footer.length && $main.length && $footer.index() < $main.index()) { + // Footer comes before main in DOM, but should come after in markdown + $main.after($footer); + } + // Remove