Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion jest.config.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ export default {
testMatch: [
'**/tests/unit/**/*.test.js',
'**/tests/mock/**/*.test.js',
'**/tests/e2e/**/*.test.js'
'**/tests/e2e/**/*.test.js',
'**/tests/xpaste/**/*.test.js'
],
setupFiles: ['./tests/jest.setup.mjs']
};
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"examples": "yarn build && yarn examples:javascript && yarn examples:python"
},
"dependencies": {
"archiver": "^7.0.1",
"cheerio": "^1.0.0",
"express": "^4.18.2",
"node-fetch": "^2.7.0",
Expand All @@ -44,6 +45,6 @@
],
"license": "UNLICENSED",
"engines": {
"node": ">=22.0.0 <23.0.0"
"node": ">=20.0.0"
}
}
19 changes: 19 additions & 0 deletions scripts/xpaste/capture-fullpage.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import fetch from 'node-fetch';
import fs from 'fs';

const baseUrl = 'http://localhost:3000';
const targetUrl = 'https://xpaste.pro/p/t4q0Lsp0';

console.log('Capturing full-page screenshot of xpaste.pro/p/t4q0Lsp0...');
const imageResponse = await fetch(`${baseUrl}/image?url=${encodeURIComponent(targetUrl)}&fullPage=true&engine=playwright`);

if (!imageResponse.ok) {
console.error('Failed to capture screenshot:', await imageResponse.text());
process.exit(1);
}

const imageBuffer = await imageResponse.buffer();
const outputPath = 'tests/xpaste/data/t4q0Lsp0-screenshot.png';
fs.writeFileSync(outputPath, imageBuffer);
console.log(`Full-page screenshot saved to ${outputPath}`);
console.log(`Screenshot size: ${imageBuffer.length} bytes`);
26 changes: 26 additions & 0 deletions scripts/xpaste/regenerate-markdown.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import { readFileSync, writeFileSync } from 'fs';
import { convertHtmlToMarkdown } from '../../src/lib.js';

const html = readFileSync('./tests/xpaste/data/t4q0Lsp0-page.html', 'utf-8');
const markdown = convertHtmlToMarkdown(html, 'https://xpaste.pro/p/t4q0Lsp0');
writeFileSync('./tests/xpaste/data/t4q0Lsp0-page.md', markdown);

console.log('✅ Markdown regenerated successfully');
console.log('\nFirst 30 lines:');
console.log(markdown.split('\n').slice(0, 30).join('\n'));

console.log('\n\n=== Checking key elements ===');
const lines = markdown.split('\n');
const headingLine = lines.findIndex(l => l.includes('Упакуем пароль'));
const formatLine = lines.findIndex(l => l.includes('Формат:'));
const languageLine = lines.findIndex(l => l.includes('[Ru]') || l.includes('[En]'));

console.log(`Heading "Упакуем пароль..." at line: ${headingLine + 1}`);
console.log(`Format metadata at line: ${formatLine + 1}`);
console.log(`Language links at line: ${languageLine + 1}`);

if (headingLine < formatLine) {
console.log('✅ Heading comes before metadata (correct order)');
} else {
console.log('❌ Heading comes after metadata (incorrect order)');
}
69 changes: 69 additions & 0 deletions scripts/xpaste/test-html-markdown.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/usr/bin/env node
/**
* Test script to fetch xpaste.pro HTML and convert to markdown
* This helps us understand what the markdown extraction looks like
*/

import fetch from 'node-fetch';
import { convertHtmlToMarkdown } from '../../src/lib.js';
import fs from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';

const __dirname = path.dirname(fileURLToPath(import.meta.url));

async function testXpasteHtmlToMarkdown() {
const url = 'https://xpaste.pro/p/t4q0Lsp0';

console.log('Fetching HTML from:', url);
const response = await fetch(url);
const html = await response.text();

// Save the HTML for reference
const htmlPath = path.join(__dirname, '../../tests/xpaste/data/t4q0Lsp0-page.html');
fs.writeFileSync(htmlPath, html);
console.log('Saved HTML to:', htmlPath);

// Convert to markdown
console.log('\nConverting to markdown...');
const markdown = convertHtmlToMarkdown(html, url);

// Save the markdown
const mdPath = path.join(__dirname, '../../tests/xpaste/data/t4q0Lsp0-page.md');
fs.writeFileSync(mdPath, markdown);
console.log('Saved markdown to:', mdPath);

// Analyze the markdown
const lines = markdown.split('\n');
console.log('\nMarkdown statistics:');
console.log('- Total lines:', lines.length);
console.log('- Total characters:', markdown.length);

// Check for key elements from the screenshot
const checks = [
{ name: 'Page title/header', pattern: /xpaste|упакует/i },
{ name: 'Format info', pattern: /формат|format.*text/i },
{ name: 'Creation date', pattern: /07\.07\.2021|время создания/i },
{ name: 'SQL query #1', pattern: /User@Host.*1703313381.*1138102510/i },
{ name: 'SQL query content', pattern: /SELECT.*phpbb_posts/i },
{ name: 'Footer text', pattern: /soulbridge|справка|политика/i },
];

console.log('\nContent checks:');
checks.forEach(check => {
const found = check.pattern.test(markdown);
console.log(`- ${check.name}: ${found ? '✓' : '✗'}`);
});

// Show first 500 chars of markdown
console.log('\nFirst 500 characters of markdown:');
console.log(markdown.substring(0, 500));
console.log('...\n');

// Show last 300 chars of markdown
console.log('Last 300 characters of markdown:');
console.log('...');
console.log(markdown.substring(markdown.length - 300));
}

testXpasteHtmlToMarkdown().catch(console.error);
8 changes: 6 additions & 2 deletions src/browser.js
Original file line number Diff line number Diff line change
Expand Up @@ -129,13 +129,17 @@ function createPuppeteerPageAdapter(page) {
* @param {Object} page - Playwright page object
* @returns {PageAdapter}
*/
function createPlaywrightPageAdapter(page) {
function createPlaywrightPageAdapter(page, context) {
return {
async setExtraHTTPHeaders(headers) {
await page.setExtraHTTPHeaders(headers);
},
async setUserAgent(userAgent) {
await page.setUserAgent(userAgent);
// Playwright sets user agent at context level, but we can also use route
// For simplicity, we'll store it and use it in goto if needed
// Note: In Playwright, user agent should ideally be set when creating the context
// Since we can't change it after page creation, we'll just ignore this for now
// or we could recreate the page with the new user agent
},
async setViewport(viewport) {
// Playwright uses setViewportSize instead of setViewport
Expand Down
5 changes: 3 additions & 2 deletions src/image.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ export async function imageHandler(req, res) {
// Ensure URL is absolute
const absoluteUrl = url.startsWith('http') ? url : `https://${url}`;
const engine = getBrowserEngine(req);
const fullPage = req.query.fullPage === 'true';
const browser = await createBrowser(engine);
try {
const page = await browser.newPage();
Expand All @@ -22,8 +23,8 @@ export async function imageHandler(req, res) {
});
// Wait for 5 seconds after page load
await new Promise(resolve => setTimeout(resolve, 5000));
// Take a screenshot of just the viewport (not the full page)
const buffer = await page.screenshot({ type: 'png' });
// Take a screenshot (viewport or full page based on query parameter)
const buffer = await page.screenshot({ type: 'png', fullPage });
res.set('Content-Type', 'image/png');
res.set('Content-Disposition', 'inline; filename="screenshot.png"');
res.end(buffer);
Expand Down
2 changes: 2 additions & 0 deletions src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { markdownHandler } from './markdown.js';
import { imageHandler } from './image.js';
import { streamHandler } from './stream.js';
import { fetchHandler } from './fetch.js';
import { txtHandler } from './txt.js';

const app = express();
const port = process.env.PORT || 3000;
Expand All @@ -14,6 +15,7 @@ app.get('/markdown', markdownHandler);
app.get('/image', imageHandler);
app.get('/stream', streamHandler);
app.get('/fetch', fetchHandler);
app.get('/txt', txtHandler);

// Start the server if this is the main module
const isMainModule = process.argv[1] && fileURLToPath(import.meta.url) === process.argv[1];
Expand Down
47 changes: 47 additions & 0 deletions src/lib.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,23 @@ export function convertHtmlToMarkdown(html, baseUrl) {
// Load HTML into Cheerio
const $ = cheerio.load(html);

// Reorder header/main/footer elements to match visual layout
// Some sites (like xpaste.pro) use CSS to position header at top,
// but in HTML it comes after main. We need to fix the order for markdown.
const $header = $('header').first();
const $main = $('main').first();
const $footer = $('footer').first();

if ($header.length && $main.length && $header.index() > $main.index()) {
// Header comes after main in DOM, but should come before in markdown
$main.before($header);
}

if ($footer.length && $main.length && $footer.index() < $main.index()) {
// Footer comes before main in DOM, but should come after in markdown
$main.after($footer);
}

// Remove <style>, <script>, and <noscript> tags
$('style, script, noscript').remove();

Expand Down Expand Up @@ -289,4 +306,34 @@ export function ensureUtf8(html) {
);
}
return html;
}

// Normalize URL to get text content (e.g., convert xpaste.pro URLs to raw format)
export function normalizeUrlForTextContent(url) {
try {
const urlObj = new URL(url);

// Handle xpaste.pro URLs - convert to /raw endpoint
if (urlObj.hostname === 'xpaste.pro' && urlObj.pathname.startsWith('/p/')) {
// If it doesn't already end with /raw, append it
if (!urlObj.pathname.endsWith('/raw')) {
return `${url}/raw`;
}
}

return url;
} catch (error) {
// If URL parsing fails, return original URL
return url;
}
}

// Check if a URL is a text paste service (like xpaste.pro)
export function isTextPasteUrl(url) {
try {
const urlObj = new URL(url);
return urlObj.hostname === 'xpaste.pro' && urlObj.pathname.startsWith('/p/');
} catch {
return false;
}
}
59 changes: 58 additions & 1 deletion src/markdown.js
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
import { fetchHtml, convertHtmlToMarkdown } from './lib.js';
import fetch from 'node-fetch';
import { fetchHtml, convertHtmlToMarkdown, isTextPasteUrl, normalizeUrlForTextContent } from './lib.js';
import archiver from 'archiver';

export async function markdownHandler(req, res) {
const url = req.query.url;
if (!url) return res.status(400).send('Missing `url` parameter');

try {
// Check if this is a text paste URL (like xpaste.pro)
if (isTextPasteUrl(url)) {
return await handleTextPasteMarkdown(req, res, url);
}

// Regular HTML to markdown conversion
const html = await fetchHtml(url);
// Pass baseUrl to convertHtmlToMarkdown so all URLs are absolute
const markdown = convertHtmlToMarkdown(html, url);
Expand All @@ -12,4 +21,52 @@ export async function markdownHandler(req, res) {
console.error(err);
res.status(500).send('Error converting to Markdown');
}
}

async function handleTextPasteMarkdown(req, res, url) {
// Fetch the HTML version of the page to get the full visual content
// (including UI elements, metadata, etc.) as it appears in screenshots
const html = await fetchHtml(url);
const markdown = convertHtmlToMarkdown(html, url);

// Count lines to determine if we should create a zip archive
const lines = markdown.split('\n');
const lineCount = lines.length;

// Get filename from URL for potential zip file
const urlObj = new URL(url);
const pasteId = urlObj.pathname.split('/').pop();
const filename = `xpaste-pro-${pasteId}.md`;

// If content is less than 1500 lines, return markdown directly
if (lineCount < 1500) {
res.type('text/markdown').send(markdown);
} else {
// Create a zip archive with index.md and the full markdown file
const archive = archiver('zip', {
zlib: { level: 9 }
});

// Set response headers for zip download
res.setHeader('Content-Type', 'application/zip');
res.setHeader('Content-Disposition', `attachment; filename="${pasteId}.zip"`);

// Pipe archive to response
archive.pipe(res);

// Add index.md with link to the full markdown file
const indexMarkdown = `# ${url}

Content from: ${url}

The full content is available in [${filename}](${filename}) (${lineCount} lines).
`;
archive.append(indexMarkdown, { name: 'index.md' });

// Add the full markdown file
archive.append(markdown, { name: filename });

// Finalize the archive
await archive.finalize();
}
}
46 changes: 46 additions & 0 deletions src/txt.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import fetch from 'node-fetch';
import { normalizeUrlForTextContent } from './lib.js';

export async function txtHandler(req, res) {
const url = req.query.url;
if (!url) return res.status(400).send('Missing `url` parameter');

try {
// Normalize URL to get text content (e.g., xpaste.pro -> xpaste.pro/raw)
const textUrl = normalizeUrlForTextContent(url);

const response = await fetch(textUrl);

if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
}

const contentType = response.headers.get('content-type') || 'text/plain';

// Only accept text content types
if (!contentType.includes('text/')) {
throw new Error(`Expected text content, got ${contentType}`);
}

const text = await response.text();

// Set appropriate headers for text file download
res.setHeader('Content-Type', 'text/plain; charset=utf-8');
res.setHeader('Content-Disposition', `attachment; filename="${getFilenameFromUrl(url)}"`);
res.send(text);
} catch (err) {
console.error('Text fetch error:', err);
res.status(500).send('Error fetching text content');
}
}

function getFilenameFromUrl(url) {
try {
const urlObj = new URL(url);
const hostname = urlObj.hostname.replace(/\./g, '-');
const path = urlObj.pathname.replace(/\//g, '-').replace(/^-/, '');
return `${hostname}${path}.txt`;
} catch {
return 'download.txt';
}
}
Loading