link-assistant · konard · Nov 13, 2025 · Nov 13, 2025 · Nov 13, 2025 · Nov 14, 2025
diff --git a/jest.config.mjs b/jest.config.mjs
@@ -13,7 +13,8 @@ export default {
   testMatch: [
     '**/tests/unit/**/*.test.js',
     '**/tests/mock/**/*.test.js',
-    '**/tests/e2e/**/*.test.js'
+    '**/tests/e2e/**/*.test.js',
+    '**/tests/xpaste/**/*.test.js'
   ],
   setupFiles: ['./tests/jest.setup.mjs']
 };
diff --git a/package.json b/package.json
@@ -18,6 +18,7 @@
     "examples": "yarn build && yarn examples:javascript && yarn examples:python"
   },
   "dependencies": {
+    "archiver": "^7.0.1",
     "cheerio": "^1.0.0",
     "express": "^4.18.2",
     "node-fetch": "^2.7.0",
@@ -44,6 +45,6 @@
   ],
   "license": "UNLICENSED",
   "engines": {
-    "node": ">=22.0.0 <23.0.0"
+    "node": ">=20.0.0"
   }
 }
diff --git a/scripts/xpaste/capture-fullpage.js b/scripts/xpaste/capture-fullpage.js
@@ -0,0 +1,19 @@
+import fetch from 'node-fetch';
+import fs from 'fs';
+
+const baseUrl = 'http://localhost:3000';
+const targetUrl = 'https://xpaste.pro/p/t4q0Lsp0';
+
+console.log('Capturing full-page screenshot of xpaste.pro/p/t4q0Lsp0...');
+const imageResponse = await fetch(`${baseUrl}/image?url=${encodeURIComponent(targetUrl)}&fullPage=true&engine=playwright`);
+
+if (!imageResponse.ok) {
+  console.error('Failed to capture screenshot:', await imageResponse.text());
+  process.exit(1);
+}
+
+const imageBuffer = await imageResponse.buffer();
+const outputPath = 'tests/xpaste/data/t4q0Lsp0-screenshot.png';
+fs.writeFileSync(outputPath, imageBuffer);
+console.log(`Full-page screenshot saved to ${outputPath}`);
+console.log(`Screenshot size: ${imageBuffer.length} bytes`);
diff --git a/scripts/xpaste/regenerate-markdown.js b/scripts/xpaste/regenerate-markdown.js
@@ -0,0 +1,26 @@
+import { readFileSync, writeFileSync } from 'fs';
+import { convertHtmlToMarkdown } from '../../src/lib.js';
+
+const html = readFileSync('./tests/xpaste/data/t4q0Lsp0-page.html', 'utf-8');
+const markdown = convertHtmlToMarkdown(html, 'https://xpaste.pro/p/t4q0Lsp0');
+writeFileSync('./tests/xpaste/data/t4q0Lsp0-page.md', markdown);
+
+console.log('✅ Markdown regenerated successfully');
+console.log('\nFirst 30 lines:');
+console.log(markdown.split('\n').slice(0, 30).join('\n'));
+
+console.log('\n\n=== Checking key elements ===');
+const lines = markdown.split('\n');
+const headingLine = lines.findIndex(l => l.includes('Упакуем пароль'));
+const formatLine = lines.findIndex(l => l.includes('Формат:'));
+const languageLine = lines.findIndex(l => l.includes('[Ru]') || l.includes('[En]'));
+
+console.log(`Heading "Упакуем пароль..." at line: ${headingLine + 1}`);
+console.log(`Format metadata at line: ${formatLine + 1}`);
+console.log(`Language links at line: ${languageLine + 1}`);
+
+if (headingLine < formatLine) {
+  console.log('✅ Heading comes before metadata (correct order)');
+} else {
+  console.log('❌ Heading comes after metadata (incorrect order)');
+}
diff --git a/scripts/xpaste/test-html-markdown.js b/scripts/xpaste/test-html-markdown.js
@@ -0,0 +1,69 @@
+#!/usr/bin/env node
+/**
+ * Test script to fetch xpaste.pro HTML and convert to markdown
+ * This helps us understand what the markdown extraction looks like
+ */
+
+import fetch from 'node-fetch';
+import { convertHtmlToMarkdown } from '../../src/lib.js';
+import fs from 'fs';
+import path from 'path';
+import { fileURLToPath } from 'url';
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+async function testXpasteHtmlToMarkdown() {
+  const url = 'https://xpaste.pro/p/t4q0Lsp0';
+
+  console.log('Fetching HTML from:', url);
+  const response = await fetch(url);
+  const html = await response.text();
+
+  // Save the HTML for reference
+  const htmlPath = path.join(__dirname, '../../tests/xpaste/data/t4q0Lsp0-page.html');
+  fs.writeFileSync(htmlPath, html);
+  console.log('Saved HTML to:', htmlPath);
+
+  // Convert to markdown
+  console.log('\nConverting to markdown...');
+  const markdown = convertHtmlToMarkdown(html, url);
+
+  // Save the markdown
+  const mdPath = path.join(__dirname, '../../tests/xpaste/data/t4q0Lsp0-page.md');
+  fs.writeFileSync(mdPath, markdown);
+  console.log('Saved markdown to:', mdPath);
+
+  // Analyze the markdown
+  const lines = markdown.split('\n');
+  console.log('\nMarkdown statistics:');
+  console.log('- Total lines:', lines.length);
+  console.log('- Total characters:', markdown.length);
+
+  // Check for key elements from the screenshot
+  const checks = [
+    { name: 'Page title/header', pattern: /xpaste|упакует/i },
+    { name: 'Format info', pattern: /формат|format.*text/i },
+    { name: 'Creation date', pattern: /07\.07\.2021|время создания/i },
+    { name: 'SQL query #1', pattern: /User@Host.*1703313381.*1138102510/i },
+    { name: 'SQL query content', pattern: /SELECT.*phpbb_posts/i },
+    { name: 'Footer text', pattern: /soulbridge|справка|политика/i },
+  ];
+
+  console.log('\nContent checks:');
+  checks.forEach(check => {
+    const found = check.pattern.test(markdown);
+    console.log(`- ${check.name}: ${found ? '✓' : '✗'}`);
+  });
+
+  // Show first 500 chars of markdown
+  console.log('\nFirst 500 characters of markdown:');
+  console.log(markdown.substring(0, 500));
+  console.log('...\n');
+
+  // Show last 300 chars of markdown
+  console.log('Last 300 characters of markdown:');
+  console.log('...');
+  console.log(markdown.substring(markdown.length - 300));
+}
+
+testXpasteHtmlToMarkdown().catch(console.error);
diff --git a/src/browser.js b/src/browser.js
@@ -129,13 +129,17 @@ function createPuppeteerPageAdapter(page) {
  * @param {Object} page - Playwright page object
  * @returns {PageAdapter}
  */
-function createPlaywrightPageAdapter(page) {
+function createPlaywrightPageAdapter(page, context) {
   return {
     async setExtraHTTPHeaders(headers) {
       await page.setExtraHTTPHeaders(headers);
     },
     async setUserAgent(userAgent) {
-      await page.setUserAgent(userAgent);
+      // Playwright sets user agent at context level, but we can also use route
+      // For simplicity, we'll store it and use it in goto if needed
+      // Note: In Playwright, user agent should ideally be set when creating the context
+      // Since we can't change it after page creation, we'll just ignore this for now
+      // or we could recreate the page with the new user agent
     },
     async setViewport(viewport) {
       // Playwright uses setViewportSize instead of setViewport

diff --git a/src/image.js b/src/image.js
@@ -7,6 +7,7 @@ export async function imageHandler(req, res) {
     // Ensure URL is absolute
     const absoluteUrl = url.startsWith('http') ? url : `https://${url}`;
     const engine = getBrowserEngine(req);
+    const fullPage = req.query.fullPage === 'true';
     const browser = await createBrowser(engine);
     try {
       const page = await browser.newPage();
@@ -22,8 +23,8 @@ export async function imageHandler(req, res) {
       });
       // Wait for 5 seconds after page load
       await new Promise(resolve => setTimeout(resolve, 5000));
-      // Take a screenshot of just the viewport (not the full page)
-      const buffer = await page.screenshot({ type: 'png' });
+      // Take a screenshot (viewport or full page based on query parameter)
+      const buffer = await page.screenshot({ type: 'png', fullPage });
       res.set('Content-Type', 'image/png');
       res.set('Content-Disposition', 'inline; filename="screenshot.png"');
       res.end(buffer);

diff --git a/src/index.js b/src/index.js
@@ -5,6 +5,7 @@ import { markdownHandler } from './markdown.js';
 import { imageHandler } from './image.js';
 import { streamHandler } from './stream.js';
 import { fetchHandler } from './fetch.js';
+import { txtHandler } from './txt.js';
 
 const app = express();
 const port = process.env.PORT || 3000;
@@ -14,6 +15,7 @@ app.get('/markdown', markdownHandler);
 app.get('/image', imageHandler);
 app.get('/stream', streamHandler);
 app.get('/fetch', fetchHandler);
+app.get('/txt', txtHandler);
 
 // Start the server if this is the main module
 const isMainModule = process.argv[1] && fileURLToPath(import.meta.url) === process.argv[1];

diff --git a/src/lib.js b/src/lib.js
@@ -20,6 +20,23 @@ export function convertHtmlToMarkdown(html, baseUrl) {
   // Load HTML into Cheerio
   const $ = cheerio.load(html);
 
+  // Reorder header/main/footer elements to match visual layout
+  // Some sites (like xpaste.pro) use CSS to position header at top,
+  // but in HTML it comes after main. We need to fix the order for markdown.
+  const $header = $('header').first();
+  const $main = $('main').first();
+  const $footer = $('footer').first();
+
+  if ($header.length && $main.length && $header.index() > $main.index()) {
+    // Header comes after main in DOM, but should come before in markdown
+    $main.before($header);
+  }
+
+  if ($footer.length && $main.length && $footer.index() < $main.index()) {
+    // Footer comes before main in DOM, but should come after in markdown
+    $main.after($footer);
+  }
+
   // Remove <style>, <script>, and <noscript> tags
   $('style, script, noscript').remove();
 
@@ -289,4 +306,34 @@ export function ensureUtf8(html) {
     );
   }
   return html;
+}
+
+// Normalize URL to get text content (e.g., convert xpaste.pro URLs to raw format)
+export function normalizeUrlForTextContent(url) {
+  try {
+    const urlObj = new URL(url);
+
+    // Handle xpaste.pro URLs - convert to /raw endpoint
+    if (urlObj.hostname === 'xpaste.pro' && urlObj.pathname.startsWith('/p/')) {
+      // If it doesn't already end with /raw, append it
+      if (!urlObj.pathname.endsWith('/raw')) {
+        return `${url}/raw`;
+      }
+    }
+
+    return url;
+  } catch (error) {
+    // If URL parsing fails, return original URL
+    return url;
+  }
+}
+
+// Check if a URL is a text paste service (like xpaste.pro)
+export function isTextPasteUrl(url) {
+  try {
+    const urlObj = new URL(url);
+    return urlObj.hostname === 'xpaste.pro' && urlObj.pathname.startsWith('/p/');
+  } catch {
+    return false;
+  }
 }
diff --git a/src/markdown.js b/src/markdown.js
@@ -1,9 +1,18 @@
-import { fetchHtml, convertHtmlToMarkdown } from './lib.js';
+import fetch from 'node-fetch';
+import { fetchHtml, convertHtmlToMarkdown, isTextPasteUrl, normalizeUrlForTextContent } from './lib.js';
+import archiver from 'archiver';
 
 export async function markdownHandler(req, res) {
   const url = req.query.url;
   if (!url) return res.status(400).send('Missing `url` parameter');
+
   try {
+    // Check if this is a text paste URL (like xpaste.pro)
+    if (isTextPasteUrl(url)) {
+      return await handleTextPasteMarkdown(req, res, url);
+    }
+
+    // Regular HTML to markdown conversion
     const html = await fetchHtml(url);
     // Pass baseUrl to convertHtmlToMarkdown so all URLs are absolute
     const markdown = convertHtmlToMarkdown(html, url);
@@ -12,4 +21,52 @@ export async function markdownHandler(req, res) {
     console.error(err);
     res.status(500).send('Error converting to Markdown');
   }
+}
+
+async function handleTextPasteMarkdown(req, res, url) {
+  // Fetch the HTML version of the page to get the full visual content
+  // (including UI elements, metadata, etc.) as it appears in screenshots
+  const html = await fetchHtml(url);
+  const markdown = convertHtmlToMarkdown(html, url);
+
+  // Count lines to determine if we should create a zip archive
+  const lines = markdown.split('\n');
+  const lineCount = lines.length;
+
+  // Get filename from URL for potential zip file
+  const urlObj = new URL(url);
+  const pasteId = urlObj.pathname.split('/').pop();
+  const filename = `xpaste-pro-${pasteId}.md`;
+
+  // If content is less than 1500 lines, return markdown directly
+  if (lineCount < 1500) {
+    res.type('text/markdown').send(markdown);
+  } else {
+    // Create a zip archive with index.md and the full markdown file
+    const archive = archiver('zip', {
+      zlib: { level: 9 }
+    });
+
+    // Set response headers for zip download
+    res.setHeader('Content-Type', 'application/zip');
+    res.setHeader('Content-Disposition', `attachment; filename="${pasteId}.zip"`);
+
+    // Pipe archive to response
+    archive.pipe(res);
+
+    // Add index.md with link to the full markdown file
+    const indexMarkdown = `# ${url}
+
+Content from: ${url}
+
+The full content is available in [${filename}](${filename}) (${lineCount} lines).
+`;
+    archive.append(indexMarkdown, { name: 'index.md' });
+
+    // Add the full markdown file
+    archive.append(markdown, { name: filename });
+
+    // Finalize the archive
+    await archive.finalize();
+  }
 }
diff --git a/src/txt.js b/src/txt.js
@@ -0,0 +1,46 @@
+import fetch from 'node-fetch';
+import { normalizeUrlForTextContent } from './lib.js';
+
+export async function txtHandler(req, res) {
+  const url = req.query.url;
+  if (!url) return res.status(400).send('Missing `url` parameter');
+
+  try {
+    // Normalize URL to get text content (e.g., xpaste.pro -> xpaste.pro/raw)
+    const textUrl = normalizeUrlForTextContent(url);
+
+    const response = await fetch(textUrl);
+
+    if (!response.ok) {
+      throw new Error(`HTTP ${response.status}: ${response.statusText}`);
+    }
+
+    const contentType = response.headers.get('content-type') || 'text/plain';
+
+    // Only accept text content types
+    if (!contentType.includes('text/')) {
+      throw new Error(`Expected text content, got ${contentType}`);
+    }
+
+    const text = await response.text();
+
+    // Set appropriate headers for text file download
+    res.setHeader('Content-Type', 'text/plain; charset=utf-8');
+    res.setHeader('Content-Disposition', `attachment; filename="${getFilenameFromUrl(url)}"`);
+    res.send(text);
+  } catch (err) {
+    console.error('Text fetch error:', err);
+    res.status(500).send('Error fetching text content');
+  }
+}
+
+function getFilenameFromUrl(url) {
+  try {
+    const urlObj = new URL(url);
+    const hostname = urlObj.hostname.replace(/\./g, '-');
+    const path = urlObj.pathname.replace(/\//g, '-').replace(/^-/, '');
+    return `${hostname}${path}.txt`;
+  } catch {
+    return 'download.txt';
+  }
+}