hyperbrowserai · ayushsharma74 · Sep 21, 2025
diff --git a/llmstxt-generator/.env.example b/llmstxt-generator/.env.example
@@ -0,0 +1,3 @@
+HYPERBROWSER_API_KEY=your_hyperbrowser_api_key_here
+OPENAI_API_KEY=your_openai_api_key_here
+OPENAI_API_BASE_URL=
diff --git a/llmstxt-generator/.gitignore b/llmstxt-generator/.gitignore
@@ -0,0 +1,34 @@
+# dependencies (bun install)
+node_modules
+
+# output
+out
+dist
+*.tgz
+
+# code coverage
+coverage
+*.lcov
+
+# logs
+logs
+_.log
+report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json
+
+# dotenv environment variable files
+.env
+.env.development.local
+.env.test.local
+.env.production.local
+.env.local
+
+# caches
+.eslintcache
+.cache
+*.tsbuildinfo
+
+# IntelliJ based IDEs
+.idea
+
+# Finder (MacOS) folder config
+.DS_Store
diff --git a/llmstxt-generator/README.md b/llmstxt-generator/README.md
@@ -0,0 +1,80 @@
+# llms.txt generator
+
+A CLI tool to **crawl websites** using [Hyperbrowser](https://hyperbrowser.ai) and generate an `llms.txt` file containing **short, LLM-ready descriptions of each page**. Summaries are created with OpenAI.
+
+Useful for:
+
+- SEO optimization
+- Helping LLMs index your site with structured, concise descriptions
+- Quick website overviews
+
+---
+
+## 🚀 Features
+
+- 🌍 Crawl any website with [Hyperbrowser](https://hyperbrowser.ai)
+- 🤖 Summarize each page into **3–5 word titles + short descriptions**
+- 📄 Auto-generates `llms.txt` in your chosen directory
+- 🔗 Saves discovered URLs into `crawl-urls.txt`
+
+---
+
+## 📦 Installation
+
+Clone the repo and install dependencies:
+
+```bash
+cd llms-txt-generator
+bun install
+```
+
+### CLI Usage
+
+```bash
+bun run index.ts
+✔ Enter the start URL: https://hyperbrowser.ai
+✔ Max number of pages to crawl: 5
+✔ Output directory: output
+```
+
+## Example output
+
+```bash
+✔ 🌍 Crawl completed.
+🔗 URL Found: https://hyperbrowser.ai
+🔗 URL Found: https://hyperbrowser.ai/blog
+🔗 URL Found: https://hyperbrowser.ai/privacy-policy
+🔗 URL Found: https://hyperbrowser.ai/terms
+🔗 URL Found: https://hyperbrowser.ai/blog/introducing-hyperbrowser-mcp-server
+
+Summarizing |████████████████████████████████████████| 100% || 5/5 Pages
+✅ All summaries generated.
+📄 llms.txt written to output/llms.txt
+
+```
+
+## Example llms.txt file
+
+```
+# https://hyperbrowser.ai llms.txt
+
+- [Hyperbrowser](https://hyperbrowser.ai): Browser-as-a-Service for AI agents and apps.
+- [Web Scraping Tools](https://hyperbrowser.ai/blog): Compare SSR, CSR, and rendering approaches.
+- [Hyperbrowser](https://hyperbrowser.ai/): Privacy policy explains data collection and usage.
+- [Terms of Service](https://hyperbrowser.ai/terms): Governs use of Hyperbrowser and privacy policy.
+- [Hyperbrowser MCP Server](https://hyperbrowser.ai/blog/introducing-hyperbrowser-mcp-server): Connect LLMs to the web with AI-powered tools for scraping, automation, and data extraction.
+
+```
+
+## API Costs 💰
+
+- Hyperbrowser: ~$0.01-0.05 per page (depending on pages to crawl)
+- OpenAI GPT-4: ~$0.03-0.10 per page summary (depending on content length)
+
+## Contributing 🤝
+
+Feel free to submit issues, feature requests, or pull requests to improve the tool!
+
+## License 📝
+
+MIT License - feel free to use this tool to generate llms.txt for your websites!
diff --git a/llmstxt-generator/bun.lock b/llmstxt-generator/bun.lock
diff --git a/llmstxt-generator/index.ts b/llmstxt-generator/index.ts
@@ -0,0 +1,208 @@
+import inquirer from "inquirer";
+import ora from "ora";
+import fs from "fs";
+import path from "path";
+import { Hyperbrowser } from "@hyperbrowser/sdk";
+import OpenAI from "openai";
+import { SingleBar, Presets } from "cli-progress";
+
+// ----------------------------TYPES---------------------------
+interface Options {
+  url: string;
+  maxUrls: number;
+  outputDir: string;
+}
+
+interface CrawlResult {
+  url: string;
+  markdown: string | undefined;
+}
+// -------------------------------------------------------------
+
+const browser = new Hyperbrowser({
+  apiKey: process.env.HYPERBROWSER_API_KEY,
+});
+
+const ai = new OpenAI({
+  baseURL: process.env.OPENAI_API_BASE_URL || "https://api.openai.com/v1",
+  apiKey: process.env.OPENAI_API_KEY!,
+});
+
+// Summarize one page with OpenAI
+async function summarizePage(markdown: string, url: string): Promise<string> {
+  if (!markdown) return "No description available.";
+
+  const prompt = `
+    You are generating entries for an llms.txt file.
+
+    Rules:
+    - Output MUST be in the format:
+    - [Title](URL): Short description (max 12 words).
+    - Title: 3–5 words only, no "Short Title", no placeholders.
+    - Description: concise, human-readable, no extra punctuation.
+    - Do not use bold, quotes, or headings.
+
+    Examples:
+    - [Web Data Extraction](https://firecrawl.dev): Transform websites into clean, LLM-ready data.
+    - [Scaling Web Apps](https://example.com/blog): Compare SSR, CSR, and rendering approaches.
+
+    Now generate entry for this page:
+    URL: ${url}
+    Content:
+    ${markdown.slice(0, 2000)}
+`;
+
+  const res = await ai.chat.completions.create({
+    model: "gpt-4o-mini",
+    messages: [{ role: "user", content: prompt }],
+    temperature: 0.3,
+  });
+
+  return res.choices[0]?.message.content?.trim() || "Summary unavailable.";
+}
+
+export async function generateLLMSTxt(
+  crawlData: Omit<CrawlResult, "html">[],
+  siteName: string,
+  outputDir: string
+) {
+  if (!crawlData || crawlData.length === 0) {
+    console.error("⚠️ No crawl data to process.");
+    return;
+  }
+
+  let content = `# https://${siteName} llms.txt\n\n`;
+
+  // 🔥 Progress bar
+  const bar = new SingleBar(
+    {
+      format: "Summarizing |{bar}| {percentage}% || {value}/{total} Pages",
+      barCompleteChar: "█",
+      barIncompleteChar: "░",
+      hideCursor: true,
+    },
+    Presets.shades_classic
+  );
+
+  bar.start(crawlData.length, 0);
+
+  for (const page of crawlData) {
+    if (!page?.url) {
+      console.warn(`⚠️ Skipping page, missing URL.`);
+      bar.increment();
+      continue;
+    }
+
+    try {
+      const summary = await summarizePage(page.markdown || "", page.url);
+
+      // Split into lines, remove empties
+      const lines = summary.split("\n").filter((l) => l.trim());
+      for (const line of lines) {
+        let clean = line.trim();
+
+        if (!clean.startsWith("- ")) {
+          clean = `- ${clean}`;
+        }
+
+        content += `${clean}\n`;
+      }
+    } catch (err) {
+      console.error(`❌ Failed to summarize ${page.url}:`, err);
+    }
+
+    bar.increment();
+  }
+
+  bar.stop();
+  console.log("✅ All summaries generated.");
+
+  fs.mkdirSync(outputDir, { recursive: true });
+
+  const filePath = path.join(outputDir, "llms.txt");
+  fs.writeFileSync(filePath, content, "utf-8");
+
+  console.log(`📄 llms.txt written to ${filePath}`);
+}
+
+async function crawl(url: string, maxPages: number) {
+  const spinner = ora("Crawling website...").start();
+
+  try {
+    const res = await browser.crawl.startAndWait(
+      {
+        url,
+        maxPages,
+      },
+      true
+    );
+
+    spinner.succeed("🌍 Crawl completed.");
+
+    if (res.error) {
+      console.error(res.error);
+      return [];
+    }
+
+    let content = "";
+    const crawlData = res.data?.map((val) => {
+      console.log("🔗 URL Found:", val.url);
+      content += `${val.url} \n`;
+      return { url: val.url, markdown: val.markdown };
+    });
+
+    fs.writeFileSync("crawl-urls.txt", content, "utf-8");
+
+    return crawlData || [];
+  } catch (err) {
+    spinner.fail("❌ Crawl failed.");
+    console.error(err);
+    return [];
+  }
+}
+
+async function main() {
+  const answers = await inquirer.prompt([
+    {
+      type: "input",
+      name: "url",
+      message: "Enter the start URL:",
+      validate: (val: string) =>
+        val.startsWith("http")
+          ? true
+          : "Please enter a valid URL (http/https).",
+    },
+    {
+      type: "number",
+      name: "maxUrls",
+      message: "Max number of pages to crawl:",
+      default: 50,
+    },
+    {
+      type: "input",
+      name: "outputDir",
+      message: "Output directory:",
+      default: "output",
+    },
+  ]);
+
+  const options: Options = {
+    url: answers.url,
+    maxUrls: answers.maxUrls,
+    outputDir: answers.outputDir,
+  };
+
+  const crawlData = await crawl(options.url, options.maxUrls);
+
+  if (!crawlData || crawlData.length === 0) {
+    console.error("❌ No crawl data found, exiting...");
+    return;
+  }
+
+  await generateLLMSTxt(
+    crawlData,
+    new URL(options.url).hostname,
+    options.outputDir
+  );
+}
+main();
diff --git a/llmstxt-generator/package.json b/llmstxt-generator/package.json
@@ -0,0 +1,21 @@
+{
+  "name": "llmstxt-generator",
+  "module": "index.ts",
+  "type": "module",
+  "private": true,
+  "devDependencies": {
+    "@types/bun": "latest",
+    "@types/cli-progress": "^3.11.6"
+  },
+  "peerDependencies": {
+    "typescript": "^5"
+  },
+  "dependencies": {
+    "@hyperbrowser/sdk": "^0.61.0",
+    "cli-progress": "^3.12.0",
+    "commander": "^14.0.1",
+    "inquirer": "^12.9.6",
+    "openai": "^5.22.0",
+    "ora": "5.4.1"
+  }
+}
diff --git a/llmstxt-generator/tsconfig.json b/llmstxt-generator/tsconfig.json
@@ -0,0 +1,28 @@
+{
+  "compilerOptions": {
+    // Environment setup & latest features
+    "lib": ["ESNext"],
+    "target": "ESNext",
+    "module": "ESNext",
+    "moduleDetection": "force",
+    "jsx": "react-jsx",
+    "allowJs": true,
+
+    // Bundler mode
+    "moduleResolution": "bundler",
+    "allowImportingTsExtensions": true,
+    "verbatimModuleSyntax": true,
+    "noEmit": true,
+
+    // Best practices
+    "strict": true,
+    "skipLibCheck": true,
+    "noFallthroughCasesInSwitch": true,
+    "noUncheckedIndexedAccess": true,
+
+    // Some stricter flags (disabled by default)
+    "noUnusedLocals": false,
+    "noUnusedParameters": false,
+    "noPropertyAccessFromIndexSignature": false
+  }
+}