Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions llmstxt-generator/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
HYPERBROWSER_API_KEY=your_hyperbrowser_api_key_here
OPENAI_API_KEY=your_openai_api_key_here
OPENAI_API_BASE_URL=
34 changes: 34 additions & 0 deletions llmstxt-generator/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# dependencies (bun install)
node_modules

# output
out
dist
*.tgz

# code coverage
coverage
*.lcov

# logs
logs
_.log
report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json

# dotenv environment variable files
.env
.env.development.local
.env.test.local
.env.production.local
.env.local

# caches
.eslintcache
.cache
*.tsbuildinfo

# IntelliJ based IDEs
.idea

# Finder (MacOS) folder config
.DS_Store
80 changes: 80 additions & 0 deletions llmstxt-generator/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# llms.txt generator

A CLI tool to **crawl websites** using [Hyperbrowser](https://hyperbrowser.ai) and generate an `llms.txt` file containing **short, LLM-ready descriptions of each page**. Summaries are created with OpenAI.

Useful for:

- SEO optimization
- Helping LLMs index your site with structured, concise descriptions
- Quick website overviews

---

## 🚀 Features

- 🌍 Crawl any website with [Hyperbrowser](https://hyperbrowser.ai)
- 🤖 Summarize each page into **3–5 word titles + short descriptions**
- 📄 Auto-generates `llms.txt` in your chosen directory
- 🔗 Saves discovered URLs into `crawl-urls.txt`

---

## 📦 Installation

Clone the repo and install dependencies:

```bash
cd llms-txt-generator
bun install
```

### CLI Usage

```bash
bun run index.ts
✔ Enter the start URL: https://hyperbrowser.ai
✔ Max number of pages to crawl: 5
✔ Output directory: output
```

## Example output

```bash
✔ 🌍 Crawl completed.
🔗 URL Found: https://hyperbrowser.ai
🔗 URL Found: https://hyperbrowser.ai/blog
🔗 URL Found: https://hyperbrowser.ai/privacy-policy
🔗 URL Found: https://hyperbrowser.ai/terms
🔗 URL Found: https://hyperbrowser.ai/blog/introducing-hyperbrowser-mcp-server

Summarizing |████████████████████████████████████████| 100% || 5/5 Pages
✅ All summaries generated.
📄 llms.txt written to output/llms.txt

```

## Example llms.txt file

```
# https://hyperbrowser.ai llms.txt

- [Hyperbrowser](https://hyperbrowser.ai): Browser-as-a-Service for AI agents and apps.
- [Web Scraping Tools](https://hyperbrowser.ai/blog): Compare SSR, CSR, and rendering approaches.
- [Hyperbrowser](https://hyperbrowser.ai/): Privacy policy explains data collection and usage.
- [Terms of Service](https://hyperbrowser.ai/terms): Governs use of Hyperbrowser and privacy policy.
- [Hyperbrowser MCP Server](https://hyperbrowser.ai/blog/introducing-hyperbrowser-mcp-server): Connect LLMs to the web with AI-powered tools for scraping, automation, and data extraction.

```

## API Costs 💰

- Hyperbrowser: ~$0.01-0.05 per page (depending on pages to crawl)
- OpenAI GPT-4: ~$0.03-0.10 per page summary (depending on content length)

## Contributing 🤝

Feel free to submit issues, feature requests, or pull requests to improve the tool!

## License 📝

MIT License - feel free to use this tool to generate llms.txt for your websites!
222 changes: 222 additions & 0 deletions llmstxt-generator/bun.lock

Large diffs are not rendered by default.

208 changes: 208 additions & 0 deletions llmstxt-generator/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
import inquirer from "inquirer";
import ora from "ora";
import fs from "fs";
import path from "path";
import { Hyperbrowser } from "@hyperbrowser/sdk";
import OpenAI from "openai";
import { SingleBar, Presets } from "cli-progress";

// ----------------------------TYPES---------------------------
interface Options {
url: string;
maxUrls: number;
outputDir: string;
}

interface CrawlResult {
url: string;
markdown: string | undefined;
}
// -------------------------------------------------------------

const browser = new Hyperbrowser({
apiKey: process.env.HYPERBROWSER_API_KEY,
});

const ai = new OpenAI({
baseURL: process.env.OPENAI_API_BASE_URL || "https://api.openai.com/v1",
apiKey: process.env.OPENAI_API_KEY!,
});

// Summarize one page with OpenAI
async function summarizePage(markdown: string, url: string): Promise<string> {
if (!markdown) return "No description available.";

const prompt = `
You are generating entries for an llms.txt file.

Rules:
- Output MUST be in the format:
- [Title](URL): Short description (max 12 words).
- Title: 3–5 words only, no "Short Title", no placeholders.
- Description: concise, human-readable, no extra punctuation.
- Do not use bold, quotes, or headings.

Examples:
- [Web Data Extraction](https://firecrawl.dev): Transform websites into clean, LLM-ready data.
- [Scaling Web Apps](https://example.com/blog): Compare SSR, CSR, and rendering approaches.

Now generate entry for this page:
URL: ${url}
Content:
${markdown.slice(0, 2000)}
`;

const res = await ai.chat.completions.create({
model: "gpt-4o-mini",
messages: [{ role: "user", content: prompt }],
temperature: 0.3,
});

return res.choices[0]?.message.content?.trim() || "Summary unavailable.";
}

export async function generateLLMSTxt(
crawlData: Omit<CrawlResult, "html">[],
siteName: string,
outputDir: string
) {
if (!crawlData || crawlData.length === 0) {
console.error("⚠️ No crawl data to process.");
return;
}

let content = `# https://${siteName} llms.txt\n\n`;

// 🔥 Progress bar
const bar = new SingleBar(
{
format: "Summarizing |{bar}| {percentage}% || {value}/{total} Pages",
barCompleteChar: "█",
barIncompleteChar: "░",
hideCursor: true,
},
Presets.shades_classic
);

bar.start(crawlData.length, 0);

for (const page of crawlData) {
if (!page?.url) {
console.warn(`⚠️ Skipping page, missing URL.`);
bar.increment();
continue;
}

try {
const summary = await summarizePage(page.markdown || "", page.url);

// Split into lines, remove empties
const lines = summary.split("\n").filter((l) => l.trim());
for (const line of lines) {
let clean = line.trim();

if (!clean.startsWith("- ")) {
clean = `- ${clean}`;
}

content += `${clean}\n`;
}
} catch (err) {
console.error(`❌ Failed to summarize ${page.url}:`, err);
}

bar.increment();
}

bar.stop();
console.log("✅ All summaries generated.");

fs.mkdirSync(outputDir, { recursive: true });

const filePath = path.join(outputDir, "llms.txt");
fs.writeFileSync(filePath, content, "utf-8");

console.log(`📄 llms.txt written to ${filePath}`);
}

async function crawl(url: string, maxPages: number) {
const spinner = ora("Crawling website...").start();

try {
const res = await browser.crawl.startAndWait(
{
url,
maxPages,
},
true
);

spinner.succeed("🌍 Crawl completed.");

if (res.error) {
console.error(res.error);
return [];
}

let content = "";
const crawlData = res.data?.map((val) => {
console.log("🔗 URL Found:", val.url);
content += `${val.url} \n`;
return { url: val.url, markdown: val.markdown };
});

fs.writeFileSync("crawl-urls.txt", content, "utf-8");

return crawlData || [];
} catch (err) {
spinner.fail("❌ Crawl failed.");
console.error(err);
return [];
}
}

async function main() {
const answers = await inquirer.prompt([
{
type: "input",
name: "url",
message: "Enter the start URL:",
validate: (val: string) =>
val.startsWith("http")
? true
: "Please enter a valid URL (http/https).",
},
{
type: "number",
name: "maxUrls",
message: "Max number of pages to crawl:",
default: 50,
},
{
type: "input",
name: "outputDir",
message: "Output directory:",
default: "output",
},
]);

const options: Options = {
url: answers.url,
maxUrls: answers.maxUrls,
outputDir: answers.outputDir,
};

const crawlData = await crawl(options.url, options.maxUrls);

if (!crawlData || crawlData.length === 0) {
console.error("❌ No crawl data found, exiting...");
return;
}

await generateLLMSTxt(
crawlData,
new URL(options.url).hostname,
options.outputDir
);
}
main();
21 changes: 21 additions & 0 deletions llmstxt-generator/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"name": "llmstxt-generator",
"module": "index.ts",
"type": "module",
"private": true,
"devDependencies": {
"@types/bun": "latest",
"@types/cli-progress": "^3.11.6"
},
"peerDependencies": {
"typescript": "^5"
},
"dependencies": {
"@hyperbrowser/sdk": "^0.61.0",
"cli-progress": "^3.12.0",
"commander": "^14.0.1",
"inquirer": "^12.9.6",
"openai": "^5.22.0",
"ora": "5.4.1"
}
}
28 changes: 28 additions & 0 deletions llmstxt-generator/tsconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"compilerOptions": {
// Environment setup & latest features
"lib": ["ESNext"],
"target": "ESNext",
"module": "ESNext",
"moduleDetection": "force",
"jsx": "react-jsx",
"allowJs": true,

// Bundler mode
"moduleResolution": "bundler",
"allowImportingTsExtensions": true,
"verbatimModuleSyntax": true,
"noEmit": true,

// Best practices
"strict": true,
"skipLibCheck": true,
"noFallthroughCasesInSwitch": true,
"noUncheckedIndexedAccess": true,

// Some stricter flags (disabled by default)
"noUnusedLocals": false,
"noUnusedParameters": false,
"noPropertyAccessFromIndexSignature": false
}
}