Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 10 additions & 9 deletions .github/scripts/generate_llms_full.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@
"""

import json
import asyncio
import aiohttp
import re
from pathlib import Path
from dataclasses import dataclass
Expand Down Expand Up @@ -98,7 +96,7 @@ def extract_article_content(html_content: str) -> str:


async def fetch_page_content(
session: aiohttp.ClientSession, url: str, semaphore: asyncio.Semaphore
session: "aiohttp.ClientSession", url: str, semaphore: "asyncio.Semaphore"
) -> tuple[str, str]:
"""Fetch page content and return (url, content)."""
async with semaphore:
Expand All @@ -121,16 +119,19 @@ async def fetch_page_content(
else:
print(f" Warning: {url} returned {response.status}")
return (url, "")
except asyncio.TimeoutError:
print(f" Timeout: {url}")
return (url, "")
except Exception as e:
print(f" Error fetching {url}: {e}")
print(f" Timeout or error: {url}: {e}")
return (url, "")


async def fetch_all_pages(urls: list[str]) -> dict[str, str]:
"""Fetch all pages concurrently with rate limiting."""
import asyncio
try:
import aiohttp
except ImportError:
return {}

semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
results = {}

Expand Down Expand Up @@ -242,7 +243,7 @@ def generate_llms_full_txt(manifest: dict, contents: dict[str, str]) -> str:
return "\n".join(lines)


async def main():
def main():
"""Main function."""
print("=" * 60)
print("Microsoft Foundry llms-full.txt Generator")
Expand Down Expand Up @@ -386,4 +387,4 @@ async def main():


if __name__ == "__main__":
asyncio.run(main())
main()
55 changes: 23 additions & 32 deletions .github/scripts/scrape_foundry_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,24 @@

import json
import asyncio
import aiohttp
import re
from pathlib import Path
from dataclasses import dataclass, field
from typing import Optional
from urllib.parse import urljoin
from urllib.request import Request, urlopen
import time

try:
import aiohttp
HAS_AIOHTTP = True
except ImportError:
HAS_AIOHTTP = False

# Base URLs
BASE_URL = "https://learn.microsoft.com/en-us/azure/ai-foundry/"
TOC_URL = "https://learn.microsoft.com/en-us/azure/ai-foundry/toc.json?view=foundry"
VIEW_PARAM = "?view=foundry"
BASE_URL = "https://learn.microsoft.com/en-us/azure/foundry/"
TOC_URL = "https://learn.microsoft.com/en-us/azure/foundry/toc.json"
VIEW_PARAM = ""

# Output paths
OUTPUT_DIR = Path(__file__).parent.parent.parent / "docs"
Expand Down Expand Up @@ -54,27 +60,20 @@ class DocSection:


def normalize_url(href: str, base_path: str = BASE_URL) -> str:
"""Convert relative href to full URL with view parameter."""
"""Convert relative href to full URL."""
if href.startswith("http"):
# External URL - don't modify
if "learn.microsoft.com" in href and "view=" not in href:
return f"{href}{VIEW_PARAM}"
return href

if href.startswith("/"):
# Absolute path within learn.microsoft.com
url = f"https://learn.microsoft.com{href}"
elif href.startswith(".."):
# Relative path going up - handle azure/ai-services context
# Relative path going up
url = urljoin(base_path, href)
else:
# Relative path within ai-foundry
# Relative path within foundry docs
url = f"{base_path}{href}"

# Add view parameter if not present
if "view=" not in url:
url = f"{url}{VIEW_PARAM}"

return url


Expand Down Expand Up @@ -162,16 +161,11 @@ def organize_into_sections(


async def fetch_page_content(
session: aiohttp.ClientSession, url: str, semaphore: asyncio.Semaphore
session: "aiohttp.ClientSession", url: str, semaphore: "asyncio.Semaphore"
) -> str:
"""Fetch page content from Microsoft Learn API (markdown endpoint)."""
async with semaphore:
try:
# Try the markdown endpoint first
md_url = url.replace("?view=foundry", ".md?view=foundry")
if "?" not in url:
md_url = f"{url}.md"

# Use the regular URL and parse HTML
headers = {"User-Agent": "Mozilla/5.0 (compatible; LLMsTxtGenerator/1.0)"}

Expand Down Expand Up @@ -236,9 +230,6 @@ def generate_llms_txt(
lines.append(
"- The Foundry SDK is available for Python, C#, JavaScript/TypeScript, and Java"
)
lines.append(
"- All URLs require `?view=foundry` parameter to access the new documentation"
)
lines.append("")

# Define section order
Expand Down Expand Up @@ -297,7 +288,7 @@ def generate_llms_txt(
return "\n".join(lines)


async def main():
def main():
"""Main function to scrape docs and generate llms.txt."""
print("=" * 60)
print("Microsoft Foundry Documentation Scraper")
Expand All @@ -308,12 +299,13 @@ async def main():

# Fetch TOC
print("\n[1/4] Fetching Table of Contents...")
async with aiohttp.ClientSession() as session:
async with session.get(TOC_URL) as response:
if response.status != 200:
print(f"Error: Failed to fetch TOC (status {response.status})")
return
toc_data = await response.json()
headers = {"User-Agent": "Mozilla/5.0 (compatible; LLMsTxtGenerator/1.0)"}
req = Request(TOC_URL, headers=headers)
with urlopen(req, timeout=30) as resp:
if resp.status != 200:
print(f"Error: Failed to fetch TOC (status {resp.status})")
return
toc_data = json.loads(resp.read())

# Extract pages
print("[2/4] Extracting page URLs...")
Expand All @@ -340,7 +332,6 @@ async def main():
manifest = {
"title": "Microsoft Foundry",
"base_url": BASE_URL,
"view_param": VIEW_PARAM,
"sections": {
section: [
{"title": t, "href": h, "url": normalize_url(h)} for t, h in pages
Expand All @@ -361,4 +352,4 @@ async def main():


if __name__ == "__main__":
asyncio.run(main())
main()
Loading
Loading