-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathConfluenceCrawler.py
More file actions
155 lines (123 loc) · 5.23 KB
/
ConfluenceCrawler.py
File metadata and controls
155 lines (123 loc) · 5.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
"""
This script starts from a single Confluence page ID, then walks the tree of
child pages entirely via the REST API. All pages are saved locally as HTML, so
later stages of the pipeline can clean and index them.
Replace ``ROOT_PAGE_ID`` with the ID of your root wiki page and either export
``CONFLUENCE_TOKEN`` in your environment or hard‑code ``BEARER_TOKEN`` for
quick tests.
All downloaded files go to ``confluence_pages/`` in the current directory.
"""
import os
import logging
import time
import random
from urllib.parse import urljoin
import requests
from requests.adapters import HTTPAdapter
from urllib3.util import Retry
BASE_URL = "https://confluence.org.com" # change to your Confluence base
BEARER_TOKEN = os.getenv("CONFLUENCE_TOKEN", "MY_BEARER_TOKEN")
OUTPUT_DIR = "downloaded_wikis/"
ROOT_PAGE_ID = "123456" # <-- put your main page ID here
def create_session(token: str) -> requests.Session:
"""Return a requests.Session with auth + retry handling."""
session = requests.Session()
session.headers.update({
"Authorization": f"Bearer {token}",
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/114.0.0.0 Safari/537.36"
),
"Accept": "application/json",
})
retry = Retry(
total=10,
backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504],
respect_retry_after_header=True,
)
adapter = HTTPAdapter(max_retries=retry)
session.mount("https://", adapter)
session.mount("http://", adapter)
return session
def safe_get_json(session: requests.Session, url: str, *, params=None, timeout=10):
"""GET a URL that returns JSON, with built‑in delay and retry."""
for _ in range(10):
try:
time.sleep(0.5 + random.random() * 0.5) # 0.5–1.0 s polite delay
resp = session.get(url, params=params, timeout=timeout)
if resp.status_code == 429:
retry_after = int(resp.headers.get("Retry-After", "5"))
logging.warning(f"Rate limited. Sleeping for {retry_after}s")
time.sleep(retry_after)
continue
resp.raise_for_status()
return resp.json()
except Exception as e:
logging.warning(f"Temporary issue fetching {url}: {e}. Retrying...")
time.sleep(5)
raise RuntimeError(f"Failed to GET {url} after multiple retries")
def get_page_content(session: requests.Session, page_id: str):
"""Return (title, html) for a single Confluence page."""
url = f"{BASE_URL}/rest/api/content/{page_id}"
params = {"expand": "body.storage"}
data = safe_get_json(session, url, params=params)
title = data.get("title", f"page_{page_id}")
html = data["body"]["storage"]["value"]
return title, html
def get_child_pages(session: requests.Session, page_id: str, *, limit=50, start=0):
"""Return list of direct child pages for *page_id* and count of results."""
url = f"{BASE_URL}/rest/api/content/{page_id}/child/page"
params = {"limit": limit, "start": start}
data = safe_get_json(session, url, params=params)
results = data.get("results", [])
return results, len(results)
def sanitize_filename(text: str, max_len=80) -> str:
"""Return filename‑safe version of *text* (alnum + -_.) limited to *max_len*."""
return "".join(c if c.isalnum() or c in "-_ ." else "_" for c in text)[:max_len]
def save_html(page_id: str, title: str, html: str):
os.makedirs(OUTPUT_DIR, exist_ok=True)
filename = f"{page_id}__{sanitize_filename(title)}.html"
path = os.path.join(OUTPUT_DIR, filename)
with open(path, "w", encoding="utf-8") as fp:
fp.write(html)
logging.info("Saved %s", path)
def crawl_page(session: requests.Session, page_id: str, visited: set[str]):
if page_id in visited:
return
visited.add(page_id)
# Fetch & save this page
try:
title, html = get_page_content(session, page_id)
save_html(page_id, title, html)
except Exception as exc:
logging.error("Failed to fetch/save page %s: %s", page_id, exc)
return
# Recurse into child pages (handle pagination)
start = 0
while True:
try:
children, count = get_child_pages(session, page_id, start=start)
except Exception as exc:
logging.warning("Failed to fetch children of page %s: %s", page_id, exc)
break
if count == 0:
break
for child in children:
child_id = child["id"]
crawl_page(session, child_id, visited)
if count < 50: # fewer than limit → last page of results
break
start += 50
def main():
logging.basicConfig(level=logging.INFO, format="%(message)s")
session = create_session(BEARER_TOKEN)
visited: set[str] = set()
try:
crawl_page(session, ROOT_PAGE_ID, visited)
except KeyboardInterrupt:
logging.warning("Interrupted by user.")
logging.info("Done. Crawled %d pages.", len(visited))
if __name__ == "__main__":
main()