-
Notifications
You must be signed in to change notification settings - Fork 12
Expand file tree
/
Copy pathmyModules.py
More file actions
491 lines (449 loc) · 21.7 KB
/
myModules.py
File metadata and controls
491 lines (449 loc) · 21.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
import shutil
import requests
import os.path
import json
import time
from requests.auth import HTTPBasicAuth
from requests.exceptions import RequestException
from bs4 import BeautifulSoup as bs
import sys
import pypandoc
from PIL import Image
import re
"""
Arguments needed to run these functions centrally:
* outdirs: outdir, attach_dir, emoticonDir, styles_dir
* page details: Title, ID, Parent, orig URL, Space
* space details: Title, ID, site
* Confluence API: Username, Password
CURRENT STATE
* fixed getting output folders
* next up: getAttachments
"""
#
# Set path for where script is
#
script_dir = os.path.dirname(os.path.abspath(__file__))
attach_dir = "_images/"
emoticons_dir = "_images/"
styles_dir = "_static/"
def request_get_with_retry(url, retries=5, backoff_factor=1.5, timeout=30, **kwargs):
"""Execute GET request with exponential backoff for transient network errors."""
last_error = None
for attempt in range(retries):
try:
response = requests.get(url, timeout=timeout, **kwargs)
response.raise_for_status()
return response
except RequestException as error:
last_error = error
if attempt == retries - 1:
break
sleep_time = backoff_factor ** attempt
print(
f"WARNING: request failed ({error}). Retrying in {sleep_time:.1f}s "
f"[{attempt + 1}/{retries}] for URL: {url}"
)
time.sleep(sleep_time)
raise last_error
def set_variables():
"""Set variables for export folders"""
dict_vars = {}
dict_vars['attach_dir'] = "_images/"
dict_vars['emoticons_dir'] = "_images/"
dict_vars['styles_dir'] = "_static/"
attach_dir = "_images/"
emoticons_dir = "_images/"
styles_dir = "_static/"
return(dict_vars)
#
# Create the output folders, set to match Sphynx structure
#
def set_dirs(arg_outdir="output"): # setting default to output
"""Set output folders paths for attachments, emoticons and styles"""
my_vars = set_variables()
outdir_attach = os.path.join(arg_outdir,my_vars['attach_dir'])
outdir_emoticons = os.path.join(arg_outdir,my_vars['emoticons_dir'])
outdir_styles = os.path.join(arg_outdir,my_vars['styles_dir'])
return[outdir_attach, outdir_emoticons, outdir_styles] # returns a list
def mk_outdirs(arg_outdir="output"): # setting default to output
"""Create the output folders"""
my_vars = set_variables()
outdir_list = set_dirs(arg_outdir)
outdir_attach = outdir_list[0]
outdir_emoticons = outdir_list[1]
outdir_styles = outdir_list[2]
if not os.path.exists(arg_outdir):
os.mkdir(arg_outdir)
if not os.path.exists(outdir_attach):
os.mkdir(outdir_attach)
if not os.path.exists(outdir_emoticons):
os.mkdir(outdir_emoticons)
if not os.path.exists(outdir_styles):
os.mkdir(outdir_styles)
if not os.path.exists(outdir_styles + '/confluence.css'):
shutil.copy(f"{script_dir}/styles/confluence.css", f"{outdir_styles}confluence.css")
return(outdir_list)
def get_space_title(arg_site,arg_space_id,arg_username,arg_api_token):
"""Get Title of a space
Args:
arg_site: The site name
arg_space_id: ID of the space
arg_username: Username for auth
arg_api_token: API token for auth
Returns:
response (string): The title of the space
"""
server_url = (f"https://{arg_site}.atlassian.net/wiki/api/v2/spaces/{arg_space_id}")
response = request_get_with_retry(server_url, auth=(arg_username, arg_api_token), timeout=30).json()['name']
return(response)
def get_spaces_all(arg_site,arg_username,arg_api_token):
server_url = f"https://{arg_site}.atlassian.net/wiki/api/v2/spaces/?limit=250"
response = request_get_with_retry(server_url, auth=(arg_username,arg_api_token), timeout=30)
response.raise_for_status() # raises exception when not a 2xx response
space_list = response.json()['results']
while 'next' in response.json()['_links'].keys():
cursorserver_url = f"{server_url}&cursor{response.json()['_links']['next'].split('cursor')[1]}"
response = request_get_with_retry(cursorserver_url, auth=(arg_username,arg_api_token), timeout=30)
space_list = space_list + response.json()['results']
return(space_list)
def get_pages_from_space(arg_site,arg_space_id,arg_username,arg_api_token):
page_list = []
server_url = f"https://{arg_site}.atlassian.net/wiki/api/v2/spaces/{arg_space_id}/pages?status=current&limit=250"
response = request_get_with_retry(server_url, auth=(arg_username,arg_api_token), timeout=30)
page_list = response.json()['results']
while 'next' in response.json()['_links'].keys():
cursorserver_url = f"{server_url}&cursor{response.json()['_links']['next'].split('cursor')[1]}"
response = request_get_with_retry(cursorserver_url, auth=(arg_username,arg_api_token), timeout=30)
page_list = page_list + response.json()['results']
return(page_list)
def get_body_export_view(arg_site,arg_page_id,arg_username,arg_api_token):
server_url = f"https://{arg_site}.atlassian.net/wiki/rest/api/content/{arg_page_id}?expand=body.export_view"
response = request_get_with_retry(server_url, auth=(arg_username, arg_api_token), timeout=30)
return(response)
def get_page_name(arg_site,arg_page_id,arg_username,arg_api_token):
server_url = f"https://{arg_site}.atlassian.net/wiki/rest/api/content/{arg_page_id}"
r_pagetree = request_get_with_retry(server_url, auth=(arg_username, arg_api_token), timeout=30)
return(r_pagetree.json()['id'] + "_" + r_pagetree.json()['title'])
def get_page_parent(arg_site,arg_page_id,arg_username,arg_api_token):
server_url = f"https://{arg_site}.atlassian.net/wiki/api/v2/pages/{arg_page_id}"
response = request_get_with_retry(server_url, auth=(arg_username, arg_api_token), timeout=30)
return(response.json()['parentId'])
def remove_illegal_characters(input):
return re.sub(r'[^\w_\.\- ]+', '_', input)
def windows_safe_filename(name, max_len=180):
name = re.sub(r'[<>:"/\\|?*]', '_', name)
name = re.sub(r'[\x00-\x1f]', '_', name)
name = re.sub(r'[\s\.]+$', '', name)
name = name.strip()
if len(name) > max_len:
name = name[:max_len]
if not name:
name = "untitled"
return name
def get_attachments(arg_site,arg_page_id,arg_outdir_attach,arg_username,arg_api_token):
my_attachments_list = []
server_url = f"https://{arg_site}.atlassian.net/wiki/rest/api/content/{arg_page_id}?expand=children.attachment"
response = request_get_with_retry(server_url, auth=(arg_username, arg_api_token), timeout=30)
my_attachments = response.json()['children']['attachment']['results']
for attachment in my_attachments:
attachment_title = remove_illegal_characters(requests.utils.unquote(attachment['title']).replace(" ","_").replace(":","-")) # I want attachments without spaces
attachment_file_path = os.path.join(arg_outdir_attach,attachment_title)
if not os.path.exists(attachment_file_path):
print(f"Downloading: {attachment_title}")
try:
attachment_url = f"https://{arg_site}.atlassian.net/wiki{attachment['_links']['download']}"
request_attachment = request_get_with_retry(attachment_url, auth=(arg_username, arg_api_token), allow_redirects=True, timeout=30)
open(attachment_file_path, 'wb').write(request_attachment.content)
except:
print(f"WARNING: Skipping attachment file {attachment_file_path} due to issues. url: {attachment_url}")
my_attachments_list.append(attachment_title)
return(my_attachments_list)
# get page labels
def get_page_labels(arg_site,arg_page_id,arg_username,arg_api_token):
html_labels = []
server_url = f"https://{arg_site}.atlassian.net/wiki/api/v2/pages/{arg_page_id}/labels"
response = request_get_with_retry(server_url, auth=(arg_username,arg_api_token), timeout=30).json()
for l in response['results']:
html_labels.append(l['name'])
print(f"Label: {l['name']}")
html_labels = ", ".join(html_labels)
print(f"Page labels: {html_labels}")
return(html_labels)
def get_page_properties_children(arg_site,arg_html,arg_outdir,arg_username,arg_api_token):
my_page_properties_children = []
my_page_properties_children_dict = {}
soup = bs(arg_html, "html.parser")
my_page_properties_items = soup.findAll('td',class_="title")
my_page_properties_items_counter = 0
for n in my_page_properties_items:
my_page_id = str(n['data-content-id'])
my_page_properties_children.append(str(n['data-content-id']))
my_page_properties_items_counter = my_page_properties_items_counter + 1
my_page_name = get_page_name(arg_site,int(my_page_id),arg_username,arg_api_token).rsplit('_',1)[1].replace(":","-").replace(" ","_").replace("%20","_") # replace offending characters from file name
my_page_properties_children_dict.update({ my_page_id:{}})
my_page_properties_children_dict[my_page_id].update({"ID": my_page_id})
my_page_properties_children_dict[my_page_id].update({"Name": my_page_name})
print( f"{my_page_properties_items_counter} Page Properties Children Pages")
return[my_page_properties_children,my_page_properties_children_dict]
def get_editor_version(arg_site,arg_page_id,arg_username,arg_api_token):
server_url = f"https://{arg_site}.atlassian.net/wiki/rest/api/content/{arg_page_id}?expand=metadata.properties.editor"
response = request_get_with_retry(server_url, auth=(arg_username, arg_api_token), timeout=30)
return(response)
def dump_html(
arg_site,
arg_html,
arg_title,
arg_page_id,
arg_outdir_base,
arg_outdir_content,
arg_page_labels,
arg_page_parent,
arg_username,
arg_api_token,
arg_sphinx_compatible=True,
arg_sphinx_tags=False,
arg_type="",
arg_html_output=False,
arg_rst_output=True,
arg_show_labels=False
):
"""Create HTML and RST files
Args:
arg_site: Name of the Confluence Site
arg_html: HTML Content to use for page
arg_title: Title of the page
arg_page_id: Page ID
arg_outdir_base: Base output folder
arg_outdir_content: Output folder for Content
arg_page_labels: Labels of the page
arg_page_parent: Parent of the page
arg_username: Username for authentication
arg_api_token: API Token for authentication
arg_sphinx_compatible: Place _static and _images folder at root of output folder
arg_sphinx_tags: Add tags to output RST
arg_type: For Page Properties, the type of page: "report", "child" or "common" if it's not for Page Properties
Returns:
HTML, RST and all attachments, embeds and emoticons
"""
my_vars = set_variables()
my_emoticons_list = []
my_outdir_content = arg_outdir_content
#my_outdir_content = os.path.join(arg_outdir_base,str(arg_page_id) + "-" + str(arg_title)) # this is for html and rst files
if not os.path.exists(my_outdir_content):
os.mkdir(my_outdir_content)
#myOutdir = os.path.join(arg_outdir,str(arg_page_id) + "-" + str(arg_title))
my_outdirs = mk_outdirs(arg_outdir_base) # this is for everything for _images and _static
my_vars = set_variables() # create a dict with the 3 folder paths: attach, emoticons, styles
soup = bs(arg_html, "html.parser")
#
# removing elements we don't need like
# * <div class="expand-control"...
# * <pre class="syntaxhighlighter-pre"...
#
my_undesirables = soup.findAll('div',class_="expand-control")
for div in my_undesirables:
div.decompose()
# Find all pre tags
pre_tags = soup.find_all('pre')
# Remove the class 'syntaxhighlighter-pre' from each pre tag
for pre in pre_tags:
pre['class'] = [c for c in pre.get('class', []) if c != 'syntaxhighlighter-pre']
# continuing
safe_title = windows_safe_filename(arg_title.replace(" ", "_").replace(":", "-").replace("/", "-"))
html_file_name = f"{safe_title}.html"
html_file_path = os.path.join(my_outdir_content,html_file_name)
my_attachments = get_attachments(arg_site,arg_page_id,str(my_outdirs[0]),arg_username,arg_api_token)
#
# used for pageprops mode
#
#if (arg_type == "child"):
#my_report_children_dict = get_page_properties_children(arg_site,arg_html,arg_outdir,arg_username,arg_api_token)[1] # get list of all page properties children
#my_report_children_dict[arg_page_id].update({"Filename": arg_html_file_name})
if (arg_type == "report"):
my_report_children_dict= get_page_properties_children(arg_site,arg_html,my_outdir_content,arg_username,arg_api_token)[1] # dict
my_page_properties_items = soup.findAll('td',class_="title") # list
for item in my_page_properties_items:
id = item['data-content-id']
item.a['href'] = (f"{my_report_children_dict[id]['Name']}.html")
#
# dealing with "confluence-embedded-image confluence-external-resource"
#
my_embeds_externals = soup.findAll('img',class_="confluence-embedded-image confluence-external-resource")
my_embeds_externals_counter = 0
for embed_ext in my_embeds_externals:
orig_embed_external_path = embed_ext['src'] # online link to file
orig_embed_external_name = orig_embed_external_path.rsplit('/',1)[-1].rsplit('?')[0] # just the file name
my_embed_external_name = remove_illegal_characters((f"{arg_page_id}-{my_embeds_externals_counter}-{requests.utils.unquote(orig_embed_external_name)}").replace(" ", "_").replace(":","-")) # local filename
my_embed_external_path = os.path.join(my_outdirs[0],my_embed_external_name) # local filename and path
if arg_sphinx_compatible == True:
my_embed_external_path_relative = os.path.join(str('../' + my_vars['attach_dir']),my_embed_external_name)
else:
my_embed_external_path_relative = os.path.join(my_vars['attach_dir'],my_embed_external_name)
try:
if not os.path.exists(my_embed_external_path):
to_download = request_get_with_retry(orig_embed_external_path, allow_redirects=True, timeout=30)
open(my_embed_external_path,'wb').write(to_download.content)
img = Image.open(my_embed_external_path)
except:
print(f"WARNING: Skipping embed file {my_embed_external_path} due to issues. url: {orig_embed_external_path}")
else:
if img is not None:
if img.width < 600:
embed_ext['width'] = img.width
else:
embed_ext['width'] = 600
img.close
embed_ext['height'] = "auto"
embed_ext['onclick'] = f"window.open(\"{my_embed_external_path_relative}\")"
embed_ext['src'] = str(my_embed_external_path_relative)
embed_ext['data-image-src'] = str(my_embed_external_path_relative)
my_embeds_externals_counter = my_embeds_externals_counter + 1
#
# dealing with "confluence-embedded-image"
#
my_embeds = soup.findAll('img',class_=re.compile("^confluence-embedded-image"))
print(str(len(my_embeds)) + " embedded images.")
for embed in my_embeds:
orig_embed_path = embed['src'] # online link to file
orig_embed_name = orig_embed_path.rsplit('/',1)[-1].rsplit('?')[0] # online file name
my_embed_name = remove_illegal_characters(requests.utils.unquote(orig_embed_name).replace(" ", "_")) # local file name
my_embed_path = my_outdirs[0] + my_embed_name # local file path
if arg_sphinx_compatible == True:
my_embed_path_relative = f"../{my_vars['attach_dir']}{my_embed_name}"
else:
my_embed_path_relative = f"{my_vars['attach_dir']}{my_embed_name}"
img = None
try:
if not os.path.exists(my_embed_path):
to_download = request_get_with_retry(orig_embed_path, allow_redirects=True, auth=(arg_username, arg_api_token), timeout=30)
open(my_embed_path,'wb').write(to_download.content)
img = Image.open(my_embed_path)
except:
print(f"WARNING: Skipping embed file {my_embed_path} due to issues. url: {orig_embed_path}")
else:
if img is not None:
if img.width < 600:
embed['width'] = img.width
else:
embed['width'] = 600
img.close
embed['height'] = "auto"
embed['onclick'] = f"window.open(\"{my_embed_path_relative}\")"
embed['src'] = my_embed_path_relative
#
# dealing with "emoticon" and expands' "grey_arrow_down.png"
#
my_emoticons = soup.findAll('img',class_=re.compile("emoticon|expand-control-image"))
print(f"{len(my_emoticons)} emoticons.")
for emoticon in my_emoticons:
my_emoticon_title = emoticon['src'].rsplit('/',1)[-1] # just filename
if arg_sphinx_compatible == True:
my_emoticon_path = f"../{my_vars['emoticons_dir']}{my_emoticon_title}"
else:
my_emoticon_path = f"{my_vars['emoticons_dir']}{my_emoticon_title}"
if my_emoticon_title not in my_emoticons_list:
my_emoticons_list.append(my_emoticon_title)
print(f"Getting emoticon: {my_emoticon_title}")
file_path = os.path.join(my_outdirs[1],remove_illegal_characters(my_emoticon_title))
if not os.path.exists(file_path):
emoticon_src = emoticon['src']
try:
request_emoticons = request_get_with_retry(emoticon_src, auth=(arg_username, arg_api_token), timeout=30)
open(file_path, 'wb').write(request_emoticons.content)
except:
print(f"WARNING: Skipping emoticon file {file_path} due to issues. url: {emoticon_src}")
emoticon['src'] = my_emoticon_path
my_body_export_view = get_body_export_view(arg_site,arg_page_id,arg_username,arg_api_token).json()
page_url = f"{my_body_export_view['_links']['base']}{my_body_export_view['_links']['webui']}"
if arg_sphinx_compatible == True:
styles_dir_relative = f"../{my_vars['styles_dir']}"
else:
styles_dir_relative = my_vars['styles_dir']
my_header = (f"<html>\n"
f"<head>\n"
f"<title>{arg_title}</title>\n"
f"<link rel=\"stylesheet\" href=\"{styles_dir_relative}confluence.css\" type=\"text/css\" />\n"
f"<meta name=\"generator\" content=\"confluenceExportHTML\" />\n"
f"<META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n"
f"<meta name=\"ConfluencePageLabels\" content=\"{arg_page_labels}\">\n"
f"<meta name=\"ConfluencePageID\" content=\"{arg_page_id}\">\n"
f"<meta name=\"ConfluencePageParent\" content=\"{arg_page_parent}\">\n"
f"</head>\n"
f"<body>\n"
f"<h2>{arg_title}</h2>\n"
f"<p>Original URL: <a href=\"{page_url}\"> {arg_title}</a><hr>\n"
)
myFooter = (f"</body>\n"
f"</html>"
)
#
# At the end of the page, put a link to all attachments.
#
if arg_sphinx_compatible == True:
attach_dir = "../" + my_vars['attach_dir']
else:
attach_dir = my_vars['attach_dir']
if len(my_attachments) > 0:
my_pre_footer = "<h2>Attachments</h2><ol>"
for attachment in my_attachments:
my_pre_footer += (f"<li><a href=\"{os.path.join(attach_dir,attachment)}\">{attachment}</a></li>")
my_pre_footer += "</ol></br>"
#
# Putting HTML together
#
pretty_html = soup.prettify()
html_file = open(html_file_path, 'w', encoding='utf-8')
html_file.write(my_header)
html_file.write(pretty_html)
if len(my_attachments) > 0:
html_file.write(my_pre_footer)
html_file.write(myFooter)
html_file.close()
if arg_html_output == True:
print(f"Exported HTML file {html_file_path}")
#
# convert html to rst
#
if not arg_rst_output:
return
rst_file_name = html_file_name.replace(".html", ".rst")
rst_file_path = os.path.join(my_outdir_content,rst_file_name)
try:
output_rst = pypandoc.convert_file(str(html_file_path), 'rst', format='html',extra_args=['--standalone','--wrap=none','--list-tables'])
except:
print("There was an issue generating an RST file from the page.")
else:
##
## RST Header with Page Metadata
##
if (arg_sphinx_compatible == True):
rst_page_header = (f":conf_pagetype: {arg_type}\n"
f":conf_pageid: {arg_page_id}\n"
f":conf_parent: {arg_page_parent}\n"
f":conf_labels: {arg_page_labels}\n"
f":doc_title: {arg_title}\n"
f"\n"
)
else:
rst_page_header = (f".. meta::\n"
f" :confluencePageId: {arg_page_id} \n"
f" :confluencePageLabels: {arg_page_labels} \n"
f" :confluencePageParent: {arg_page_parent} \n"
f"\n"
)
## Footer with list of page labels
if arg_show_labels == True:
footer_rst = (f"...."
f"\n"
f"\n**Page labels**: {arg_page_labels} \n")
else:
footer_rst = ""
rst_file = open(rst_file_path, 'w', encoding='utf-8')
rst_file.write(rst_page_header)
rst_file.write(output_rst)
rst_file.write(footer_rst)
rst_file.close()
print(f"Exported RST file: {rst_file_path}")
if arg_html_output == False:
os.remove(html_file_path)