-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathconvert.py
More file actions
110 lines (90 loc) · 3.44 KB
/
convert.py
File metadata and controls
110 lines (90 loc) · 3.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import mimetypes
import os
import subprocess
import tempfile
from bs4 import BeautifulSoup
from PIL import Image, UnidentifiedImageError
from core.files import IMAGE_MIMETYPES
from utils.logger import get_logger
from plugins.pandoc_plugin import plugin_settings
logger = get_logger(__name__)
# Argument added to all calls to pandoc that caps the size of Pandoc's heap,
# preventing maliciously formatted files from triggering a runaway
# conversion process.
MEMORY_LIMIT_ARG = ['+RTS', '-M{}M'.format(plugin_settings.MEMORY_LIMIT_MB), '-RTS']
EXTRACT_MEDIA = "--extract-media"
PANDOC_CMD = ['pandoc']
def resize_image(img_path, size):
try:
img = Image.open(img_path)
except FileNotFoundError:
logger.warning("File not found, can't resize: %s" % img_path)
return
except UnidentifiedImageError:
# Could be an SVG
return
img = img.resize((size[0], size[1]), Image.LANCZOS)
img.save(img_path, "png")
def get_img_size(style, default_dpi):
styles = dict(s.split(":") for s in style.split(";"))
w = styles.get("width", False)
h = styles.get("height", False)
if w and h:
w = float(w.strip("in"))
h = float(h.strip("in"))
return (int(w * default_dpi[0]), int(h * default_dpi[1]))
return False
def generate_html_from_doc(doc_path, extract_images=False, resize_images=False, default_dpi=(96, 96)):
""" Generates an HTML galley from the given document path
:param doc_path: A string with the path to the file to be converted
:return: The string with the produced HTML and an iterable of paths to the
extracted images
"""
_, extension = os.path.splitext(doc_path)
if extension not in ['.docx', '.rtf']:
raise TypeError("File Extension {} not supported".format(extension))
images_temp_path = tempfile.mkdtemp()
if extract_images:
extract_images_arg = [EXTRACT_MEDIA, images_temp_path]
else:
extract_images_arg = []
pandoc_command = (
PANDOC_CMD
+ MEMORY_LIMIT_ARG
+ extract_images_arg
+ ['-s', doc_path, '-t', 'html']
)
try:
logger.info("[PANDOC] Running command '{}'".format(pandoc_command))
pandoc_return = subprocess.run(
pandoc_command,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True,
)
except subprocess.CalledProcessError as e:
raise PandocError("PandocError: {e.stderr}".format(e=e))
# Formatting
pandoc_soup = BeautifulSoup(pandoc_return.stdout, 'html.parser')
for img in pandoc_soup.find_all("img"):
tmp_img_path = img["src"]
# Replace pandoc path to image with filename
img["src"] = os.path.basename(img["src"])
if resize_images and img.has_attr("style"):
size = get_img_size(img["style"], default_dpi)
if size:
resize_image(tmp_img_path, size)
# Undo pandoc guess of height/width attributes of images.
del img["style"]
image_paths = [
os.path.join(base, f)
for base, dirs, files in os.walk(images_temp_path)
for f in files
if mimetypes.guess_type(f)[0] in IMAGE_MIMETYPES
]
logger.debug("[PANDOC] Extracted {} images".format(len(image_paths)))
return str(pandoc_soup), image_paths
class PandocError(Exception):
def __init__(self, msg, cmd=None):
super().__init__(self, msg)
self.cmd = cmd