forked from openlibhums/pandoc_plugin
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconvert.py
More file actions
79 lines (65 loc) · 2.5 KB
/
convert.py
File metadata and controls
79 lines (65 loc) · 2.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import mimetypes
import os
import subprocess
import tempfile
from bs4 import BeautifulSoup
from core.files import IMAGE_MIMETYPES
from utils import models, setting_handler
from utils.logger import get_logger
from plugins.pandoc_plugin import plugin_settings
logger = get_logger(__name__)
# Argument added to all calls to pandoc that caps the size of Pandoc's heap,
# preventing maliciously formatted files from triggering a runaway
# conversion process.
MEMORY_LIMIT_ARG = ['+RTS', '-M{}M'.format(plugin_settings.MEMORY_LIMIT_MB), '-RTS']
EXTRACT_MEDIA = "--extract-media"
PANDOC_CMD = ['pandoc']
def generate_html_from_doc(doc_path, extract_images=False):
""" Generates an HTML galley from the given document path
:param doc_path: A string with the path to the file to be converted
:return: The string with the produced HTML and an iterable of paths to the
extracted images
"""
_, extension = os.path.splitext(doc_path)
if extension not in ['.docx', '.rtf']:
raise TypeError("File Extension {} not supported".format(extension))
images_temp_path = tempfile.mkdtemp()
if extract_images:
extract_images_arg = [EXTRACT_MEDIA, images_temp_path]
else:
extract_images_arg = []
pandoc_command = (
PANDOC_CMD
+ MEMORY_LIMIT_ARG
+ extract_images_arg
+ ['-s', doc_path, '-t', 'html']
)
try:
logger.info("[PANDOC] Running command '{}'".format(pandoc_command))
pandoc_return = subprocess.run(
pandoc_command,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True,
)
except subprocess.CalledProcessError as e:
raise PandocError("PandocError: {e.stderr}".format(e=e))
# Formatting
pandoc_soup = BeautifulSoup(pandoc_return.stdout, 'html.parser')
for img in pandoc_soup.find_all("img"):
# Replace pandoc path to image with filename
img["src"] = os.path.basename(img["src"])
# Undo pandoc guess of height/width attributes of images.
del img["style"]
image_paths = [
os.path.join(base, f)
for base, dirs, files in os.walk(images_temp_path)
for f in files
if mimetypes.guess_type(f)[0] in IMAGE_MIMETYPES
]
logger.debug("[PANDOC] Extracted {} images".format(len(image_paths)))
return str(pandoc_soup), image_paths
class PandocError(Exception):
def __init__(self, msg, cmd=None):
super().__init__(self, msg)
self.cmd = cmd