pandoc_plugin/convert.py at master · openlibhums/pandoc_plugin · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import mimetypes
import os
import subprocess
import tempfile

from bs4 import BeautifulSoup
from PIL import Image, UnidentifiedImageError

from core.files import IMAGE_MIMETYPES
from utils.logger import get_logger

from plugins.pandoc_plugin import plugin_settings

logger = get_logger(__name__)

# Argument added to all calls to pandoc that caps the size of Pandoc's heap,
# preventing maliciously formatted files from triggering a runaway
# conversion process.
MEMORY_LIMIT_ARG = ['+RTS', '-M{}M'.format(plugin_settings.MEMORY_LIMIT_MB), '-RTS']
EXTRACT_MEDIA = "--extract-media"
PANDOC_CMD = ['pandoc']

def resize_image(img_path, size):
    try:
        img = Image.open(img_path)
    except FileNotFoundError:
        logger.warning("File not found, can't resize: %s" % img_path)
        return
    except UnidentifiedImageError:
        # Could be an SVG
        return

    img = img.resize((size[0], size[1]), Image.LANCZOS)

    img.save(img_path, "png")

def get_img_size(style, default_dpi):
    styles = dict(s.split(":") for s in style.split(";"))
    w = styles.get("width", False)
    h = styles.get("height", False)
    if w and h:
        w = float(w.strip("in"))
        h = float(h.strip("in"))
        return (int(w * default_dpi[0]), int(h * default_dpi[1]))
    return False

def generate_html_from_doc(doc_path, extract_images=False, resize_images=False, default_dpi=(96, 96)):
    """ Generates an HTML galley from the given document path
    :param doc_path: A string with the path to the file to be converted
    :return: The string with the produced HTML and an iterable of paths to the
        extracted images
    """
    _, extension = os.path.splitext(doc_path)
    if extension not in ['.docx', '.rtf']:
        raise TypeError("File Extension {} not supported".format(extension))

    images_temp_path = tempfile.mkdtemp()
    if extract_images:
        extract_images_arg = [EXTRACT_MEDIA, images_temp_path]
    else:
        extract_images_arg = []

    pandoc_command = (
            PANDOC_CMD
            + MEMORY_LIMIT_ARG
            + extract_images_arg
            + ['-s', doc_path, '-t', 'html']
    )
    try:
        logger.info("[PANDOC] Running command '{}'".format(pandoc_command))
        pandoc_return = subprocess.run(
                pandoc_command,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                check=True,
        )
    except subprocess.CalledProcessError as e:
        raise PandocError("PandocError: {e.stderr}".format(e=e))

    # Formatting
    pandoc_soup = BeautifulSoup(pandoc_return.stdout, 'html.parser')
    for img in pandoc_soup.find_all("img"):
        tmp_img_path = img["src"]
        # Replace pandoc path to image with filename
        img["src"] = os.path.basename(img["src"])

        if resize_images and img.has_attr("style"):
            size = get_img_size(img["style"], default_dpi)
            if size:
                resize_image(tmp_img_path, size)

        # Undo pandoc guess of height/width attributes of images.
        del img["style"]

    image_paths = [
        os.path.join(base, f)
        for base, dirs, files in os.walk(images_temp_path)
        for f in files
        if mimetypes.guess_type(f)[0] in IMAGE_MIMETYPES
    ]
    logger.debug("[PANDOC] Extracted {} images".format(len(image_paths)))

    return str(pandoc_soup), image_paths


class PandocError(Exception):
    def __init__(self, msg, cmd=None):
        super().__init__(self, msg)
        self.cmd = cmd