Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions resources/lib/item_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import xbmcgui

from .utils import (
datetime_from_string, get_art_url, image_url, get_current_datetime
datetime_from_string, get_art_url, image_url, get_current_datetime, plainify_html
)
from .lazylogger import LazyLogger

Expand Down Expand Up @@ -300,7 +300,7 @@ def extract_item_info(item, gui_options):
item_details.resume_time = int(reasonable_ticks / 10000)

item_details.series_name = item.get("SeriesName", '')
item_details.plot = item.get("Overview", '')
item_details.plot = plainify_html(item.get("Overview", ''))

runtime = item.get("RunTimeTicks")
if item_details.is_folder is False and runtime:
Expand Down
123 changes: 123 additions & 0 deletions resources/lib/plainhtmlparser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
from html.parser import HTMLParser


class PlainHTMLParser(HTMLParser):
'''
HTMLParser implementation that strips HTML tags, preserving the content.
This is not intended to interpret HTML, nor output sanitized and secure HTML
that's safe to use in a web browser.

This parses a string that may contain HTML, and removes HTML tags, and content
that isn't intended for users to read, such as <script> and <style>. It will
preserve the content of tags that is semantically intended to be read by the
user, such <a>, <p>, and <span>.

Implementation should always be locale independent. It works with the HTML,
not string/ASCII content.
'''
html_elements = [
"html", "base", "head", "link", "meta", "style", "title", "body",
"address", "article", "aside", "footer", "header", "h1", "h2", "h3",
"h4", "h5", "h6", "hgroup", "main", "nav", "section", "search",
"blockquote", "dd", "div", "dl", "dt", "figcaption", "figure", "hr",
"li", "menu", "ol", "p", "pre", "ul", "a", "abbr", "b", "bdi", "bdo",
"br", "cite", "code", "data", "dfn", "em", "i", "kbd", "mark", "q",
"rp", "rt", "ruby", "s", "samp", "small", "span", "strong", "sub",
"sup", "time", "u", "var", "wbr", "area", "audio", "img", "map",
"track", "video", "embed", "iframe", "object", "picture", "portal",
"source", "svg", "math", "canvas", "noscript", "script", "del", "ins",
"caption", "col", "colgroup", "table", "tbody", "td", "tfoot", "th",
"thead", "tr", "button", "datalist", "fieldset", "form", "input",
"label", "legend", "meter", "optgroup", "option", "output", "progress",
"select", "textarea", "details", "dialog", "summary", "slot", "template"
]
'''
List of HTML elements, excluding obsolete and deprecated HTML elements.

This allows us to seperate stylized text from actual HTML. For example,
if a description or movie name contains "<3", it will be preserved because
"3" is not the list of known HTML elements.

See: https://developer.mozilla.org/en-US/docs/Web/HTML/Element
'''

inline_elements = [
"a", "b", "em", "i", "s", "span", "strong", "sub", "sup", "u"
]
'''
List of inline HTML elements. Allows us know which element doesn't need to
have whitespace appended.
'''

tag_denylist = [
"head", "meta", "style", "canvas", "noscript", "script", "summary"
]
'''
Denylist of tags with content we don't want to display.

We drop <summary> because it's part of the <details> tag, but since we'll
always display the details anyway, there is no need for the summary of it.
'''

def __init__(self):
super().__init__(convert_charrefs=True)
self.elements = []
self.accumulator = []
self.pending_data = []
self.result = None

def handle_starttag(self, tag, _):
if self.elements and self.elements[-1] in PlainHTMLParser.tag_denylist:
return

self.handle_pending_data()

if tag not in PlainHTMLParser.html_elements:
self.accumulator.append(self.get_starttag_text())
return

self.elements.append(tag)

def handle_endtag(self, tag):
self.handle_pending_data()

if self.elements and self.elements[-1] == tag:
self.elements.pop()
return
elif self.elements and self.elements[-1] in PlainHTMLParser.tag_denylist:
return

if tag not in PlainHTMLParser.html_elements:
self.accumulator.append(self.get_starttag_text())

def handle_startendtag(self, tag, _):
if self.elements and self.elements[-1] in PlainHTMLParser.tag_denylist:
return

self.handle_pending_data()

if tag not in PlainHTMLParser.html_elements:
self.accumulator.append(self.get_starttag_text())

def handle_data(self, data):
if self.elements and self.elements[-1] in PlainHTMLParser.tag_denylist:
return

self.pending_data.append(data)

def close(self):
super().close()
self.handle_pending_data()
self.result = "".join(self.accumulator)

def handle_pending_data(self):
if not self.pending_data:
return

data_concat = "".join(self.pending_data)

if self.accumulator and self.elements and self.elements[-1] not in PlainHTMLParser.inline_elements:
data_concat = " " + data_concat

self.accumulator.append(data_concat)
self.pending_data.clear()
14 changes: 14 additions & 0 deletions resources/lib/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

from .lazylogger import LazyLogger
from .kodi_utils import HomeWindow
from .plainhtmlparser import PlainHTMLParser

# hack to get datetime strptime loaded
throwaway = time.strptime('20110101', '%Y%m%d')
Expand Down Expand Up @@ -451,3 +452,16 @@ def get_bitrate(enum_value):
7000, 8000, 9000, 10000, 12000, 14000, 16000, 18000,
20000, 25000, 30000, 35000, 40000, 100000, 1000000, 2147483]
return bitrate[int(enum_value) if enum_value else 24] * 1000


def plainify_html(body):
'''
Strip HTML elements from the string, preserving human-readable content.
'''
if body is None:
raise ValueError("body must not be None")

parser = PlainHTMLParser()
parser.feed(body)
parser.close()
return parser.result