diff --git a/pycaption/base.py b/pycaption/base.py index 1fa77895..e55b39b4 100644 --- a/pycaption/base.py +++ b/pycaption/base.py @@ -1,4 +1,5 @@ import os +import re from collections import defaultdict from datetime import timedelta from numbers import Number @@ -306,6 +307,9 @@ class CaptionSet: by all the children. """ + RE_HTML_STRIP = re.compile(r"<[^>]+>") + RE_ASS_STRIP = re.compile(r"{[^}]+}") + def __init__(self, captions, styles={}, layout_info=None): """ :param captions: A dictionary of the format {'language': CaptionList} @@ -377,6 +381,40 @@ def adjust_caption_timing(self, offset=0, rate_skew=1.0): out_captions.append(caption) self.set_captions(lang, out_captions) + def strip_html_tags(self): + for lang in self.get_languages(): + captions = self.get_captions(lang) + out_captions = CaptionList() + for caption in captions: + for node in caption.nodes: + if node.type_ == CaptionNode.TEXT: + node.content = self.RE_HTML_STRIP.sub("", node.content) + out_captions.append(caption) + self.set_captions(lang, out_captions) + + def strip_ass_tags(self): + for lang in self.get_languages(): + captions = self.get_captions(lang) + out_captions = CaptionList() + for caption in captions: + for node in caption.nodes: + if node.type_ == CaptionNode.TEXT: + node.content = self.RE_ASS_STRIP.sub("", node.content) + out_captions.append(caption) + self.set_captions(lang, out_captions) + + def remove_empty_captions(self): + for lang in self.get_languages(): + captions = self.get_captions(lang) + out_captions = CaptionList() + for caption in captions: + valid_text_nodes = [ + node for node in caption.nodes if node.type_ == CaptionNode.TEXT and node.content.strip() + ] + if valid_text_nodes: + out_captions.append(caption) + self.set_captions(lang, out_captions) + # Functions def merge_concurrent_captions(caption_set): diff --git a/pycaption/srt.py b/pycaption/srt.py index 359978f2..b1770e79 100644 --- a/pycaption/srt.py +++ b/pycaption/srt.py @@ -8,7 +8,9 @@ import re from PIL import Image, ImageFont, ImageDraw +import warnings +warnings.simplefilter('once', DeprecationWarning) class SRTReader(BaseReader): RE_HTML = re.compile(r'<[^>]+>') @@ -22,6 +24,14 @@ def detect(self, content): return False def read(self, content, lang='en-US', strip_html=False, strip_ass_tags=False): + if strip_html: + warnings.warn("Using strip_html in the read function is deprecated. " + "Use CaptionSet.strip_html_tags() instead", DeprecationWarning) + + if strip_ass_tags: + warnings.warn("Using strip_ass_tags in the read function is deprecated. " + "Use CaptionSet.strip_ass_tags() instead", DeprecationWarning) + if not isinstance(content, str): raise InvalidInputError('The content is not a unicode string.')