From 767f1cb1bb2e4aeba844de3ff57923103648539d Mon Sep 17 00:00:00 2001 From: Andrej Peterka Date: Thu, 12 Jun 2025 10:27:12 +0200 Subject: [PATCH 1/3] Move stripping html and ASS tags to CaptionSet class --- pycaption/base.py | 26 ++++++++++++++++++++++++++ pycaption/srt.py | 10 ++++++++++ 2 files changed, 36 insertions(+) diff --git a/pycaption/base.py b/pycaption/base.py index 1fa77895..eed2c37e 100644 --- a/pycaption/base.py +++ b/pycaption/base.py @@ -1,4 +1,5 @@ import os +import re from collections import defaultdict from datetime import timedelta from numbers import Number @@ -306,6 +307,9 @@ class CaptionSet: by all the children. """ + RE_HTML_STRIP = re.compile(r"<[^>]+>") + RE_ASS_STRIP = re.compile(r"{[^}]+}") + def __init__(self, captions, styles={}, layout_info=None): """ :param captions: A dictionary of the format {'language': CaptionList} @@ -377,6 +381,28 @@ def adjust_caption_timing(self, offset=0, rate_skew=1.0): out_captions.append(caption) self.set_captions(lang, out_captions) + def strip_html_tags(self): + for lang in self.get_languages(): + captions = self.get_captions(lang) + out_captions = CaptionList() + for caption in captions: + for node in caption.nodes: + if node.type_ == CaptionNode.TEXT: + node.content = self.RE_HTML_STRIP.sub("", node.content) + out_captions.append(caption) + self.set_captions(lang, out_captions) + + def strip_ass_tags(self): + for lang in self.get_languages(): + captions = self.get_captions(lang) + out_captions = CaptionList() + for caption in captions: + for node in caption.nodes: + if node.type_ == CaptionNode.TEXT: + node.content = self.RE_ASS_STRIP.sub("", node.content) + out_captions.append(caption) + self.set_captions(lang, out_captions) + # Functions def merge_concurrent_captions(caption_set): diff --git a/pycaption/srt.py b/pycaption/srt.py index 359978f2..b1770e79 100644 --- a/pycaption/srt.py +++ b/pycaption/srt.py @@ -8,7 +8,9 @@ import re from PIL import Image, ImageFont, ImageDraw +import warnings +warnings.simplefilter('once', DeprecationWarning) class SRTReader(BaseReader): RE_HTML = re.compile(r'<[^>]+>') @@ -22,6 +24,14 @@ def detect(self, content): return False def read(self, content, lang='en-US', strip_html=False, strip_ass_tags=False): + if strip_html: + warnings.warn("Using strip_html in the read function is deprecated. " + "Use CaptionSet.strip_html_tags() instead", DeprecationWarning) + + if strip_ass_tags: + warnings.warn("Using strip_ass_tags in the read function is deprecated. " + "Use CaptionSet.strip_ass_tags() instead", DeprecationWarning) + if not isinstance(content, str): raise InvalidInputError('The content is not a unicode string.') From dda2c73b6540313aa9e1d231000aafda3d17ac8f Mon Sep 17 00:00:00 2001 From: Andrej Peterka Date: Thu, 12 Jun 2025 10:36:28 +0200 Subject: [PATCH 2/3] Add function to CaptionSet to remove empty captions --- pycaption/base.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pycaption/base.py b/pycaption/base.py index eed2c37e..189dcff0 100644 --- a/pycaption/base.py +++ b/pycaption/base.py @@ -403,6 +403,25 @@ def strip_ass_tags(self): out_captions.append(caption) self.set_captions(lang, out_captions) + def remove_empty_captions(self): + for lang in self.get_languages(): + captions = self.get_captions(lang) + out_captions = CaptionList() + for caption in captions: + valid_nodes = [] + for node in caption.nodes: + if node.type_ == CaptionNode.TEXT: + if node.content.strip(): + valid_nodes.append(node) + else: + valid_nodes.append(node) + + valid_text_nodes = [node for node in valid_nodes if node.type_ == CaptionNode.TEXT] + if valid_text_nodes: + out_captions.append(caption) + + self.set_captions(lang, out_captions) + # Functions def merge_concurrent_captions(caption_set): From 496be9112d87fab7f3c9e418c6b29a60cc6b7537 Mon Sep 17 00:00:00 2001 From: Andrej Peterka Date: Thu, 12 Jun 2025 10:39:21 +0200 Subject: [PATCH 3/3] Refactor --- pycaption/base.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/pycaption/base.py b/pycaption/base.py index 189dcff0..e55b39b4 100644 --- a/pycaption/base.py +++ b/pycaption/base.py @@ -408,18 +408,11 @@ def remove_empty_captions(self): captions = self.get_captions(lang) out_captions = CaptionList() for caption in captions: - valid_nodes = [] - for node in caption.nodes: - if node.type_ == CaptionNode.TEXT: - if node.content.strip(): - valid_nodes.append(node) - else: - valid_nodes.append(node) - - valid_text_nodes = [node for node in valid_nodes if node.type_ == CaptionNode.TEXT] + valid_text_nodes = [ + node for node in caption.nodes if node.type_ == CaptionNode.TEXT and node.content.strip() + ] if valid_text_nodes: out_captions.append(caption) - self.set_captions(lang, out_captions)