castlabs · andrej-peterka · Jun 12, 2025 · Jun 12, 2025 · Jun 12, 2025 · Jun 12, 2025
diff --git a/pycaption/base.py b/pycaption/base.py
@@ -1,4 +1,5 @@
 import os
+import re
 from collections import defaultdict
 from datetime import timedelta
 from numbers import Number
@@ -306,6 +307,9 @@ class CaptionSet:
     by all the children.
     """
 
+    RE_HTML_STRIP = re.compile(r"<[^>]+>")
+    RE_ASS_STRIP = re.compile(r"{[^}]+}")
+
     def __init__(self, captions, styles={}, layout_info=None):
         """
         :param captions: A dictionary of the format {'language': CaptionList}
@@ -377,6 +381,40 @@ def adjust_caption_timing(self, offset=0, rate_skew=1.0):
                     out_captions.append(caption)
             self.set_captions(lang, out_captions)
 
+    def strip_html_tags(self):
+        for lang in self.get_languages():
+            captions = self.get_captions(lang)
+            out_captions = CaptionList()
+            for caption in captions:
+                for node in caption.nodes:
+                    if node.type_ == CaptionNode.TEXT:
+                        node.content = self.RE_HTML_STRIP.sub("", node.content)
+                out_captions.append(caption)
+            self.set_captions(lang, out_captions)
+
+    def strip_ass_tags(self):
+        for lang in self.get_languages():
+            captions = self.get_captions(lang)
+            out_captions = CaptionList()
+            for caption in captions:
+                for node in caption.nodes:
+                    if node.type_ == CaptionNode.TEXT:
+                        node.content = self.RE_ASS_STRIP.sub("", node.content)
+                out_captions.append(caption)
+            self.set_captions(lang, out_captions)
+
+    def remove_empty_captions(self):
+        for lang in self.get_languages():
+            captions = self.get_captions(lang)
+            out_captions = CaptionList()
+            for caption in captions:
+                valid_text_nodes = [
+                    node for node in caption.nodes if node.type_ == CaptionNode.TEXT and node.content.strip()
+                ]
+                if valid_text_nodes:
+                    out_captions.append(caption)
+            self.set_captions(lang, out_captions)
+
 
 # Functions
 def merge_concurrent_captions(caption_set):

diff --git a/pycaption/srt.py b/pycaption/srt.py
@@ -8,7 +8,9 @@
 
 import re
 from PIL import Image, ImageFont, ImageDraw
+import warnings
 
+warnings.simplefilter('once', DeprecationWarning)
 
 class SRTReader(BaseReader):
     RE_HTML = re.compile(r'<[^>]+>')
@@ -22,6 +24,14 @@ def detect(self, content):
             return False
 
     def read(self, content, lang='en-US', strip_html=False, strip_ass_tags=False):
+        if strip_html:
+            warnings.warn("Using strip_html in the read function is deprecated. "
+                          "Use CaptionSet.strip_html_tags() instead", DeprecationWarning)
+
+        if strip_ass_tags:
+            warnings.warn("Using strip_ass_tags in the read function is deprecated. "
+                          "Use CaptionSet.strip_ass_tags() instead", DeprecationWarning)
+
         if not isinstance(content, str):
             raise InvalidInputError('The content is not a unicode string.')