From 767f1cb1bb2e4aeba844de3ff57923103648539d Mon Sep 17 00:00:00 2001
From: Andrej Peterka <andrej.peterka@castlabs.com>
Date: Thu, 12 Jun 2025 10:27:12 +0200
Subject: [PATCH 1/3] Move stripping html and ASS tags to CaptionSet class

---
 pycaption/base.py | 26 ++++++++++++++++++++++++++
 pycaption/srt.py  | 10 ++++++++++
 2 files changed, 36 insertions(+)

diff --git a/pycaption/base.py b/pycaption/base.py
index 1fa77895..eed2c37e 100644
--- a/pycaption/base.py
+++ b/pycaption/base.py
@@ -1,4 +1,5 @@
 import os
+import re
 from collections import defaultdict
 from datetime import timedelta
 from numbers import Number
@@ -306,6 +307,9 @@ class CaptionSet:
     by all the children.
     """
 
+    RE_HTML_STRIP = re.compile(r"<[^>]+>")
+    RE_ASS_STRIP = re.compile(r"{[^}]+}")
+
     def __init__(self, captions, styles={}, layout_info=None):
         """
         :param captions: A dictionary of the format {'language': CaptionList}
@@ -377,6 +381,28 @@ def adjust_caption_timing(self, offset=0, rate_skew=1.0):
                     out_captions.append(caption)
             self.set_captions(lang, out_captions)
 
+    def strip_html_tags(self):
+        for lang in self.get_languages():
+            captions = self.get_captions(lang)
+            out_captions = CaptionList()
+            for caption in captions:
+                for node in caption.nodes:
+                    if node.type_ == CaptionNode.TEXT:
+                        node.content = self.RE_HTML_STRIP.sub("", node.content)
+                out_captions.append(caption)
+            self.set_captions(lang, out_captions)
+
+    def strip_ass_tags(self):
+        for lang in self.get_languages():
+            captions = self.get_captions(lang)
+            out_captions = CaptionList()
+            for caption in captions:
+                for node in caption.nodes:
+                    if node.type_ == CaptionNode.TEXT:
+                        node.content = self.RE_ASS_STRIP.sub("", node.content)
+                out_captions.append(caption)
+            self.set_captions(lang, out_captions)
+
 
 # Functions
 def merge_concurrent_captions(caption_set):
diff --git a/pycaption/srt.py b/pycaption/srt.py
index 359978f2..b1770e79 100644
--- a/pycaption/srt.py
+++ b/pycaption/srt.py
@@ -8,7 +8,9 @@
 
 import re
 from PIL import Image, ImageFont, ImageDraw
+import warnings
 
+warnings.simplefilter('once', DeprecationWarning)
 
 class SRTReader(BaseReader):
     RE_HTML = re.compile(r'<[^>]+>')
@@ -22,6 +24,14 @@ def detect(self, content):
             return False
 
     def read(self, content, lang='en-US', strip_html=False, strip_ass_tags=False):
+        if strip_html:
+            warnings.warn("Using strip_html in the read function is deprecated. "
+                          "Use CaptionSet.strip_html_tags() instead", DeprecationWarning)
+
+        if strip_ass_tags:
+            warnings.warn("Using strip_ass_tags in the read function is deprecated. "
+                          "Use CaptionSet.strip_ass_tags() instead", DeprecationWarning)
+
         if not isinstance(content, str):
             raise InvalidInputError('The content is not a unicode string.')
 

From dda2c73b6540313aa9e1d231000aafda3d17ac8f Mon Sep 17 00:00:00 2001
From: Andrej Peterka <andrej.peterka@castlabs.com>
Date: Thu, 12 Jun 2025 10:36:28 +0200
Subject: [PATCH 2/3] Add function to CaptionSet to remove empty captions

---
 pycaption/base.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/pycaption/base.py b/pycaption/base.py
index eed2c37e..189dcff0 100644
--- a/pycaption/base.py
+++ b/pycaption/base.py
@@ -403,6 +403,25 @@ def strip_ass_tags(self):
                 out_captions.append(caption)
             self.set_captions(lang, out_captions)
 
+    def remove_empty_captions(self):
+        for lang in self.get_languages():
+            captions = self.get_captions(lang)
+            out_captions = CaptionList()
+            for caption in captions:
+                valid_nodes = []
+                for node in caption.nodes:
+                    if node.type_ == CaptionNode.TEXT:
+                        if node.content.strip():
+                            valid_nodes.append(node)
+                    else:
+                        valid_nodes.append(node)
+
+                valid_text_nodes = [node for node in valid_nodes if node.type_ == CaptionNode.TEXT]
+                if valid_text_nodes:
+                    out_captions.append(caption)
+
+            self.set_captions(lang, out_captions)
+
 
 # Functions
 def merge_concurrent_captions(caption_set):

From 496be9112d87fab7f3c9e418c6b29a60cc6b7537 Mon Sep 17 00:00:00 2001
From: Andrej Peterka <andrej.peterka@castlabs.com>
Date: Thu, 12 Jun 2025 10:39:21 +0200
Subject: [PATCH 3/3] Refactor

---
 pycaption/base.py | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/pycaption/base.py b/pycaption/base.py
index 189dcff0..e55b39b4 100644
--- a/pycaption/base.py
+++ b/pycaption/base.py
@@ -408,18 +408,11 @@ def remove_empty_captions(self):
             captions = self.get_captions(lang)
             out_captions = CaptionList()
             for caption in captions:
-                valid_nodes = []
-                for node in caption.nodes:
-                    if node.type_ == CaptionNode.TEXT:
-                        if node.content.strip():
-                            valid_nodes.append(node)
-                    else:
-                        valid_nodes.append(node)
-
-                valid_text_nodes = [node for node in valid_nodes if node.type_ == CaptionNode.TEXT]
+                valid_text_nodes = [
+                    node for node in caption.nodes if node.type_ == CaptionNode.TEXT and node.content.strip()
+                ]
                 if valid_text_nodes:
                     out_captions.append(caption)
-
             self.set_captions(lang, out_captions)