From 1bebb7970ff74c3baf5e3f39b0d914e08178475f Mon Sep 17 00:00:00 2001
From: hnikolov <hristo.nikolov@solirus.com>
Date: Wed, 11 Mar 2026 09:40:03 +0000
Subject: [PATCH 01/19] integrating xray and  stopwords

---
 redactor/core/redaction/file_processor.py | 35 +++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/redactor/core/redaction/file_processor.py b/redactor/core/redaction/file_processor.py
index 8a811c3c..c950c7fa 100644
--- a/redactor/core/redaction/file_processor.py
+++ b/redactor/core/redaction/file_processor.py
@@ -1,5 +1,6 @@
 import json
 import pymupdf
+import os
 
 from typing import Set, Type, List, Any, Dict, Tuple
 from abc import ABC, abstractmethod
@@ -7,6 +8,7 @@
 from PIL import Image
 from pydantic import BaseModel
 from itertools import chain
+from yaml import safe_load
 
 from core.redaction.redactor import (
     Redactor,
@@ -31,6 +33,7 @@
 from core.util.logging_util import LoggingUtil, log_to_appins
 from core.util.types import PydanticImage
 import dataclasses
+import xray
 
 
 class FileProcessor(ABC):
@@ -195,6 +198,28 @@ def _extract_pdf_text(self, file_bytes: BytesIO) -> str:
             return None
         return "\n".join(page for page in pages)
 
+    def _find_bad_redactions(self, file_bytes: BytesIO):
+        """
+        Return a list of bad redactions in the give PDF
+
+        :param BytesIO file_bytes: Bytes stream for the PDF
+        :return List[]: the bad redaction strings
+        """
+        pdf = pymupdf.open(stream=file_bytes)
+        bad_redactions = xray.inspect(pdf)
+        bad_redactions_list = [item["text"] for items in bad_redactions.values() for item in items]
+        return bad_redactions_list
+    
+    def _load_stopwords(self): 
+        """
+        Check the text_to_redact list against the list in the stopwords yaml
+
+        :return List[]: the bad redaction strings
+        """
+        stopwords = safe_load(open(os.path.join("config", "stopwords.yaml"), "r"))
+        stopword_list = stopwords["stopwords"]
+        return stopword_list
+
     def _extract_pdf_images(self, file_bytes: BytesIO):
         """
         Return the images of the given PDF as a list of PDFImageMetadata objects
@@ -1014,6 +1039,16 @@ def redact(
             for result in text_redaction_results
             for redaction_string in result.redaction_strings
         ]
+        # Add bad redactions to the text redaction list
+        pdf = pymupdf.open(stream=file_bytes)
+        bad_redactions = xray.inspect(pdf)
+        bad_redactions_list = [item["text"] for items in bad_redactions.values() for item in items]
+        text_redactions = text_redactions + bad_redactions_list
+        # Remove stopwords from text redaction list
+        stopwords = safe_load(open(os.path.join("config", "stopwords.yaml"), "r"))
+        stopword_list = stopwords["stopwords"]
+        text_redactions = text_redactions - stopword_list
+
         image_redaction_results: List[ImageRedactionResult] = [
             x
             for x in redaction_results

From 6663c907e3c90df1e1bd7f62d81b7e4a2d4eab8b Mon Sep 17 00:00:00 2001
From: hnikolov <hristo.nikolov@solirus.com>
Date: Fri, 13 Mar 2026 13:10:24 +0000
Subject: [PATCH 02/19] xray + redactions functions

---
 redactor/core/redaction/file_processor.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/redactor/core/redaction/file_processor.py b/redactor/core/redaction/file_processor.py
index c950c7fa..36d28b74 100644
--- a/redactor/core/redaction/file_processor.py
+++ b/redactor/core/redaction/file_processor.py
@@ -1040,13 +1040,10 @@ def redact(
             for redaction_string in result.redaction_strings
         ]
         # Add bad redactions to the text redaction list
-        pdf = pymupdf.open(stream=file_bytes)
-        bad_redactions = xray.inspect(pdf)
-        bad_redactions_list = [item["text"] for items in bad_redactions.values() for item in items]
+        bad_redactions_list = self._find_bad_redactions(file_bytes)
         text_redactions = text_redactions + bad_redactions_list
         # Remove stopwords from text redaction list
-        stopwords = safe_load(open(os.path.join("config", "stopwords.yaml"), "r"))
-        stopword_list = stopwords["stopwords"]
+        stopword_list = self._load_stopwords()
         text_redactions = text_redactions - stopword_list
 
         image_redaction_results: List[ImageRedactionResult] = [

From 64013e774e08e26e6bbcfea9532633fcf49ccbc2 Mon Sep 17 00:00:00 2001
From: hnikolov <hristo.nikolov@solirus.com>
Date: Fri, 13 Mar 2026 13:23:53 +0000
Subject: [PATCH 03/19] requirements updated

---
 redactor/requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/redactor/requirements.txt b/redactor/requirements.txt
index 20a84b2b..eabf86fd 100644
--- a/redactor/requirements.txt
+++ b/redactor/requirements.txt
@@ -29,4 +29,5 @@ PyYAML==6.0.3
 ruff==0.14.7
 tiktoken==0.12.0
 unidecode==1.4.0
-StrEnum==0.4.15  # Not ideal, but this is needed due to compatibility issues between ADO agents and the Function App
\ No newline at end of file
+StrEnum==0.4.15  # Not ideal, but this is needed due to compatibility issues between ADO agents and the Function App
+xray==0.7.0
\ No newline at end of file

From a059cd68c7a22eb2d0ef621e6c47aad8f247a4d8 Mon Sep 17 00:00:00 2001
From: hnikolov <hristo.nikolov@solirus.com>
Date: Mon, 16 Mar 2026 08:48:18 +0000
Subject: [PATCH 04/19] Unit tests added

---
 .../file_processor/test_pdf_processor.py      | 41 ++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/redactor/test/unit_test/redaction/file_processor/test_pdf_processor.py b/redactor/test/unit_test/redaction/file_processor/test_pdf_processor.py
index 64edb06d..a8563046 100644
--- a/redactor/test/unit_test/redaction/file_processor/test_pdf_processor.py
+++ b/redactor/test/unit_test/redaction/file_processor/test_pdf_processor.py
@@ -3,7 +3,8 @@
 
 from PIL import Image
 from io import BytesIO
-from mock import patch, Mock
+import mock
+from mock import patch, Mock, MagicMock
 
 from core.redaction.file_processor import (
     PDFProcessor,
@@ -960,3 +961,41 @@ def test__pdf_processor__apply():
     assert expected_image == actual_image, (
         "Expected the image in the pdf to be redacted, but it did not match the redacted sample"
     )
+
+def test_find_bad_redactions():
+    """
+    - Given i have a pdf file with some content
+    - When i call PDFProcessor._find_bad_redactions
+    - The content is returned as a list
+    """
+    file_bytes = BytesIO(b"fake pdf bytes")
+    mock_pdf = MagicMock()
+    mock_inspect_result = {
+        "page1": [{"text": "secret"}, {"text": "password"}],
+        "page2": [{"text": "token"}],
+    }
+    with patch("pymupdf.open", return_value=mock_pdf) as mock_open:
+        with patch("xray.inspect", return_value=mock_inspect_result) as mock_inspect:
+            obj = PDFProcessor()
+            result = obj._find_bad_redactions(file_bytes)
+
+    assert result == ["secret", "password", "token"]
+    mock_open.assert_called_once_with(stream=file_bytes)
+    mock_inspect.assert_called_once_with(mock_pdf)
+
+def test_load_stopwords():
+    """
+    - Given i have a yaml file with some content
+    - When i call PDFProcessor._load_stopwords
+    - The yaml content is returned as a list
+    """
+    mock_config_file_content = """
+    stopwords:
+    - the
+    - test
+    """
+    expected_output = ["the","test"]
+    with mock.patch(
+        "builtins.open", mock.mock_open(read_data=mock_config_file_content)
+    ):
+        assert PDFProcessor._load_stopwords("some_file") == expected_output
\ No newline at end of file

From f68ac0def2426a53ae86bb436de37a221fe6b947 Mon Sep 17 00:00:00 2001
From: hnikolov <hristo.nikolov@solirus.com>
Date: Mon, 16 Mar 2026 10:22:42 +0000
Subject: [PATCH 05/19] Adding collections.abc to reqs to attempt import error
 resolution

---
 redactor/requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/redactor/requirements.txt b/redactor/requirements.txt
index eabf86fd..cba09064 100644
--- a/redactor/requirements.txt
+++ b/redactor/requirements.txt
@@ -30,4 +30,5 @@ ruff==0.14.7
 tiktoken==0.12.0
 unidecode==1.4.0
 StrEnum==0.4.15  # Not ideal, but this is needed due to compatibility issues between ADO agents and the Function App
-xray==0.7.0
\ No newline at end of file
+xray==0.7.0
+pycopy-collections.abc==0.0.0 # Dummy as this should be included in standard packages
\ No newline at end of file

From 4b7ae806a3863d404dcf5b3cdd79ac2bdc27b3cf Mon Sep 17 00:00:00 2001
From: hnikolov <hristo.nikolov@solirus.com>
Date: Mon, 16 Mar 2026 16:00:35 +0000
Subject: [PATCH 06/19] Stopwords yaml

---
 redactor/config/stopwords.yaml | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 redactor/config/stopwords.yaml

diff --git a/redactor/config/stopwords.yaml b/redactor/config/stopwords.yaml
new file mode 100644
index 00000000..076bba37
--- /dev/null
+++ b/redactor/config/stopwords.yaml
@@ -0,0 +1,3 @@
+stopwords:
+  - "the"
+  - "my"
\ No newline at end of file

From 5bf767028d74eb4b740d7120f7feecd4d4288449 Mon Sep 17 00:00:00 2001
From: hnikolov <hristo.nikolov@solirus.com>
Date: Tue, 17 Mar 2026 14:47:35 +0000
Subject: [PATCH 07/19] removed unused import

---
 redactor/core/redaction/file_processor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/redactor/core/redaction/file_processor.py b/redactor/core/redaction/file_processor.py
index 09efe604..828ae2e6 100644
--- a/redactor/core/redaction/file_processor.py
+++ b/redactor/core/redaction/file_processor.py
@@ -35,7 +35,6 @@
 from core.util.text_util import is_english_text, get_normalised_words, normalise_text
 from core.util.logging_util import LoggingUtil, log_to_appins
 from core.util.types import PydanticImage
-import dataclasses
 import xray
 from core.util.metric_util import MetricUtil
 

From b184c2e53dcd1bc0d4dac252cbab28753e1dc4b7 Mon Sep 17 00:00:00 2001
From: hnikolov <hristo.nikolov@solirus.com>
Date: Wed, 18 Mar 2026 15:18:45 +0000
Subject: [PATCH 08/19] x-ray req file updated

---
 redactor/core/redaction/file_processor.py | 1 +
 redactor/requirements.txt                 | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/redactor/core/redaction/file_processor.py b/redactor/core/redaction/file_processor.py
index 828ae2e6..a34c8349 100644
--- a/redactor/core/redaction/file_processor.py
+++ b/redactor/core/redaction/file_processor.py
@@ -245,6 +245,7 @@ def _find_bad_redactions(self, file_bytes: BytesIO):
         pdf = pymupdf.open(stream=file_bytes)
         bad_redactions = xray.inspect(pdf)
         bad_redactions_list = [item["text"] for items in bad_redactions.values() for item in items]
+        print(bad_redactions_list)
         return bad_redactions_list
     
     def _load_stopwords(self): 
diff --git a/redactor/requirements.txt b/redactor/requirements.txt
index cba09064..fccf053d 100644
--- a/redactor/requirements.txt
+++ b/redactor/requirements.txt
@@ -30,5 +30,5 @@ ruff==0.14.7
 tiktoken==0.12.0
 unidecode==1.4.0
 StrEnum==0.4.15  # Not ideal, but this is needed due to compatibility issues between ADO agents and the Function App
-xray==0.7.0
+x-ray==0.3.6
 pycopy-collections.abc==0.0.0 # Dummy as this should be included in standard packages
\ No newline at end of file

From d19632a104d0df4b5ad43f9cde07aa002c18efbb Mon Sep 17 00:00:00 2001
From: hnikolov <hristo.nikolov@solirus.com>
Date: Wed, 18 Mar 2026 15:33:44 +0000
Subject: [PATCH 09/19] Amend to add seek command to function - should work
 correctly now

---
 redactor/core/redaction/file_processor.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/redactor/core/redaction/file_processor.py b/redactor/core/redaction/file_processor.py
index a34c8349..2f0b10ff 100644
--- a/redactor/core/redaction/file_processor.py
+++ b/redactor/core/redaction/file_processor.py
@@ -238,14 +238,12 @@ def _extract_pdf_text(self, file_bytes: BytesIO) -> str:
     def _find_bad_redactions(self, file_bytes: BytesIO):
         """
         Return a list of bad redactions in the give PDF
-
         :param BytesIO file_bytes: Bytes stream for the PDF
         :return List[]: the bad redaction strings
         """
-        pdf = pymupdf.open(stream=file_bytes)
-        bad_redactions = xray.inspect(pdf)
+        file_bytes.seek(0)
+        bad_redactions = xray.inspect(file_bytes.read())
         bad_redactions_list = [item["text"] for items in bad_redactions.values() for item in items]
-        print(bad_redactions_list)
         return bad_redactions_list
     
     def _load_stopwords(self): 

From 96696dea555134f4172afd2aec1f3fadd64762df Mon Sep 17 00:00:00 2001
From: Shannon Williams <shannon.williams.f4@planninginspectorate.gov.uk>
Date: Fri, 13 Mar 2026 13:51:32 +0000
Subject: [PATCH 10/19] Correct comparison and output analytics (#74)

Switch "apply" and "redact" names in analytics output file. Make sure annotations are treated as positive predictions if `isRedactionCandidate` is `True`. Remove `pandas` to make unit tests pass and improve robustness
---
 redactor/core/redaction_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/redactor/core/redaction_manager.py b/redactor/core/redaction_manager.py
index c17904fb..c2567631 100644
--- a/redactor/core/redaction_manager.py
+++ b/redactor/core/redaction_manager.py
@@ -353,7 +353,7 @@ def _compare_redactions(
             proposed_candidates = [
                 {k: v for k, v in ann.items() if k in attrs_to_compare}
                 for ann in proposed_annots_on_page
-                if ann.get("isRedactionCandidate", False)
+                if ann.get("isRedactionCandidate", True)
             ]
             n_proposed_redactions += len(proposed_candidates)
 

From e9948171cc5870a5b05b5acf559ed0dbd19296cf Mon Sep 17 00:00:00 2001
From: hnikolov <hristo.nikolov@solirus.com>
Date: Wed, 18 Mar 2026 15:18:45 +0000
Subject: [PATCH 11/19] x-ray req file updated

---
 redactor/core/redaction/file_processor.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/redactor/core/redaction/file_processor.py b/redactor/core/redaction/file_processor.py
index db0f5189..0435208f 100644
--- a/redactor/core/redaction/file_processor.py
+++ b/redactor/core/redaction/file_processor.py
@@ -271,6 +271,7 @@ def _find_bad_redactions(self, file_bytes: BytesIO):
         file_bytes.seek(0)
         bad_redactions = xray.inspect(file_bytes.read())
         bad_redactions_list = [item["text"] for items in bad_redactions.values() for item in items]
+        print(bad_redactions_list)
         return bad_redactions_list
     
     def _load_stopwords(self): 

From cd5df3ceee0d088c04282b515c79506b3a1a332f Mon Sep 17 00:00:00 2001
From: hnikolov <hristo.nikolov@solirus.com>
Date: Wed, 18 Mar 2026 15:33:44 +0000
Subject: [PATCH 12/19] Amend to add seek command to function - should work
 correctly now

---
 redactor/core/redaction/file_processor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/redactor/core/redaction/file_processor.py b/redactor/core/redaction/file_processor.py
index 0435208f..db0f5189 100644
--- a/redactor/core/redaction/file_processor.py
+++ b/redactor/core/redaction/file_processor.py
@@ -271,7 +271,6 @@ def _find_bad_redactions(self, file_bytes: BytesIO):
         file_bytes.seek(0)
         bad_redactions = xray.inspect(file_bytes.read())
         bad_redactions_list = [item["text"] for items in bad_redactions.values() for item in items]
-        print(bad_redactions_list)
         return bad_redactions_list
     
     def _load_stopwords(self): 

From 5c1e4ea1867b3cd5d7628199fb677342832f0e34 Mon Sep 17 00:00:00 2001
From: hnikolov <hristo.nikolov@solirus.com>
Date: Tue, 24 Mar 2026 15:58:44 +0000
Subject: [PATCH 13/19] ruff formatting

---
 redactor/core/redaction/file_processor.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/redactor/core/redaction/file_processor.py b/redactor/core/redaction/file_processor.py
index db0f5189..5c637d57 100644
--- a/redactor/core/redaction/file_processor.py
+++ b/redactor/core/redaction/file_processor.py
@@ -270,10 +270,12 @@ def _find_bad_redactions(self, file_bytes: BytesIO):
         """
         file_bytes.seek(0)
         bad_redactions = xray.inspect(file_bytes.read())
-        bad_redactions_list = [item["text"] for items in bad_redactions.values() for item in items]
+        bad_redactions_list = [
+            item["text"] for items in bad_redactions.values() for item in items
+        ]
         return bad_redactions_list
-    
-    def _load_stopwords(self): 
+
+    def _load_stopwords(self):
         """
         Check the text_to_redact list against the list in the stopwords yaml
 

From 2bc837f42ed04937de290876f12392f5996ae017 Mon Sep 17 00:00:00 2001
From: hnikolov <hristo.nikolov@solirus.com>
Date: Tue, 24 Mar 2026 16:05:39 +0000
Subject: [PATCH 14/19] Attempt wthout pycopy-collections

---
 redactor/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/redactor/requirements.txt b/redactor/requirements.txt
index 6ea1b3d1..e933c7c7 100644
--- a/redactor/requirements.txt
+++ b/redactor/requirements.txt
@@ -31,5 +31,5 @@ tiktoken==0.12.0
 unidecode==1.4.0
 StrEnum==0.4.15  # Not ideal, but this is needed due to compatibility issues between ADO agents and the Function App
 x-ray==0.3.6
-pycopy-collections.abc==0.0.0 # Dummy as this should be included in standard packages
+# pycopy-collections.abc==0.0.0 # Dummy as this should be included in standard packages
 numpy==2.2.6

From 311283e43334f0f622788686f821346a57bf5a5f Mon Sep 17 00:00:00 2001
From: hnikolov <hristo.nikolov@solirus.com>
Date: Tue, 24 Mar 2026 16:16:04 +0000
Subject: [PATCH 15/19] ruff fix

---
 redactor/core/redaction/file_processor.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/redactor/core/redaction/file_processor.py b/redactor/core/redaction/file_processor.py
index 5c637d57..353be7ca 100644
--- a/redactor/core/redaction/file_processor.py
+++ b/redactor/core/redaction/file_processor.py
@@ -10,9 +10,8 @@
 from io import BytesIO
 from PIL import Image
 from pydantic import BaseModel
-from itertools import chain
 from yaml import safe_load
-from pydantic import BaseModel, Field
+from pydantic import Field
 from time import time
 from datetime import datetime
 

From 5761e50b77d66120ecaa9d0155626a2e118ac747 Mon Sep 17 00:00:00 2001
From: hnikolov <hristo.nikolov@solirus.com>
Date: Tue, 24 Mar 2026 16:27:43 +0000
Subject: [PATCH 16/19] Removing bad redactions, leaving stopwords

---
 redactor/core/redaction/file_processor.py     | 17 -------------
 redactor/requirements.txt                     |  2 --
 .../file_processor/test_pdf_processor.py      | 24 ++-----------------
 3 files changed, 2 insertions(+), 41 deletions(-)

diff --git a/redactor/core/redaction/file_processor.py b/redactor/core/redaction/file_processor.py
index 353be7ca..5cb611b2 100644
--- a/redactor/core/redaction/file_processor.py
+++ b/redactor/core/redaction/file_processor.py
@@ -37,7 +37,6 @@
 from core.util.text_util import is_english_text, get_normalised_words, normalise_text
 from core.util.logging_util import LoggingUtil, log_to_appins
 from core.util.types import PydanticImage
-import xray
 from core.util.metric_util import MetricUtil
 
 
@@ -261,19 +260,6 @@ def _extract_pdf_text(self, file_bytes: BytesIO) -> str:
             return None
         return "\n".join(page for page in pages)
 
-    def _find_bad_redactions(self, file_bytes: BytesIO):
-        """
-        Return a list of bad redactions in the give PDF
-        :param BytesIO file_bytes: Bytes stream for the PDF
-        :return List[]: the bad redaction strings
-        """
-        file_bytes.seek(0)
-        bad_redactions = xray.inspect(file_bytes.read())
-        bad_redactions_list = [
-            item["text"] for items in bad_redactions.values() for item in items
-        ]
-        return bad_redactions_list
-
     def _load_stopwords(self):
         """
         Check the text_to_redact list against the list in the stopwords yaml
@@ -1255,9 +1241,6 @@ def redact(
             for result in text_redaction_results
             for redaction_string in result.redaction_strings
         ]
-        # Add bad redactions to the text redaction list
-        bad_redactions_list = self._find_bad_redactions(file_bytes)
-        text_redactions = text_redactions + bad_redactions_list
         # Remove stopwords from text redaction list
         stopword_list = self._load_stopwords()
         text_redactions = text_redactions - stopword_list
diff --git a/redactor/requirements.txt b/redactor/requirements.txt
index e933c7c7..69672134 100644
--- a/redactor/requirements.txt
+++ b/redactor/requirements.txt
@@ -30,6 +30,4 @@ ruff==0.14.7
 tiktoken==0.12.0
 unidecode==1.4.0
 StrEnum==0.4.15  # Not ideal, but this is needed due to compatibility issues between ADO agents and the Function App
-x-ray==0.3.6
-# pycopy-collections.abc==0.0.0 # Dummy as this should be included in standard packages
 numpy==2.2.6
diff --git a/redactor/test/unit_test/redaction/file_processor/test_pdf_processor.py b/redactor/test/unit_test/redaction/file_processor/test_pdf_processor.py
index 9c6f3f53..7261ae58 100644
--- a/redactor/test/unit_test/redaction/file_processor/test_pdf_processor.py
+++ b/redactor/test/unit_test/redaction/file_processor/test_pdf_processor.py
@@ -1231,26 +1231,6 @@ def test__pdf_processor__apply():
         "Expected the image in the pdf to be redacted, but it did not match the redacted sample"
     )
 
-def test_find_bad_redactions():
-    """
-    - Given i have a pdf file with some content
-    - When i call PDFProcessor._find_bad_redactions
-    - The content is returned as a list
-    """
-    file_bytes = BytesIO(b"fake pdf bytes")
-    mock_pdf = MagicMock()
-    mock_inspect_result = {
-        "page1": [{"text": "secret"}, {"text": "password"}],
-        "page2": [{"text": "token"}],
-    }
-    with patch("pymupdf.open", return_value=mock_pdf) as mock_open:
-        with patch("xray.inspect", return_value=mock_inspect_result) as mock_inspect:
-            obj = PDFProcessor()
-            result = obj._find_bad_redactions(file_bytes)
-
-    assert result == ["secret", "password", "token"]
-    mock_open.assert_called_once_with(stream=file_bytes)
-    mock_inspect.assert_called_once_with(mock_pdf)
 
 def test_load_stopwords():
     """
@@ -1263,8 +1243,8 @@ def test_load_stopwords():
     - the
     - test
     """
-    expected_output = ["the","test"]
+    expected_output = ["the", "test"]
     with mock.patch(
         "builtins.open", mock.mock_open(read_data=mock_config_file_content)
     ):
-        assert PDFProcessor._load_stopwords("some_file") == expected_output
\ No newline at end of file
+        assert PDFProcessor._load_stopwords("some_file") == expected_output

From a30aaf68d80a43cedc34f0dd17324b2bfee52e1b Mon Sep 17 00:00:00 2001
From: hnikolov <hristo.nikolov@solirus.com>
Date: Tue, 24 Mar 2026 16:48:57 +0000
Subject: [PATCH 17/19] Unit test failed, fix applied

---
 redactor/core/redaction/file_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/redactor/core/redaction/file_processor.py b/redactor/core/redaction/file_processor.py
index 5cb611b2..1b55c934 100644
--- a/redactor/core/redaction/file_processor.py
+++ b/redactor/core/redaction/file_processor.py
@@ -1243,7 +1243,7 @@ def redact(
         ]
         # Remove stopwords from text redaction list
         stopword_list = self._load_stopwords()
-        text_redactions = text_redactions - stopword_list
+        text_redactions = np.array(text_redactions) - np.array(stopword_list)
 
         image_redaction_results: List[ImageRedactionResult] = [
             x

From fbf8f024559ea0f416c42ba50a7b570bdf671fd1 Mon Sep 17 00:00:00 2001
From: hnikolov <hristo.nikolov@solirus.com>
Date: Tue, 24 Mar 2026 16:54:38 +0000
Subject: [PATCH 18/19] fix attempt 2

---
 redactor/core/redaction/file_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/redactor/core/redaction/file_processor.py b/redactor/core/redaction/file_processor.py
index 1b55c934..b85d7683 100644
--- a/redactor/core/redaction/file_processor.py
+++ b/redactor/core/redaction/file_processor.py
@@ -1243,7 +1243,7 @@ def redact(
         ]
         # Remove stopwords from text redaction list
         stopword_list = self._load_stopwords()
-        text_redactions = np.array(text_redactions) - np.array(stopword_list)
+        text_redactions = list(set(text_redactions) - set(stopword_list))
 
         image_redaction_results: List[ImageRedactionResult] = [
             x

From 7b3e72d24b8f3f71e6a09b0d9238b270df22f35c Mon Sep 17 00:00:00 2001
From: hnikolov <hristo.nikolov@solirus.com>
Date: Wed, 25 Mar 2026 12:12:03 +0000
Subject: [PATCH 19/19] Reverting unwanted change

---
 redactor/core/redaction_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/redactor/core/redaction_manager.py b/redactor/core/redaction_manager.py
index c2567631..c17904fb 100644
--- a/redactor/core/redaction_manager.py
+++ b/redactor/core/redaction_manager.py
@@ -353,7 +353,7 @@ def _compare_redactions(
             proposed_candidates = [
                 {k: v for k, v in ann.items() if k in attrs_to_compare}
                 for ann in proposed_annots_on_page
-                if ann.get("isRedactionCandidate", True)
+                if ann.get("isRedactionCandidate", False)
             ]
             n_proposed_redactions += len(proposed_candidates)