Planning-Inspectorate · hnikolov-solirius · Mar 11, 2026 · Mar 13, 2026 · Mar 13, 2026 · Mar 16, 2026
diff --git a/redactor/config/stopwords.yaml b/redactor/config/stopwords.yaml
@@ -0,0 +1,3 @@
+stopwords:
+  - "the"
+  - "my"
diff --git a/redactor/core/redaction/file_processor.py b/redactor/core/redaction/file_processor.py
@@ -1,5 +1,6 @@
 import json
 import pymupdf
+import os
 import dataclasses
 import numpy as np
 
@@ -8,7 +9,9 @@
 from abc import ABC, abstractmethod
 from io import BytesIO
 from PIL import Image
-from pydantic import BaseModel, Field
+from pydantic import BaseModel
+from yaml import safe_load
+from pydantic import Field
 from time import time
 from datetime import datetime
 
@@ -34,6 +37,7 @@
 from core.util.text_util import is_english_text, get_normalised_words, normalise_text
 from core.util.logging_util import LoggingUtil, log_to_appins
 from core.util.types import PydanticImage
+import xray
 from core.util.metric_util import MetricUtil
 
 
@@ -257,6 +261,29 @@ def _extract_pdf_text(self, file_bytes: BytesIO) -> str:
             return None
         return "\n".join(page for page in pages)
 
+    def _find_bad_redactions(self, file_bytes: BytesIO):
+        """
+        Return a list of bad redactions in the give PDF
+        :param BytesIO file_bytes: Bytes stream for the PDF
+        :return List[]: the bad redaction strings
+        """
+        file_bytes.seek(0)
+        bad_redactions = xray.inspect(file_bytes.read())
+        bad_redactions_list = [
+            item["text"] for items in bad_redactions.values() for item in items
+        ]
+        return bad_redactions_list
+
+    def _load_stopwords(self):
+        """
+        Check the text_to_redact list against the list in the stopwords yaml
+
+        :return List[]: the bad redaction strings
+        """
+        stopwords = safe_load(open(os.path.join("config", "stopwords.yaml"), "r"))
+        stopword_list = stopwords["stopwords"]
+        return stopword_list
+
     def _extract_pdf_images(self, file_bytes: BytesIO):
         """
         Return the images of the given PDF as a list of PDFImageMetadata objects
@@ -1228,6 +1255,13 @@ def redact(
             for result in text_redaction_results
             for redaction_string in result.redaction_strings
         ]
+        # Add bad redactions to the text redaction list
+        bad_redactions_list = self._find_bad_redactions(file_bytes)
+        text_redactions = text_redactions + bad_redactions_list
+        # Remove stopwords from text redaction list
+        stopword_list = self._load_stopwords()
+        text_redactions = text_redactions - stopword_list
+
         image_redaction_results: List[ImageRedactionResult] = [
             x
             for x in redaction_results

diff --git a/redactor/core/redaction_manager.py b/redactor/core/redaction_manager.py
@@ -353,7 +353,7 @@ def _compare_redactions(
             proposed_candidates = [
                 {k: v for k, v in ann.items() if k in attrs_to_compare}
                 for ann in proposed_annots_on_page
-                if ann.get("isRedactionCandidate", False)
+                if ann.get("isRedactionCandidate", True)
             ]
             n_proposed_redactions += len(proposed_candidates)
 

diff --git a/redactor/requirements.txt b/redactor/requirements.txt
@@ -30,4 +30,6 @@ ruff==0.14.7
 tiktoken==0.12.0
 unidecode==1.4.0
 StrEnum==0.4.15  # Not ideal, but this is needed due to compatibility issues between ADO agents and the Function App
-numpy==2.2.6
+x-ray==0.3.6
+# pycopy-collections.abc==0.0.0 # Dummy as this should be included in standard packages
+numpy==2.2.6
diff --git a/redactor/test/unit_test/redaction/file_processor/test_pdf_processor.py b/redactor/test/unit_test/redaction/file_processor/test_pdf_processor.py
@@ -5,6 +5,7 @@
 
 from PIL import Image
 from io import BytesIO
+import mock
 from mock import patch, Mock, MagicMock
 from datetime import datetime
 
@@ -1229,3 +1230,41 @@ def test__pdf_processor__apply():
     assert expected_image == actual_image, (
         "Expected the image in the pdf to be redacted, but it did not match the redacted sample"
     )
+
+def test_find_bad_redactions():
+    """
+    - Given i have a pdf file with some content
+    - When i call PDFProcessor._find_bad_redactions
+    - The content is returned as a list
+    """
+    file_bytes = BytesIO(b"fake pdf bytes")
+    mock_pdf = MagicMock()
+    mock_inspect_result = {
+        "page1": [{"text": "secret"}, {"text": "password"}],
+        "page2": [{"text": "token"}],
+    }
+    with patch("pymupdf.open", return_value=mock_pdf) as mock_open:
+        with patch("xray.inspect", return_value=mock_inspect_result) as mock_inspect:
+            obj = PDFProcessor()
+            result = obj._find_bad_redactions(file_bytes)
+
+    assert result == ["secret", "password", "token"]
+    mock_open.assert_called_once_with(stream=file_bytes)
+    mock_inspect.assert_called_once_with(mock_pdf)
+
+def test_load_stopwords():
+    """
+    - Given i have a yaml file with some content
+    - When i call PDFProcessor._load_stopwords
+    - The yaml content is returned as a list
+    """
+    mock_config_file_content = """
+    stopwords:
+    - the
+    - test
+    """
+    expected_output = ["the","test"]
+    with mock.patch(
+        "builtins.open", mock.mock_open(read_data=mock_config_file_content)
+    ):
+        assert PDFProcessor._load_stopwords("some_file") == expected_output