Planning-Inspectorate · hnikolov-solirius · Mar 11, 2026 · Mar 13, 2026 · Mar 13, 2026 · Mar 16, 2026
diff --git a/redactor/config/stopwords.yaml b/redactor/config/stopwords.yaml
@@ -0,0 +1,3 @@
+stopwords:
+  - "the"
+  - "my"
diff --git a/redactor/core/redaction/file_processor.py b/redactor/core/redaction/file_processor.py
@@ -1,5 +1,6 @@
 import json
 import pymupdf
+import os
 import dataclasses
 import numpy as np
 
@@ -8,7 +9,9 @@
 from abc import ABC, abstractmethod
 from io import BytesIO
 from PIL import Image
-from pydantic import BaseModel, Field
+from pydantic import BaseModel
+from yaml import safe_load
+from pydantic import Field
 from time import time
 from datetime import datetime
 
@@ -257,6 +260,16 @@ def _extract_pdf_text(self, file_bytes: BytesIO) -> str:
             return None
         return "\n".join(page for page in pages)
 
+    def _load_stopwords(self):
+        """
+        Check the text_to_redact list against the list in the stopwords yaml
+
+        :return List[]: the bad redaction strings
+        """
+        stopwords = safe_load(open(os.path.join("config", "stopwords.yaml"), "r"))
+        stopword_list = stopwords["stopwords"]
+        return stopword_list
+
     def _extract_pdf_images(self, file_bytes: BytesIO):
         """
         Return the images of the given PDF as a list of PDFImageMetadata objects
@@ -1228,6 +1241,10 @@ def redact(
             for result in text_redaction_results
             for redaction_string in result.redaction_strings
         ]
+        # Remove stopwords from text redaction list
+        stopword_list = self._load_stopwords()
+        text_redactions = list(set(text_redactions) - set(stopword_list))
+
         image_redaction_results: List[ImageRedactionResult] = [
             x
             for x in redaction_results

diff --git a/redactor/requirements.txt b/redactor/requirements.txt
@@ -30,4 +30,4 @@ ruff==0.14.7
 tiktoken==0.12.0
 unidecode==1.4.0
 StrEnum==0.4.15  # Not ideal, but this is needed due to compatibility issues between ADO agents and the Function App
-numpy==2.2.6
+numpy==2.2.6
diff --git a/redactor/test/unit_test/redaction/file_processor/test_pdf_processor.py b/redactor/test/unit_test/redaction/file_processor/test_pdf_processor.py
@@ -5,6 +5,7 @@
 
 from PIL import Image
 from io import BytesIO
+import mock
 from mock import patch, Mock, MagicMock
 from datetime import datetime
 
@@ -1229,3 +1230,21 @@ def test__pdf_processor__apply():
     assert expected_image == actual_image, (
         "Expected the image in the pdf to be redacted, but it did not match the redacted sample"
     )
+
+
+def test_load_stopwords():
+    """
+    - Given i have a yaml file with some content
+    - When i call PDFProcessor._load_stopwords
+    - The yaml content is returned as a list
+    """
+    mock_config_file_content = """
+    stopwords:
+    - the
+    - test
+    """
+    expected_output = ["the", "test"]
+    with mock.patch(
+        "builtins.open", mock.mock_open(read_data=mock_config_file_content)
+    ):
+        assert PDFProcessor._load_stopwords("some_file") == expected_output