diff --git a/redactor/config/stopwords.yaml b/redactor/config/stopwords.yaml new file mode 100644 index 00000000..076bba37 --- /dev/null +++ b/redactor/config/stopwords.yaml @@ -0,0 +1,3 @@ +stopwords: + - "the" + - "my" \ No newline at end of file diff --git a/redactor/core/redaction/file_processor.py b/redactor/core/redaction/file_processor.py index dc01f54a..b85d7683 100644 --- a/redactor/core/redaction/file_processor.py +++ b/redactor/core/redaction/file_processor.py @@ -1,5 +1,6 @@ import json import pymupdf +import os import dataclasses import numpy as np @@ -8,7 +9,9 @@ from abc import ABC, abstractmethod from io import BytesIO from PIL import Image -from pydantic import BaseModel, Field +from pydantic import BaseModel +from yaml import safe_load +from pydantic import Field from time import time from datetime import datetime @@ -257,6 +260,16 @@ def _extract_pdf_text(self, file_bytes: BytesIO) -> str: return None return "\n".join(page for page in pages) + def _load_stopwords(self): + """ + Check the text_to_redact list against the list in the stopwords yaml + + :return List[]: the bad redaction strings + """ + stopwords = safe_load(open(os.path.join("config", "stopwords.yaml"), "r")) + stopword_list = stopwords["stopwords"] + return stopword_list + def _extract_pdf_images(self, file_bytes: BytesIO): """ Return the images of the given PDF as a list of PDFImageMetadata objects @@ -1228,6 +1241,10 @@ def redact( for result in text_redaction_results for redaction_string in result.redaction_strings ] + # Remove stopwords from text redaction list + stopword_list = self._load_stopwords() + text_redactions = list(set(text_redactions) - set(stopword_list)) + image_redaction_results: List[ImageRedactionResult] = [ x for x in redaction_results diff --git a/redactor/requirements.txt b/redactor/requirements.txt index 359ba92e..69672134 100644 --- a/redactor/requirements.txt +++ b/redactor/requirements.txt @@ -30,4 +30,4 @@ ruff==0.14.7 tiktoken==0.12.0 unidecode==1.4.0 StrEnum==0.4.15 # Not ideal, but this is needed due to compatibility issues between ADO agents and the Function App -numpy==2.2.6 \ No newline at end of file +numpy==2.2.6 diff --git a/redactor/test/unit_test/redaction/file_processor/test_pdf_processor.py b/redactor/test/unit_test/redaction/file_processor/test_pdf_processor.py index 4aed4a57..7261ae58 100644 --- a/redactor/test/unit_test/redaction/file_processor/test_pdf_processor.py +++ b/redactor/test/unit_test/redaction/file_processor/test_pdf_processor.py @@ -5,6 +5,7 @@ from PIL import Image from io import BytesIO +import mock from mock import patch, Mock, MagicMock from datetime import datetime @@ -1229,3 +1230,21 @@ def test__pdf_processor__apply(): assert expected_image == actual_image, ( "Expected the image in the pdf to be redacted, but it did not match the redacted sample" ) + + +def test_load_stopwords(): + """ + - Given i have a yaml file with some content + - When i call PDFProcessor._load_stopwords + - The yaml content is returned as a list + """ + mock_config_file_content = """ + stopwords: + - the + - test + """ + expected_output = ["the", "test"] + with mock.patch( + "builtins.open", mock.mock_open(read_data=mock_config_file_content) + ): + assert PDFProcessor._load_stopwords("some_file") == expected_output