Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions redactor/config/stopwords.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
stopwords:
- "the"
- "my"
36 changes: 35 additions & 1 deletion redactor/core/redaction/file_processor.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import pymupdf
import os
import dataclasses
import numpy as np

Expand All @@ -8,7 +9,9 @@
from abc import ABC, abstractmethod
from io import BytesIO
from PIL import Image
from pydantic import BaseModel, Field
from pydantic import BaseModel
from yaml import safe_load
from pydantic import Field
from time import time
from datetime import datetime

Expand All @@ -34,6 +37,7 @@
from core.util.text_util import is_english_text, get_normalised_words, normalise_text
from core.util.logging_util import LoggingUtil, log_to_appins
from core.util.types import PydanticImage
import xray
from core.util.metric_util import MetricUtil


Expand Down Expand Up @@ -257,6 +261,29 @@ def _extract_pdf_text(self, file_bytes: BytesIO) -> str:
return None
return "\n".join(page for page in pages)

def _find_bad_redactions(self, file_bytes: BytesIO):
"""
Return a list of bad redactions in the give PDF
:param BytesIO file_bytes: Bytes stream for the PDF
:return List[]: the bad redaction strings
"""
file_bytes.seek(0)
bad_redactions = xray.inspect(file_bytes.read())
bad_redactions_list = [
item["text"] for items in bad_redactions.values() for item in items
]
return bad_redactions_list

def _load_stopwords(self):
"""
Check the text_to_redact list against the list in the stopwords yaml

:return List[]: the bad redaction strings
"""
stopwords = safe_load(open(os.path.join("config", "stopwords.yaml"), "r"))
stopword_list = stopwords["stopwords"]
return stopword_list

def _extract_pdf_images(self, file_bytes: BytesIO):
"""
Return the images of the given PDF as a list of PDFImageMetadata objects
Expand Down Expand Up @@ -1228,6 +1255,13 @@ def redact(
for result in text_redaction_results
for redaction_string in result.redaction_strings
]
# Add bad redactions to the text redaction list
bad_redactions_list = self._find_bad_redactions(file_bytes)
text_redactions = text_redactions + bad_redactions_list
# Remove stopwords from text redaction list
stopword_list = self._load_stopwords()
text_redactions = text_redactions - stopword_list

image_redaction_results: List[ImageRedactionResult] = [
x
for x in redaction_results
Expand Down
2 changes: 1 addition & 1 deletion redactor/core/redaction_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ def _compare_redactions(
proposed_candidates = [
{k: v for k, v in ann.items() if k in attrs_to_compare}
for ann in proposed_annots_on_page
if ann.get("isRedactionCandidate", False)
if ann.get("isRedactionCandidate", True)
]
n_proposed_redactions += len(proposed_candidates)

Expand Down
4 changes: 3 additions & 1 deletion redactor/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,6 @@ ruff==0.14.7
tiktoken==0.12.0
unidecode==1.4.0
StrEnum==0.4.15 # Not ideal, but this is needed due to compatibility issues between ADO agents and the Function App
numpy==2.2.6
x-ray==0.3.6
# pycopy-collections.abc==0.0.0 # Dummy as this should be included in standard packages
numpy==2.2.6
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from PIL import Image
from io import BytesIO
import mock
from mock import patch, Mock, MagicMock
from datetime import datetime

Expand Down Expand Up @@ -1229,3 +1230,41 @@ def test__pdf_processor__apply():
assert expected_image == actual_image, (
"Expected the image in the pdf to be redacted, but it did not match the redacted sample"
)

def test_find_bad_redactions():
"""
- Given i have a pdf file with some content
- When i call PDFProcessor._find_bad_redactions
- The content is returned as a list
"""
file_bytes = BytesIO(b"fake pdf bytes")
mock_pdf = MagicMock()
mock_inspect_result = {
"page1": [{"text": "secret"}, {"text": "password"}],
"page2": [{"text": "token"}],
}
with patch("pymupdf.open", return_value=mock_pdf) as mock_open:
with patch("xray.inspect", return_value=mock_inspect_result) as mock_inspect:
obj = PDFProcessor()
result = obj._find_bad_redactions(file_bytes)

assert result == ["secret", "password", "token"]
mock_open.assert_called_once_with(stream=file_bytes)
mock_inspect.assert_called_once_with(mock_pdf)

def test_load_stopwords():
"""
- Given i have a yaml file with some content
- When i call PDFProcessor._load_stopwords
- The yaml content is returned as a list
"""
mock_config_file_content = """
stopwords:
- the
- test
"""
expected_output = ["the","test"]
with mock.patch(
"builtins.open", mock.mock_open(read_data=mock_config_file_content)
):
assert PDFProcessor._load_stopwords("some_file") == expected_output
Loading