Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions redactor/config/stopwords.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
stopwords:
- "the"
- "my"
19 changes: 18 additions & 1 deletion redactor/core/redaction/file_processor.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import pymupdf
import os
import dataclasses
import numpy as np

Expand All @@ -8,7 +9,9 @@
from abc import ABC, abstractmethod
from io import BytesIO
from PIL import Image
from pydantic import BaseModel, Field
from pydantic import BaseModel
from yaml import safe_load
from pydantic import Field
from time import time
from datetime import datetime

Expand Down Expand Up @@ -257,6 +260,16 @@ def _extract_pdf_text(self, file_bytes: BytesIO) -> str:
return None
return "\n".join(page for page in pages)

def _load_stopwords(self):
"""
Check the text_to_redact list against the list in the stopwords yaml

:return List[]: the bad redaction strings
"""
stopwords = safe_load(open(os.path.join("config", "stopwords.yaml"), "r"))
stopword_list = stopwords["stopwords"]
return stopword_list

def _extract_pdf_images(self, file_bytes: BytesIO):
"""
Return the images of the given PDF as a list of PDFImageMetadata objects
Expand Down Expand Up @@ -1228,6 +1241,10 @@ def redact(
for result in text_redaction_results
for redaction_string in result.redaction_strings
]
# Remove stopwords from text redaction list
stopword_list = self._load_stopwords()
text_redactions = list(set(text_redactions) - set(stopword_list))

image_redaction_results: List[ImageRedactionResult] = [
x
for x in redaction_results
Expand Down
2 changes: 1 addition & 1 deletion redactor/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,4 @@ ruff==0.14.7
tiktoken==0.12.0
unidecode==1.4.0
StrEnum==0.4.15 # Not ideal, but this is needed due to compatibility issues between ADO agents and the Function App
numpy==2.2.6
numpy==2.2.6
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from PIL import Image
from io import BytesIO
import mock
from mock import patch, Mock, MagicMock
from datetime import datetime

Expand Down Expand Up @@ -1229,3 +1230,21 @@ def test__pdf_processor__apply():
assert expected_image == actual_image, (
"Expected the image in the pdf to be redacted, but it did not match the redacted sample"
)


def test_load_stopwords():
"""
- Given i have a yaml file with some content
- When i call PDFProcessor._load_stopwords
- The yaml content is returned as a list
"""
mock_config_file_content = """
stopwords:
- the
- test
"""
expected_output = ["the", "test"]
with mock.patch(
"builtins.open", mock.mock_open(read_data=mock_config_file_content)
):
assert PDFProcessor._load_stopwords("some_file") == expected_output
Loading