Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
bd72d1a
Adding remove_duplicates to llm_util
Feb 3, 2026
b914e51
Back to redactor.py, no config_processor dependency
Feb 3, 2026
002fba8
Tests with Shannon
Feb 5, 2026
ed2b1a3
_
Feb 5, 2026
0c11acd
Fixing error re positional argument
Feb 5, 2026
fc492df
patching in LLMTextRedactor to remove_stopwords test
Feb 5, 2026
d470922
Objectifying LLM_text_redactor
Feb 5, 2026
1e8d67f
rebased
Feb 11, 2026
b546cb9
Changed to capture case variance
Feb 12, 2026
7a699d8
Ruff formatting
Feb 12, 2026
66c3979
failing integration test - moving stopwords timing
Feb 12, 2026
ca97d4b
Addressing failed imports
Feb 12, 2026
699708d
removed redundant reference in ImageLLMTextRedaction
Feb 13, 2026
116f364
Alternative PDF redaction highlighting method (#49)
shannon-wms Feb 12, 2026
dd79f67
Combined prompt (#53)
hnikolov-solirius Feb 12, 2026
dea081a
Add more comprehensive logging. Update and improve test coverage (#48)
HarrisonBoyleThomas Feb 12, 2026
33ce118
Adding remove_duplicates to llm_util
Feb 3, 2026
c6a10b9
Back to redactor.py, no config_processor dependency
Feb 3, 2026
2bb7109
Addressing failed imports
Feb 12, 2026
2b35c70
Alternative PDF redaction highlighting method (#49)
shannon-wms Feb 12, 2026
967d490
missing imports
Feb 13, 2026
d1b0d91
Add more comprehensive logging. Update and improve test coverage (#48)
HarrisonBoyleThomas Feb 12, 2026
8ebc1b7
Adding remove_duplicates to llm_util
Feb 3, 2026
2586365
Addressing failed imports
Feb 12, 2026
cdf226b
Alternative PDF redaction highlighting method (#49)
shannon-wms Feb 12, 2026
08e5d90
Add more comprehensive logging. Update and improve test coverage (#48)
HarrisonBoyleThomas Feb 12, 2026
7457815
Alternative PDF redaction highlighting method (#49)
shannon-wms Feb 12, 2026
1cd5cca
.
Mar 2, 2026
ad94b5c
.
Mar 5, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 6 additions & 31 deletions redactor/config/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,39 +6,23 @@ redactors:
model: "gpt-4.1"
system_prompt: "You are a thorough assistant that extracts all of the requested terms from a given text."
redaction_terms:
- "People's names. List each part of the name separately."
constraints:
- "Do not include locations or organisations"
- "Do not include names of anything which is not a person"
- "Do not list the author of the text"
- "Do not include those on whose behalf the text was written"

- name: "Text_Redactor_02"
model: "gpt-4.1"
system_prompt: "You are a thorough assistant that extracts all of the requested terms from a given text."
redaction_terms:
- "Personal addresses and postcodes"
- "People's names"
- "Personal addresses and postcodes"
- "Personal email addresses, unless it's a Planning Inspectorate email"
- "Telephone numbers, unless it's a Planning Inspectorate customer service team telephone number"
- "National Insurance Numbers, e.g. AB 12 34 56 C"
- "Hyperlinks, except those that are .gov.uk, .org, .gov.wales"

- name: "Text_Redactor_03"
model: "gpt-4.1"
system_prompt: "You are a thorough assistant that extracts all of the requested terms from a given text."
redaction_terms:
- "Personal health information, e.g. illnesses or concerning a person's sex life. List each term as it appears in the text."
- "Personal data revealing ethnic origin, political opinions, philosophical beliefs, or trade union membership"
- "Criminal offence data, e.g. allegations, investigations, proceedings, penalties"
- "Any defamatory (libellous) or inflammatory information"
- "Specific financial information such as bank accounts, salary details, house valuations, bonuses, or shares"

- name: "Text_Redactor_04"
model: "gpt-4.1"
system_prompt: "You are a thorough assistant that extracts all of the requested terms from a given text."
redaction_terms:
- "Dates of birth, and ages of people"
- "The location of any of the following: badger sett, bat maternity roost, bird nest"
constraints:
- "Do not include names of anything which is not a person"
- "Do not include the name of the author of the text"
- "Do not include the names of those on whose behalf the text was written"

- redactor_type: "ImageRedaction"

Expand All @@ -55,14 +39,5 @@ redactors:
redaction_rules:
- name: "Image_Text_Redactor_01"
text_redaction_rule: "Text_Redactor_01"

- name: "Image_Text_Redactor_02"
text_redaction_rule: "Text_Redactor_02"

- name: "Image_Text_Redactor_03"
text_redaction_rule: "Text_Redactor_03"

- name: "Image_Text_Redactor_04"
text_redaction_rule: "Text_Redactor_04"

provisional_redactions:
3 changes: 3 additions & 0 deletions redactor/config/stopwords.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
stopwords:
- "the"
- "my"
16 changes: 15 additions & 1 deletion redactor/core/io/azure_blob_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
ChainedTokenCredential,
)
from azure.storage.blob import BlobServiceClient
from azure.core.exceptions import ResourceExistsError
from core.util.logging_util import LoggingUtil
from io import BytesIO
from typing import Any
from .storage_io import StorageIO
Expand Down Expand Up @@ -35,6 +37,9 @@ def get_kind(cls):
return "AzureBlob"

def read(self, container_name: str, blob_path: str, **kwargs) -> BytesIO:
LoggingUtil().log_info(
f"Reading blob '{blob_path}' from container '{container_name}' in storage account '{self.storage_endpoint}'"
)
blob_service_client = BlobServiceClient(
self.storage_endpoint, credential=self.credential
)
Expand All @@ -45,10 +50,19 @@ def read(self, container_name: str, blob_path: str, **kwargs) -> BytesIO:
return byte_stream

def write(self, data_bytes: BytesIO, container_name: str, blob_path: str, **kwargs):
LoggingUtil().log_info(
f"Writing blob '{blob_path}' from container '{container_name}' in storage account '{self.storage_endpoint}'"
)
blob_service_client = BlobServiceClient(
self.storage_endpoint, credential=self.credential
)
blob_client = blob_service_client.get_blob_client(
container=container_name, blob=blob_path
)
blob_client.upload_blob(data_bytes, blob_type="BlockBlob")
try:
blob_client.upload_blob(data_bytes, blob_type="BlockBlob")
except ResourceExistsError:
# Improve the base Azure error, which does not include helpful info
raise ResourceExistsError(
f"The specified blob {self.storage_endpoint}/{container_name}/{blob_path} already exists"
)
Loading