diff --git a/redactor/config/default.yaml b/redactor/config/default.yaml index 7559eb4f..0c1d26e6 100644 --- a/redactor/config/default.yaml +++ b/redactor/config/default.yaml @@ -6,39 +6,23 @@ redactors: model: "gpt-4.1" system_prompt: "You are a thorough assistant that extracts all of the requested terms from a given text." redaction_terms: - - "People's names. List each part of the name separately." - constraints: - - "Do not include locations or organisations" - - "Do not include names of anything which is not a person" - - "Do not list the author of the text" - - "Do not include those on whose behalf the text was written" - - - name: "Text_Redactor_02" - model: "gpt-4.1" - system_prompt: "You are a thorough assistant that extracts all of the requested terms from a given text." - redaction_terms: - - "Personal addresses and postcodes" + - "People's names" + - "Personal addresses and postcodes" - "Personal email addresses, unless it's a Planning Inspectorate email" - "Telephone numbers, unless it's a Planning Inspectorate customer service team telephone number" - "National Insurance Numbers, e.g. AB 12 34 56 C" - "Hyperlinks, except those that are .gov.uk, .org, .gov.wales" - - - name: "Text_Redactor_03" - model: "gpt-4.1" - system_prompt: "You are a thorough assistant that extracts all of the requested terms from a given text." - redaction_terms: - "Personal health information, e.g. illnesses or concerning a person's sex life. List each term as it appears in the text." - "Personal data revealing ethnic origin, political opinions, philosophical beliefs, or trade union membership" - "Criminal offence data, e.g. allegations, investigations, proceedings, penalties" - "Any defamatory (libellous) or inflammatory information" - "Specific financial information such as bank accounts, salary details, house valuations, bonuses, or shares" - - - name: "Text_Redactor_04" - model: "gpt-4.1" - system_prompt: "You are a thorough assistant that extracts all of the requested terms from a given text." - redaction_terms: - "Dates of birth, and ages of people" - "The location of any of the following: badger sett, bat maternity roost, bird nest" + constraints: + - "Do not include names of anything which is not a person" + - "Do not include the name of the author of the text" + - "Do not include the names of those on whose behalf the text was written" - redactor_type: "ImageRedaction" @@ -55,14 +39,5 @@ redactors: redaction_rules: - name: "Image_Text_Redactor_01" text_redaction_rule: "Text_Redactor_01" - - - name: "Image_Text_Redactor_02" - text_redaction_rule: "Text_Redactor_02" - - - name: "Image_Text_Redactor_03" - text_redaction_rule: "Text_Redactor_03" - - - name: "Image_Text_Redactor_04" - text_redaction_rule: "Text_Redactor_04" provisional_redactions: \ No newline at end of file diff --git a/redactor/config/stopwords.yaml b/redactor/config/stopwords.yaml new file mode 100644 index 00000000..076bba37 --- /dev/null +++ b/redactor/config/stopwords.yaml @@ -0,0 +1,3 @@ +stopwords: + - "the" + - "my" \ No newline at end of file diff --git a/redactor/core/io/azure_blob_io.py b/redactor/core/io/azure_blob_io.py index 4f897386..72e542fd 100644 --- a/redactor/core/io/azure_blob_io.py +++ b/redactor/core/io/azure_blob_io.py @@ -4,6 +4,8 @@ ChainedTokenCredential, ) from azure.storage.blob import BlobServiceClient +from azure.core.exceptions import ResourceExistsError +from core.util.logging_util import LoggingUtil from io import BytesIO from typing import Any from .storage_io import StorageIO @@ -35,6 +37,9 @@ def get_kind(cls): return "AzureBlob" def read(self, container_name: str, blob_path: str, **kwargs) -> BytesIO: + LoggingUtil().log_info( + f"Reading blob '{blob_path}' from container '{container_name}' in storage account '{self.storage_endpoint}'" + ) blob_service_client = BlobServiceClient( self.storage_endpoint, credential=self.credential ) @@ -45,10 +50,19 @@ def read(self, container_name: str, blob_path: str, **kwargs) -> BytesIO: return byte_stream def write(self, data_bytes: BytesIO, container_name: str, blob_path: str, **kwargs): + LoggingUtil().log_info( + f"Writing blob '{blob_path}' from container '{container_name}' in storage account '{self.storage_endpoint}'" + ) blob_service_client = BlobServiceClient( self.storage_endpoint, credential=self.credential ) blob_client = blob_service_client.get_blob_client( container=container_name, blob=blob_path ) - blob_client.upload_blob(data_bytes, blob_type="BlockBlob") + try: + blob_client.upload_blob(data_bytes, blob_type="BlockBlob") + except ResourceExistsError: + # Improve the base Azure error, which does not include helpful info + raise ResourceExistsError( + f"The specified blob {self.storage_endpoint}/{container_name}/{blob_path} already exists" + ) diff --git a/redactor/core/redaction/file_processor.py b/redactor/core/redaction/file_processor.py index e5710a67..7c68d6c2 100644 --- a/redactor/core/redaction/file_processor.py +++ b/redactor/core/redaction/file_processor.py @@ -7,14 +7,11 @@ from PIL import Image from pydantic import BaseModel -from concurrent.futures import ProcessPoolExecutor, as_completed -import multiprocessing as mp - from core.redaction.redactor import ( Redactor, TextRedactor, ImageRedactor, - RedactorFactory, + RedactorFactory ) from core.redaction.exceptions import ( DuplicateFileProcessorNameException, @@ -28,11 +25,13 @@ TextRedactionResult, ImageRedactionResult, ) -from core.util.text_util import is_english_text, get_normalised_words +from core.util.text_util import is_english_text, get_normalised_words, normalise_text, normalise_text from core.util.logging_util import LoggingUtil, log_to_appins from core.util.types import PydanticImage -from core.util.multiprocessing_util import get_max_workers +from yaml import safe_load +import os +import dataclasses class FileProcessor(ABC): """ @@ -96,6 +95,29 @@ class PDFImageMetadata(BaseModel): """The transform of the instance of the image in the PDF, represented as a pymupdf.Matrix""" +class PDFLineMetadata(BaseModel): + line_number: int + """The line number on the page (0-indexed)""" + words: Tuple[str, ...] = [] + """The words in the line""" + y0: float = None + """The y0 coordinate of the line's bounding box""" + y1: float = None + """The y1 coordinate of the line's bounding box""" + x0: Tuple[float, ...] = () + """The x0 coordinates of the words in the line""" + x1: Tuple[float, ...] = () + """The x1 coordinates of the words in the line""" + + +class PDFPageMetadata(BaseModel): + page_number: int + """The page the image belongs to (0-indexed)""" + """The text content of the page""" + lines: List[PDFLineMetadata] = [] + """The metadata for the text content of the page""" + + class PDFProcessor(FileProcessor): """ Class for managing the redaction of PDF documents @@ -105,11 +127,59 @@ class PDFProcessor(FileProcessor): def get_name(cls) -> str: return "pdf" - def _extract_page_text(self, page: pymupdf.Page) -> str: - text = page.get_text().strip() - if text == "": # No text found on the page - return None - return text + def _create_line_metadata(self, line_text, line_rects, line_no): + """ + Helper function to create PDFLineMetadata for PDFPageMetadata + """ + line_y0 = min(rect[1] for rect in line_rects) if line_rects else 0 + line_y1 = max(rect[3] for rect in line_rects) if line_rects else 0 + return PDFLineMetadata( + line_number=line_no, + words=tuple(normalise_text(word) for word in line_text), + y0=line_y0, + y1=line_y1, + x0=tuple(rect[0] for rect in line_rects), + x1=tuple(rect[2] for rect in line_rects), + ) + + def _extract_page_text(self, page: pymupdf.Page) -> PDFPageMetadata: + """ + Extract text content and metadata from a PDF page. + + :param pymupdf.Page page: The PDF page to extract text from + + :return PDFPageMetadata: The metadata for the text content of the page, + including for each line the list of words and bounding box coordinates as + a PDFLineMetadata object. + """ + page_text = page.get_text("words", sort=True) + lines = [] + current_line = 0 + current_block = 1 + line_text = [] + line_rects = [] + n_lines = 0 + + for word in page_text: + x0, y0, x1, y1, word_text, block_no, line_no, _ = word + if line_no != current_line or block_no != current_block: + if line_text: # Don't add empty lines + lines.append( + self._create_line_metadata(line_text, line_rects, n_lines) + ) + n_lines += 1 + line_text = [] + line_rects = [] + current_line = line_no + current_block = block_no + + line_text.append(word_text) + line_rects.append((x0, y0, x1, y1)) + + if line_text: + lines.append(self._create_line_metadata(line_text, line_rects, n_lines)) + + return PDFPageMetadata(page_number=page.number, lines=lines) def _extract_pdf_text(self, file_bytes: BytesIO) -> str: """ @@ -119,10 +189,11 @@ def _extract_pdf_text(self, file_bytes: BytesIO) -> str: :return str: The text content of the PDF """ pdf = pymupdf.open(stream=file_bytes) - page_text = [self._extract_page_text(page) for page in pdf] - if all(text is None for text in page_text): # No text found on any page + pages = [page.get_text().strip() for page in pdf] + + if all(page == "" for page in pages): # No text found on any page return None - return "\n".join(text for text in page_text if text is not None) + return "\n".join(page for page in pages) def _extract_pdf_images(self, file_bytes: BytesIO): """ @@ -159,6 +230,12 @@ def _extract_pdf_images(self, file_bytes: BytesIO): transform.f, ), ) + LoggingUtil().log_info( + f"Loaded image with the following metadata {image_metadata}" + ) + LoggingUtil().log_info( + f"Loaded image with the following metadata {image_metadata}" + ) image_metadata_list.append(image_metadata) return image_metadata_list @@ -180,8 +257,8 @@ def _extract_unique_pdf_images(self, image_metadata: List[PDFImageMetadata]): @classmethod def _is_full_text_being_redacted( - cls, page: pymupdf.Page, term: str, rect: pymupdf.Rect - ) -> Tuple[bool, str]: + cls, term_to_redact: str, line_to_check: PDFLineMetadata + ) -> List[Tuple[str, int, int]]: """ Check whether the text found at the given bounding box on the provided page is an exact match for the given redaction text candidate, i.e., @@ -189,56 +266,89 @@ def _is_full_text_being_redacted( by approximately half a character on each side, to identify partial matches. :param pymupdf.Page page: The page containing the text to redact - :param str term: The redaction text candidate - :param pymupdf.Rect rect: The redaction candidate's bounding box (on the page) - :return tuple(bool, str): Whether the text found at rect on the page is - an exact match for the given redaction text candidate (i.e., the text - should be redacted), and the actual text found at the expanded bounding box. - """ - - # Normalise term: remove punctuation and extra whitespace, convert to lowercase - text_to_redact_normalised = (" ").join(get_normalised_words(term)) - - # Expand rect slightly by approximately a quarter of a character on each side - char_width = rect.width / len(term) - quarter_char_width = char_width / 4 - actual_text_at_rect = page.get_textbox( - rect + (-quarter_char_width, 0, quarter_char_width, 0) - ).strip() # Remove trailing/leading whitespace - words_at_rect_normalised = get_normalised_words(actual_text_at_rect) - - # If the text found contains multiple words, split to the same number of words - # as the term to redact - candidate_words = [] - n_words_to_redact = len(text_to_redact_normalised.split(" ")) - if len(words_at_rect_normalised) >= n_words_to_redact and n_words_to_redact > 1: - for i in range(len(words_at_rect_normalised) - n_words_to_redact + 1): - candidate_words.append( - " ".join(words_at_rect_normalised[i : i + n_words_to_redact]) + :param str term_to_redact: The redaction text candidate + :param PDFLineMetadata line_to_check: The line metadata to check + :return List[Tuple[str, int, int]]: A list of matches found. Each tuple + contains the text found, and the start and end index of the match in the line, + For single-word redaction candidates, this will be a list of all exact matches + found in the line. For multi-word redaction candidates, this will be a list of + potential matches for the first word in the term to redact, with the end index + set to -1 if there is no exact match for the full term. + """ + # Find first word in line that matches the first word in the term to redact + normalised_words_to_redact = get_normalised_words(term_to_redact) + + if len(normalised_words_to_redact) == 1: + # Single term redaction: check for exact match with words in line + matching_indices = [ + i + for i, word in enumerate(line_to_check.words) + if any( + x == normalise_text(term_to_redact) + for x in [word, word[:-2] if word.endswith("'s") else ""] ) - else: - candidate_words = words_at_rect_normalised + ] + return [ + (line_to_check.words[index], index, index) for index in matching_indices + ] + else: # Multi-word redaction + # Find matches for the first word + matching_indices = [ + i + for i, word in enumerate(line_to_check.words) + if word == normalised_words_to_redact[0] + ] + + # Check subsequent words to redact for each first matching index + matches = [] + for index in matching_indices: + candidate_words = [] + end_index = index + words_to_redact = normalised_words_to_redact.copy() + while words_to_redact and end_index < len(line_to_check.words): + word = line_to_check.words[end_index] + word_to_redact = words_to_redact.pop(0) + # Check for exact match or ignore possessive markers + if word == word_to_redact or ( + word.endswith("'s") and word[:-2] == word_to_redact + ): + # Potential match, move to next word + candidate_words.append(word) + end_index += 1 + else: + # No match found + end_index = -1 + break + matches.append((" ".join(candidate_words), index, end_index - 1)) + + return matches - # Check for exact match among candidate words - match_result = False - for word in candidate_words: - match_result = text_to_redact_normalised == word + @classmethod + def _construct_pdf_rect( + cls, line: PDFLineMetadata, start_index: int, end_index: int + ) -> pymupdf.Rect: + """ + Construct the bouding box for the words in the line between the start and + and indices. - # Try to match by ignoring possessive markers - if word.endswith("'s"): - word = word[:-2] - match_result = match_result or text_to_redact_normalised == word + :param PDFLineMetadata line: The line metadata containing the words to redact + :param int start_index: The index of the first word + :param int end_index: The index of the last word - if match_result: - # Once a match is found, no need to check further - return True, actual_text_at_rect - return match_result, actual_text_at_rect + :return pymupdf.Rect: The bounding box + """ + return pymupdf.Rect( + line.x0[start_index], + line.y0, + line.x1[end_index], + line.y1, + ) @classmethod def _add_provisional_redaction(cls, page: pymupdf.Page, rect: pymupdf.Rect): - highlight_annotation = page.add_highlight_annot(rect) # Add the original rect in the subject, since highlight annotations may not have the same rect once created # i.e. this is needed to ensure the final redactions are in the correct location + highlight_annotation = page.add_highlight_annot(rect) highlight_annotation.set_info( { "content": "REDACTION CANDIDATE", @@ -246,50 +356,101 @@ def _add_provisional_redaction(cls, page: pymupdf.Page, rect: pymupdf.Rect): } ) - @classmethod - def _is_partial_redaction_across_line_breaks( - cls, - term: str, - actual_text_at_rect: str, - next_page: pymupdf.Page, - next_rect: pymupdf.Rect, - ) -> bool: + def _check_partial_redaction_across_line_breaks( + self, + normalised_words_to_redact: List[str], + partial_term_found: str, + line_checked: PDFLineMetadata, + page_metadata: PDFPageMetadata, + next_page_metadata: PDFPageMetadata = None, + ) -> Tuple[int, PDFLineMetadata, int]: """ Check if the given term is partially redacted in the current rect, and the remaining part is in the next rect (i.e. redaction across line breaks) - :param str term: The redaction text candidate - :param str actual_text_at_rect: The actual text found at the current rect - :param pymupdf.Page next_page: The next page containing the next redaction instance - :param pymupdf.Rect next_rect: The next redaction candidate's bounding box (on the page) - :return bool: True if the full term is found across the two rects, else False - """ - partial_term_in_rect = "" - words_to_redact = term.split(" ") - # Try to find the largest partial match in the current rect - while True: - if not words_to_redact: - break - partial_term_in_rect = " ".join(words_to_redact) - if partial_term_in_rect in actual_text_at_rect: - break - words_to_redact = words_to_redact[:-1] + :param List[str] normalised_words_to_redact: The redaction text candidate + :param str partial_term_found: The text found on the current line + :param PDFLineMetadata line_checked: The line containing the partial redaction instance + :param PDFPageMetadata page_metadata: The page containing the partial redaction + instance + :param PDFPageMetadata next_page_metadata: The next page containing the next redaction instance + + :return Tuple[int, PDFLineMetadata, int]: If a partial redaction across line + breaks is found, return a tuple containing the page number, line metadata, + and end index of the redaction instance on the next line. Otherwise, return None. + """ + words_to_redact = normalised_words_to_redact.copy() + term_to_redact = " ".join(words_to_redact) # Check next redaction instance for the remaining words - if partial_term_in_rect and partial_term_in_rect != term: + if partial_term_found and partial_term_found != term_to_redact: # Remove the part already found in the current rect - remaining_words_to_redact = term.replace(partial_term_in_rect, "").strip() - # Check if the next rect contains the remaining words to redact - match_result, _ = cls._is_full_text_being_redacted( - next_page, remaining_words_to_redact, next_rect + remaining_words_to_redact = ( + term_to_redact.replace(partial_term_found, "").strip().split(" ") ) - return match_result - return False + # Check if the next line contains the remaining words to redact + next_line = next( + ( + line + for line in page_metadata.lines + if line.line_number == line_checked.line_number + 1 + ), + None, + ) + + if not next_line: + # Check the next page for remaining words to redact + if next_page_metadata: + next_line = next( + line + for line in next_page_metadata.lines + if line.line_number == 0 + ) + page_number = next_page_metadata.page_number + else: + return None + else: + page_number = page_metadata.page_number + + if next_line: + end_index = 0 + while remaining_words_to_redact and end_index < len(next_line.words): + word = next_line.words[end_index] + word_to_redact = remaining_words_to_redact[0] + if word == word_to_redact or ( + word.endswith("'s") and word[:-2] == word_to_redact + ): + remaining_words_to_redact.pop(0) + end_index += 1 + else: + break + if not remaining_words_to_redact: + LoggingUtil().log_info( + f"Partial redaction found across line break for term '{term_to_redact}'." + ) + return page_number, next_line, end_index - 1 + return None + + def _get_next_page_metadata(self, pdf, page_number): + """ + Helper function to get the metadata for the next page if it exists + + :param pdf: The PDF document + :param page_number: The current page number + + :return PDFPageMetadata: The metadata for the next page, or None if there + is no next page + """ + return ( + self._extract_page_text(pdf[page_number + 1]) + if page_number + 1 < len(pdf) + else None + ) @log_to_appins def _apply_provisional_text_redactions( - self, file_bytes: BytesIO, text_to_redact: List[str], n_workers: int = None + self, file_bytes: BytesIO, text_to_redact: List[str] ): """ Redact the given list of redaction strings as provisional redactions in @@ -301,10 +462,8 @@ def _apply_provisional_text_redactions( :return BytesIO: Bytes stream for the PDF with provisional text redactions applied """ - self.file_bytes = file_bytes self.redaction_candidates: List[List[Tuple[pymupdf.Rect, str]]] = [] - - pdf = pymupdf.open(stream=self.file_bytes) + pdf = pymupdf.open(stream=file_bytes) # Find location of each redaction candidate by page for page in pdf: @@ -332,52 +491,34 @@ def _apply_provisional_text_redactions( # Examine redaction candidates: only apply exact matches and partial matches # across line breaks - n_highlights = 0 - with ProcessPoolExecutor( - max_workers=get_max_workers(n_workers), - # Use fork if available to avoid overhead of copying pdf for each process, - # but fall back to spawn if fork is not available (e.g. on Windows) - mp_context=mp.get_context("fork") - if mp.get_start_method() == "fork" - else None, - ) as executor: - # Submit task to the executor - futures_to_page = { - executor.submit( - self._examine_provisional_redactions_on_page, - i, - candidates_on_page, - ): i - for i, candidates_on_page in enumerate(self.redaction_candidates) - } - for future in as_completed(futures_to_page): - try: - page_num = futures_to_page[future] - instances_to_redact_on_page = future.result() - - n_highlights_on_page = len(instances_to_redact_on_page) - n_highlights += n_highlights_on_page - LoggingUtil().log_info( - f"Found {n_highlights_on_page} redaction instances to apply" - f" on page {page_num}." - ) + redaction_instances = [] + for i, page in enumerate(pdf): + if i == 0: + page_metadata = self._extract_page_text(page) + next_page_metadata = self._get_next_page_metadata(pdf, page.number) + else: + page_metadata = next_page_metadata + next_page_metadata = self._get_next_page_metadata(pdf, page.number) + redaction_instances.extend( + self._examine_provisional_redactions_on_page( + self.redaction_candidates[page.number], + page_metadata, + next_page_metadata, + ) + ) - # Apply accepted provisional redaction highlights for the human-in-the-loop to review - for redaction_inst in instances_to_redact_on_page: - page_number, rect, term = redaction_inst - page = pdf[page_number] - self._add_provisional_redaction(page, rect) - LoggingUtil().log_info( - f" Applying highlight for term '{term}' at location" - f" '{rect}' on page {page.number}." - ) - except Exception as e: - LoggingUtil().log_exception( - f"An error occurred while examining provisional text" - f" redactions on page {page_num}: {e}" - ) + n_highlights = 0 + for page_to_redact, rect, term in redaction_instances: + self._add_provisional_redaction(pdf[page_to_redact], rect) + LoggingUtil().log_info( + f" Applied provisional redaction for term '{term}'" + f" at location '{rect}' on page {page_to_redact}." + ) + n_highlights += 1 - LoggingUtil().log_info(f"Applied {n_highlights} redaction highlights.") + LoggingUtil().log_info( + f"Applied {n_highlights} provisional redactions in total." + ) new_file_bytes = BytesIO() pdf.save(new_file_bytes, deflate=True) @@ -386,58 +527,55 @@ def _apply_provisional_text_redactions( @log_to_appins def _examine_provisional_redactions_on_page( - self, page_number: int, candidates_on_page: List[Tuple[pymupdf.Rect, str]] + self, + candidates_on_page: List[Tuple[pymupdf.Rect, str]], + page_metadata: PDFPageMetadata, + next_page_metadata: PDFPageMetadata = None, ) -> List[Tuple[int, pymupdf.Rect, str]]: """ Check whether the provisional redaction candidates on the given page are valid redactions (i.e. full matches or partial matches across line breaks). - :param int page_number: The page number to examine + :param PDFPageMetadata page_metadata: The metadata of the page to examine + :param PDFPageMetadata next_page_metadata: The metadata of the next page to + examine, in case of a line break on the next page :param int candidates_on_page: The list of provisional redaction candidates on the page - :return List[Tuple[int, pymupdf.Rect, str]]: The list of valid - redaction instances to apply on the page. Each tuple contains the page number + :return List[Tuple[PDFPageMetadata, pymupdf.Rect, str]]: The list of valid + redaction instances to apply on the page. Each tuple contains the page metadata (which may be the following page for partial redactions across line breaks), the bounding box to redact, and the full term being redacted. """ - page = pymupdf.open(stream=self.file_bytes)[page_number] - instances_to_redact: List[Tuple[int, pymupdf.Rect, str]] = [] - - for i, (rect, term) in enumerate(candidates_on_page): + redaction_instances = [] + for rect, term_to_redact in candidates_on_page: LoggingUtil().log_info( - f" Validating redaction instance {i} on page {page.number} at" - f" location '{rect}' for term '{term}' ." + f" Examining redaction candidate for term '{term_to_redact}'" ) - try: - instances_to_redact.extend( - self._examine_provisional_text_redaction(page, term, rect, i) - ) - - except Exception as e: - LoggingUtil().log_exception( - f" Failed to validate redaction instance {i} on page" - f" {page.number} for term '{term}', at location '{rect}':" - f" {str(e)}" + redaction_instances.extend( + self._examine_provisional_text_redaction( + term_to_redact, rect, page_metadata, next_page_metadata ) - return instances_to_redact + ) + return redaction_instances @log_to_appins def _examine_provisional_text_redaction( self, - page, - term, - rect, - i, + term_to_redact: str, + rect: pymupdf.Rect, + page_metadata: PDFPageMetadata, + next_page_metadata: PDFPageMetadata = None, ) -> List[Tuple[int, pymupdf.Rect, str]]: """ Check whether the provisional redaction candidate is valid, i.e., a full match or a partial match across line breaks. - :param pymupdf.Page page: The page where the redaction candidate is found :param str term: The redaction text candidate :param pymupdf.Rect rect: The bounding box of the redaction candidate - :param int i: The index of the redaction candidate in the list of candidates - on the page + :param PDFPageMetadata page_metadata: The metadata of the page where the + redaction candidate is found + :param PDFPageMetadata next_page_metadata: The metadata of the next page + to examine, in case of a line break on the next page :return List[Tuple[int, pymupdf.Rect, str]]: The list of valid redaction candidates to apply. Each tuple contains the page number, the bounding box @@ -445,91 +583,97 @@ def _examine_provisional_text_redaction( full matches, a two entry list for partial redactions across line breaks, or an empty list if no valid redaction is found. """ - # Only redact text that is fully matched - do not apply partial redactions - match_result, actual_text_at_rect = self._is_full_text_being_redacted( - page, term, rect + LoggingUtil().log_info( + f" Examining redaction candidate for term '{term_to_redact}'" ) - if match_result: - return [(page.number, rect, term)] - elif ( - len(term.split(" ")) > 1 - ): # Check for line breaks causing partial redactions - candidates_on_page = self.redaction_candidates[page.number] - next_redaction_inst = self._find_next_redaction_instance( - candidates_on_page, i, page + # Find line corresponding to the redaction candidate + lines_on_page = page_metadata.lines + try: + line_to_check = next( + line for line in lines_on_page if line.y0 <= rect.y0 <= line.y1 ) - if next_redaction_inst is None: - return [] - next_page, next_rect, next_term = next_redaction_inst - - # Check this is for the same term - if next_term == term: - # Check whether the remaining part of the term is in the next rect - next_match_result = self._is_partial_redaction_across_line_breaks( - term, - actual_text_at_rect, - next_page, - next_rect, + except StopIteration: + LoggingUtil().log_info( + f" No line found for redaction candidate at location '{rect}'. " + " Skipping this candidate." + ) + return [] + + redaction_instances = [] + matches = self._is_full_text_being_redacted(term_to_redact, line_to_check) + + words_to_redact = get_normalised_words(term_to_redact) + if len(words_to_redact) == 1: + # Single term redaction: check for exact match with words in line + if matches: + # Apply each highlight for match found + for _, start, end in matches: + # Calculate the rect for the individual word to redact + rect = self._construct_pdf_rect(line_to_check, start, end) + redaction_instances.append( + (page_metadata.page_number, rect, term_to_redact) + ) + else: + LoggingUtil().log_info( + f" No exact match found for single-word term '{term_to_redact}'" + f" in line '{line_to_check.words}'. Skipping this candidate." ) - if next_match_result: + else: # Multi-word redaction candidate + if not matches: + return redaction_instances + # Find first word in line that matches the first word in the term to redact + for term_found, start, end in matches: + # No match found + if end == -1: LoggingUtil().log_info( - f" Partial redaction across line break" - f" for term '{term}'." + f" No exact match found for term '{term_to_redact}'" + f" in line '{line_to_check.words}'. Skipping this candidate." + ) + continue + + # Exact match found - apply highlight + elif end - start == len(words_to_redact) - 1: + # Calculate the rect for the term to redact + rect = self._construct_pdf_rect(line_to_check, start, end) + redaction_instances.append( + (page_metadata.page_number, rect, term_to_redact) ) - return [ - (page.number, rect, term), - (next_page.number, next_rect, next_term), - ] - - # No full match found, skip redaction - LoggingUtil().log_info( - " Partial match found when attempting to redact " - f"'{term}'. The surrounding box contains " - f"'{actual_text_at_rect}'. Skipping this candidate." - ) - return [] - def _find_next_redaction_instance( - self, - candidates_on_page: List[Tuple[pymupdf.Rect, str]], - i: int, - page: pymupdf.Page, - ) -> Tuple[pymupdf.Page, pymupdf.Rect, str]: - """ - Find the next redaction instance after the current one. This may be on the - same page or the next page. - - :param List[Tuple[pymupdf.Rect, str]] candidates_on_page: The list of - provisional redaction candidates on the current page - :param int i: The index of the current redaction candidate in the list - :param pymupdf.Page page: The current page - :return Tuple[pymupdf.Page, pymupdf.Rect, str]: The next page - and the next redaction candidate (bounding box and term). If there is no - next redaction candidate, returns (None, (None, None)). - """ - # Check next redaction on the same page - next_redaction_inst = ( - candidates_on_page[i + 1] if i + 1 < len(candidates_on_page) else None - ) - if next_redaction_inst: - next_page = page - next_rect, next_term = next_redaction_inst - else: - # If next page is different, check first redaction on next page - if page.number + 1 < len(self.redaction_candidates): - next_page_redaction_candidates = self.redaction_candidates[ - page.number + 1 - ] - next_page = pymupdf.open(stream=self.file_bytes)[page.number + 1] - next_rect, next_term = ( - next_page_redaction_candidates[0] - if next_page_redaction_candidates - else (None, None) - ) - else: - return None + # Partial match found - check for partial redaction across line breaks + elif end == len(line_to_check.words) - 1: + # Check for partial redaction across line break + result = self._check_partial_redaction_across_line_breaks( + words_to_redact, + term_found, + line_to_check, + page_metadata, + next_page_metadata, + ) + if result: + next_page_number, next_line, next_line_end_index = result + # Remaining part of the term to redact + redaction_instances.extend( + [ + ( + page_metadata.page_number, + self._construct_pdf_rect( + line_to_check, + start, + len(line_to_check.words) - 1, + ), + term_to_redact, + ), + ( + next_page_number, + self._construct_pdf_rect( + next_line, 0, next_line_end_index + ), + term_to_redact, + ), + ] + ) - return next_page, next_rect, next_term + return redaction_instances def _apply_provisional_image_redactions( self, file_bytes: BytesIO, redactions: List[ImageRedactionResult] @@ -579,7 +723,24 @@ def _apply_provisional_image_redactions( pymupdf.Matrix(image_transform), ) ) - self._add_provisional_redaction(page, rect_in_global_space) + LoggingUtil().log_info( + f"Transformed the rect {untransformed_bounding_box} in image-space " + f"for an image with dimensions {(pdf_image.width, pdf_image.height)} " + f"to the new rect {rect_in_global_space} in page-space, using the transform {image_transform}" + ) + try: + self._add_provisional_redaction(page, rect_in_global_space) + LoggingUtil().log_info( + f"Applied image redaction highlight for rect {rect_in_global_space} on page {page.number}" + ) + except Exception as e: + LoggingUtil().log_exception_with_message( + ( + f"Failed to apply image redaction highlight for rect '{rect_in_global_space}' on page " + f"'{page.number}' with dimensions '{page.rect}'" + ), + e, + ) new_file_bytes = BytesIO() pdf.save(new_file_bytes, deflate=True) new_file_bytes.seek(0) @@ -627,7 +788,6 @@ def redact( self, file_bytes: BytesIO, redaction_config: Dict[str, Any], - n_workers: int = None, ) -> BytesIO: """ Redact the given PDF file bytes according to the redaction configuration. @@ -635,20 +795,20 @@ def redact( :param file_bytes: File bytes of the PDF to redact. :param redaction_config: Dictionary of RedactionConfig objects specifying the redaction rules to apply. - :param n_workers: The number of worker processes to use for parallel processing - in applying text redactions. :return: The redacted PDF file bytes. """ # Extract text from PDF pdf_text = self._extract_pdf_text(file_bytes) + LoggingUtil().log_info( + f"The following text was extracted from the PDF:\n'{pdf_text}'" + ) if pdf_text and not is_english_text(pdf_text): - LoggingUtil().log_exception( + exception = NonEnglishContentException( "Language check: non-English or insufficient English content " "detected; skipping provisional redactions." ) - raise NonEnglishContentException - - # Extract images from PDF + LoggingUtil().log_exception(exception) + raise exception pdf_images = self._extract_pdf_images(file_bytes) # Generate list of redaction rules from config @@ -671,20 +831,36 @@ def redact( # Generate redactions # TODO convert back to a set redaction_results: List[RedactionResult] = [] - # Apply each redaction rule + LoggingUtil().log_info("Analysing PDF to identify redactions") + LoggingUtil().log_info("Analysing PDF to identify redactions") for rule_to_apply in redaction_rules_to_apply: - redaction_results.append(rule_to_apply.redact()) + LoggingUtil().log_info(f"Running redaction rule {rule_to_apply}") + redaction_result = rule_to_apply.redact() + LoggingUtil().log_info( + f"The redactor {rule_to_apply} yielded the following result: " + f"{json.dumps(dataclasses.asdict(redaction_result), indent=4, default=str)}" + ) + redaction_results.append(redaction_result) + LoggingUtil().log_info("PDF analysis complete") + # Ingest Stopword list + def _remove_stopwords(self, text_to_redact: List[str]): + stopwords = safe_load(open(os.path.join("config", "stopwords.yaml"), "r")) + stopwords_list = stopwords["stopwords"] + text_to_redact = [x for x in text_to_redact if x.lower() not in stopwords_list] + return text_to_redact # Separate out text and image redaction results text_redaction_results: List[TextRedactionResult] = [ x for x in redaction_results if issubclass(x.__class__, TextRedactionResult) ] - text_redactions = [ - redaction_string - for result in text_redaction_results - for redaction_string in result.redaction_strings - ] + text_redactions = self.remove_stopwords( + [ + redaction_string + for result in text_redaction_results + for redaction_string in result.redaction_strings + ] + ) image_redaction_results: List[ImageRedactionResult] = [ x for x in redaction_results @@ -705,10 +881,10 @@ def redact( ) as e: LoggingUtil().log_exception(e) raise e - + LoggingUtil().log_info("Applying proposed redactions") # Apply text redactions by highlighting text to redact new_file_bytes = self._apply_provisional_text_redactions( - file_bytes, text_redactions, n_workers=n_workers + file_bytes, text_redactions ) # Apply image redactions diff --git a/redactor/core/redaction/redactor.py b/redactor/core/redaction/redactor.py index a32c4f21..746f563c 100644 --- a/redactor/core/redaction/redactor.py +++ b/redactor/core/redaction/redactor.py @@ -31,6 +31,9 @@ from core.util.logging_util import LoggingUtil, log_to_appins from core.util.text_util import get_normalised_words +from yaml import safe_load +import os + class Redactor(ABC): """ @@ -44,6 +47,9 @@ def __init__(self, config: RedactionConfig): self._validate_redaction_config(config) self.config = config + def __str__(self): + return f"{self.__class__.__name__}({self.config.name})" + @classmethod @abstractmethod def get_name(cls) -> str: @@ -129,6 +135,9 @@ def _analyse_text(self, text_to_analyse: str, **kwargs) -> LLMTextRedactionResul # The user's prompt will just be the raw text text_chunks = self.TEXT_SPLITTER.split_text(text_to_analyse) + LoggingUtil().log_info( + f"The text has been broken down into {len(text_chunks)} chunks" + ) # Initialise LLM interface llm_util = LLMUtil(self.config) @@ -140,11 +149,20 @@ def _analyse_text(self, text_to_analyse: str, **kwargs) -> LLMTextRedactionResul ) return llm_redaction_result + def _remove_stopwords_(self,text_to_redact: List[str]): + """ + Check the text_to_redact list against the list in the stopwords yaml + """ + stopwords = safe_load(open(os.path.join("config", "stopwords.yaml"), "r")) + stopwords_list = stopwords["stopwords"] + text_to_redact = list(set(text_to_redact) - set(stopwords_list)) + return text_to_redact + def redact(self) -> LLMTextRedactionResult: self.config: LLMTextRedactionConfig + self.LLMTextRedactionResult.redaction_strings = self._remove_stopwords_(self.LLMTextRedactionResult.redaction_strings) return self._analyse_text(self.config.text) - class ImageRedactor(Redactor): # pragma: no cover """ Class that performs image redaction @@ -357,7 +375,13 @@ def redact(self) -> ImageRedactionResult: try: vision_util = AzureVisionUtil() text_rect_map = vision_util.detect_text(image_to_redact) + LoggingUtil().log_info( + f"The following text analysis was returned by AzureVisionUtil.detect_text: {text_rect_map}" + ) text_content = " ".join([x[0] for x in text_rect_map]) + LoggingUtil().log_info( + f"The following text was extracted from an image in the PDF:\n'{text_content}'" + ) # Analyse detected text with LLM redaction_strings = self._analyse_text(text_content).redaction_strings diff --git a/redactor/core/redaction_manager.py b/redactor/core/redaction_manager.py index 9d3bcb82..8e50485d 100644 --- a/redactor/core/redaction_manager.py +++ b/redactor/core/redaction_manager.py @@ -1,5 +1,5 @@ # import magic # Cannot use magic in the Azure function yet due to needing to build via ACR. This will be added in the future -from typing import Dict, Any, Optional +from typing import Dict, Any, List, Optional from core.redaction.file_processor import ( FileProcessorFactory, ) @@ -14,6 +14,7 @@ import traceback from dotenv import load_dotenv import os +import json load_dotenv(verbose=True, override=True) @@ -51,7 +52,8 @@ def __init__(self, job_id: str): raise RuntimeError( "An 'ENV' environment variable has not been set - please ensure this is set wherever RedactionManager is running" ) - # Ensure the job id is set to the job id + self.runtime_errors: List[str] = [] + # Ensure the logger's job id is set to the job id LoggingUtil(job_id=self.job_id) def convert_kwargs_for_io(self, some_parameters: Dict[str, Any]): @@ -71,6 +73,9 @@ def redact(self, params: Dict[str, Any]): """ Perform a redaction using the supplied parameters """ + LoggingUtil().log_info( + f"Starting the redaction process with params '{json.dumps(params, indent=4)}'" + ) config_name = params.get("configName", "default") file_kind = params.get("fileKind") read_details: Dict[str, Any] = params.get("readDetails") @@ -92,6 +97,7 @@ def redact(self, params: Dict[str, Any]): ) # Load the data + LoggingUtil().log_info("Reading the raw file to redact") read_io_inst = IOFactory.get(read_torage_kind)(**read_storage_properties) file_data = read_io_inst.read(**read_storage_properties) file_data.seek(0) @@ -99,6 +105,7 @@ def redact(self, params: Dict[str, Any]): file_processor_class = FileProcessorFactory.get(file_kind) # Load redaction config + LoggingUtil().log_info(f"Loading the redaction config '{config_name}'") config = ConfigProcessor.load_config(config_name) # Cannot use magic in the Azure function yet due to needing to build via ACR. This will be added in the future # file_format = magic.from_buffer(file_data.read(), mime=True) @@ -111,6 +118,7 @@ def redact(self, params: Dict[str, Any]): ) # Store a copy of the raw data in redaction storage before processing begins + LoggingUtil().log_info("Saving a copy of the raw file to redact") redaction_storage_io_inst.write( file_data, container_name="redactiondata", @@ -119,18 +127,24 @@ def redact(self, params: Dict[str, Any]): # Process the data if skip_redaction: + LoggingUtil().log_info( + "skip_redaction=True, so the redaction process is being skipped" + ) # Allow the process to skip redaction and just return the read data # this should be used just for testing, as a way of quickly verifying the # end to end process for connectivity proposed_redaction_file_data = file_data proposed_redaction_file_data.seek(0) else: + LoggingUtil().log_info("Starting the redaction process") file_processor_inst = file_processor_class() proposed_redaction_file_data = file_processor_inst.redact( file_data, config_cleaned ) + LoggingUtil().log_info("Redaction process complete") # Store a copy of the proposed redactions in redaction storage + LoggingUtil().log_info("Saving a copy of the proposed redactions") redaction_storage_io_inst.write( proposed_redaction_file_data, container_name="redactiondata", @@ -139,23 +153,52 @@ def redact(self, params: Dict[str, Any]): proposed_redaction_file_data.seek(0) # Write the data back to the sender's desired location + LoggingUtil().log_info( + "Sending a copy of the proposed redactions to the caller" + ) write_io_inst = IOFactory.get(write_storage_kind)(**write_storage_properties) write_io_inst.write(proposed_redaction_file_data, **write_storage_properties) + def save_logs(self): + """ + Write a log file locally and in Azure + """ + log_bytes = LoggingUtil().get_log_bytes() + # Dump in Azure + AzureBlobIO( + storage_name=f"pinsstredaction{self.env}uks", + ).write( + data_bytes=log_bytes, + container_name="redactiondata", + blob_path=f"{self.job_id}/log.txt", + ) + def log_exception(self, exception: Exception): """ - Store an exception log in the redaction storage account + Store an exception log """ LoggingUtil().log_exception(exception) error_trace = "".join( traceback.TracebackException.from_exception(exception).format() ) - AzureBlobIO( + self.runtime_errors.append(error_trace) + + def save_exception_log(self): + """ + Save any logged exceptions to the redaction storage account. If there are no exceptions, then nothing is written + Note: This should only be called once - overwrites are not permitted + """ + if not self.runtime_errors: + return + blob_io = AzureBlobIO( storage_name=f"pinsstredaction{self.env}uks", - ).write( - data_bytes=error_trace.encode("utf-8"), + ) + text_encoding = "utf-8" + data_to_write = "\n\n\n".join(self.runtime_errors) + blob_io.write( + data_bytes=data_to_write.encode(text_encoding), container_name="redactiondata", - blob_path=f"{self.job_id}/exception.txt", + blob_path=f"{self.job_id}/exceptions.txt", ) def send_service_bus_completion_message( @@ -205,6 +248,8 @@ def try_redact(self, params: Dict[str, Any]): } ``` """ + fatal_error = None + non_fatal_errors = [] base_response = { "parameters": params, "id": self.job_id, @@ -218,10 +263,40 @@ def try_redact(self, params: Dict[str, Any]): self.log_exception(e) status = "FAIL" message = f"Redaction process failed with the following error: {e}" + fatal_error = message final_output = base_response | {"status": status, "message": message} try: self.send_service_bus_completion_message(params, final_output) except Exception as e: self.log_exception(e) - message = f"Redaction process completed successfully, but failed to submit a service bus message with the following error: {e}" + non_fatal_errors.append( + f"Failed to submit a service bus message with the following error: {e}" + ) + try: + self.save_logs() + except Exception as e: + self.log_exception(e) + non_fatal_errors.append( + f"Failed to write logs with the following error: {e}" + ) + try: + self.save_exception_log() + except Exception as e: + non_fatal_errors.append( + f"Failed to write an exception log with the following error: {e}" + ) + # Return any non-fatal errors to the caller + if non_fatal_errors: + if fatal_error: + message = ( + message + + "\nAdditionally, the following non-fatal errors occurred:\n" + + "\n".join(non_fatal_errors) + ) + else: + message = ( + "Redaction process completed successfully, but had some non-fatal errors:\n" + + "\n".join(non_fatal_errors) + ) + final_output = base_response | {"status": status, "message": message} return final_output diff --git a/redactor/core/util/azure_vision_util.py b/redactor/core/util/azure_vision_util.py index e38d774a..90f79b74 100644 --- a/redactor/core/util/azure_vision_util.py +++ b/redactor/core/util/azure_vision_util.py @@ -7,6 +7,7 @@ from azure.ai.vision.imageanalysis import ImageAnalysisClient from azure.ai.vision.imageanalysis.models import VisualFeatures +from core.util.logging_util import LoggingUtil, log_to_appins from azure.identity import ( ChainedTokenCredential, @@ -14,8 +15,6 @@ AzureCliCredential, ) -from core.util.logging_util import LoggingUtil, log_to_appins - load_dotenv(verbose=True) @@ -29,6 +28,9 @@ def __init__(self): credential = ChainedTokenCredential( ManagedIdentityCredential(), AzureCliCredential() ) + LoggingUtil().log_info( + f"Establishing connection to Azure Computer Vision at {self.azure_endpoint}" + ) self.vision_client = ImageAnalysisClient( endpoint=self.azure_endpoint, credential=credential ) @@ -42,7 +44,7 @@ def detect_faces( :param Image.Image image: The image to analyse :param floatconfidence_threshold: Confidence threshold between 0 and 1 - :returns: Bounding boxes of faces as a 4-tuple of the form (top left corner x, top left corner y, box width, box height), for boxes + :returns: Bounding boxes of faces as a 4-tuple of the form (top left corner x, top left corner y, bottom right corner x, bottom right corner y), for boxes with confidence above the threshold """ try: @@ -74,8 +76,8 @@ def detect_faces( "box": ( person.bounding_box.x, person.bounding_box.y, - person.bounding_box.width, - person.bounding_box.height, + person.bounding_box.x + person.bounding_box.width, + person.bounding_box.y + person.bounding_box.height, ), "confidence": person.confidence, } diff --git a/redactor/core/util/llm_util.py b/redactor/core/util/llm_util.py index b27ac482..e93a1c4e 100644 --- a/redactor/core/util/llm_util.py +++ b/redactor/core/util/llm_util.py @@ -2,6 +2,7 @@ import time from typing import List +from tenacity.retry import retry_if_exception_type from tenacity import retry, wait_random_exponential, stop_after_attempt from pydantic import BaseModel from dotenv import load_dotenv @@ -12,7 +13,7 @@ ) from langchain_core.prompts import PromptTemplate from concurrent.futures import ThreadPoolExecutor, as_completed -from openai import AzureOpenAI +from openai import AzureOpenAI, RateLimitError from openai.types.chat.parsed_chat_completion import ParsedChatCompletion from tiktoken import get_encoding from threading import Semaphore @@ -24,6 +25,9 @@ ) from core.util.logging_util import log_to_appins, LoggingUtil from core.util.multiprocessing_util import TokenSemaphore, get_max_workers +import json + +from core.ConfigProcessor import load_config load_dotenv(verbose=True) @@ -79,6 +83,9 @@ def __init__( self.token = credential.get_token( "https://cognitiveservices.azure.com/.default" ).token + LoggingUtil().log_info( + f"Establishing connection to the LLM at {self.azure_endpoint}" + ) self.llm = AzureOpenAI( azure_endpoint=self.azure_endpoint, api_version="2024-12-01-preview", @@ -174,7 +181,9 @@ def _num_tokens_consumed( total_tokens = n_tokens + completion_tokens return total_tokens except Exception as e: - LoggingUtil().log_exception(e) + LoggingUtil().log_exception_with_message( + "An error occurred while counting tokens:", e + ) return 0 def create_api_message( @@ -199,7 +208,9 @@ def invoke_chain(self, api_messages: str, response_format: BaseModel): @log_to_appins # exponential backoff to increase wait time between retries https://platform.openai.com/docs/guides/rate-limits + # Only retry if there is a rate limit exception. All other errors are logged and skipped @retry( + retry=retry_if_exception_type((RateLimitError, TimeoutError)), wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(10), before_sleep=lambda retry_state: LoggingUtil().log_info("Retrying..."), @@ -211,6 +222,8 @@ def _analyse_text_chunk( user_prompt: str, ) -> tuple[ParsedChatCompletion, List[str]]: """Redact a single chunk of text using the LLM.""" + # Chunk hash to distinguish between messages when multithreading + chunk_hash_string = f"(chunk ID {hash(user_prompt)})" # Estimate tokens for the request api_messages = self.create_api_message(system_prompt, user_prompt) estimated_tokens = self._num_tokens_consumed(api_messages) @@ -220,9 +233,10 @@ def _analyse_text_chunk( timeout=self.config.request_timeout ) # returns True if acquired, False on timeout if not thread_available: - LoggingUtil().log_exception( - "Timeout while waiting for request semaphore to be available." + exception = TimeoutError( + f"{chunk_hash_string} Timeout while waiting for request semaphore to be available." ) + LoggingUtil().log_exception(exception) raise TimeoutError try: @@ -230,24 +244,35 @@ def _analyse_text_chunk( try: self.token_semaphore.acquire(estimated_tokens) except TimeoutError as te: - LoggingUtil().log_exception( - f"Timeout while waiting for tokens to be released: {te}" + LoggingUtil().log_exception_with_message( + f"{chunk_hash_string} Timeout while waiting for tokens to be released :", + te, ) raise te try: + LoggingUtil().log_info( + f"{chunk_hash_string} The following messages were sent to the LLM: {api_messages}" + ) response = self.invoke_chain(api_messages, LLMRedactionResultFormat) + LoggingUtil().log_info( + f"{chunk_hash_string} The following raw message were received by the LLM: {api_messages}" + ) response_cleaned: LLMRedactionResultFormat = response.choices[ 0 ].message.parsed redaction_strings = response_cleaned.redaction_strings + LoggingUtil().log_info( + f"{chunk_hash_string} The following redaction_strings were generated" + ) # Update token counts and costs self._compute_costs(response) return response, redaction_strings except Exception as e: - LoggingUtil().log_exception( - f"An error occurred while processing the chunk: {e}" + LoggingUtil().log_exception_with_message( + f"{chunk_hash_string} An error occurred while processing the chunk:", + e, ) raise e finally: @@ -280,6 +305,10 @@ def analyse_text( Based on https://github.com/mahmoudhage21/Parallel-LLM-API-Requester/blob/main/src/Parallel_LLM_API_Requester.py """ + chunk_hashes = [{"chunk": chunk, "hash": hash(chunk)} for chunk in text_chunks] + LoggingUtil().log_info( + f"The following text chunks will be processed: {json.dumps(chunk_hashes, indent=4)}" + ) # Initialise LLM interface request_counter = 0 @@ -320,8 +349,9 @@ def analyse_text( responses.append(response) text_to_redact.extend(redaction_strings) except Exception as e: - LoggingUtil().log_exception( - f"Function call with chunk {chunk} generated an exception: {e}" + LoggingUtil().log_exception_with_message( + f"Function call with chunk ID ({hash(chunk)}) generated an exception:", + e, ) # Check budget after each request @@ -332,6 +362,12 @@ def analyse_text( ) break + + # Remove stopwords + stopwords = ConfigProcessor.load_config(str = "stopwords") + stopwords_list = stopwords["stopwords"] + text_to_redact = list(set(text_to_redact) - set(stopwords_list)) + # Remove duplicates text_to_redact_cleaned = tuple(dict.fromkeys(text_to_redact)) diff --git a/redactor/core/util/logging_util.py b/redactor/core/util/logging_util.py index dc66406b..f113821e 100644 --- a/redactor/core/util/logging_util.py +++ b/redactor/core/util/logging_util.py @@ -7,6 +7,7 @@ from dotenv import load_dotenv from azure.monitor.opentelemetry import configure_azure_monitor +import traceback load_dotenv(verbose=True) @@ -63,7 +64,8 @@ def __init__(self, *args, **kwargs): self.job_id = kwargs.pop("job_id", uuid4()) self.namespace = kwargs.pop("namespace", "redactor_logs") self.log_file = kwargs.pop("log_file", None) - self.log_level = kwargs.pop("log_level", logging.INFO) + self.log_level = kwargs.pop("log_level", logging.DEBUG) + self.raw_logs = "" app_insights_connection_string = os.environ.get( "APP_INSIGHTS_CONNECTION_STRING", None @@ -101,13 +103,38 @@ def log_info(self, msg: str): """ Log an information message """ - self.logger.info(f"{self.job_id}: {msg}") + message = f"{self.job_id}: {msg}" + self.raw_logs += f"INFO: {message}\n" + self.logger.info(message) def log_exception(self, ex: Exception): """ Log an exception """ - self.logger.exception(f"{self.job_id}: {ex}") + stack_trace = "".join(traceback.TracebackException.from_exception(ex).format()) + message = f"{self.job_id}: {ex}\n\n The Exception stack trace is below:\n\n{stack_trace}\n" + self.raw_logs += f"ERROR: {message}\n" + self.logger.exception(message) + + def log_exception_with_message(self, message: str, ex: Exception): + """ + Log an exception + """ + stack_trace = "".join(traceback.TracebackException.from_exception(ex).format()) + message = f"{self.job_id}: {message}: {ex}\n\n The Exception stack trace is below:\n\n{stack_trace}\n" + self.raw_logs += f"ERROR: {message}\n" + self.logger.exception(message) + + def log_warning(self, msg: str): + """ + Log a warning message + """ + message = f"{self.job_id}: {msg}" + self.raw_logs += f"WARNING: {message}\n" + self.logger.warning(message) + + def get_log_bytes(self) -> bytes: + return self.raw_logs.encode("utf-8") def log_to_appins(_func=None, *args, **kwargs): @@ -145,8 +172,8 @@ def wrapper(*args, **kwargs): try: return func(*args, **kwargs) except Exception as e: - logger.log_exception( - f"Exception raised in function {func.__name__}: {e}" + logger.log_exception_with_message( + f"Exception raised in function {func.__name__}:", e ) raise e diff --git a/redactor/core/util/text_util.py b/redactor/core/util/text_util.py index 53ceb960..b72a2a49 100644 --- a/redactor/core/util/text_util.py +++ b/redactor/core/util/text_util.py @@ -60,6 +60,16 @@ def normalise_punctuation_unidecode(text: str) -> str: ) +def normalise_text(text: str) -> str: + """ + Normalise text by normalising punctuation and converting to lowercase. + + :param str text: The text to normalise + :return str: The normalised text + """ + return normalise_punctuation_unidecode(text).lower().strip().strip(punctuation) + + def get_normalised_words(text: str) -> List[str]: """ Normalise the given text into a list of words for redaction matching @@ -67,11 +77,7 @@ def get_normalised_words(text: str) -> List[str]: :param str text: The text to normalise :return List[str]: The list of normalised words """ - text_normalised = ( - normalise_punctuation_unidecode(text) # Normalise punctuation to ASCII - .lower() - .split(" ") - ) + text_normalised = normalise_text(text).split(" ") words_normalised = [ word.strip().strip( # Remove leading/trailing whitespace punctuation # Remove punctuation around the word diff --git a/redactor/test/integration_test/redaction/file_processor/test_config_processor.py b/redactor/test/integration_test/redaction/file_processor/test_config_processor.py index da602e8d..13177a3c 100644 --- a/redactor/test/integration_test/redaction/file_processor/test_config_processor.py +++ b/redactor/test/integration_test/redaction/file_processor/test_config_processor.py @@ -17,12 +17,25 @@ def test__config_processor__process_config(): llm_text_redaction_attributes = { "model": "gpt-4.1", "system_prompt": "You are a thorough assistant that extracts all of the requested terms from a given text.", - "redaction_terms": ["People's names. List each part of the name separately."], + "redaction_terms": [ + "People's names", + "Personal addresses and postcodes", + "Personal email addresses, unless it's a Planning Inspectorate email", + "Telephone numbers, unless it's a Planning Inspectorate customer service team telephone number", + "National Insurance Numbers, e.g. AB 12 34 56 C", + "Hyperlinks, except those that are .gov.uk, .org, .gov.wales", + "Personal health information, e.g. illnesses or concerning a person's sex life. List each term as it appears in the text.", + "Personal data revealing ethnic origin, political opinions, philosophical beliefs, or trade union membership", + "Criminal offence data, e.g. allegations, investigations, proceedings, penalties", + "Any defamatory (libellous) or inflammatory information", + "Specific financial information such as bank accounts, salary details, house valuations, bonuses, or shares", + "Dates of birth, and ages of people", + "The location of any of the following: badger sett, bat maternity roost, bird nest", + ], "constraints": [ - "Do not include locations or organisations", "Do not include names of anything which is not a person", - "Do not list the author of the text", - "Do not include those on whose behalf the text was written", + "Do not include the name of the author of the text", + "Do not include the names of those on whose behalf the text was written", ], } diff --git a/redactor/test/integration_test/redaction/file_processor/test_pdf_processor.py b/redactor/test/integration_test/redaction/file_processor/test_pdf_processor.py index 01453ece..0aa8d221 100644 --- a/redactor/test/integration_test/redaction/file_processor/test_pdf_processor.py +++ b/redactor/test/integration_test/redaction/file_processor/test_pdf_processor.py @@ -24,7 +24,7 @@ def test__pdf_processor__examine_provisional_text_redaction(): [ ( # Single word redaction pymupdf.Rect( - 72.0, 101.452392578125, 97.65274810791016, 113.741455078125 + 72.0, 101.452392578125, 101.31322479248047, 113.741455078125 ), "Riker", ), @@ -32,7 +32,7 @@ def test__pdf_processor__examine_provisional_text_redaction(): pymupdf.Rect( 180.76254272460938, 145.0911865234375, - 267.51654052734375, + 270.5718994140625, 157.3802490234375, ), "Commander Data", @@ -42,29 +42,32 @@ def test__pdf_processor__examine_provisional_text_redaction(): pymupdf.Rect( 470.42864990234375, 101.452392578125, - 495.6957702636719, + 492.6402282714844, 113.741455078125, ), "Your Honour", # "Your" on first line ), ( pymupdf.Rect( - 72.0, 115.9986572265625, 108.05452728271484, 128.2877197265625 + 72.0, 115.9986572265625, 110.5030746459961, 128.2877197265625 ), "Your Honour", # "Honour" on second line ), ] ] - page = pymupdf.open(stream=document_bytes)[0] pdf_processor = PDFProcessor() - pdf_processor.file_bytes = document_bytes pdf_processor.redaction_candidates = redaction_candidates + pdf = pymupdf.open(stream=document_bytes) instances_to_redact = [] for i, (rect, term) in enumerate(redaction_candidates[0]): instances_to_redact.append( - pdf_processor._examine_provisional_text_redaction(page, term, rect, i) + pdf_processor._examine_provisional_text_redaction( + term, + rect, + pdf_processor._extract_page_text(pdf[0]), + ) ) expected_result = [ @@ -72,7 +75,7 @@ def test__pdf_processor__examine_provisional_text_redaction(): ( 0, pymupdf.Rect( - 72.0, 101.452392578125, 97.65274810791016, 113.741455078125 + 72.0, 101.452392578125, 101.31322479248047, 113.741455078125 ), "Riker", ), @@ -83,7 +86,7 @@ def test__pdf_processor__examine_provisional_text_redaction(): pymupdf.Rect( 180.76254272460938, 145.0911865234375, - 267.51654052734375, + 270.5718994140625, 157.3802490234375, ), "Commander Data", @@ -95,7 +98,7 @@ def test__pdf_processor__examine_provisional_text_redaction(): pymupdf.Rect( 470.42864990234375, 101.452392578125, - 495.6957702636719, + 492.6402282714844, 113.741455078125, ), "Your Honour", @@ -103,7 +106,7 @@ def test__pdf_processor__examine_provisional_text_redaction(): ( 0, pymupdf.Rect( - 72.0, 115.9986572265625, 108.05452728271484, 128.2877197265625 + 72.0, 115.9986572265625, 110.5030746459961, 128.2877197265625 ), "Your Honour", ), @@ -124,14 +127,14 @@ def test__pdf_processor__examine_provisional_redactions_on_page(): document_bytes = BytesIO(f.read()) redaction_candidates = [ ( - pymupdf.Rect(72.0, 101.452392578125, 97.65274810791016, 113.741455078125), + pymupdf.Rect(72.0, 101.452392578125, 101.31322479248047, 113.741455078125), "Riker", ), ( pymupdf.Rect( 164.2420654296875, 101.452392578125, - 199.68487548828125, + 203.34519958496094, 113.741455078125, ), "Phillipa", @@ -140,52 +143,25 @@ def test__pdf_processor__examine_provisional_redactions_on_page(): pymupdf.Rect( 180.76254272460938, 145.0911865234375, - 267.51654052734375, + 270.5718994140625, 157.3802490234375, ), "Commander Data", ), ] pdf_processor = PDFProcessor() - pdf_processor.file_bytes = document_bytes + pdf_processor.redaction_candidates = [redaction_candidates] + pdf = pymupdf.open(stream=document_bytes) + instances_to_redact = pdf_processor._examine_provisional_redactions_on_page( - 0, redaction_candidates + redaction_candidates, + pdf_processor._extract_page_text(pdf[0]), ) assert instances_to_redact == [ (0, rect, term) for rect, term in redaction_candidates ] -def test__find_next_redaction_instance__on_page(): - """ - Given a text redaction candidate list for a PDF - I want to find the next instance of a redaction on the same page - """ - with open("test/resources/pdf/test__pdf_processor__source.pdf", "rb") as f: - document_bytes = BytesIO(f.read()) - redaction_candidates = [ - ( - pymupdf.Rect(72.0, 101.452392578125, 97.65274810791016, 113.741455078125), - "Riker", - ), - ( - pymupdf.Rect( - 164.2420654296875, - 101.452392578125, - 199.68487548828125, - 113.741455078125, - ), - "Phillipa", - ), - ] - pdf_processor = PDFProcessor() - page = pymupdf.open(stream=document_bytes)[0] - next_instance = pdf_processor._find_next_redaction_instance( - redaction_candidates, 0, page - ) - assert next_instance == (page, *redaction_candidates[1]) - - def test__pdf_processor__apply_provisional_text_redactions(): """ - Given I have a PDF with some provisional redactions diff --git a/redactor/test/integration_test/test_redaction_manager.py b/redactor/test/integration_test/test_redaction_manager.py index d053c489..366e2b0b 100644 --- a/redactor/test/integration_test/test_redaction_manager.py +++ b/redactor/test/integration_test/test_redaction_manager.py @@ -167,6 +167,11 @@ def test__redaction__manager__try_redact__skip_redaction(self): blob_bytes = blob_client.download_blob().read() assert pdf_bytes == blob_bytes self.validate_service_bus_message_sent(guid) + log_container_client = blob_service_client.get_container_client("redactiondata") + log_blob_client = log_container_client.get_blob_client(f"{guid}/log.txt") + assert log_blob_client.exists(), ( + f"Expected {guid}/log.txt to be in the redactiondata container, but was missing" + ) def test__redaction__manager__try_redact(self): """ @@ -235,6 +240,11 @@ def test__redaction__manager__try_redact(self): "The uploaded PDF should have some of its content marked for redaction" ) self.validate_service_bus_message_sent(guid) + log_container_client = blob_service_client.get_container_client("redactiondata") + log_blob_client = log_container_client.get_blob_client(f"{guid}/log.txt") + assert log_blob_client.exists(), ( + f"Expected {guid}/log.txt to be in the redactiondata container, but was missing" + ) def test__redaction_manager__try_redact__failure(self): """ @@ -268,5 +278,11 @@ def test__redaction_manager__try_redact__failure(self): response = manager.try_redact(params) assert response["status"] == "FAIL" container_client = blob_service_client.get_container_client("redactiondata") - blob_client = container_client.get_blob_client(f"{guid}/exception.txt") - assert blob_client.exists() + exception_blob_client = container_client.get_blob_client( + f"{guid}/exceptions.txt" + ) + assert exception_blob_client.exists() + log_blob_client = container_client.get_blob_client(f"{guid}/log.txt") + assert log_blob_client.exists(), ( + f"Expected {guid}/log.txt to be in the redactiondata container, but was missing" + ) diff --git a/redactor/test/integration_test/util/test_azure_vision_util.py b/redactor/test/integration_test/util/test_azure_vision_util.py index 686eb6d5..a1ef5e9b 100644 --- a/redactor/test/integration_test/util/test_azure_vision_util.py +++ b/redactor/test/integration_test/util/test_azure_vision_util.py @@ -20,7 +20,7 @@ def test__azure_vision_util__detect_faces(): response = AzureVisionUtil().detect_faces(image, confidence_threshold=0.5) # Azure Vision seems to be deterministic from testing - expected_response = ((0, 4, 409, 427), (360, 7, 407, 424)) + expected_response = ((0, 4, 409, 431), (360, 7, 767, 431)) assert expected_response == response @@ -34,7 +34,7 @@ def test__azure_vision_util__detect_faces__use_cached_result(): # Azure Vision seems to be deterministic from testing new_response = AzureVisionUtil().detect_faces(image, confidence_threshold=0.5) - expected_response = ((0, 4, 409, 427), (360, 7, 407, 424)) + expected_response = ((0, 4, 409, 431), (360, 7, 767, 431)) assert expected_response == new_response assert response == new_response diff --git a/redactor/test/resources/pdf/pas_01.pdf b/redactor/test/resources/pdf/pas_01.pdf new file mode 100644 index 00000000..28e90efa Binary files /dev/null and b/redactor/test/resources/pdf/pas_01.pdf differ diff --git a/redactor/test/resources/pdf/pas_01_PROVISIONAL_edited.pdf b/redactor/test/resources/pdf/pas_01_PROVISIONAL_edited.pdf new file mode 100644 index 00000000..e1d67d5b Binary files /dev/null and b/redactor/test/resources/pdf/pas_01_PROVISIONAL_edited.pdf differ diff --git a/redactor/test/resources/pdf/pas_02.pdf b/redactor/test/resources/pdf/pas_02.pdf new file mode 100644 index 00000000..dda1290e Binary files /dev/null and b/redactor/test/resources/pdf/pas_02.pdf differ diff --git a/redactor/test/resources/pdf/pas_03.pdf b/redactor/test/resources/pdf/pas_03.pdf new file mode 100644 index 00000000..7253c313 Binary files /dev/null and b/redactor/test/resources/pdf/pas_03.pdf differ diff --git a/redactor/test/resources/pdf/pas_04.pdf b/redactor/test/resources/pdf/pas_04.pdf new file mode 100644 index 00000000..016ffe28 Binary files /dev/null and b/redactor/test/resources/pdf/pas_04.pdf differ diff --git a/redactor/test/resources/pdf/pas_05.pdf b/redactor/test/resources/pdf/pas_05.pdf new file mode 100644 index 00000000..727b80de Binary files /dev/null and b/redactor/test/resources/pdf/pas_05.pdf differ diff --git a/redactor/test/resources/pdf/pas_06.pdf b/redactor/test/resources/pdf/pas_06.pdf new file mode 100644 index 00000000..090fe2d2 Binary files /dev/null and b/redactor/test/resources/pdf/pas_06.pdf differ diff --git a/redactor/test/resources/pdf/pas_07.pdf b/redactor/test/resources/pdf/pas_07.pdf new file mode 100644 index 00000000..0e37646f Binary files /dev/null and b/redactor/test/resources/pdf/pas_07.pdf differ diff --git a/redactor/test/resources/pdf/pas_08.pdf b/redactor/test/resources/pdf/pas_08.pdf new file mode 100644 index 00000000..1b2e7e78 Binary files /dev/null and b/redactor/test/resources/pdf/pas_08.pdf differ diff --git a/redactor/test/resources/pdf/pas_09.pdf b/redactor/test/resources/pdf/pas_09.pdf new file mode 100644 index 00000000..cd21064a Binary files /dev/null and b/redactor/test/resources/pdf/pas_09.pdf differ diff --git a/redactor/test/resources/pdf/pas_10.pdf b/redactor/test/resources/pdf/pas_10.pdf new file mode 100644 index 00000000..1ab1d008 Binary files /dev/null and b/redactor/test/resources/pdf/pas_10.pdf differ diff --git a/redactor/test/resources/pdf/pas_11.pdf b/redactor/test/resources/pdf/pas_11.pdf new file mode 100644 index 00000000..0754e035 Binary files /dev/null and b/redactor/test/resources/pdf/pas_11.pdf differ diff --git a/redactor/test/resources/pdf/pas_12.pdf b/redactor/test/resources/pdf/pas_12.pdf new file mode 100644 index 00000000..09b56fdf Binary files /dev/null and b/redactor/test/resources/pdf/pas_12.pdf differ diff --git a/redactor/test/resources/pdf/pas_14.pdf b/redactor/test/resources/pdf/pas_14.pdf new file mode 100644 index 00000000..2bc60e45 Binary files /dev/null and b/redactor/test/resources/pdf/pas_14.pdf differ diff --git a/redactor/test/resources/pdf/pas_15.pdf b/redactor/test/resources/pdf/pas_15.pdf new file mode 100644 index 00000000..b10be061 Binary files /dev/null and b/redactor/test/resources/pdf/pas_15.pdf differ diff --git a/redactor/test/resources/pdf/pas_16.pdf b/redactor/test/resources/pdf/pas_16.pdf new file mode 100644 index 00000000..2d05dca9 Binary files /dev/null and b/redactor/test/resources/pdf/pas_16.pdf differ diff --git a/redactor/test/resources/pdf/pas_18.pdf b/redactor/test/resources/pdf/pas_18.pdf new file mode 100644 index 00000000..88cc54c5 Binary files /dev/null and b/redactor/test/resources/pdf/pas_18.pdf differ diff --git a/redactor/test/resources/pdf/pas_19.pdf b/redactor/test/resources/pdf/pas_19.pdf new file mode 100644 index 00000000..bfc2c03d Binary files /dev/null and b/redactor/test/resources/pdf/pas_19.pdf differ diff --git a/redactor/test/resources/pdf/pas_20.pdf b/redactor/test/resources/pdf/pas_20.pdf new file mode 100644 index 00000000..ac110910 Binary files /dev/null and b/redactor/test/resources/pdf/pas_20.pdf differ diff --git a/redactor/test/resources/pdf/pas_21.pdf b/redactor/test/resources/pdf/pas_21.pdf new file mode 100644 index 00000000..46e11c2a Binary files /dev/null and b/redactor/test/resources/pdf/pas_21.pdf differ diff --git a/redactor/test/resources/pdf/pas_22.pdf b/redactor/test/resources/pdf/pas_22.pdf new file mode 100644 index 00000000..ebdce5a5 Binary files /dev/null and b/redactor/test/resources/pdf/pas_22.pdf differ diff --git a/redactor/test/resources/pdf/pas_23.pdf b/redactor/test/resources/pdf/pas_23.pdf new file mode 100644 index 00000000..63449a84 Binary files /dev/null and b/redactor/test/resources/pdf/pas_23.pdf differ diff --git a/redactor/test/resources/pdf/pas_24.pdf b/redactor/test/resources/pdf/pas_24.pdf new file mode 100644 index 00000000..bf826aa2 Binary files /dev/null and b/redactor/test/resources/pdf/pas_24.pdf differ diff --git a/redactor/test/resources/pdf/pas_25.pdf b/redactor/test/resources/pdf/pas_25.pdf new file mode 100644 index 00000000..2b0fd99c Binary files /dev/null and b/redactor/test/resources/pdf/pas_25.pdf differ diff --git a/redactor/test/resources/pdf/pas_26.pdf b/redactor/test/resources/pdf/pas_26.pdf new file mode 100644 index 00000000..eaf44071 Binary files /dev/null and b/redactor/test/resources/pdf/pas_26.pdf differ diff --git a/redactor/test/resources/pdf/pas_27.pdf b/redactor/test/resources/pdf/pas_27.pdf new file mode 100644 index 00000000..8c198e1d Binary files /dev/null and b/redactor/test/resources/pdf/pas_27.pdf differ diff --git a/redactor/test/resources/pdf/pas_28.pdf b/redactor/test/resources/pdf/pas_28.pdf new file mode 100644 index 00000000..b62b9a42 Binary files /dev/null and b/redactor/test/resources/pdf/pas_28.pdf differ diff --git a/redactor/test/resources/pdf/pins_anon_samples_nosigs.pdf b/redactor/test/resources/pdf/pins_anon_samples_nosigs.pdf new file mode 100644 index 00000000..5616d7e5 Binary files /dev/null and b/redactor/test/resources/pdf/pins_anon_samples_nosigs.pdf differ diff --git a/redactor/test/resources/pdf/sample1.pdf b/redactor/test/resources/pdf/sample1.pdf new file mode 100644 index 00000000..fa6a1f95 Binary files /dev/null and b/redactor/test/resources/pdf/sample1.pdf differ diff --git a/redactor/test/resources/pdf/sample6.pdf b/redactor/test/resources/pdf/sample6.pdf new file mode 100644 index 00000000..bd2aa4d7 Binary files /dev/null and b/redactor/test/resources/pdf/sample6.pdf differ diff --git a/redactor/test/unit_test/conftest.py b/redactor/test/unit_test/conftest.py index 6ed2f752..4277e4d7 100644 --- a/redactor/test/unit_test/conftest.py +++ b/redactor/test/unit_test/conftest.py @@ -13,7 +13,13 @@ def mock_logging_util(request): with patch.object(LoggingUtil, "__init__", return_value=None): with patch.object(LoggingUtil, "log_info", return_value=None): with patch.object(LoggingUtil, "log_exception", return_value=None): - yield + with patch.object( + LoggingUtil, "log_exception_with_message", return_value=None + ): + with patch.object( + LoggingUtil, "log_warning", return_value=None + ): + yield def pytest_configure(): diff --git a/redactor/test/unit_test/redaction/file_processor/test_pdf_processor.py b/redactor/test/unit_test/redaction/file_processor/test_pdf_processor.py index 56a1aab0..32d372ce 100644 --- a/redactor/test/unit_test/redaction/file_processor/test_pdf_processor.py +++ b/redactor/test/unit_test/redaction/file_processor/test_pdf_processor.py @@ -4,16 +4,17 @@ from PIL import Image from io import BytesIO from mock import patch, Mock -from concurrent.futures import ProcessPoolExecutor from core.redaction.file_processor import ( PDFProcessor, PDFImageMetadata, + PDFPageMetadata, + PDFLineMetadata, ) from core.redaction.result import ( ImageRedactionResult, ) -from core.util.text_util import is_english_text +from core.util.text_util import is_english_text, get_normalised_words from core.redaction.exceptions import NonEnglishContentException @@ -234,155 +235,259 @@ def test__pdf_processor__transform_bounding_box_to_global_space__scale_non_unifo assert expected_transformed_bounding_box == actual_transformed_bounding_box -def test__pdf_processor__find_next_redaction_instance__same_page(): - candidates_on_page = [ - (pymupdf.Rect(0, 0, 10, 10), "term1"), - (pymupdf.Rect(20, 20, 30, 30), "term2"), - ] - current_index = 0 # Start at first candidate - page = pymupdf.open().new_page() - - with patch.object(PDFProcessor, "__init__", return_value=None): - next_page, next_rect, next_term = PDFProcessor()._find_next_redaction_instance( - candidates_on_page, current_index, page - ) - - assert next_page == page - assert next_rect == candidates_on_page[1][0] - assert next_term == candidates_on_page[1][1] +def test__pdf_processor__create_line_metadata(): + """ + - Given I have a line of text with some metadata, and a bounding box that partially overlaps with that line + - When I call _update_line_info with the bounding box and the line metadata + - Then the line metadata should be updated to reflect the redaction of the text within the bounding box + """ + expected_line_metadata = PDFLineMetadata( + line_number=0, + words=("hello", "world"), + y0=0, + y1=10, + x0=(0, 15), + x1=(10, 25), + ) + line_metadata = PDFProcessor()._create_line_metadata( + ["Hello", "World!"], + [ + pymupdf.Rect( + 0, + 0, + 10, + 10, + ), + pymupdf.Rect(15, 0, 25, 10), + ], + 0, + ) + assert expected_line_metadata == line_metadata -def test__pdf_processor__find_next_redaction_instance__next_page(): - candidates_on_first_page = [ - (pymupdf.Rect(0, 0, 10, 10), "term1"), - (pymupdf.Rect(20, 20, 30, 30), "term2"), - ] - first_page = pymupdf.open().new_page() +def test__pdf_processor__extract_page_text(): + page = pymupdf.open().new_page() + with patch.object( + pymupdf.Page, + "get_text", + return_value=[ + (0, 0, 10, 10, "Hello", 0, 0, None), + (5, 0, 15, 10, "World", 0, 0, None), + (0, 10, 10, 20, "Hey", 0, 1, None), + (5, 10, 15, 20, "there", 0, 1, None), + ], + ): + page_metadata = PDFProcessor()._extract_page_text(page) + + expected_page_metadata = PDFPageMetadata( + page_number=page.number, + lines=[ + PDFLineMetadata( + line_number=0, + words=("hello", "world"), + y0=0, + y1=10, + x0=(0, 5), + x1=(10, 15), + ), + PDFLineMetadata( + line_number=1, + words=("hey", "there"), + y0=10, + y1=20, + x0=(0, 5), + x1=(10, 15), + ), + ], + ) - candidates_on_second_page = [ - (pymupdf.Rect(0, 0, 10, 10), "term3"), - (pymupdf.Rect(20, 20, 30, 30), "term4"), - ] - second_page = pymupdf.open().new_page() + assert expected_page_metadata == page_metadata - with patch.object(PDFProcessor, "__init__", return_value=None): - pdf_processor = PDFProcessor() - pdf_processor.redaction_candidates = [ - candidates_on_first_page, - candidates_on_second_page, - ] - pdf_processor.file_bytes = BytesIO() # Dummy value for file_bytes - with patch.object(pymupdf, "open", return_value=[first_page, second_page]): - next_page, next_rect, next_term = ( - pdf_processor._find_next_redaction_instance( - candidates_on_first_page, 1, first_page +def create_mock_page_metadata( + page_number, lines=None, y0=None, y1=None, x0=None, x1=None +): + line_metadata = [] + + if lines: + for i, line in enumerate(lines): + line_metadata.append( + PDFLineMetadata( + line_number=i, + words=tuple(get_normalised_words(line)), + y0=y0[i], + y1=y1[i], + x0=tuple(x0[i]), + x1=tuple(x1[i]), ) ) - pymupdf.open.assert_called_with( - stream=pdf_processor.file_bytes - ) # Ensure file_bytes used - - assert next_page == second_page - assert next_rect == candidates_on_second_page[0][0] - assert next_term == candidates_on_second_page[0][1] + return PDFPageMetadata(page_number=page_number, lines=line_metadata) -def test__pdf_processor__is_partial_redaction_across_line_breaks(): +def test__pdf_processor__check_partial_redaction_across_line_breaks(): + page_metadata = create_mock_page_metadata( + page_number=0, + lines=["Hello", "World"], + y0=[0, 20], + y1=[10, 30], + x0=[[0], [0]], + x1=[[10], [10]], + ) term = "Hello World" - next_rect = pymupdf.Rect(0, 0, 10, 20) - next_page = pymupdf.open().new_page() + normalised_words_to_redact = get_normalised_words(term) - with patch.object( - PDFProcessor, "_is_full_text_being_redacted", return_value=(True, "World") - ): - match_result = PDFProcessor()._is_partial_redaction_across_line_breaks( - term, "Hello", next_page, next_rect - ) - assert match_result + with patch.object(PDFProcessor, "__init__", return_value=None): + with patch.object( + PDFProcessor, + "_is_full_text_being_redacted", + side_effect=[([("hello", 0, 0)]), [("world", 0, 0)]], + ): + match_result = PDFProcessor()._check_partial_redaction_across_line_breaks( + normalised_words_to_redact, + "hello", + page_metadata.lines[0], + page_metadata, + ) + expected_result = (0, page_metadata.lines[1], 0) + + assert match_result == expected_result -def test__pdf_processor__is_partial_redaction_across_line_breaks__no_match(): +def test__pdf_processor__check_partial_redaction_across_line_breaks__no_match(): + page_metadata = create_mock_page_metadata( + page_number=0, + lines=["Hello", "You"], + y0=[0, 20], + y1=[10, 30], + x0=[[0], [0]], + x1=[[10], [10]], + ) term = "Hello World" - next_rect = pymupdf.Rect(0, 0, 10, 20) - next_page = pymupdf.open().new_page() + normalised_words_to_redact = get_normalised_words(term) - with patch.object( - PDFProcessor, "_is_full_text_being_redacted", return_value=(False, "You") - ): - match_result = PDFProcessor()._is_partial_redaction_across_line_breaks( - term, "Hello", next_page, next_rect - ) - assert not match_result + with patch.object(PDFProcessor, "__init__", return_value=None): + with patch.object( + PDFProcessor, + "_is_full_text_being_redacted", + side_effect=[([("hello", 0, 0)]), []], + ): + result = PDFProcessor()._check_partial_redaction_across_line_breaks( + normalised_words_to_redact, + "hello", + page_metadata.lines[0], + page_metadata, + ) + + assert result is None def test__pdf_processor__examine_provisional_text_redaction(): - page = pymupdf.open().new_page() - term = "test term" + page_metadata = create_mock_page_metadata( + page_number=0, + lines=["Hello World"], + y0=[0], + y1=[10], + x0=[[0, 6]], + x1=[[10, 11]], + ) + term = "Hello" rect = pymupdf.Rect(0, 0, 10, 10) - with patch.object( - PDFProcessor, "_is_full_text_being_redacted", return_value=(True, term) - ): - result = PDFProcessor()._examine_provisional_text_redaction(page, term, rect, 0) + with patch.object(PDFProcessor, "__init__", return_value=None): + with patch.object( + PDFProcessor, + "_is_full_text_being_redacted", + return_value=([("hello world", 0, 0)]), + ): + result = PDFProcessor()._examine_provisional_text_redaction( + "Hello", + rect, + page_metadata, + ) - assert result == [(page.number, rect, term)] + assert result == [(page_metadata.page_number, rect, term)] -@patch.object(PDFProcessor, "_is_full_text_being_redacted", return_value=(False, "")) -@patch.object(PDFProcessor, "_find_next_redaction_instance", return_value=None) +@patch.object(PDFProcessor, "_is_full_text_being_redacted", return_value=[]) def test__pdf_processor__examine_provisional_text_redaction__no_matches( - mock_full_redaction, mock_find_next + mock_full_redaction, ): - page = pymupdf.open().new_page() - term = "test term" + page_metadata = create_mock_page_metadata( + page_number=0, + lines=["Hello World"], + y0=[0], + y1=[10], + x0=[[0, 6]], + x1=[[10, 11]], + ) + term = "test" rect = pymupdf.Rect(0, 0, 10, 10) with patch.object(PDFProcessor, "__init__", return_value=None): pdf_processor = PDFProcessor() pdf_processor.redaction_candidates = [[(rect, term)]] - result = pdf_processor._examine_provisional_text_redaction(page, term, rect, 0) + result = pdf_processor._examine_provisional_text_redaction( + term, + rect, + page_metadata, + ) assert result == [] -@patch.object( - PDFProcessor, "_is_full_text_being_redacted", return_value=(False, "test") -) -@patch.object( - PDFProcessor, "_is_partial_redaction_across_line_breaks", return_value=True -) -def test__pdf_processor__examine_provisional_text_redaction__line_break( - mock_full_redaction, mock_find_next -): - page = pymupdf.open().new_page() - term = "test term" +def test__pdf_processor__examine_provisional_text_redaction__line_break(): + page_metadata = create_mock_page_metadata( + page_number=0, + lines=["Hello", "World"], + y0=[0, 20], + y1=[10, 30], + x0=[[0], [0]], + x1=[[10], [10]], + ) + term = "Hello World" rect = pymupdf.Rect(0, 0, 10, 10) - next_rect = pymupdf.Rect(20, 20, 30, 30) + next_rect = pymupdf.Rect(0, 20, 10, 30) candidates_on_page = [(rect, term), (next_rect, term)] with patch.object(PDFProcessor, "__init__", return_value=None): - with patch.object( - PDFProcessor, - "_find_next_redaction_instance", - return_value=(page, next_rect, term), + with ( + patch.object( + PDFProcessor, + "_check_partial_redaction_across_line_breaks", + return_value=(0, page_metadata.lines[1], 0), + ), + patch.object( + PDFProcessor, + "_is_full_text_being_redacted", + side_effect=[[("hello", 0, 0)], [("world", 0, 0)]], + ), ): pdf_processor = PDFProcessor() pdf_processor.redaction_candidates = [candidates_on_page] result = pdf_processor._examine_provisional_text_redaction( - page, term, rect, 0 + term, + rect, + page_metadata, ) - assert result == [(page.number, rect, term), (page.number, next_rect, term)] + assert result == [ + (page_metadata.page_number, rect, term), + (page_metadata.page_number, next_rect, term), + ] -@patch("pymupdf.open", return_value=[pymupdf.open().new_page()]) @patch.object(PDFProcessor, "__init__", return_value=None) -def test__pdf_processor__examine_provisional_redactions_on_page( - mock_page_open, mock_init -): +def test__pdf_processor__examine_provisional_redactions_on_page(mock_init): + page_metadata = create_mock_page_metadata( + page_number=0, + lines=["Hello", "World"], + y0=[0, 20], + y1=[10, 30], + x0=[[0], [0]], + x1=[[10], [10]], + ) + term = "Hello" rect = pymupdf.Rect(0, 0, 10, 10) - term = "term1" candidates_on_page = [(rect, term)] expected_result = [(0, rect, term)] @@ -393,22 +498,28 @@ def test__pdf_processor__examine_provisional_redactions_on_page( ): pdf_processor = PDFProcessor() pdf_processor.file_bytes = BytesIO() # Dummy value for file_bytes + pdf_processor.redaction_candidates = [candidates_on_page] result = pdf_processor._examine_provisional_redactions_on_page( - 0, candidates_on_page + candidates_on_page, page_metadata ) assert result == expected_result -@patch("pymupdf.open", return_value=[pymupdf.open().new_page()]) @patch.object(PDFProcessor, "__init__", return_value=None) -def test__pdf_processor__examine_provisional_redactions_on_page__line_break( - mock_page_open, mock_init -): +def test__pdf_processor__examine_provisional_redactions_on_page__line_break(mock_init): + page_metadata = create_mock_page_metadata( + page_number=0, + lines=["Hello", "World"], + y0=[0, 20], + y1=[10, 30], + x0=[[0], [0]], + x1=[[10], [10]], + ) + term = "Hello World" rect = pymupdf.Rect(0, 0, 10, 10) - next_rect = pymupdf.Rect(20, 20, 30, 30) - term = "term1" + next_rect = pymupdf.Rect(0, 20, 10, 30) candidates_on_page = [(rect, term), (next_rect, term)] expected_result = [(0, rect, term), (0, next_rect, term)] @@ -416,17 +527,18 @@ def test__pdf_processor__examine_provisional_redactions_on_page__line_break( [(0, rect, term), (0, next_rect, term)], [], ] - with patch.object( - PDFProcessor, - "_examine_provisional_text_redaction", - side_effect=side_effects, - ): - pdf_processor = PDFProcessor() - pdf_processor.file_bytes = BytesIO() # Dummy value for file_bytes - - result = pdf_processor._examine_provisional_redactions_on_page( - 0, candidates_on_page - ) + with patch.object(PDFProcessor, "__init__", return_value=None): + with patch.object( + PDFProcessor, + "_examine_provisional_text_redaction", + side_effect=side_effects, + ): + pdf_processor = PDFProcessor() + pdf_processor.redaction_candidates = [candidates_on_page] + result = pdf_processor._examine_provisional_redactions_on_page( + candidates_on_page, + page_metadata, + ) assert result == expected_result @@ -493,86 +605,36 @@ def test__pdf_processor__is_full_text_being_redacted(test_case): actual_text_at_rect = test_case[0] text_to_redact = test_case[1] - expected_result = test_case[2] + truth = test_case[2] error_message = ( - f"Expected _is_full_text_being_redacted to return {expected_result} when trying " + f"Expected _is_full_text_being_redacted to return {truth} when trying " f"to redact '{text_to_redact}' within the word '{actual_text_at_rect}'" ) - page = pymupdf.open().new_page() - rect = Mock() rect.width = 100 # Dummy value rect.__add__ = Mock(return_value=rect) - with patch.object(pymupdf.Page, "get_textbox", return_value=actual_text_at_rect): - result = PDFProcessor._is_full_text_being_redacted(page, text_to_redact, rect) - assert result[0] == expected_result, error_message - - -class MockPDFPPage: - def __init__(self, number: int): - self.number = number - - def annots(self, annot_type): - return [] - - def search_for(self, text): - return [] - - -class MockPDFDocument: - def __init__(self, num_pages: int): - self.num_pages = num_pages - self.pages = [MockPDFPPage(i) for i in range(num_pages)] - - def __iter__(self): - return iter(self.pages) - - def __len__(self): - return self.num_pages - - def save(self, stream: BytesIO, **kwargs): - pass + line_to_check = PDFLineMetadata( + line_number=0, + words=tuple(get_normalised_words(actual_text_at_rect)), + y0=0, + y1=10, + x0=(0,) * len(actual_text_at_rect), + x1=tuple(range(1, len(actual_text_at_rect) + 1)), + ) + result = PDFProcessor._is_full_text_being_redacted(text_to_redact, line_to_check) -@patch.object(pymupdf, "open", return_value=MockPDFDocument(2)) -def test__apply_provisional_text_redactions__check_pool_size(mock_pymupdf_open): - with ( - patch.object( - ProcessPoolExecutor, "submit", return_value=None - ) as mock_executor_submit, - patch( - "core.redaction.file_processor.as_completed", return_value=[] - ) as mock_as_completed, - patch.object( - ProcessPoolExecutor, "__init__", return_value=None - ) as mock_executor_init, - patch.object(ProcessPoolExecutor, "__exit__", return_value=None), - patch.object( - PDFProcessor, "_examine_provisional_redactions_on_page", return_value=[] - ), - ): - PDFProcessor()._apply_provisional_text_redactions( - BytesIO(), ["redaction1", "redaction2"], n_workers=4 + if truth: + expected_result = ( + " ".join(get_normalised_words(actual_text_at_rect)), + 0, + len(get_normalised_words(text_to_redact)) - 1, ) - - mock_executor_init.assert_called_once() - _, kwargs = mock_executor_init.call_args - - assert kwargs["max_workers"] == 4 - - assert "mp_context" in kwargs - assert ( - kwargs["mp_context"].get_start_method() == "fork" - if hasattr( - kwargs["mp_context"], "get_start_method" - ) # To allow local testing on Windows where 'fork' is not available - else True - ) - - assert mock_executor_submit.call_count == 2 - mock_as_completed.assert_called_once() + assert result[0] == expected_result, error_message + else: + assert result == [] def _make_pdf_with_text(text: str) -> BytesIO: diff --git a/redactor/test/unit_test/test_redaction_manager.py b/redactor/test/unit_test/test_redaction_manager.py index b218473f..314100d9 100644 --- a/redactor/test/unit_test/test_redaction_manager.py +++ b/redactor/test/unit_test/test_redaction_manager.py @@ -201,8 +201,21 @@ def test__redaction_manager__redact( ) -def check__log_exception__azure_blob_write__single_call( - job_id, expected_exception_message, exception +@mock.patch.object(RedactionManager, "__init__", return_value=None) +def test__redaction_manager__log_exception(mock_init): + expected_exception_message = "An exception with a message" + inst = RedactionManager("job_id") + inst.job_id = "inst" + inst.env = "dev" + inst.runtime_errors = [] + some_exception = Exception(expected_exception_message) + inst.log_exception(some_exception) + LoggingUtil.log_exception.assert_called_once_with(some_exception) + assert any(expected_exception_message in x for x in inst.runtime_errors) + + +def check__save_exception_log__azure_blob_write__single_call( + job_id, expected_exception_message ): calls = AzureBlobIO.write.call_args_list assert len(calls) == 1, ( @@ -210,8 +223,8 @@ def check__log_exception__azure_blob_write__single_call( ) -def check__log_exception__azure_blob_write__data_bytes( - job_id, expected_exception_message, exception +def check__save_exception_log__azure_blob_write__data_bytes( + job_id, expected_exception_message ): calls = AzureBlobIO.write.call_args_list if calls: @@ -223,8 +236,8 @@ def check__log_exception__azure_blob_write__data_bytes( ) -def check__log_exception__azure_blob_write__container_name( - job_id, expected_exception_message, exception +def check__save_exception_log__azure_blob_write__container_name( + job_id, expected_exception_message ): calls = AzureBlobIO.write.call_args_list if calls: @@ -232,47 +245,59 @@ def check__log_exception__azure_blob_write__container_name( assert call[1].get("container_name", None) == "redactiondata" -def check__log_exception__azure_blob_write__blob_path( - job_id, expected_exception_message, exception +def check__save_exception_log__azure_blob_write__blob_path( + job_id, expected_exception_message ): calls = AzureBlobIO.write.call_args_list if calls: call = calls[0] - assert call[1].get("blob_path", None) == f"{job_id}/exception.txt" - - -def check__log_exception__azure_blob_write__logging_util_called( - job_id, expected_exception_message, exception -): - LoggingUtil.log_exception.assert_called_once_with(exception) + assert call[1].get("blob_path", None) == f"{job_id}/exceptions.txt" @pytest.mark.parametrize( "test_case", [ - check__log_exception__azure_blob_write__single_call, - check__log_exception__azure_blob_write__data_bytes, - check__log_exception__azure_blob_write__container_name, - check__log_exception__azure_blob_write__blob_path, - check__log_exception__azure_blob_write__logging_util_called, + check__save_exception_log__azure_blob_write__single_call, + check__save_exception_log__azure_blob_write__data_bytes, + check__save_exception_log__azure_blob_write__container_name, + check__save_exception_log__azure_blob_write__blob_path, ], ) @mock.patch.object(RedactionManager, "__init__", return_value=None) @mock.patch.object(AzureBlobIO, "__init__", return_value=None) @mock.patch.object(AzureBlobIO, "write", return_value=None) -def test__redaction_manager__log_exception( +def test__redaction_manager__save_exception_log( mock_blob_write, mock_blob_init, mock_init, test_case, ): - expected_exception_message = "An exception with a message" inst = RedactionManager("job_id") inst.job_id = "inst" inst.env = "dev" - some_exception = Exception(expected_exception_message) - inst.log_exception(some_exception) - test_case(inst.job_id, expected_exception_message, some_exception) + inst.runtime_errors = ["some exception A", "some exception B"] + expected_exception_message = "\n\n\n".join(inst.runtime_errors) + inst.save_exception_log() + test_case(inst.job_id, expected_exception_message) + + +@mock.patch.object(RedactionManager, "__init__", return_value=None) +@mock.patch.object(AzureBlobIO, "__init__", return_value=None) +@mock.patch.object(AzureBlobIO, "write", return_value=None) +def test__redaction_manager__save_exception_log__with_no_exception( + mock_blob_write, + mock_blob_init, + mock_init, +): + inst = RedactionManager("job_id") + inst.job_id = "inst" + inst.env = "dev" + inst.runtime_errors = [] + inst.save_exception_log() + calls = AzureBlobIO.write.call_args_list + assert len(calls) == 0, ( + f"Expected AzureBlobIO.write to be not have been called, but was called {len(calls)} times" + ) def check__try_redact__successful_output( @@ -284,6 +309,8 @@ def check__try_redact__successful_output( mock_validate_json, mock_init, mock_send_service_bus_message, + mock_save_logs, + mock_save_exception, ): expected_response = { "parameters": {"some_payload", ""}, @@ -303,6 +330,8 @@ def check__try_redact__failed_output( mock_validate_json, mock_init, mock_send_service_bus_message, + mock_save_logs, + mock_save_exception, ): expected_response = { "parameters": {"some_payload", ""}, @@ -322,6 +351,8 @@ def check__try_redact__validate_json_payload__called( mock_validate_json, mock_init, mock_send_service_bus_message, + mock_save_logs, + mock_save_exception, ): mock_validate_json.assert_called_once_with(params) @@ -335,6 +366,8 @@ def check__try_redact__validate_json_payload__not_called( mock_validate_json, mock_init, mock_send_service_bus_message, + mock_save_logs, + mock_save_exception, ): not mock_validate_json.called @@ -348,6 +381,8 @@ def check__try_redact__redact__called( mock_validate_json, mock_init, mock_send_service_bus_message, + mock_save_logs, + mock_save_exception, ): mock_redact.assert_called_once_with(params) @@ -361,6 +396,8 @@ def check__try_redact__redact__not_called( mock_validate_json, mock_init, mock_send_service_bus_message, + mock_save_logs, + mock_save_exception, ): not mock_redact.called @@ -374,6 +411,8 @@ def check__try_redact__log_exception__called( mock_validate_json, mock_init, mock_send_service_bus_message, + mock_save_logs, + mock_save_exception, ): mock_log_exception.assert_called_once_with(exception) @@ -387,6 +426,8 @@ def check__try_redact__log_exception__not_called( mock_validate_json, mock_init, mock_send_service_bus_message, + mock_save_logs, + mock_save_exception, ): not mock_log_exception.called @@ -400,6 +441,8 @@ def check__try_redact__log_exception__not_called( check__try_redact__log_exception__not_called, ], ) +@mock.patch.object(RedactionManager, "save_exception_log") +@mock.patch.object(RedactionManager, "save_logs") @mock.patch.object(RedactionManager, "send_service_bus_completion_message") @mock.patch.object(RedactionManager, "__init__", return_value=None) @mock.patch.object(RedactionManager, "validate_json_payload") @@ -411,6 +454,8 @@ def test__try_redact__successful( mock_validate_json, mock_init, mock_send_service_bus_message, + mock_save_logs, + mock_save_exception, test_case, ): inst = RedactionManager("job_id") @@ -427,6 +472,8 @@ def test__try_redact__successful( mock_validate_json, mock_init, mock_send_service_bus_message, + mock_save_logs, + mock_save_exception, ) @@ -439,6 +486,8 @@ def test__try_redact__successful( check__try_redact__log_exception__called, ], ) +@mock.patch.object(RedactionManager, "save_exception_log") +@mock.patch.object(RedactionManager, "save_logs") @mock.patch.object(RedactionManager, "send_service_bus_completion_message") @mock.patch.object(RedactionManager, "__init__", return_value=None) @mock.patch.object(RedactionManager, "validate_json_payload") @@ -450,6 +499,8 @@ def test__try_redact__param_validation_failure( mock_validate_json, mock_init, mock_send_service_bus_message, + mock_save_logs, + mock_save_exception, test_case, ): exception = Exception("Some exception") @@ -468,6 +519,8 @@ def test__try_redact__param_validation_failure( mock_validate_json, mock_init, mock_send_service_bus_message, + mock_save_logs, + mock_save_exception, ) @@ -480,6 +533,8 @@ def test__try_redact__param_validation_failure( check__try_redact__log_exception__called, ], ) +@mock.patch.object(RedactionManager, "save_exception_log") +@mock.patch.object(RedactionManager, "save_logs") @mock.patch.object(RedactionManager, "send_service_bus_completion_message") @mock.patch.object(RedactionManager, "__init__", return_value=None) @mock.patch.object(RedactionManager, "validate_json_payload") @@ -491,6 +546,8 @@ def test__try_redact__redaction_failure( mock_validate_json, mock_init, mock_send_service_bus_message, + mock_save_logs, + mock_save_exception, test_case, ): exception = Exception("Some exception") @@ -509,6 +566,129 @@ def test__try_redact__redaction_failure( mock_validate_json, mock_init, mock_send_service_bus_message, + mock_save_logs, + mock_save_exception, + ) + + +@mock.patch.object( + RedactionManager, + "save_exception_log", + side_effect=Exception("save_exception_log exception"), +) +@mock.patch.object( + RedactionManager, "save_logs", side_effect=Exception("save_logs exception") +) +@mock.patch.object( + RedactionManager, + "send_service_bus_completion_message", + side_effect=Exception("send_service_bus_completion_message exception"), +) +@mock.patch.object(RedactionManager, "__init__", return_value=None) +@mock.patch.object(RedactionManager, "validate_json_payload") +@mock.patch.object(RedactionManager, "redact") +@mock.patch.object(RedactionManager, "log_exception") +def test__try_redact__success_with_non_fatal_error( + mock_log_exception, + mock_redact, + mock_validate_json, + mock_init, + mock_send_service_bus_message, + mock_save_logs, + mock_save_exception, +): + """ + - Given the redaction process is successful + - When there are non-fatal errors + - Then the redaction process should succeed with any non-fatal errors reported as a warning to the caller + """ + inst = RedactionManager("job_id") + inst.job_id = "test__try_redact__non_fatal_error" + inst.env = "dev" + params = {"some_payload", ""} + response = inst.try_redact(params) + expected_response = { + "parameters": params, + "id": inst.job_id, + "status": "SUCCESS", + "message": ( + "Redaction process completed successfully, but had some non-fatal errors:\n" + "Failed to submit a service bus message with the following error: send_service_bus_completion_message exception\n" + "Failed to write logs with the following error: save_logs exception\nFailed to write an exception log with the " + "following error: save_exception_log exception" + ), + } + assert response == expected_response + + +@mock.patch.object( + RedactionManager, + "save_exception_log", + side_effect=Exception("save_exception_log exception"), +) +@mock.patch.object( + RedactionManager, "save_logs", side_effect=Exception("save_logs exception") +) +@mock.patch.object( + RedactionManager, + "send_service_bus_completion_message", + side_effect=Exception("send_service_bus_completion_message exception"), +) +@mock.patch.object(RedactionManager, "__init__", return_value=None) +@mock.patch.object(RedactionManager, "validate_json_payload") +@mock.patch.object(RedactionManager, "redact") +@mock.patch.object(RedactionManager, "log_exception") +def test__try_redact__fail_with_extra_non_fatal_error( + mock_log_exception, + mock_redact, + mock_validate_json, + mock_init, + mock_send_service_bus_message, + mock_save_logs, + mock_save_exception, +): + """ + - Given the redaction process is not successful + - When there are also non-fatal errors + - Then the redaction process should fail with all fatal and non-fatal errors reported to the caller + """ + exception = Exception("Some exception") + inst = RedactionManager("job_id") + inst.job_id = "test__try_redact__non_fatal_error" + inst.env = "dev" + mock_redact.side_effect = exception + params = {"some_payload", ""} + response = inst.try_redact(params) + expected_response = { + "parameters": params, + "id": inst.job_id, + "status": "FAIL", + "message": ( + f"Redaction process failed with the following error: {exception}" + "\nAdditionally, the following non-fatal errors occurred:\n" + "Failed to submit a service bus message with the following error: send_service_bus_completion_message exception\n" + "Failed to write logs with the following error: save_logs exception\nFailed to write an exception log with the " + "following error: save_exception_log exception" + ), + } + assert response == expected_response + + +@mock.patch.object(RedactionManager, "__init__", return_value=None) +@mock.patch.object(AzureBlobIO, "__init__", return_value=None) +@mock.patch.object(AzureBlobIO, "write", return_value=None) +@mock.patch.object(LoggingUtil, "get_log_bytes", return_value=b"xyz") +def test__redaction_manager__save_logs( + mock_get_log_bytes, mock_blob_write, mock_blob_init, mock_init +): + inst = RedactionManager() + inst.job_id = "test__redaction_manager__save_logs" + inst.env = "dev" + inst.save_logs() + AzureBlobIO.write.assert_called_once_with( + data_bytes=b"xyz", + container_name="redactiondata", + blob_path=f"{inst.job_id}/log.txt", ) diff --git a/redactor/test/unit_test/util/test_azure_vision_util.py b/redactor/test/unit_test/util/test_azure_vision_util.py index cb5f7e71..cf61f056 100644 --- a/redactor/test/unit_test/util/test_azure_vision_util.py +++ b/redactor/test/unit_test/util/test_azure_vision_util.py @@ -34,7 +34,7 @@ def test__azure_vision_util__detect_faces(mock_bytes_io): mock_analyze.return_value = MockImageAnalysisClientResult(people_list) result = azure_vision_util.detect_faces(image, confidence_threshold=0.5) - assert result == ((10, 20, 30, 40),) + assert result == ((10, 20, 40, 60),) # Verify caching assert azure_vision_util._IMAGE_FACE_CACHE == [ @@ -42,11 +42,11 @@ def test__azure_vision_util__detect_faces(mock_bytes_io): "image": image, "faces": ( { - "box": (10, 20, 30, 40), + "box": (10, 20, 40, 60), "confidence": 0.9, }, { - "box": (10, 20, 30, 40), + "box": (10, 20, 40, 60), "confidence": 0.4, }, ), @@ -58,14 +58,14 @@ def test__azure_vision_util__detect_faces(mock_bytes_io): def test__azure_vision_util__detect_faces__use_cached_result(mock_bytes_io): azure_vision_util = AzureVisionUtil() image = Mock() - image_rects = ((10, 20, 30, 40),) + image_rects = ((10, 20, 40, 60),) azure_vision_util._IMAGE_FACE_CACHE = [ { "image": image, "faces": ( { - "box": (10, 20, 30, 40), + "box": (10, 20, 40, 60), "confidence": 0.9, }, ), diff --git a/redactor/test/unit_test/util/test_llm_util.py b/redactor/test/unit_test/util/test_llm_util.py index 33d5da20..70e7b8d7 100644 --- a/redactor/test/unit_test/util/test_llm_util.py +++ b/redactor/test/unit_test/util/test_llm_util.py @@ -13,6 +13,7 @@ ) from core.util.llm_util import LLMUtil, handle_last_retry_error from core.util.logging_util import LoggingUtil +from openai import RateLimitError class MockLLMChatCompletion: @@ -37,6 +38,12 @@ def __init__(self, prompt_tokens, completion_tokens): self.completion_tokens = completion_tokens +class MockOpenAIAPIResponse: + request = None + status_code = None + headers = dict() + + TOKEN_RATE_LIMIT = 1000000 REQUEST_RATE_LIMIT = 1000 @@ -334,6 +341,8 @@ def test__llm_util___analyse_text_chunk__timeout_on_request_semaphore( llm_util.request_semaphore.acquire.return_value = False + user_prompt = "" + result = llm_util._analyse_text_chunk(system_prompt="system prompt", user_prompt="") assert result is None # Timeout occurred, so None is returned @@ -344,8 +353,15 @@ def test__llm_util___analyse_text_chunk__timeout_on_request_semaphore( llm_util.token_semaphore.acquire.assert_not_called() llm_util.token_semaphore.release.assert_not_called() - LoggingUtil.log_exception.assert_called_with( - "Timeout while waiting for request semaphore to be available." + logging_util_calls = LoggingUtil.log_exception.call_args_list + logging_util_calls_as_string = [str(x) for x in logging_util_calls] + timeout_error = str( + TimeoutError( + f"(chunk ID {hash(user_prompt)}) Timeout while waiting for request semaphore to be available." + ) + ) + assert any(timeout_error in x for x in logging_util_calls_as_string), ( + f"Expected {timeout_error} to be called. Called list was {logging_util_calls}" ) @@ -360,15 +376,21 @@ def test__llm_util___analyse_text_chunk__exception(mock_invoke_chain): llm_util._analyse_text_chunk.retry.stop = stop_after_attempt(1) - result = llm_util._analyse_text_chunk(system_prompt="system prompt", user_prompt="") - - assert result is None # Exception occurred, so None is returned - LoggingUtil.log_exception.assert_called_with( - "An error occurred while processing the chunk: Some LLM invocation error" - ) + with pytest.raises(Exception): + llm_util._analyse_text_chunk(system_prompt="system prompt", user_prompt="") + LoggingUtil.log_exception.assert_called_with( + "An error occurred while processing the chunk: Some LLM invocation error" + ) -def test__llm_util___analyse_text_chunk__retry_on_exception(): +@pytest.mark.parametrize( + "exception", + [ + RateLimitError("message", response=MockOpenAIAPIResponse(), body="body"), + TimeoutError("Some LLM invocation error"), + ], +) +def test__llm_util___analyse_text_chunk__retry_on_exception(exception): mock_chat_completion = create_mock_chat_completion() redaction_strings = mock_chat_completion.choices[0].message.parsed.redaction_strings @@ -379,7 +401,7 @@ def test__llm_util___analyse_text_chunk__retry_on_exception(): LLMUtil, "invoke_chain", side_effect=[ - Exception("Some LLM invocation error"), + exception, create_mock_chat_completion(["string A", "string B"]), ], ): diff --git a/redactor/test/unit_test/util/test_logging_util.py b/redactor/test/unit_test/util/test_logging_util.py index 15b874da..21452a7e 100644 --- a/redactor/test/unit_test/util/test_logging_util.py +++ b/redactor/test/unit_test/util/test_logging_util.py @@ -6,6 +6,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from core.util.logging_util import LoggingUtil, Singleton, log_to_appins +import traceback @pytest.mark.nologgerfixt @@ -94,6 +95,11 @@ def test_logging_util__log_info(mock_logger_info): call(f"some_guid: {info_message}"), ] ) + assert ( + "INFO: some_guid: Logging initialised for redactor_logs.\n" + in logging_util_inst.raw_logs + ) + assert f"INFO: some_guid: {info_message}" @pytest.mark.nologgerfixt @@ -102,10 +108,35 @@ def test_logging_util__log_exception(mock_logger_exception): logging_util_inst = get_new_logging_instance() logging_util_inst.logger = getLogger() - error_message = "some_exception_message" + error_message = Exception("some_exception_message") logging_util_inst.log_exception(error_message) - Logger.exception.assert_called_once_with(f"some_guid: {error_message}") + stack_trace = "".join( + traceback.TracebackException.from_exception(error_message).format() + ) + base_message = f"some_guid: {error_message}\n\n The Exception stack trace is below:\n\n{stack_trace}\n" + + Logger.exception.assert_called_once_with(base_message) + assert f"ERROR: {base_message}\n" in logging_util_inst.raw_logs + + +@pytest.mark.nologgerfixt +@patch.object(Logger, "exception", return_value=None) +def test_logging_util__log_exception_with_message(mock_logger_exception): + logging_util_inst = get_new_logging_instance() + logging_util_inst.logger = getLogger() + + error_message = Exception("some_exception_message") + message = "some preceding message" + logging_util_inst.log_exception_with_message(message, error_message) + + stack_trace = "".join( + traceback.TracebackException.from_exception(error_message).format() + ) + base_message = f"some_guid: {message}: {error_message}\n\n The Exception stack trace is below:\n\n{stack_trace}\n" + + Logger.exception.assert_called_once_with(base_message) + assert f"ERROR: {base_message}\n" in logging_util_inst.raw_logs @pytest.mark.nologgerfixt @@ -148,7 +179,7 @@ def my_function_with_args(a, b, c): @pytest.mark.nologgerfixt @patch.object(LoggingUtil, "__init__", return_value=None) @patch.object(LoggingUtil, "log_info", return_value=None) -@patch.object(LoggingUtil, "log_exception", return_value=None) +@patch.object(LoggingUtil, "log_exception_with_message", return_value=None) def test_log_to_appins__with_exception(mock_init, mock_log_info, mock_log_exception): exception = Exception("Some exception") @@ -166,8 +197,8 @@ def my_function_with_exception(): f"Function my_function_with_exception called with args: " f"{', '.join(args_repr + kwargs_repr)}" ) - LoggingUtil.log_exception.assert_called_once_with( - f"Exception raised in function my_function_with_exception: {exception}" + LoggingUtil.log_exception_with_message.assert_called_once_with( + "Exception raised in function my_function_with_exception:", exception )