From 9b008de99d4ea97ffcd1231b48b64e81266f82b3 Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Mon, 12 Jan 2026 14:30:09 +0100 Subject: [PATCH] Properly encode binary objects --- src/parxy_core/drivers/landingai.py | 5 +- src/parxy_core/drivers/llamaparse.py | 3 +- src/parxy_core/drivers/llmwhisperer.py | 3 +- src/parxy_core/drivers/pdfact.py | 3 +- src/parxy_core/drivers/pymupdf.py | 3 +- src/parxy_core/drivers/unstructured_local.py | 3 +- src/parxy_core/utils/__init__.py | 5 ++ src/parxy_core/utils/json_helpers.py | 66 ++++++++++++++++++++ 8 files changed, 85 insertions(+), 6 deletions(-) create mode 100644 src/parxy_core/utils/__init__.py create mode 100644 src/parxy_core/utils/json_helpers.py diff --git a/src/parxy_core/drivers/landingai.py b/src/parxy_core/drivers/landingai.py index 270cd24..8655186 100644 --- a/src/parxy_core/drivers/landingai.py +++ b/src/parxy_core/drivers/landingai.py @@ -16,6 +16,7 @@ from parxy_core.drivers import Driver from parxy_core.models import Document, Metadata, TextBlock, Page, BoundingBox +from parxy_core.utils import safe_json_dumps class LandingAIADEDriver(Driver): @@ -47,7 +48,9 @@ def _handle( filename, stream = self.handle_file_input(file) with self._trace_parse(filename, stream, **kwargs) as span: parse_response = self.__client.parse(document=Path(file), **kwargs) - span.set_attribute('output.document', parse_response.model_dump_json()) + span.set_attribute( + 'output.document', safe_json_dumps(parse_response.model_dump()) + ) except AuthenticationError as aex: raise AuthenticationException( diff --git a/src/parxy_core/drivers/llamaparse.py b/src/parxy_core/drivers/llamaparse.py index f9510c0..26ee7ac 100644 --- a/src/parxy_core/drivers/llamaparse.py +++ b/src/parxy_core/drivers/llamaparse.py @@ -19,6 +19,7 @@ from parxy_core.drivers import Driver from parxy_core.models import Document, Page, BoundingBox, TextBlock, HierarchyLevel +from parxy_core.utils import safe_json_dumps from parxy_core.exceptions import ( ParsingException, AuthenticationException, @@ -206,7 +207,7 @@ def _handle( extra_info = {'file_name': filename if len(filename) > 0 else 'default'} with self._trace_parse(filename, stream, **kwargs) as span: res = self.__client.parse(stream, extra_info=extra_info) - span.set_attribute('output.document', res.model_dump_json()) + span.set_attribute('output.document', safe_json_dumps(res.model_dump())) except FileNotFoundError as fex: raise FileNotFoundException(fex, self.__class__) from fex except Exception as ex: diff --git a/src/parxy_core/drivers/llmwhisperer.py b/src/parxy_core/drivers/llmwhisperer.py index adc96df..fed5d41 100644 --- a/src/parxy_core/drivers/llmwhisperer.py +++ b/src/parxy_core/drivers/llmwhisperer.py @@ -23,6 +23,7 @@ AuthenticationException, ) from parxy_core.models import Document, Page +from parxy_core.utils import safe_json_dumps _credits_per_parsing_mode_per_page = { @@ -145,7 +146,7 @@ def _handle( # wait_timeout=kwargs.get("wait_timeout", 200), # **kwargs, ) - span.set_attribute('output.document', json.dumps(res)) + span.set_attribute('output.document', safe_json_dumps(res)) except FileNotFoundError as fex: raise FileNotFoundException(fex, self.SERVICE_NAME) from fex except LLMWhispererClientException as wex: diff --git a/src/parxy_core/drivers/pdfact.py b/src/parxy_core/drivers/pdfact.py index 7887653..30e8f2d 100644 --- a/src/parxy_core/drivers/pdfact.py +++ b/src/parxy_core/drivers/pdfact.py @@ -8,6 +8,7 @@ from urllib.parse import urljoin from parxy_core.drivers import Driver +from parxy_core.utils import safe_json_dumps from parxy_core.models import ( BoundingBox, Style, @@ -70,7 +71,7 @@ def _handle( filename, stream = self.handle_file_input(file) with self._trace_parse(filename, stream, **kwargs) as span: res = self._request(file, level) - span.set_attribute('output.document', json.dumps(res)) + span.set_attribute('output.document', safe_json_dumps(res)) return pdfact_to_parxy(doc=res, level=level, filename=file) diff --git a/src/parxy_core/drivers/pymupdf.py b/src/parxy_core/drivers/pymupdf.py index a25f0e2..8055b49 100644 --- a/src/parxy_core/drivers/pymupdf.py +++ b/src/parxy_core/drivers/pymupdf.py @@ -7,6 +7,7 @@ from parxy_core.drivers import Driver from parxy_core.exceptions import FileNotFoundException +from parxy_core.utils import safe_json_dumps from parxy_core.models import ( BoundingBox, Style, @@ -52,7 +53,7 @@ def _handle( with self._trace_parse(filename, stream, **kwargs) as span: doc = pymupdf.open(stream=stream) doc_pages = [page.get_text(page_key_to_extract) for page in doc.pages()] - span.set_attribute('output.document', json.dumps(doc_pages)) + span.set_attribute('output.document', safe_json_dumps(doc_pages)) span.set_attribute('output.pages', len(doc_pages)) res = pymupdf_to_parxy(doc=doc, pages=doc_pages, level=level) diff --git a/src/parxy_core/drivers/unstructured_local.py b/src/parxy_core/drivers/unstructured_local.py index 6f78f06..6315419 100644 --- a/src/parxy_core/drivers/unstructured_local.py +++ b/src/parxy_core/drivers/unstructured_local.py @@ -16,6 +16,7 @@ from parxy_core.drivers import Driver from parxy_core.exceptions import FileNotFoundException, ParsingException from parxy_core.models import Document, TextBlock, BoundingBox, Page, HierarchyLevel +from parxy_core.utils import safe_json_dumps from parxy_core.models.config import UnstructuredLocalConfig from parxy_core.tracing.utils import trace_with_output @@ -85,7 +86,7 @@ def _handle( file=io.BytesIO(stream), languages=['eng'], **kwargs ) span.set_attribute( - 'output.document', json.dumps([el.to_dict() for el in res]) + 'output.document', safe_json_dumps([el.to_dict() for el in res]) ) except FileNotFoundError as fex: raise FileNotFoundException(fex, self.__class__) from fex diff --git a/src/parxy_core/utils/__init__.py b/src/parxy_core/utils/__init__.py new file mode 100644 index 0000000..e7fd25e --- /dev/null +++ b/src/parxy_core/utils/__init__.py @@ -0,0 +1,5 @@ +"""Utility functions for parxy_core.""" + +from parxy_core.utils.json_helpers import safe_json_dumps, BytesJSONEncoder + +__all__ = ['safe_json_dumps', 'BytesJSONEncoder'] diff --git a/src/parxy_core/utils/json_helpers.py b/src/parxy_core/utils/json_helpers.py new file mode 100644 index 0000000..11b7371 --- /dev/null +++ b/src/parxy_core/utils/json_helpers.py @@ -0,0 +1,66 @@ +"""JSON serialization helpers for handling binary content.""" + +import base64 +import json +from typing import Any + + +class BytesJSONEncoder(json.JSONEncoder): + """Custom JSON encoder that handles bytes by encoding to base64. + + This encoder converts bytes objects to base64-encoded strings prefixed + with 'base64:' to indicate the encoding format. This allows binary data + like images to be safely serialized to JSON. + + Example + ------- + >>> data = {'image': b'\\x89PNG...', 'text': 'hello'} + >>> json.dumps(data, cls=BytesJSONEncoder) + '{"image": "base64:iVBORw0KG...", "text": "hello"}' + """ + + def default(self, obj: Any) -> Any: + """Convert bytes to base64-encoded string. + + Parameters + ---------- + obj : Any + Object to encode. + + Returns + ------- + Any + Encoded object. Bytes are converted to base64 strings, + other types are handled by the parent encoder. + """ + if isinstance(obj, bytes): + return f'base64:{base64.b64encode(obj).decode("ascii")}' + return super().default(obj) + + +def safe_json_dumps(obj: Any, **kwargs) -> str: + """Safely serialize object to JSON string, handling bytes objects. + + This is a convenience wrapper around json.dumps that uses BytesJSONEncoder + to handle binary data. Any bytes objects in the data structure will be + automatically converted to base64-encoded strings. + + Parameters + ---------- + obj : Any + Object to serialize to JSON. + **kwargs + Additional arguments passed to json.dumps (e.g., indent, sort_keys). + + Returns + ------- + str + JSON string representation of the object. + + Example + ------- + >>> data = {'image': b'\\x89PNG...', 'text': 'hello'} + >>> safe_json_dumps(data, indent=2) + '{\\n "image": "base64:iVBORw0KG...",\\n "text": "hello"\\n}' + """ + return json.dumps(obj, cls=BytesJSONEncoder, **kwargs)