Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/parxy_core/drivers/landingai.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from parxy_core.drivers import Driver
from parxy_core.models import Document, Metadata, TextBlock, Page, BoundingBox
from parxy_core.utils import safe_json_dumps


class LandingAIADEDriver(Driver):
Expand Down Expand Up @@ -47,7 +48,9 @@ def _handle(
filename, stream = self.handle_file_input(file)
with self._trace_parse(filename, stream, **kwargs) as span:
parse_response = self.__client.parse(document=Path(file), **kwargs)
span.set_attribute('output.document', parse_response.model_dump_json())
span.set_attribute(
'output.document', safe_json_dumps(parse_response.model_dump())
)

except AuthenticationError as aex:
raise AuthenticationException(
Expand Down
3 changes: 2 additions & 1 deletion src/parxy_core/drivers/llamaparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from parxy_core.drivers import Driver
from parxy_core.models import Document, Page, BoundingBox, TextBlock, HierarchyLevel
from parxy_core.utils import safe_json_dumps
from parxy_core.exceptions import (
ParsingException,
AuthenticationException,
Expand Down Expand Up @@ -206,7 +207,7 @@ def _handle(
extra_info = {'file_name': filename if len(filename) > 0 else 'default'}
with self._trace_parse(filename, stream, **kwargs) as span:
res = self.__client.parse(stream, extra_info=extra_info)
span.set_attribute('output.document', res.model_dump_json())
span.set_attribute('output.document', safe_json_dumps(res.model_dump()))
except FileNotFoundError as fex:
raise FileNotFoundException(fex, self.__class__) from fex
except Exception as ex:
Expand Down
3 changes: 2 additions & 1 deletion src/parxy_core/drivers/llmwhisperer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
AuthenticationException,
)
from parxy_core.models import Document, Page
from parxy_core.utils import safe_json_dumps


_credits_per_parsing_mode_per_page = {
Expand Down Expand Up @@ -145,7 +146,7 @@ def _handle(
# wait_timeout=kwargs.get("wait_timeout", 200),
# **kwargs,
)
span.set_attribute('output.document', json.dumps(res))
span.set_attribute('output.document', safe_json_dumps(res))
except FileNotFoundError as fex:
raise FileNotFoundException(fex, self.SERVICE_NAME) from fex
except LLMWhispererClientException as wex:
Expand Down
3 changes: 2 additions & 1 deletion src/parxy_core/drivers/pdfact.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from urllib.parse import urljoin

from parxy_core.drivers import Driver
from parxy_core.utils import safe_json_dumps
from parxy_core.models import (
BoundingBox,
Style,
Expand Down Expand Up @@ -70,7 +71,7 @@ def _handle(
filename, stream = self.handle_file_input(file)
with self._trace_parse(filename, stream, **kwargs) as span:
res = self._request(file, level)
span.set_attribute('output.document', json.dumps(res))
span.set_attribute('output.document', safe_json_dumps(res))

return pdfact_to_parxy(doc=res, level=level, filename=file)

Expand Down
3 changes: 2 additions & 1 deletion src/parxy_core/drivers/pymupdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from parxy_core.drivers import Driver
from parxy_core.exceptions import FileNotFoundException
from parxy_core.utils import safe_json_dumps
from parxy_core.models import (
BoundingBox,
Style,
Expand Down Expand Up @@ -52,7 +53,7 @@ def _handle(
with self._trace_parse(filename, stream, **kwargs) as span:
doc = pymupdf.open(stream=stream)
doc_pages = [page.get_text(page_key_to_extract) for page in doc.pages()]
span.set_attribute('output.document', json.dumps(doc_pages))
span.set_attribute('output.document', safe_json_dumps(doc_pages))
span.set_attribute('output.pages', len(doc_pages))

res = pymupdf_to_parxy(doc=doc, pages=doc_pages, level=level)
Expand Down
3 changes: 2 additions & 1 deletion src/parxy_core/drivers/unstructured_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from parxy_core.drivers import Driver
from parxy_core.exceptions import FileNotFoundException, ParsingException
from parxy_core.models import Document, TextBlock, BoundingBox, Page, HierarchyLevel
from parxy_core.utils import safe_json_dumps
from parxy_core.models.config import UnstructuredLocalConfig
from parxy_core.tracing.utils import trace_with_output

Expand Down Expand Up @@ -85,7 +86,7 @@ def _handle(
file=io.BytesIO(stream), languages=['eng'], **kwargs
)
span.set_attribute(
'output.document', json.dumps([el.to_dict() for el in res])
'output.document', safe_json_dumps([el.to_dict() for el in res])
)
except FileNotFoundError as fex:
raise FileNotFoundException(fex, self.__class__) from fex
Expand Down
5 changes: 5 additions & 0 deletions src/parxy_core/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""Utility functions for parxy_core."""

from parxy_core.utils.json_helpers import safe_json_dumps, BytesJSONEncoder

__all__ = ['safe_json_dumps', 'BytesJSONEncoder']
66 changes: 66 additions & 0 deletions src/parxy_core/utils/json_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
"""JSON serialization helpers for handling binary content."""

import base64
import json
from typing import Any


class BytesJSONEncoder(json.JSONEncoder):
"""Custom JSON encoder that handles bytes by encoding to base64.

This encoder converts bytes objects to base64-encoded strings prefixed
with 'base64:' to indicate the encoding format. This allows binary data
like images to be safely serialized to JSON.

Example
-------
>>> data = {'image': b'\\x89PNG...', 'text': 'hello'}
>>> json.dumps(data, cls=BytesJSONEncoder)
'{"image": "base64:iVBORw0KG...", "text": "hello"}'
"""

def default(self, obj: Any) -> Any:
"""Convert bytes to base64-encoded string.

Parameters
----------
obj : Any
Object to encode.

Returns
-------
Any
Encoded object. Bytes are converted to base64 strings,
other types are handled by the parent encoder.
"""
if isinstance(obj, bytes):
return f'base64:{base64.b64encode(obj).decode("ascii")}'
return super().default(obj)


def safe_json_dumps(obj: Any, **kwargs) -> str:
"""Safely serialize object to JSON string, handling bytes objects.

This is a convenience wrapper around json.dumps that uses BytesJSONEncoder
to handle binary data. Any bytes objects in the data structure will be
automatically converted to base64-encoded strings.

Parameters
----------
obj : Any
Object to serialize to JSON.
**kwargs
Additional arguments passed to json.dumps (e.g., indent, sort_keys).

Returns
-------
str
JSON string representation of the object.

Example
-------
>>> data = {'image': b'\\x89PNG...', 'text': 'hello'}
>>> safe_json_dumps(data, indent=2)
'{\\n "image": "base64:iVBORw0KG...",\\n "text": "hello"\\n}'
"""
return json.dumps(obj, cls=BytesJSONEncoder, **kwargs)