From 85b28183935fabe713ed9408e5c927aeb023f4d4 Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Fri, 3 Oct 2025 12:53:04 +1300 Subject: [PATCH 01/32] Update evo-objects dependencies --- packages/evo-objects/pyproject.toml | 6 +++--- uv.lock | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/packages/evo-objects/pyproject.toml b/packages/evo-objects/pyproject.toml index 335223de..2b41d335 100644 --- a/packages/evo-objects/pyproject.toml +++ b/packages/evo-objects/pyproject.toml @@ -10,7 +10,7 @@ authors = [ ] dependencies = [ - "evo-sdk-common>=0.1.0", + "evo-sdk-common[jmespath]>=0.5.4", "pydantic>=2,<3", ] @@ -21,8 +21,8 @@ Homepage = "https://www.seequent.com/" Documentation = "https://developer.seequent.com/" [project.optional-dependencies] -aiohttp = ["evo-sdk-common[aiohttp]>=0.1.0"] -notebooks = ["evo-sdk-common[notebooks]>=0.1.0"] +aiohttp = ["evo-sdk-common[aiohttp]"] +notebooks = ["evo-sdk-common[notebooks]"] utils = ["pyarrow", "pandas"] [dependency-groups] diff --git a/uv.lock b/uv.lock index 9a573231..e4d08791 100644 --- a/uv.lock +++ b/uv.lock @@ -719,7 +719,7 @@ name = "evo-objects" version = "0.2.3" source = { editable = "packages/evo-objects" } dependencies = [ - { name = "evo-sdk-common" }, + { name = "evo-sdk-common", extra = ["jmespath"] }, { name = "pydantic" }, ] @@ -758,8 +758,8 @@ test = [ [package.metadata] requires-dist = [ - { name = "evo-sdk-common", editable = "packages/evo-sdk-common" }, { name = "evo-sdk-common", extras = ["aiohttp"], marker = "extra == 'aiohttp'", editable = "packages/evo-sdk-common" }, + { name = "evo-sdk-common", extras = ["jmespath"], editable = "packages/evo-sdk-common" }, { name = "evo-sdk-common", extras = ["notebooks"], marker = "extra == 'notebooks'", editable = "packages/evo-sdk-common" }, { name = "pandas", marker = "extra == 'utils'" }, { name = "pyarrow", marker = "extra == 'utils'" }, From ab90ae9ca3a1d3e2248a4bc11bccb1bb5c2332b0 Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Fri, 3 Oct 2025 14:05:10 +1300 Subject: [PATCH 02/32] Refactor parsing object API responses --- .../evo-objects/src/evo/objects/client.py | 132 ++----------- packages/evo-objects/src/evo/objects/parse.py | 187 ++++++++++++++++++ 2 files changed, 198 insertions(+), 121 deletions(-) create mode 100644 packages/evo-objects/src/evo/objects/parse.py diff --git a/packages/evo-objects/src/evo/objects/client.py b/packages/evo-objects/src/evo/objects/client.py index 93152122..05b20313 100644 --- a/packages/evo-objects/src/evo/objects/client.py +++ b/packages/evo-objects/src/evo/objects/client.py @@ -12,7 +12,6 @@ from __future__ import annotations from collections.abc import AsyncIterator, Iterator, Sequence -from pathlib import PurePosixPath from uuid import UUID from evo import logging @@ -20,18 +19,14 @@ from evo.common.data import EmptyResponse, Environment, OrderByOperatorEnum from evo.common.io.exceptions import DataNotFoundError from evo.common.utils import get_service_health, parse_order_by -from evo.workspaces import ServiceUser +from . import parse from .data import ObjectMetadata, ObjectOrderByEnum, ObjectSchema, ObjectVersion, OrgObjectMetadata, Stage from .endpoints import MetadataApi, ObjectsApi, StagesApi from .endpoints.models import ( GeoscienceObject, - GeoscienceObjectVersion, GetObjectResponse, - ListedObject, MetadataUpdateBody, - OrgListedObject, - PostObjectResponse, UpdateGeoscienceObject, ) from .exceptions import ObjectUUIDError @@ -46,23 +41,6 @@ ] -def _version_from_listed_version(model: GeoscienceObjectVersion) -> ObjectVersion: - """Create an ObjectVersion instance from a generated ListedObject model. - - :param model: The model to create the ObjectVersion instance from. - - :return: An ObjectVersion instance. - """ - created_by = None if model.created_by is None else ServiceUser.from_model(model.created_by) # type: ignore - stage = None if model.stage is None else Stage.from_model(model.stage) - return ObjectVersion( - version_id=model.version_id, - created_at=model.created_at, - created_by=created_by, - stage=stage, - ) - - class DownloadedObject: """A downloaded geoscience object.""" @@ -128,79 +106,6 @@ async def get_service_health(self, check_type: HealthCheckType = HealthCheckType """ return await get_service_health(self._connector, "geoscience-object", check_type=check_type) - def _metadata_from_listed_object(self, model: ListedObject) -> ObjectMetadata: - """Create an ObjectMetadata instance from a generated ListedObject model. - - :param model: The model to create the ObjectMetadata instance from. - - :return: An ObjectMetadata instance. - """ - created_by = None if model.created_by is None else ServiceUser.from_model(model.created_by) - modified_by = None if model.modified_by is None else ServiceUser.from_model(model.modified_by) - stage = None if model.stage is None else Stage.from_model(model.stage) - return ObjectMetadata( - environment=self._environment, - id=model.object_id, - name=model.name, - created_at=model.created_at, - created_by=created_by, - modified_at=model.modified_at, - modified_by=modified_by, - parent=model.path.rstrip("/"), - schema_id=ObjectSchema.from_id(model.schema_), - version_id=model.version_id, - stage=stage, - ) - - def _metadata_from_org_listed_object(self, model: OrgListedObject) -> OrgObjectMetadata: - """Create an OrgObjectMetadata instance from a generated OrgListedObject model. - - :param model: The model to create the OrgObjectMetadata instance from. - - :return: An OrgObjectMetadata instance. - """ - created_by = None if model.created_by is None else ServiceUser.from_model(model.created_by) - modified_by = None if model.modified_by is None else ServiceUser.from_model(model.modified_by) - stage = None if model.stage is None else Stage.from_model(model.stage) - return OrgObjectMetadata( - environment=self._environment, - workspace_id=model.workspace_id, - workspace_name=model.workspace_name, - id=model.object_id, - name=model.name, - created_at=model.created_at, - created_by=created_by, - modified_at=model.modified_at, - modified_by=modified_by, - schema_id=ObjectSchema.from_id(model.schema_), - stage=stage, - ) - - def _metadata_from_endpoint_model(self, model: GetObjectResponse | PostObjectResponse) -> ObjectMetadata: - """Create an ObjectMetadata instance from a generated GetObjectResponse or PostObjectResponse model. - - :param model: The model to create the ObjectMetadata instance from. - - :return: An ObjectMetadata instance. - """ - object_path = PurePosixPath(model.object_path) - created_by = None if model.created_by is None else ServiceUser.from_model(model.created_by) - modified_by = None if model.modified_by is None else ServiceUser.from_model(model.modified_by) - stage = None if model.stage is None else Stage.from_model(model.stage) - return ObjectMetadata( - environment=self._environment, - id=model.object_id, - name=object_path.name, - created_at=model.created_at, - created_by=created_by, - modified_at=model.modified_at, - modified_by=modified_by, - parent=str(object_path.parent), - schema_id=ObjectSchema.from_id(model.object.schema_), - version_id=model.version_id, - stage=stage, - ) - async def list_objects( self, offset: int = 0, @@ -238,12 +143,7 @@ async def list_objects( request_timeout=request_timeout, deleted=deleted, ) - return Page( - offset=offset, - limit=limit, - total=response.total, - items=[self._metadata_from_listed_object(model) for model in response.objects], - ) + return parse.page_of_metadata(response, self._environment) async def list_all_objects( self, @@ -320,17 +220,7 @@ async def list_objects_for_instance( permitted_workspaces_only=True, deleted=deleted, ) - return Page( - offset=offset, - limit=limit, - total=response.total, - items=[self._metadata_from_org_listed_object(model) for model in response.objects], - ) - - @staticmethod - def _get_object_versions(response: GetObjectResponse) -> list[ObjectVersion]: - object_versions = [_version_from_listed_version(model) for model in response.versions] - return sorted(object_versions, key=lambda v: v.created_at, reverse=True) + return parse.page_of_metadata(response, self._environment) async def list_versions_by_path( self, path: str, request_timeout: int | float | tuple[int | float, int | float] | None = None @@ -350,7 +240,7 @@ async def list_versions_by_path( include_versions=True, request_timeout=request_timeout, ) - return self._get_object_versions(response) + return parse.versions(response) async def list_versions_by_id( self, object_id: UUID, request_timeout: int | float | tuple[int | float, int | float] | None = None @@ -370,7 +260,7 @@ async def list_versions_by_id( include_versions=True, request_timeout=request_timeout, ) - return self._get_object_versions(response) + return parse.versions(response) async def prepare_data_upload(self, data_identifiers: Sequence[str | UUID]) -> AsyncIterator[ObjectDataUpload]: """Prepare to upload multiple data files to the geoscience object service. @@ -461,7 +351,7 @@ async def create_geoscience_object( request_timeout=request_timeout, ) object_dict["uuid"] = result.object_id - return self._metadata_from_endpoint_model(result) + return parse.object_metadata(result, self._environment) async def move_geoscience_object( self, path: str, object_dict: dict, request_timeout: int | float | tuple[int | float, int | float] | None = None @@ -488,7 +378,7 @@ async def move_geoscience_object( geoscience_object=object_for_upload, request_timeout=request_timeout, ) - return self._metadata_from_endpoint_model(result) + return parse.object_metadata(result, self._environment) async def update_geoscience_object( self, object_dict: dict, request_timeout: int | float | tuple[int | float, int | float] | None = None @@ -514,7 +404,7 @@ async def update_geoscience_object( update_geoscience_object=object_for_upload, request_timeout=request_timeout, ) - return self._metadata_from_endpoint_model(result) + return parse.object_metadata(result, self._environment) def _downloaded_object_from_response(self, response: GetObjectResponse) -> DownloadedObject: """Parse object metadata and a geoscience object model instance from a get object response @@ -523,7 +413,7 @@ def _downloaded_object_from_response(self, response: GetObjectResponse) -> Downl :return: A tuple containing the object metadata and a data model of the requested geoscience object. """ - metadata = self._metadata_from_endpoint_model(response) + metadata = parse.object_metadata(response, self._environment) urls_by_name = {getattr(link, "name", link.id): link.download_url for link in response.links.data} return DownloadedObject(response.object, metadata, urls_by_name, self._connector) @@ -668,14 +558,14 @@ async def restore_geoscience_object( # If the restore happened with a rename, the response will be the metadata of the restored object if isinstance(result, EmptyResponse): return None - return self._metadata_from_endpoint_model(result) + return parse.object_metadata(result, self._environment) async def list_stages(self) -> list[Stage]: """List all available stages in the organisation. :return: A list of all available stages.""" response = await self._stages_api.list_stages(org_id=str(self._environment.org_id)) - return [Stage.from_model(model) for model in response.stages] + return [parse.stage(model) for model in response.stages] async def set_stage(self, object_id: UUID, version_id: int, stage_id: UUID) -> None: """Set the stage of a specific version of a geoscience object. diff --git a/packages/evo-objects/src/evo/objects/parse.py b/packages/evo-objects/src/evo/objects/parse.py new file mode 100644 index 00000000..173bb1ef --- /dev/null +++ b/packages/evo-objects/src/evo/objects/parse.py @@ -0,0 +1,187 @@ +from pathlib import PurePosixPath +from typing import overload + +from evo.common import Environment, Page, ServiceUser + +from .data import ObjectMetadata, ObjectSchema, ObjectVersion, OrgObjectMetadata, Stage +from .endpoints import models + +__all__ = [ + "object_metadata", + "org_object_metadata", + "page_of_metadata", + "schema", + "stage", + "stage_or_none", + "user_or_none", + "version", + "versions", +] + + +def user_or_none(model: models.User | None) -> ServiceUser | None: + """Parse a ServiceUser or None value from the generated model. + + :param model: The model returned by the generated code, or None. + + :return: The parsed ServiceUser, or None if the input model is None. + """ + return None if model is None else ServiceUser.from_model(model) + + +def stage(model: models.StageResponse) -> Stage: + """Parse a Stage from the generated model. + + :param model: The model returned by the generated code. + + :return: A Stage instance. + """ + return Stage.from_model(model) + + +def stage_or_none(model: models.StageResponse | None) -> Stage | None: + """Parse a Stage or None value from the generated model. + + :param model: The model returned by the generated code, or None. + + :return: The parsed Stage, or None if the input model is None. + """ + return None if model is None else stage(model) + + +def version(model: models.GeoscienceObjectVersion) -> ObjectVersion: + """Parse an ObjectVersion from the generated model. + + :param model: The model returned by the generated code. + + :return: An ObjectVersion instance. + """ + return ObjectVersion( + version_id=model.version_id, + created_at=model.created_at, + created_by=user_or_none(model.created_by), + stage=stage_or_none(model.stage), + ) + + +def versions(model: models.GetObjectResponse) -> list[ObjectVersion]: + """Parse a list of ObjectVersion from the generated model. + + :param model: The model returned by the generated code. + + :return: A list of ObjectVersion instances, sorted by created_at in descending order. + """ + object_versions = [version(model) for model in model.versions] + return sorted(object_versions, key=lambda v: v.created_at, reverse=True) + + +def schema(schema_id: str) -> ObjectSchema: + """Parse an ObjectSchema from the schema ID. + + :param schema_id: The schema ID string. + + :return: An ObjectSchema instance. + """ + return ObjectSchema.from_id(schema_id) + + +def object_metadata( + model: models.ListedObject | models.GetObjectResponse | models.PostObjectResponse, environment: Environment +) -> ObjectMetadata: + """Parse an ObjectMetadata from the generated model. + + :param model: The model returned by the generated code. + :param environment: The Evo environment associated with the object. + + :return: An ObjectMetadata instance. + """ + # There appears to be a schema defect where object_id may possibly be None, even though it shouldn't be. + assert model.object_id is not None + + # Parse name, parent, and schema_id from the appropriate fields depending on the model type. + if isinstance(model, models.ListedObject): + name = model.name + parent = model.path.rstrip("/") + schema_id = model.schema_ + elif model.object_path is not None: + path = PurePosixPath(model.object_path) + name = path.name + parent = str(path.parent) + schema_id = model.object.schema_ + else: + # There appears to be _another_ schema defect where object_path may be None in + # GetObjectResponse or PostObjectResponse, even though this never happens in practice. + raise ValueError("Model must be a ListedObject or have an object_path") + + return ObjectMetadata( + environment=environment, + id=model.object_id, + name=name, + created_at=model.created_at, + created_by=user_or_none(model.created_by), + modified_at=model.modified_at, + modified_by=user_or_none(model.modified_by), + parent=parent, + schema_id=schema(schema_id), + version_id=model.version_id, + stage=stage_or_none(model.stage), + ) + + +def org_object_metadata(model: models.OrgListedObject, environment: Environment) -> ObjectMetadata: + """Parse an OrgObjectMetadata from the generated model. + + :param model: The model returned by the generated code. + :param environment: The Evo environment associated with the object. + + :return: An ObjectMetadata instance. + """ + return OrgObjectMetadata( + environment=environment, + workspace_id=model.workspace_id, + workspace_name=model.workspace_name, + id=model.object_id, + name=model.name, + created_at=model.created_at, + created_by=user_or_none(model.created_by), + modified_at=model.modified_at, + modified_by=user_or_none(model.modified_by), + schema_id=schema(model.schema_), + stage=stage_or_none(model.stage), + ) + + +@overload +def page_of_metadata(model: models.ListObjectsResponse, environment: Environment) -> Page[ObjectMetadata]: ... + + +@overload +def page_of_metadata(model: models.ListOrgObjectsResponse, environment: Environment) -> Page[OrgObjectMetadata]: ... + + +def page_of_metadata( + model: models.ListObjectsResponse | models.ListOrgObjectsResponse, environment: Environment +) -> Page[ObjectMetadata] | Page[OrgObjectMetadata]: + """Parse a Page of ObjectMetadata or OrgObjectMetadata from the generated model. + + :param model: The model returned by the generated code. + :param environment: The Evo environment associated with the objects. + + :return: A Page of ObjectMetadata or OrgObjectMetadata instances. + + :raises TypeError: If the model type is unsupported. + """ + match model: + case models.ListObjectsResponse(): + parse_metadata = object_metadata + case models.ListOrgObjectsResponse(): + parse_metadata = org_object_metadata + case _: + raise TypeError(f"Unsupported model type: {type(model)}") + + return Page( + offset=model.offset, + limit=model.limit, + total=model.total, + items=[parse_metadata(item, environment) for item in model.objects], + ) From eff1ba690738d387a5b038f07c4d991054a707ba Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Fri, 3 Oct 2025 14:16:17 +1300 Subject: [PATCH 03/32] Fix workspace_id mismatch in list_objects_for_instance() --- packages/evo-objects/src/evo/objects/parse.py | 3 ++- packages/evo-objects/tests/test_object_service_client.py | 9 ++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/packages/evo-objects/src/evo/objects/parse.py b/packages/evo-objects/src/evo/objects/parse.py index 173bb1ef..218adb56 100644 --- a/packages/evo-objects/src/evo/objects/parse.py +++ b/packages/evo-objects/src/evo/objects/parse.py @@ -1,3 +1,4 @@ +import dataclasses from pathlib import PurePosixPath from typing import overload @@ -137,7 +138,7 @@ def org_object_metadata(model: models.OrgListedObject, environment: Environment) :return: An ObjectMetadata instance. """ return OrgObjectMetadata( - environment=environment, + environment=dataclasses.replace(environment, workspace_id=model.workspace_id), workspace_id=model.workspace_id, workspace_name=model.workspace_name, id=model.object_id, diff --git a/packages/evo-objects/tests/test_object_service_client.py b/packages/evo-objects/tests/test_object_service_client.py index 6cf3d1df..385780e9 100644 --- a/packages/evo-objects/tests/test_object_service_client.py +++ b/packages/evo-objects/tests/test_object_service_client.py @@ -9,6 +9,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import dataclasses import datetime import json from unittest import mock @@ -243,6 +244,8 @@ async def test_list_objects_for_instance(self) -> None: ), ] self.assertIsInstance(page_one, Page) + for item in page_one: + self.assertEqual(item.environment.workspace_id, item.workspace_id, "workspace_id should match environment") self.assertEqual(expected_items_page_one, page_one.items()) self.assertEqual(0, page_one.offset) self.assertEqual(2, page_one.limit) @@ -257,7 +260,9 @@ async def test_list_objects_for_instance(self) -> None: page_two = await self.object_client.list_objects_for_instance(offset=page_one.next_offset, limit=page_one.limit) expected_items_page_two = [ OrgObjectMetadata( - environment=self.environment, + environment=dataclasses.replace( + self.environment, workspace_id=UUID("00000000-0000-0000-0000-0000000004d2") + ), workspace_id=UUID("00000000-0000-0000-0000-0000000004d2"), workspace_name="Test Workspace 2", id=UUID("00000000-0000-0000-0000-000000000002"), @@ -279,6 +284,8 @@ async def test_list_objects_for_instance(self) -> None: ), ] self.assertIsInstance(page_two, Page) + for item in page_two: + self.assertEqual(item.environment.workspace_id, item.workspace_id, "workspace_id should match environment") self.assertEqual(expected_items_page_two, page_two.items()) self.assertEqual(2, page_two.offset) self.assertEqual(2, page_two.limit) From 5716ff0dffacaee6e802de1ef71198ae01292aeb Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Fri, 3 Oct 2025 14:20:32 +1300 Subject: [PATCH 04/32] move ObjectAPIClient into client sub-package --- .../evo-objects/src/evo/objects/client/__init__.py | 3 +++ .../objects/{client.py => client/api_client.py} | 14 +++++++------- .../tests/test_object_service_client.py | 2 +- 3 files changed, 11 insertions(+), 8 deletions(-) create mode 100644 packages/evo-objects/src/evo/objects/client/__init__.py rename packages/evo-objects/src/evo/objects/{client.py => client/api_client.py} (98%) diff --git a/packages/evo-objects/src/evo/objects/client/__init__.py b/packages/evo-objects/src/evo/objects/client/__init__.py new file mode 100644 index 00000000..5bceb2f7 --- /dev/null +++ b/packages/evo-objects/src/evo/objects/client/__init__.py @@ -0,0 +1,3 @@ +from .api_client import DownloadedObject, ObjectAPIClient + +__all__ = ["DownloadedObject", "ObjectAPIClient"] diff --git a/packages/evo-objects/src/evo/objects/client.py b/packages/evo-objects/src/evo/objects/client/api_client.py similarity index 98% rename from packages/evo-objects/src/evo/objects/client.py rename to packages/evo-objects/src/evo/objects/client/api_client.py index 05b20313..1d905253 100644 --- a/packages/evo-objects/src/evo/objects/client.py +++ b/packages/evo-objects/src/evo/objects/client/api_client.py @@ -20,18 +20,18 @@ from evo.common.io.exceptions import DataNotFoundError from evo.common.utils import get_service_health, parse_order_by -from . import parse -from .data import ObjectMetadata, ObjectOrderByEnum, ObjectSchema, ObjectVersion, OrgObjectMetadata, Stage -from .endpoints import MetadataApi, ObjectsApi, StagesApi -from .endpoints.models import ( +from .. import parse +from ..data import ObjectMetadata, ObjectOrderByEnum, ObjectSchema, ObjectVersion, OrgObjectMetadata, Stage +from ..endpoints import MetadataApi, ObjectsApi, StagesApi +from ..endpoints.models import ( GeoscienceObject, GetObjectResponse, MetadataUpdateBody, UpdateGeoscienceObject, ) -from .exceptions import ObjectUUIDError -from .io import ObjectDataDownload, ObjectDataUpload -from .utils import ObjectDataClient +from ..exceptions import ObjectUUIDError +from ..io import ObjectDataDownload, ObjectDataUpload +from ..utils import ObjectDataClient logger = logging.getLogger("object.client") diff --git a/packages/evo-objects/tests/test_object_service_client.py b/packages/evo-objects/tests/test_object_service_client.py index 385780e9..d44a0c5d 100644 --- a/packages/evo-objects/tests/test_object_service_client.py +++ b/packages/evo-objects/tests/test_object_service_client.py @@ -58,7 +58,7 @@ def base_path(self) -> str: async def test_check_service_health(self) -> None: """Test service health check implementation""" - with mock.patch("evo.objects.client.get_service_health", spec_set=True) as mock_get_service_health: + with mock.patch("evo.objects.client.api_client.get_service_health", spec_set=True) as mock_get_service_health: await self.object_client.get_service_health() mock_get_service_health.assert_called_once_with( self.connector, "geoscience-object", check_type=HealthCheckType.FULL From 7f71cc70388bbe41c90ec0ec1fac88f9db40810a Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Fri, 3 Oct 2025 14:21:30 +1300 Subject: [PATCH 05/32] Add license header --- .../evo-objects/src/evo/objects/client/__init__.py | 11 +++++++++++ packages/evo-objects/src/evo/objects/parse.py | 11 +++++++++++ 2 files changed, 22 insertions(+) diff --git a/packages/evo-objects/src/evo/objects/client/__init__.py b/packages/evo-objects/src/evo/objects/client/__init__.py index 5bceb2f7..b453756c 100644 --- a/packages/evo-objects/src/evo/objects/client/__init__.py +++ b/packages/evo-objects/src/evo/objects/client/__init__.py @@ -1,3 +1,14 @@ +# Copyright © 2025 Bentley Systems, Incorporated +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from .api_client import DownloadedObject, ObjectAPIClient __all__ = ["DownloadedObject", "ObjectAPIClient"] diff --git a/packages/evo-objects/src/evo/objects/parse.py b/packages/evo-objects/src/evo/objects/parse.py index 218adb56..2b6c0759 100644 --- a/packages/evo-objects/src/evo/objects/parse.py +++ b/packages/evo-objects/src/evo/objects/parse.py @@ -1,3 +1,14 @@ +# Copyright © 2025 Bentley Systems, Incorporated +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import dataclasses from pathlib import PurePosixPath from typing import overload From b82c3679292673a05dd683fc59ea73b19749045f Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Fri, 3 Oct 2025 14:26:31 +1300 Subject: [PATCH 06/32] Move DownloadedObject into new file --- .../src/evo/objects/client/__init__.py | 3 +- .../src/evo/objects/client/api_client.py | 57 +-------------- .../src/evo/objects/client/object_client.py | 73 +++++++++++++++++++ 3 files changed, 79 insertions(+), 54 deletions(-) create mode 100644 packages/evo-objects/src/evo/objects/client/object_client.py diff --git a/packages/evo-objects/src/evo/objects/client/__init__.py b/packages/evo-objects/src/evo/objects/client/__init__.py index b453756c..65670d4c 100644 --- a/packages/evo-objects/src/evo/objects/client/__init__.py +++ b/packages/evo-objects/src/evo/objects/client/__init__.py @@ -9,6 +9,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .api_client import DownloadedObject, ObjectAPIClient +from .api_client import ObjectAPIClient +from .object_client import DownloadedObject __all__ = ["DownloadedObject", "ObjectAPIClient"] diff --git a/packages/evo-objects/src/evo/objects/client/api_client.py b/packages/evo-objects/src/evo/objects/client/api_client.py index 1d905253..ee7cf01d 100644 --- a/packages/evo-objects/src/evo/objects/client/api_client.py +++ b/packages/evo-objects/src/evo/objects/client/api_client.py @@ -11,17 +11,16 @@ from __future__ import annotations -from collections.abc import AsyncIterator, Iterator, Sequence +from collections.abc import AsyncIterator, Sequence from uuid import UUID from evo import logging from evo.common import APIConnector, BaseAPIClient, HealthCheckType, ICache, Page, ServiceHealth from evo.common.data import EmptyResponse, Environment, OrderByOperatorEnum -from evo.common.io.exceptions import DataNotFoundError from evo.common.utils import get_service_health, parse_order_by from .. import parse -from ..data import ObjectMetadata, ObjectOrderByEnum, ObjectSchema, ObjectVersion, OrgObjectMetadata, Stage +from ..data import ObjectMetadata, ObjectOrderByEnum, ObjectVersion, OrgObjectMetadata, Stage from ..endpoints import MetadataApi, ObjectsApi, StagesApi from ..endpoints.models import ( GeoscienceObject, @@ -32,59 +31,11 @@ from ..exceptions import ObjectUUIDError from ..io import ObjectDataDownload, ObjectDataUpload from ..utils import ObjectDataClient +from .object_client import DownloadedObject logger = logging.getLogger("object.client") -__all__ = [ - "DownloadedObject", - "ObjectAPIClient", -] - - -class DownloadedObject: - """A downloaded geoscience object.""" - - def __init__( - self, object_: GeoscienceObject, metadata: ObjectMetadata, urls_by_name: dict[str, str], connector: APIConnector - ) -> None: - self._object = object_ - self._metadata = metadata - self._urls_by_name = urls_by_name - self._connector = connector - - @property - def schema(self) -> ObjectSchema: - """The schema of the object.""" - return self._metadata.schema_id - - @property - def metadata(self) -> ObjectMetadata: - """The metadata of the object.""" - return self._metadata - - def as_dict(self) -> dict: - """Get this object as a dictionary.""" - return self._object.model_dump(mode="python", by_alias=True) - - def prepare_data_download(self, data_identifiers: Sequence[str | UUID]) -> Iterator[ObjectDataDownload]: - """Prepare to download multiple data files from the geoscience object service, for this object. - - Any data IDs that are not associated with the requested object will raise a DataNotFoundError. - - :param data_identifiers: A list of sha256 digests or UUIDs for the data to be downloaded. - - :return: An iterator of data download contexts that can be used to download the data. - - :raises DataNotFoundError: If any requested data ID is not associated with this object. - """ - try: - filtered_urls_by_name = {str(name): self._urls_by_name[str(name)] for name in data_identifiers} - except KeyError as exc: - raise DataNotFoundError(f"Unable to find the requested data: {exc.args[0]}") from exc - for ctx in ObjectDataDownload._create_multiple( - connector=self._connector, metadata=self._metadata, urls_by_name=filtered_urls_by_name - ): - yield ctx +__all__ = ["ObjectAPIClient"] class ObjectAPIClient(BaseAPIClient): diff --git a/packages/evo-objects/src/evo/objects/client/object_client.py b/packages/evo-objects/src/evo/objects/client/object_client.py new file mode 100644 index 00000000..bc3995c9 --- /dev/null +++ b/packages/evo-objects/src/evo/objects/client/object_client.py @@ -0,0 +1,73 @@ +# Copyright © 2025 Bentley Systems, Incorporated +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from collections.abc import Iterator, Sequence +from uuid import UUID + +from evo import logging +from evo.common import APIConnector +from evo.common.io.exceptions import DataNotFoundError + +from ..data import ObjectMetadata, ObjectSchema +from ..endpoints.models import GeoscienceObject +from ..io import ObjectDataDownload + +__all__ = ["DownloadedObject"] + +logger = logging.getLogger("object.client") + + +class DownloadedObject: + """A downloaded geoscience object.""" + + def __init__( + self, object_: GeoscienceObject, metadata: ObjectMetadata, urls_by_name: dict[str, str], connector: APIConnector + ) -> None: + self._object = object_ + self._metadata = metadata + self._urls_by_name = urls_by_name + self._connector = connector + + @property + def schema(self) -> ObjectSchema: + """The schema of the object.""" + return self._metadata.schema_id + + @property + def metadata(self) -> ObjectMetadata: + """The metadata of the object.""" + return self._metadata + + def as_dict(self) -> dict: + """Get this object as a dictionary.""" + return self._object.model_dump(mode="python", by_alias=True) + + def prepare_data_download(self, data_identifiers: Sequence[str | UUID]) -> Iterator[ObjectDataDownload]: + """Prepare to download multiple data files from the geoscience object service, for this object. + + Any data IDs that are not associated with the requested object will raise a DataNotFoundError. + + :param data_identifiers: A list of sha256 digests or UUIDs for the data to be downloaded. + + :return: An iterator of data download contexts that can be used to download the data. + + :raises DataNotFoundError: If any requested data ID is not associated with this object. + """ + try: + filtered_urls_by_name = {str(name): self._urls_by_name[str(name)] for name in data_identifiers} + except KeyError as exc: + raise DataNotFoundError(f"Unable to find the requested data: {exc.args[0]}") from exc + for ctx in ObjectDataDownload._create_multiple( + connector=self._connector, metadata=self._metadata, urls_by_name=filtered_urls_by_name + ): + yield ctx From 47de1247c5d7601dfe60582b435637e02ef3b3bd Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Fri, 3 Oct 2025 15:01:07 +1300 Subject: [PATCH 07/32] add numpy and pyarrow-stubs to evo-objects utils dependencies --- packages/evo-objects/pyproject.toml | 2 +- uv.lock | 93 +++++++++++++++++------------ 2 files changed, 56 insertions(+), 39 deletions(-) diff --git a/packages/evo-objects/pyproject.toml b/packages/evo-objects/pyproject.toml index 2b41d335..ad81aa6e 100644 --- a/packages/evo-objects/pyproject.toml +++ b/packages/evo-objects/pyproject.toml @@ -23,7 +23,7 @@ Documentation = "https://developer.seequent.com/" [project.optional-dependencies] aiohttp = ["evo-sdk-common[aiohttp]"] notebooks = ["evo-sdk-common[notebooks]"] -utils = ["pyarrow", "pandas"] +utils = ["pyarrow", "pyarrow-stubs", "pandas", "numpy"] [dependency-groups] # Dev dependencies. The version is left unspecified so the latest is installed. diff --git a/uv.lock b/uv.lock index e4d08791..2e5e1e9d 100644 --- a/uv.lock +++ b/uv.lock @@ -731,8 +731,10 @@ notebooks = [ { name = "evo-sdk-common", extra = ["notebooks"] }, ] utils = [ + { name = "numpy" }, { name = "pandas" }, { name = "pyarrow" }, + { name = "pyarrow-stubs" }, ] [package.dev-dependencies] @@ -761,8 +763,10 @@ requires-dist = [ { name = "evo-sdk-common", extras = ["aiohttp"], marker = "extra == 'aiohttp'", editable = "packages/evo-sdk-common" }, { name = "evo-sdk-common", extras = ["jmespath"], editable = "packages/evo-sdk-common" }, { name = "evo-sdk-common", extras = ["notebooks"], marker = "extra == 'notebooks'", editable = "packages/evo-sdk-common" }, + { name = "numpy", marker = "extra == 'utils'" }, { name = "pandas", marker = "extra == 'utils'" }, { name = "pyarrow", marker = "extra == 'utils'" }, + { name = "pyarrow-stubs", marker = "extra == 'utils'" }, { name = "pydantic", specifier = ">=2,<3" }, ] provides-extras = ["aiohttp", "notebooks", "utils"] @@ -2133,44 +2137,57 @@ wheels = [ [[package]] name = "pyarrow" -version = "19.0.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/7f/09/a9046344212690f0632b9c709f9bf18506522feb333c894d0de81d62341a/pyarrow-19.0.1.tar.gz", hash = "sha256:3bf266b485df66a400f282ac0b6d1b500b9d2ae73314a153dbe97d6d5cc8a99e", size = 1129437, upload-time = "2025-02-18T18:55:57.027Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/36/01/b23b514d86b839956238d3f8ef206fd2728eee87ff1b8ce150a5678d9721/pyarrow-19.0.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:fc28912a2dc924dddc2087679cc8b7263accc71b9ff025a1362b004711661a69", size = 30688914, upload-time = "2025-02-18T18:51:37.575Z" }, - { url = "https://files.pythonhosted.org/packages/c6/68/218ff7cf4a0652a933e5f2ed11274f724dd43b9813cb18dd72c0a35226a2/pyarrow-19.0.1-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:fca15aabbe9b8355800d923cc2e82c8ef514af321e18b437c3d782aa884eaeec", size = 32102866, upload-time = "2025-02-18T18:51:44.358Z" }, - { url = "https://files.pythonhosted.org/packages/98/01/c295050d183014f4a2eb796d7d2bbfa04b6cccde7258bb68aacf6f18779b/pyarrow-19.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ad76aef7f5f7e4a757fddcdcf010a8290958f09e3470ea458c80d26f4316ae89", size = 41147682, upload-time = "2025-02-18T18:51:49.481Z" }, - { url = "https://files.pythonhosted.org/packages/40/17/a6c3db0b5f3678f33bbb552d2acbc16def67f89a72955b67b0109af23eb0/pyarrow-19.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d03c9d6f2a3dffbd62671ca070f13fc527bb1867b4ec2b98c7eeed381d4f389a", size = 42179192, upload-time = "2025-02-18T18:51:56.265Z" }, - { url = "https://files.pythonhosted.org/packages/cf/75/c7c8e599300d8cebb6cb339014800e1c720c9db2a3fcb66aa64ec84bac72/pyarrow-19.0.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:65cf9feebab489b19cdfcfe4aa82f62147218558d8d3f0fc1e9dea0ab8e7905a", size = 40517272, upload-time = "2025-02-18T18:52:02.969Z" }, - { url = "https://files.pythonhosted.org/packages/ef/c9/68ab123ee1528699c4d5055f645ecd1dd68ff93e4699527249d02f55afeb/pyarrow-19.0.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:41f9706fbe505e0abc10e84bf3a906a1338905cbbcf1177b71486b03e6ea6608", size = 42069036, upload-time = "2025-02-18T18:52:10.173Z" }, - { url = "https://files.pythonhosted.org/packages/54/e3/d5cfd7654084e6c0d9c3ce949e5d9e0ccad569ae1e2d5a68a3ec03b2be89/pyarrow-19.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:c6cb2335a411b713fdf1e82a752162f72d4a7b5dbc588e32aa18383318b05866", size = 25277951, upload-time = "2025-02-18T18:52:15.459Z" }, - { url = "https://files.pythonhosted.org/packages/a0/55/f1a8d838ec07fe3ca53edbe76f782df7b9aafd4417080eebf0b42aab0c52/pyarrow-19.0.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:cc55d71898ea30dc95900297d191377caba257612f384207fe9f8293b5850f90", size = 30713987, upload-time = "2025-02-18T18:52:20.463Z" }, - { url = "https://files.pythonhosted.org/packages/13/12/428861540bb54c98a140ae858a11f71d041ef9e501e6b7eb965ca7909505/pyarrow-19.0.1-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:7a544ec12de66769612b2d6988c36adc96fb9767ecc8ee0a4d270b10b1c51e00", size = 32135613, upload-time = "2025-02-18T18:52:25.29Z" }, - { url = "https://files.pythonhosted.org/packages/2f/8a/23d7cc5ae2066c6c736bce1db8ea7bc9ac3ef97ac7e1c1667706c764d2d9/pyarrow-19.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0148bb4fc158bfbc3d6dfe5001d93ebeed253793fff4435167f6ce1dc4bddeae", size = 41149147, upload-time = "2025-02-18T18:52:30.975Z" }, - { url = "https://files.pythonhosted.org/packages/a2/7a/845d151bb81a892dfb368bf11db584cf8b216963ccce40a5cf50a2492a18/pyarrow-19.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f24faab6ed18f216a37870d8c5623f9c044566d75ec586ef884e13a02a9d62c5", size = 42178045, upload-time = "2025-02-18T18:52:36.859Z" }, - { url = "https://files.pythonhosted.org/packages/a7/31/e7282d79a70816132cf6cae7e378adfccce9ae10352d21c2fecf9d9756dd/pyarrow-19.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:4982f8e2b7afd6dae8608d70ba5bd91699077323f812a0448d8b7abdff6cb5d3", size = 40532998, upload-time = "2025-02-18T18:52:42.578Z" }, - { url = "https://files.pythonhosted.org/packages/b8/82/20f3c290d6e705e2ee9c1fa1d5a0869365ee477e1788073d8b548da8b64c/pyarrow-19.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:49a3aecb62c1be1d822f8bf629226d4a96418228a42f5b40835c1f10d42e4db6", size = 42084055, upload-time = "2025-02-18T18:52:48.749Z" }, - { url = "https://files.pythonhosted.org/packages/ff/77/e62aebd343238863f2c9f080ad2ef6ace25c919c6ab383436b5b81cbeef7/pyarrow-19.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:008a4009efdb4ea3d2e18f05cd31f9d43c388aad29c636112c2966605ba33466", size = 25283133, upload-time = "2025-02-18T18:52:54.549Z" }, - { url = "https://files.pythonhosted.org/packages/78/b4/94e828704b050e723f67d67c3535cf7076c7432cd4cf046e4bb3b96a9c9d/pyarrow-19.0.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:80b2ad2b193e7d19e81008a96e313fbd53157945c7be9ac65f44f8937a55427b", size = 30670749, upload-time = "2025-02-18T18:53:00.062Z" }, - { url = "https://files.pythonhosted.org/packages/7e/3b/4692965e04bb1df55e2c314c4296f1eb12b4f3052d4cf43d29e076aedf66/pyarrow-19.0.1-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:ee8dec072569f43835932a3b10c55973593abc00936c202707a4ad06af7cb294", size = 32128007, upload-time = "2025-02-18T18:53:06.581Z" }, - { url = "https://files.pythonhosted.org/packages/22/f7/2239af706252c6582a5635c35caa17cb4d401cd74a87821ef702e3888957/pyarrow-19.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d5d1ec7ec5324b98887bdc006f4d2ce534e10e60f7ad995e7875ffa0ff9cb14", size = 41144566, upload-time = "2025-02-18T18:53:11.958Z" }, - { url = "https://files.pythonhosted.org/packages/fb/e3/c9661b2b2849cfefddd9fd65b64e093594b231b472de08ff658f76c732b2/pyarrow-19.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3ad4c0eb4e2a9aeb990af6c09e6fa0b195c8c0e7b272ecc8d4d2b6574809d34", size = 42202991, upload-time = "2025-02-18T18:53:17.678Z" }, - { url = "https://files.pythonhosted.org/packages/fe/4f/a2c0ed309167ef436674782dfee4a124570ba64299c551e38d3fdaf0a17b/pyarrow-19.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:d383591f3dcbe545f6cc62daaef9c7cdfe0dff0fb9e1c8121101cabe9098cfa6", size = 40507986, upload-time = "2025-02-18T18:53:26.263Z" }, - { url = "https://files.pythonhosted.org/packages/27/2e/29bb28a7102a6f71026a9d70d1d61df926887e36ec797f2e6acfd2dd3867/pyarrow-19.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b4c4156a625f1e35d6c0b2132635a237708944eb41df5fbe7d50f20d20c17832", size = 42087026, upload-time = "2025-02-18T18:53:33.063Z" }, - { url = "https://files.pythonhosted.org/packages/16/33/2a67c0f783251106aeeee516f4806161e7b481f7d744d0d643d2f30230a5/pyarrow-19.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:5bd1618ae5e5476b7654c7b55a6364ae87686d4724538c24185bbb2952679960", size = 25250108, upload-time = "2025-02-18T18:53:38.462Z" }, - { url = "https://files.pythonhosted.org/packages/2b/8d/275c58d4b00781bd36579501a259eacc5c6dfb369be4ddeb672ceb551d2d/pyarrow-19.0.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:e45274b20e524ae5c39d7fc1ca2aa923aab494776d2d4b316b49ec7572ca324c", size = 30653552, upload-time = "2025-02-18T18:53:44.357Z" }, - { url = "https://files.pythonhosted.org/packages/a0/9e/e6aca5cc4ef0c7aec5f8db93feb0bde08dbad8c56b9014216205d271101b/pyarrow-19.0.1-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:d9dedeaf19097a143ed6da37f04f4051aba353c95ef507764d344229b2b740ae", size = 32103413, upload-time = "2025-02-18T18:53:52.971Z" }, - { url = "https://files.pythonhosted.org/packages/6a/fa/a7033f66e5d4f1308c7eb0dfcd2ccd70f881724eb6fd1776657fdf65458f/pyarrow-19.0.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ebfb5171bb5f4a52319344ebbbecc731af3f021e49318c74f33d520d31ae0c4", size = 41134869, upload-time = "2025-02-18T18:53:59.471Z" }, - { url = "https://files.pythonhosted.org/packages/2d/92/34d2569be8e7abdc9d145c98dc410db0071ac579b92ebc30da35f500d630/pyarrow-19.0.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2a21d39fbdb948857f67eacb5bbaaf36802de044ec36fbef7a1c8f0dd3a4ab2", size = 42192626, upload-time = "2025-02-18T18:54:06.062Z" }, - { url = "https://files.pythonhosted.org/packages/0a/1f/80c617b1084fc833804dc3309aa9d8daacd46f9ec8d736df733f15aebe2c/pyarrow-19.0.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:99bc1bec6d234359743b01e70d4310d0ab240c3d6b0da7e2a93663b0158616f6", size = 40496708, upload-time = "2025-02-18T18:54:12.347Z" }, - { url = "https://files.pythonhosted.org/packages/e6/90/83698fcecf939a611c8d9a78e38e7fed7792dcc4317e29e72cf8135526fb/pyarrow-19.0.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:1b93ef2c93e77c442c979b0d596af45e4665d8b96da598db145b0fec014b9136", size = 42075728, upload-time = "2025-02-18T18:54:19.364Z" }, - { url = "https://files.pythonhosted.org/packages/40/49/2325f5c9e7a1c125c01ba0c509d400b152c972a47958768e4e35e04d13d8/pyarrow-19.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:d9d46e06846a41ba906ab25302cf0fd522f81aa2a85a71021826f34639ad31ef", size = 25242568, upload-time = "2025-02-18T18:54:25.846Z" }, - { url = "https://files.pythonhosted.org/packages/3f/72/135088d995a759d4d916ec4824cb19e066585b4909ebad4ab196177aa825/pyarrow-19.0.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:c0fe3dbbf054a00d1f162fda94ce236a899ca01123a798c561ba307ca38af5f0", size = 30702371, upload-time = "2025-02-18T18:54:30.665Z" }, - { url = "https://files.pythonhosted.org/packages/2e/01/00beeebd33d6bac701f20816a29d2018eba463616bbc07397fdf99ac4ce3/pyarrow-19.0.1-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:96606c3ba57944d128e8a8399da4812f56c7f61de8c647e3470b417f795d0ef9", size = 32116046, upload-time = "2025-02-18T18:54:35.995Z" }, - { url = "https://files.pythonhosted.org/packages/1f/c9/23b1ea718dfe967cbd986d16cf2a31fe59d015874258baae16d7ea0ccabc/pyarrow-19.0.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f04d49a6b64cf24719c080b3c2029a3a5b16417fd5fd7c4041f94233af732f3", size = 41091183, upload-time = "2025-02-18T18:54:42.662Z" }, - { url = "https://files.pythonhosted.org/packages/3a/d4/b4a3aa781a2c715520aa8ab4fe2e7fa49d33a1d4e71c8fc6ab7b5de7a3f8/pyarrow-19.0.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a9137cf7e1640dce4c190551ee69d478f7121b5c6f323553b319cac936395f6", size = 42171896, upload-time = "2025-02-18T18:54:49.808Z" }, - { url = "https://files.pythonhosted.org/packages/23/1b/716d4cd5a3cbc387c6e6745d2704c4b46654ba2668260d25c402626c5ddb/pyarrow-19.0.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:7c1bca1897c28013db5e4c83944a2ab53231f541b9e0c3f4791206d0c0de389a", size = 40464851, upload-time = "2025-02-18T18:54:57.073Z" }, - { url = "https://files.pythonhosted.org/packages/ed/bd/54907846383dcc7ee28772d7e646f6c34276a17da740002a5cefe90f04f7/pyarrow-19.0.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:58d9397b2e273ef76264b45531e9d552d8ec8a6688b7390b5be44c02a37aade8", size = 42085744, upload-time = "2025-02-18T18:55:08.562Z" }, +version = "21.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ef/c2/ea068b8f00905c06329a3dfcd40d0fcc2b7d0f2e355bdb25b65e0a0e4cd4/pyarrow-21.0.0.tar.gz", hash = "sha256:5051f2dccf0e283ff56335760cbc8622cf52264d67e359d5569541ac11b6d5bc", size = 1133487, upload-time = "2025-07-18T00:57:31.761Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/17/d9/110de31880016e2afc52d8580b397dbe47615defbf09ca8cf55f56c62165/pyarrow-21.0.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:e563271e2c5ff4d4a4cbeb2c83d5cf0d4938b891518e676025f7268c6fe5fe26", size = 31196837, upload-time = "2025-07-18T00:54:34.755Z" }, + { url = "https://files.pythonhosted.org/packages/df/5f/c1c1997613abf24fceb087e79432d24c19bc6f7259cab57c2c8e5e545fab/pyarrow-21.0.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:fee33b0ca46f4c85443d6c450357101e47d53e6c3f008d658c27a2d020d44c79", size = 32659470, upload-time = "2025-07-18T00:54:38.329Z" }, + { url = "https://files.pythonhosted.org/packages/3e/ed/b1589a777816ee33ba123ba1e4f8f02243a844fed0deec97bde9fb21a5cf/pyarrow-21.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:7be45519b830f7c24b21d630a31d48bcebfd5d4d7f9d3bdb49da9cdf6d764edb", size = 41055619, upload-time = "2025-07-18T00:54:42.172Z" }, + { url = "https://files.pythonhosted.org/packages/44/28/b6672962639e85dc0ac36f71ab3a8f5f38e01b51343d7aa372a6b56fa3f3/pyarrow-21.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:26bfd95f6bff443ceae63c65dc7e048670b7e98bc892210acba7e4995d3d4b51", size = 42733488, upload-time = "2025-07-18T00:54:47.132Z" }, + { url = "https://files.pythonhosted.org/packages/f8/cc/de02c3614874b9089c94eac093f90ca5dfa6d5afe45de3ba847fd950fdf1/pyarrow-21.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:bd04ec08f7f8bd113c55868bd3fc442a9db67c27af098c5f814a3091e71cc61a", size = 43329159, upload-time = "2025-07-18T00:54:51.686Z" }, + { url = "https://files.pythonhosted.org/packages/a6/3e/99473332ac40278f196e105ce30b79ab8affab12f6194802f2593d6b0be2/pyarrow-21.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:9b0b14b49ac10654332a805aedfc0147fb3469cbf8ea951b3d040dab12372594", size = 45050567, upload-time = "2025-07-18T00:54:56.679Z" }, + { url = "https://files.pythonhosted.org/packages/7b/f5/c372ef60593d713e8bfbb7e0c743501605f0ad00719146dc075faf11172b/pyarrow-21.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:9d9f8bcb4c3be7738add259738abdeddc363de1b80e3310e04067aa1ca596634", size = 26217959, upload-time = "2025-07-18T00:55:00.482Z" }, + { url = "https://files.pythonhosted.org/packages/94/dc/80564a3071a57c20b7c32575e4a0120e8a330ef487c319b122942d665960/pyarrow-21.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:c077f48aab61738c237802836fc3844f85409a46015635198761b0d6a688f87b", size = 31243234, upload-time = "2025-07-18T00:55:03.812Z" }, + { url = "https://files.pythonhosted.org/packages/ea/cc/3b51cb2db26fe535d14f74cab4c79b191ed9a8cd4cbba45e2379b5ca2746/pyarrow-21.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:689f448066781856237eca8d1975b98cace19b8dd2ab6145bf49475478bcaa10", size = 32714370, upload-time = "2025-07-18T00:55:07.495Z" }, + { url = "https://files.pythonhosted.org/packages/24/11/a4431f36d5ad7d83b87146f515c063e4d07ef0b7240876ddb885e6b44f2e/pyarrow-21.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:479ee41399fcddc46159a551705b89c05f11e8b8cb8e968f7fec64f62d91985e", size = 41135424, upload-time = "2025-07-18T00:55:11.461Z" }, + { url = "https://files.pythonhosted.org/packages/74/dc/035d54638fc5d2971cbf1e987ccd45f1091c83bcf747281cf6cc25e72c88/pyarrow-21.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:40ebfcb54a4f11bcde86bc586cbd0272bac0d516cfa539c799c2453768477569", size = 42823810, upload-time = "2025-07-18T00:55:16.301Z" }, + { url = "https://files.pythonhosted.org/packages/2e/3b/89fced102448a9e3e0d4dded1f37fa3ce4700f02cdb8665457fcc8015f5b/pyarrow-21.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8d58d8497814274d3d20214fbb24abcad2f7e351474357d552a8d53bce70c70e", size = 43391538, upload-time = "2025-07-18T00:55:23.82Z" }, + { url = "https://files.pythonhosted.org/packages/fb/bb/ea7f1bd08978d39debd3b23611c293f64a642557e8141c80635d501e6d53/pyarrow-21.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:585e7224f21124dd57836b1530ac8f2df2afc43c861d7bf3d58a4870c42ae36c", size = 45120056, upload-time = "2025-07-18T00:55:28.231Z" }, + { url = "https://files.pythonhosted.org/packages/6e/0b/77ea0600009842b30ceebc3337639a7380cd946061b620ac1a2f3cb541e2/pyarrow-21.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:555ca6935b2cbca2c0e932bedd853e9bc523098c39636de9ad4693b5b1df86d6", size = 26220568, upload-time = "2025-07-18T00:55:32.122Z" }, + { url = "https://files.pythonhosted.org/packages/ca/d4/d4f817b21aacc30195cf6a46ba041dd1be827efa4a623cc8bf39a1c2a0c0/pyarrow-21.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:3a302f0e0963db37e0a24a70c56cf91a4faa0bca51c23812279ca2e23481fccd", size = 31160305, upload-time = "2025-07-18T00:55:35.373Z" }, + { url = "https://files.pythonhosted.org/packages/a2/9c/dcd38ce6e4b4d9a19e1d36914cb8e2b1da4e6003dd075474c4cfcdfe0601/pyarrow-21.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:b6b27cf01e243871390474a211a7922bfbe3bda21e39bc9160daf0da3fe48876", size = 32684264, upload-time = "2025-07-18T00:55:39.303Z" }, + { url = "https://files.pythonhosted.org/packages/4f/74/2a2d9f8d7a59b639523454bec12dba35ae3d0a07d8ab529dc0809f74b23c/pyarrow-21.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:e72a8ec6b868e258a2cd2672d91f2860ad532d590ce94cdf7d5e7ec674ccf03d", size = 41108099, upload-time = "2025-07-18T00:55:42.889Z" }, + { url = "https://files.pythonhosted.org/packages/ad/90/2660332eeb31303c13b653ea566a9918484b6e4d6b9d2d46879a33ab0622/pyarrow-21.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b7ae0bbdc8c6674259b25bef5d2a1d6af5d39d7200c819cf99e07f7dfef1c51e", size = 42829529, upload-time = "2025-07-18T00:55:47.069Z" }, + { url = "https://files.pythonhosted.org/packages/33/27/1a93a25c92717f6aa0fca06eb4700860577d016cd3ae51aad0e0488ac899/pyarrow-21.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:58c30a1729f82d201627c173d91bd431db88ea74dcaa3885855bc6203e433b82", size = 43367883, upload-time = "2025-07-18T00:55:53.069Z" }, + { url = "https://files.pythonhosted.org/packages/05/d9/4d09d919f35d599bc05c6950095e358c3e15148ead26292dfca1fb659b0c/pyarrow-21.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:072116f65604b822a7f22945a7a6e581cfa28e3454fdcc6939d4ff6090126623", size = 45133802, upload-time = "2025-07-18T00:55:57.714Z" }, + { url = "https://files.pythonhosted.org/packages/71/30/f3795b6e192c3ab881325ffe172e526499eb3780e306a15103a2764916a2/pyarrow-21.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:cf56ec8b0a5c8c9d7021d6fd754e688104f9ebebf1bf4449613c9531f5346a18", size = 26203175, upload-time = "2025-07-18T00:56:01.364Z" }, + { url = "https://files.pythonhosted.org/packages/16/ca/c7eaa8e62db8fb37ce942b1ea0c6d7abfe3786ca193957afa25e71b81b66/pyarrow-21.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:e99310a4ebd4479bcd1964dff9e14af33746300cb014aa4a3781738ac63baf4a", size = 31154306, upload-time = "2025-07-18T00:56:04.42Z" }, + { url = "https://files.pythonhosted.org/packages/ce/e8/e87d9e3b2489302b3a1aea709aaca4b781c5252fcb812a17ab6275a9a484/pyarrow-21.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:d2fe8e7f3ce329a71b7ddd7498b3cfac0eeb200c2789bd840234f0dc271a8efe", size = 32680622, upload-time = "2025-07-18T00:56:07.505Z" }, + { url = "https://files.pythonhosted.org/packages/84/52/79095d73a742aa0aba370c7942b1b655f598069489ab387fe47261a849e1/pyarrow-21.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:f522e5709379d72fb3da7785aa489ff0bb87448a9dc5a75f45763a795a089ebd", size = 41104094, upload-time = "2025-07-18T00:56:10.994Z" }, + { url = "https://files.pythonhosted.org/packages/89/4b/7782438b551dbb0468892a276b8c789b8bbdb25ea5c5eb27faadd753e037/pyarrow-21.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:69cbbdf0631396e9925e048cfa5bce4e8c3d3b41562bbd70c685a8eb53a91e61", size = 42825576, upload-time = "2025-07-18T00:56:15.569Z" }, + { url = "https://files.pythonhosted.org/packages/b3/62/0f29de6e0a1e33518dec92c65be0351d32d7ca351e51ec5f4f837a9aab91/pyarrow-21.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:731c7022587006b755d0bdb27626a1a3bb004bb56b11fb30d98b6c1b4718579d", size = 43368342, upload-time = "2025-07-18T00:56:19.531Z" }, + { url = "https://files.pythonhosted.org/packages/90/c7/0fa1f3f29cf75f339768cc698c8ad4ddd2481c1742e9741459911c9ac477/pyarrow-21.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:dc56bc708f2d8ac71bd1dcb927e458c93cec10b98eb4120206a4091db7b67b99", size = 45131218, upload-time = "2025-07-18T00:56:23.347Z" }, + { url = "https://files.pythonhosted.org/packages/01/63/581f2076465e67b23bc5a37d4a2abff8362d389d29d8105832e82c9c811c/pyarrow-21.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:186aa00bca62139f75b7de8420f745f2af12941595bbbfa7ed3870ff63e25636", size = 26087551, upload-time = "2025-07-18T00:56:26.758Z" }, + { url = "https://files.pythonhosted.org/packages/c9/ab/357d0d9648bb8241ee7348e564f2479d206ebe6e1c47ac5027c2e31ecd39/pyarrow-21.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:a7a102574faa3f421141a64c10216e078df467ab9576684d5cd696952546e2da", size = 31290064, upload-time = "2025-07-18T00:56:30.214Z" }, + { url = "https://files.pythonhosted.org/packages/3f/8a/5685d62a990e4cac2043fc76b4661bf38d06efed55cf45a334b455bd2759/pyarrow-21.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:1e005378c4a2c6db3ada3ad4c217b381f6c886f0a80d6a316fe586b90f77efd7", size = 32727837, upload-time = "2025-07-18T00:56:33.935Z" }, + { url = "https://files.pythonhosted.org/packages/fc/de/c0828ee09525c2bafefd3e736a248ebe764d07d0fd762d4f0929dbc516c9/pyarrow-21.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:65f8e85f79031449ec8706b74504a316805217b35b6099155dd7e227eef0d4b6", size = 41014158, upload-time = "2025-07-18T00:56:37.528Z" }, + { url = "https://files.pythonhosted.org/packages/6e/26/a2865c420c50b7a3748320b614f3484bfcde8347b2639b2b903b21ce6a72/pyarrow-21.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:3a81486adc665c7eb1a2bde0224cfca6ceaba344a82a971ef059678417880eb8", size = 42667885, upload-time = "2025-07-18T00:56:41.483Z" }, + { url = "https://files.pythonhosted.org/packages/0a/f9/4ee798dc902533159250fb4321267730bc0a107d8c6889e07c3add4fe3a5/pyarrow-21.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:fc0d2f88b81dcf3ccf9a6ae17f89183762c8a94a5bdcfa09e05cfe413acf0503", size = 43276625, upload-time = "2025-07-18T00:56:48.002Z" }, + { url = "https://files.pythonhosted.org/packages/5a/da/e02544d6997037a4b0d22d8e5f66bc9315c3671371a8b18c79ade1cefe14/pyarrow-21.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6299449adf89df38537837487a4f8d3bd91ec94354fdd2a7d30bc11c48ef6e79", size = 44951890, upload-time = "2025-07-18T00:56:52.568Z" }, + { url = "https://files.pythonhosted.org/packages/e5/4e/519c1bc1876625fe6b71e9a28287c43ec2f20f73c658b9ae1d485c0c206e/pyarrow-21.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:222c39e2c70113543982c6b34f3077962b44fca38c0bd9e68bb6781534425c10", size = 26371006, upload-time = "2025-07-18T00:56:56.379Z" }, +] + +[[package]] +name = "pyarrow-stubs" +version = "20.0.0.20250928" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyarrow" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/18/5f/9520b0a5cd42b95a945b8ca3bc47f723fc7ec906b7a7de76f2d075d69911/pyarrow_stubs-20.0.0.20250928.tar.gz", hash = "sha256:e802b18e8e5fdf0a78afa05fae78f1456d861fcb1f95ec0234be5d6a5ecdcde2", size = 236588, upload-time = "2025-09-28T02:50:04.839Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/83/13/75c86a8ef61ea2c758c924318cf894dced2436b0f7aeb3c5f0fe9e4305b4/pyarrow_stubs-20.0.0.20250928-py3-none-any.whl", hash = "sha256:5389057a55db3c2662c05f22685a52e15e5effaf4345f41f12fb9b6b348647b9", size = 235745, upload-time = "2025-09-28T02:50:03.205Z" }, ] [[package]] From e75913b37ac553edb64c2ac66a795d2373d9e098 Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Fri, 3 Oct 2025 17:02:32 +1300 Subject: [PATCH 08/32] Refactor to use ParquetLoader for downloading parquet data --- .../src/evo/objects/client/api_client.py | 31 +-- .../src/evo/objects/utils/__init__.py | 48 +++-- .../src/evo/objects/utils/_types.py | 82 -------- .../evo-objects/src/evo/objects/utils/data.py | 185 +++++++++--------- .../src/evo/objects/utils/parquet_loader.py | 151 ++++++++++++++ .../src/evo/objects/utils/table_formats.py | 3 +- .../src/evo/objects/utils/tables.py | 25 +-- .../src/evo/objects/utils/types.py | 42 ++++ 8 files changed, 348 insertions(+), 219 deletions(-) delete mode 100644 packages/evo-objects/src/evo/objects/utils/_types.py create mode 100644 packages/evo-objects/src/evo/objects/utils/parquet_loader.py create mode 100644 packages/evo-objects/src/evo/objects/utils/types.py diff --git a/packages/evo-objects/src/evo/objects/client/api_client.py b/packages/evo-objects/src/evo/objects/client/api_client.py index ee7cf01d..df8c8bdf 100644 --- a/packages/evo-objects/src/evo/objects/client/api_client.py +++ b/packages/evo-objects/src/evo/objects/client/api_client.py @@ -30,9 +30,15 @@ ) from ..exceptions import ObjectUUIDError from ..io import ObjectDataDownload, ObjectDataUpload -from ..utils import ObjectDataClient from .object_client import DownloadedObject +try: + from ..utils import ObjectDataClient +except ImportError: + _DATA_CLIENT_AVAILABLE = False +else: + _DATA_CLIENT_AVAILABLE = True + logger = logging.getLogger("object.client") __all__ = ["ObjectAPIClient"] @@ -257,20 +263,23 @@ async def prepare_data_download( for ctx in downloaded_object.prepare_data_download(data_identifiers): yield ctx - def get_data_client(self, cache: ICache) -> ObjectDataClient: - """Get a data client for the geoscience object service. + if _DATA_CLIENT_AVAILABLE: + # Optional data client functionality, enabled if the data client dependencies are installed. - The data client provides a high-level interface for uploading and downloading data that is referenced in - geoscience objects, and caching the data locally. It depends on the optional dependency `pyarrow`, which is - not installed by default. This dependency can be installed with `pip install evo-objects[utils]`. + def get_data_client(self, cache: ICache) -> ObjectDataClient: + """Get a data client for the geoscience object service. - :param cache: The cache to use for data downloads. + The data client provides a high-level interface for uploading and downloading data that is referenced in + geoscience objects, and caching the data locally. It depends on the optional dependency `pyarrow`, which is + not installed by default. This dependency can be installed with `pip install evo-objects[utils]`. - :return: An ObjectDataClient instance. + :param cache: The cache to use for data downloads. - :raises RuntimeError: If the `pyarrow` package is not installed. - """ - return ObjectDataClient(environment=self._environment, connector=self._connector, cache=cache) + :return: An ObjectDataClient instance. + + :raises RuntimeError: If the `pyarrow` package is not installed. + """ + return ObjectDataClient(environment=self._environment, connector=self._connector, cache=cache) async def create_geoscience_object( self, path: str, object_dict: dict, request_timeout: int | float | tuple[int | float, int | float] | None = None diff --git a/packages/evo-objects/src/evo/objects/utils/__init__.py b/packages/evo-objects/src/evo/objects/utils/__init__.py index c250b0b0..5ce0a58b 100644 --- a/packages/evo-objects/src/evo/objects/utils/__init__.py +++ b/packages/evo-objects/src/evo/objects/utils/__init__.py @@ -9,27 +9,37 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ._types import DataFrame, Table -from .data import ObjectDataClient -__all__ = [ - "DataFrame", - "ObjectDataClient", - "Table", -] +try: + # Import the table type for backwards compatibility. This should be removed in a future release. + from pyarrow import Table # noqa: F401 +except ImportError: + raise ImportError("pyarrow is required to use the utils package in evo-objects") try: - import pyarrow # noqa: F401 + # Import the dataframe type for backwards compatibility. This should be removed in a future release. + from pandas import DataFrame # noqa: F401 except ImportError: - pass # Omit the following imports if pyarrow is not installed. -else: - from .table_formats import all_known_formats, get_known_format - from .tables import ArrowTableFormat, BaseTableFormat, KnownTableFormat + DataFrame = None # type: ignore - __all__ += [ - "ArrowTableFormat", - "BaseTableFormat", - "KnownTableFormat", - "all_known_formats", - "get_known_format", - ] +from .data import ObjectDataClient +from .table_formats import all_known_formats, get_known_format +from .tables import ArrowTableFormat, BaseTableFormat, KnownTableFormat +from .types import ArrayTableInfo, LookupTableInfo, TableInfo + +# We _used_ to export Table and DataFrame from this package as custom protocols, but we are using the actual +# pyarrow.Table and pandas.DataFrame types now. We are importing these types here from pyarrow and pandas +# for backwards compatibility, but they are no longer explicitly exported as the exports should be +# removed in a future release. + +__all__ = [ + "ArrayTableInfo", + "ArrowTableFormat", + "BaseTableFormat", + "KnownTableFormat", + "LookupTableInfo", + "ObjectDataClient", + "TableInfo", + "all_known_formats", + "get_known_format", +] diff --git a/packages/evo-objects/src/evo/objects/utils/_types.py b/packages/evo-objects/src/evo/objects/utils/_types.py deleted file mode 100644 index 93fd012a..00000000 --- a/packages/evo-objects/src/evo/objects/utils/_types.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright © 2025 Bentley Systems, Incorporated -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -from typing import Protocol - -# `evo-objects` uses protocols for annotating some pyarrow types, because: -# - pyarrow is optional, but type annotations are not. -# - pyarrow has poor type checker support. -# -# These protocols should be treated as aliases for the corresponding pyarrow types. -# Any required interfaces from the corresponding pyarrow types should be added to these protocols as needed. - - -class DataType(Protocol): - """Pyarrow data type. - - https://arrow.apache.org/docs/python/generated/pyarrow.DataType.html - """ - - ... - - -class Schema(Protocol): - """Pyarrow schema. - - https://arrow.apache.org/docs/python/generated/pyarrow.Schema.html - """ - - @property - def names(self) -> list[str]: - """The schema's field names.""" - ... - - @property - def types(self) -> list[DataType]: - """The schema's field types.""" - ... - - -class Table(Protocol): - """Pyarrow table. - - https://arrow.apache.org/docs/python/generated/pyarrow.Table.html - """ - - @property - def schema() -> Schema: - """Schema of the table and its columns.""" - ... - - @property - def num_columns(self) -> int: - """Number of columns in this table.""" - ... - - @property - def num_rows(self) -> int: - """Number of rows in this table. - - Due to the definition of a table, all columns have the same number of rows. - """ - ... - - def to_pandas(self) -> DataFrame: - """Convert to a pandas-compatible NumPy array or DataFrame, as appropriate""" - - -class DataFrame(Protocol): - """Pandas DataFrame. - - https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html - """ diff --git a/packages/evo-objects/src/evo/objects/utils/data.py b/packages/evo-objects/src/evo/objects/utils/data.py index 25491441..19d0e60b 100644 --- a/packages/evo-objects/src/evo/objects/utils/data.py +++ b/packages/evo-objects/src/evo/objects/utils/data.py @@ -21,11 +21,21 @@ from evo.common.utils import NoFeedback, PartialFeedback from ..io import _CACHE_SCOPE, ObjectDataUpload -from ._types import DataFrame, Table +from .types import TableInfo -__all__ = [ - "ObjectDataClient", -] +try: + import pyarrow as pa +except ImportError: + raise ImportError("ObjectDataClient requires the `pyarrow` package to be installed") + +try: + import pandas as pd +except ImportError: + _PD_AVAILABLE = False +else: + _PD_AVAILABLE = True + +__all__ = ["ObjectDataClient"] logger = logging.getLogger("object.data") @@ -51,13 +61,6 @@ def _iter_refs(target: Any, _key: str | None = None) -> Iterator[str]: yield str(value) -def _as_table(dataframe: DataFrame) -> Table: - """Wrapper around pyarrow.Table.from_pandas() with a local import.""" - import pyarrow - - return pyarrow.Table.from_pandas(dataframe) - - class ObjectDataClient: """An optional wrapper around data upload and download functionality for geoscience objects. @@ -72,11 +75,6 @@ def __init__(self, environment: Environment, connector: APIConnector, cache: ICa :param connector: The API connector to use for uploading and downloading data. :param cache: The cache to use for storing data locally. """ - try: - import pyarrow # noqa: F401 - except ImportError: - raise RuntimeError("Unable to create ObjectDataClient because the `pyarrow` package is not installed") - self._environment = environment self._connector = connector self._cache = cache @@ -90,34 +88,6 @@ def clear_cache(self) -> None: """Clear the cache used by this client.""" self._cache.clear_cache(environment=self._environment, scope=_CACHE_SCOPE) - def save_table(self, table: Table) -> dict: - """Save a pyarrow table to a file, returning the table info as a dictionary. - - :param table: The pyarrow table to save. - - :return: Information about the saved table. - - :raises TableFormatError: If the provided table does not match this format. - :raises StorageFileNotFoundError: If the destination does not exist or is not a directory. - """ - from .table_formats import get_known_format - - known_format = get_known_format(table) - table_info = known_format.save_table(table=table, destination=self.cache_location) - return table_info - - def save_dataframe(self, dataframe: DataFrame) -> dict: - """Save a pandas dataframe to a file, returning the table info as a dictionary. - - :param dataframe: The pandas dataframe to save. - - :return: Information about the saved table. - - :raises TableFormatError: If the provided table does not match this format. - :raises StorageFileNotFoundError: If the destination does not exist or is not a directory. - """ - return self.save_table(_as_table(dataframe)) - async def upload_referenced_data(self, object_model: dict, fb: IFeedback = NoFeedback) -> None: """Upload all data referenced by a geoscience object. @@ -155,7 +125,23 @@ async def upload_referenced_data(self, object_model: dict, fb: IFeedback = NoFee ) fb.progress(1) - async def upload_table(self, table: Table, fb: IFeedback = NoFeedback) -> dict: + def save_table(self, table: pa.Table) -> TableInfo: + """Save a pyarrow table to a file, returning the table info as a dictionary. + + :param table: The pyarrow table to save. + + :return: Information about the saved table. + + :raises TableFormatError: If the provided table does not match this format. + :raises StorageFileNotFoundError: If the destination does not exist or is not a directory. + """ + from .table_formats import get_known_format + + known_format = get_known_format(table) + table_info = known_format.save_table(table=table, destination=self.cache_location) + return table_info + + async def upload_table(self, table: pa.Table, fb: IFeedback = NoFeedback) -> TableInfo: """Upload pyarrow table to the geoscience object service, returning a GO model of the uploaded data. :param table: The table to be uploaded. @@ -174,22 +160,9 @@ async def upload_table(self, table: Table, fb: IFeedback = NoFeedback) -> dict: fb.progress(1) return table_info - async def upload_dataframe(self, dataframe: DataFrame, fb: IFeedback = NoFeedback) -> dict: - """Upload pandas dataframe to the geoscience object service, returning a GO model of the uploaded data. - - :param dataframe: The pandas dataframe to be uploaded. - :param fb: A feedback object for tracking upload progress. - - :return: A description of the uploaded data. - - :raises TableFormatError: If the table does not match a known format. - """ - table_info = await self.upload_table(_as_table(dataframe), fb=fb) - return table_info - async def download_table( - self, object_id: UUID, version_id: str, table_info: dict, fb: IFeedback = NoFeedback - ) -> Table: + self, object_id: UUID, version_id: str, table_info: TableInfo, fb: IFeedback = NoFeedback + ) -> pa.Table: """Download pyarrow table from the geoscience object service. The parquet metadata will be used to make sure the file contents matches the expected format before the table @@ -208,43 +181,67 @@ async def download_table( :raises SchemaValidationError: If the data has a different number of rows than expected. """ from ..client import ObjectAPIClient # Import here to avoid circular import. - from .tables import KnownTableFormat # Import here to avoid import error if pyarrow is not installed. - - parquet_file = self.cache_location / str(table_info["data"]) - if not parquet_file.exists(): # Only download it if it isn't already there. - # Reusing the implementation for preparing a download from ObjectAPIClient to avoid code duplication. - client = ObjectAPIClient(self._environment, self._connector) - (download,) = [d async for d in client.prepare_data_download(object_id, version_id, [table_info["data"]])] - await download.download_to_cache(cache=self._cache, transport=self._connector.transport, fb=fb) - else: - fb.progress(1) - logger.debug(f"Data not downloaded because it already exists locally (label: {table_info['data']})") + from .parquet_loader import ParquetLoader - # Load the table from the cache. - return KnownTableFormat.load_table(table_info, self.cache_location) + client = ObjectAPIClient(self._environment, self._connector) + (download,) = [d async for d in client.prepare_data_download(object_id, version_id, [table_info["data"]])] - async def download_dataframe( - self, object_id: UUID, version_id: str, table_info: dict, fb: IFeedback = NoFeedback - ) -> DataFrame: - """Download pandas dataframe data from the geoscience object service. + # Defer downloading the table to the new ParquetLoader class. + loader = ParquetLoader( + download=download, table_info=table_info, transport=self._connector.transport, cache=self._cache + ) + return await loader.load_as_table(fb=fb) - The parquet metadata will be used to make sure the file contents matches the expected format before the table - is read into memory. + if _PD_AVAILABLE: + # Optional support for pandas dataframes. Depends on both pyarrow and pandas. - :param object_id: The object ID to download the data from. - :param version_id: The version ID to download the data from. - :param table_info: The table info that defines the expected format. The model's `data` will be downloaded from - the service. - :param fb: A feedback object for tracking download progress. + def save_dataframe(self, dataframe: pd.DataFrame) -> TableInfo: + """Save a pandas dataframe to a file, returning the table info as a dictionary. - :return: A pandas dataframe loaded directly from the parquet file. + :param dataframe: The pandas dataframe to save. - :raises DataNotFoundError: If the data does not exist or is not associated with this object version. - :raises TableFormatError: If the data does not match the expected format. - :raises SchemaValidationError: If the data has a different number of rows than expected. - """ - table = await self.download_table(object_id, version_id, table_info, fb) - try: - return table.to_pandas() - except ModuleNotFoundError: - raise RuntimeError("Unable to download dataframe because the `pandas` package is not installed") + :return: Information about the saved table. + + :raises TableFormatError: If the provided table does not match this format. + :raises StorageFileNotFoundError: If the destination does not exist or is not a directory. + """ + return self.save_table(pa.Table.from_pandas(dataframe)) + + async def upload_dataframe(self, dataframe: pd.DataFrame, fb: IFeedback = NoFeedback) -> TableInfo: + """Upload pandas dataframe to the geoscience object service, returning a GO model of the uploaded data. + + :param dataframe: The pandas dataframe to be uploaded. + :param fb: A feedback object for tracking upload progress. + + :return: A description of the uploaded data. + + :raises TableFormatError: If the table does not match a known format. + """ + table_info = await self.upload_table(pa.Table.from_pandas(dataframe), fb=fb) + return table_info + + async def download_dataframe( + self, object_id: UUID, version_id: str, table_info: TableInfo, fb: IFeedback = NoFeedback + ) -> pd.DataFrame: + """Download pandas dataframe data from the geoscience object service. + + The parquet metadata will be used to make sure the file contents matches the expected format before the table + is read into memory. + + :param object_id: The object ID to download the data from. + :param version_id: The version ID to download the data from. + :param table_info: The table info that defines the expected format. The model's `data` will be downloaded from + the service. + :param fb: A feedback object for tracking download progress. + + :return: A pandas dataframe loaded directly from the parquet file. + + :raises DataNotFoundError: If the data does not exist or is not associated with this object version. + :raises TableFormatError: If the data does not match the expected format. + :raises SchemaValidationError: If the data has a different number of rows than expected. + """ + table = await self.download_table(object_id, version_id, table_info, fb) + try: + return table.to_pandas() + except ModuleNotFoundError: + raise RuntimeError("Unable to download dataframe because the `pandas` package is not installed") diff --git a/packages/evo-objects/src/evo/objects/utils/parquet_loader.py b/packages/evo-objects/src/evo/objects/utils/parquet_loader.py new file mode 100644 index 00000000..2162ab6a --- /dev/null +++ b/packages/evo-objects/src/evo/objects/utils/parquet_loader.py @@ -0,0 +1,151 @@ +from __future__ import annotations + +from io import BytesIO +from logging import getLogger +from typing import cast + +from pydantic import TypeAdapter + +from evo.common import ICache, IFeedback, ITransport +from evo.common.io import BytesDestination, ChunkedIOManager, Download, HTTPSource +from evo.common.utils import NoFeedback + +from ..exceptions import SchemaValidationError +from . import ArrowTableFormat, KnownTableFormat +from .types import TableInfo + +try: + import pyarrow as pa + import pyarrow.parquet as pq +except ImportError: + raise ImportError("The 'pyarrow' package is required to use ParquetLoader") from None + +try: + import pandas as pd +except ImportError: + _PD_AVAILABLE = False +else: + _PD_AVAILABLE = True + +try: + import numpy as np +except ImportError: + _NP_AVAILABLE = False +else: + _NP_AVAILABLE = True + +__all__ = ["ParquetLoader"] + +logger = getLogger(__name__) + +_TABLE_INFO_ADAPTER: TypeAdapter[TableInfo] = TypeAdapter(TableInfo) + + +class ParquetLoader: + """A loader for Parquet data from a geoscience object.""" + + def __init__( + self, download: Download, table_info: TableInfo, transport: ITransport, cache: ICache | None = None + ) -> None: + """ + :param download: The download information for the Parquet data. + :param table_info: The expected table information for validation. + :param transport: The transport to use for data downloads. + :param cache: An optional cache to use for data downloads. + """ + self._download = download + validated_table_info = _TABLE_INFO_ADAPTER.validate_python(table_info) + self._expected_format = KnownTableFormat.from_table_info(validated_table_info) + self._expected_length = table_info["length"] + self._transport = transport + self._cache = cache + + async def _reader_from_cache(self, fb: IFeedback) -> pa.NativeFile: + cached = await self._download.download_to_cache(self._cache, self._transport, fb=fb) + return pa.OSFile(str(cached), "r") + + async def _reader_from_memory(self, fb: IFeedback) -> pa.NativeFile: + # Initialize a buffer to store the downloaded data in memory + memory = BytesIO() + + # Use ChunkedIOManager to download the data into the memory buffer + manager = ChunkedIOManager() + async with HTTPSource(self._download.get_download_url, self._transport) as source: + destination = BytesDestination(memory) + await manager.run(source, destination, fb=fb) + + # Reset the buffer's position to the beginning + memory.seek(0) + return pa.BufferReader(memory.getbuffer()) + + async def _reader(self, fb: IFeedback) -> pa.NativeFile: + if self._cache is not None: + return await self._reader_from_cache(fb) + else: + return await self._reader_from_memory(fb) + + def _validate_data(self, data: pq.ParquetFile) -> None: + logger.debug("Checking parquet data format") + actual_format = ArrowTableFormat.from_schema(data.schema_arrow) + KnownTableFormat._check_format(self._expected_format, actual_format) + + logger.debug("Checking parquet data length") + actual_length = data.metadata.num_rows + if self._expected_length != actual_length: + raise SchemaValidationError( + f"Row count ({actual_length}) does not match expectation ({self._expected_length})" + ) + + logger.debug("Parquet metadata checks succeeded") + + async def load_as_table(self, fb: IFeedback = NoFeedback) -> pa.Table: + """Load the Parquet data as a PyArrow Table. + + :param fb: An optional feedback interface to report progress. + + :raises SchemaValidationError: If the data does not match the expected schema. + """ + with await self._reader(fb) as reader: + data = pq.ParquetFile(reader) + self._validate_data(data) + return data.read() + + if _PD_AVAILABLE: + # Optional support for pandas dataframes + + async def load_as_dataframe(self, fb: IFeedback = NoFeedback) -> pd.DataFrame: + """Load the Parquet data as a Pandas DataFrame. + + :param fb: An optional feedback interface to report progress. + + :raises SchemaValidationError: If the data does not match the expected schema. + """ + table = await self.load_as_table(fb) + return table.to_pandas() + + if _NP_AVAILABLE: + # Optional support for numpy arrays + + async def load_as_array(self, fb: IFeedback = NoFeedback) -> np.ndarray: + """Load the Parquet data as a NumPy array. + + The array will have a shape of (N,) for single-column data or (N, M) for multi-column data, + where N is the number of rows and M is the number of columns. The target data _must_ have a uniform dtype. + + :param fb: An optional feedback interface to report progress. + + :raises SchemaValidationError: If the data does not match the expected schema. + """ + try: + dtype = np.dtype(self._expected_format.data_type) + except TypeError: + raise SchemaValidationError( + f"Unsupported data type '{self._expected_format.data_type}' cannot be loaded as a numpy array" + ) + + table = await self.load_as_table(fb) + columns = cast(list[np.ndarray], [col.combine_chunks().to_numpy() for col in table.itercolumns()]) + if len(columns) == 1: + return columns[0].astype(dtype) + else: + return np.column_stack(columns).astype(dtype) diff --git a/packages/evo-objects/src/evo/objects/utils/table_formats.py b/packages/evo-objects/src/evo/objects/utils/table_formats.py index 57eee26d..27265792 100644 --- a/packages/evo-objects/src/evo/objects/utils/table_formats.py +++ b/packages/evo-objects/src/evo/objects/utils/table_formats.py @@ -14,7 +14,6 @@ from evo import logging from ..exceptions import TableFormatError -from ._types import Table from .tables import ArrowTableFormat, KnownTableFormat __all__ = [ @@ -135,7 +134,7 @@ ] -def get_known_format(table: Table) -> KnownTableFormat: +def get_known_format(table: pa.Table) -> KnownTableFormat: """Get the known table format that best matches the provided table. If both a multidimensional format and a format with fixed dimensions would match, the format with fixed dimensions diff --git a/packages/evo-objects/src/evo/objects/utils/tables.py b/packages/evo-objects/src/evo/objects/utils/tables.py index 0397a658..5ff336de 100644 --- a/packages/evo-objects/src/evo/objects/utils/tables.py +++ b/packages/evo-objects/src/evo/objects/utils/tables.py @@ -19,12 +19,12 @@ import pyarrow as pa import pyarrow.parquet as pq +from typing_extensions import deprecated import evo.logging from evo.common.exceptions import StorageFileNotFoundError from ..exceptions import SchemaValidationError, TableFormatError -from ._types import DataType, Schema, Table logger = evo.logging.getLogger("object.tables") @@ -39,7 +39,7 @@ class _ColumnFormat: - def __init__(self, format_spec: DataType | str): + def __init__(self, format_spec: pa.DataType | str): if isinstance(format_spec, str): self._type = self._get_data_type(format_spec) self._format_id = format_spec @@ -48,7 +48,7 @@ def __init__(self, format_spec: DataType | str): self._format_id = self._get_format_id(format_spec) @staticmethod - def _get_data_type(format_id: str) -> DataType: + def _get_data_type(format_id: str) -> pa.DataType: match format_id: case "float64": return pa.float64() @@ -72,7 +72,7 @@ def _get_data_type(format_id: str) -> DataType: raise TypeError(f"Unsupported column type '{unknown_format}'") @staticmethod - def _get_format_id(data_type: DataType) -> str: + def _get_format_id(data_type: pa.DataType) -> str: match str(data_type): case "double": return "float64" @@ -100,14 +100,14 @@ def id(self) -> str: return self._format_id @property - def type(self) -> DataType: + def type(self) -> pa.DataType: return self._type class BaseTableFormat: """Base type for comparing table formats""" - def __init__(self, name: str, columns: list[DataType | str | EllipsisType]) -> None: + def __init__(self, name: str, columns: list[pa.DataType | str | EllipsisType]) -> None: """ :param name: The display name for this format. :param columns: A list of column data types in this format. A single column data type followed by Ellipsis @@ -157,7 +157,7 @@ class ArrowTableFormat(BaseTableFormat): """Specialised table format type that can be generated from a pyarrow table""" @classmethod - def from_schema(cls, pa_schema: Schema) -> ArrowTableFormat: + def from_schema(cls, pa_schema: pa.Schema) -> ArrowTableFormat: """Generate an ArrowTableFormat instance that represents the structure of the provided table schema. :param pa_schema: Table schema to generate a format representation for. @@ -170,7 +170,7 @@ def from_schema(cls, pa_schema: Schema) -> ArrowTableFormat: class KnownTableFormat(BaseTableFormat): """A definition of a known table format that matches a Geoscience Object Schema model type""" - def __init__(self, name: str, columns: list[DataType | EllipsisType], field_names: list[str] | None) -> None: + def __init__(self, name: str, columns: list[pa.DataType | EllipsisType], field_names: list[str] | None) -> None: """ :param name: The display name for this format. :param columns: A list of column data types in this format. A single column data type followed by Ellipsis @@ -231,7 +231,7 @@ def _get_file_digest(file_path: Path) -> str: return sha256_digest.hexdigest() @classmethod - def _save_table_as_parquet(cls, table: Table, destination: Path) -> str: + def _save_table_as_parquet(cls, table: pa.Table, destination: Path) -> str: """Save a table in parquet format. :param table: The table to save to parquet file. @@ -270,7 +270,7 @@ def _save_table_as_parquet(cls, table: Table, destination: Path) -> str: return data_ref - def save_table(self, table: Table, destination: Path) -> dict: + def save_table(self, table: pa.Table, destination: Path) -> dict: """Save a pyarrow table in parquet format and return a GO model of the table metadata. :param table: The table to save in parquet format. @@ -346,7 +346,10 @@ def from_table_info(cls, table_info: dict) -> KnownTableFormat: return KnownTableFormat(name=type_name, columns=columns, field_names=table_info.get("field_names")) @classmethod - def load_table(cls, table_info: dict, source: Path) -> Table: + @deprecated( + "KnownTableFormat.load_table is deprecated, use evo.objects.client.parquet_loader.ParquetLoader instead" + ) + def load_table(cls, table_info: dict, source: Path) -> pa.Table: """Load parquet data as a pyarrow.Table and verify the format against the provided table info. The parquet metadata will be used to make sure the file contents matches the expected format before the table diff --git a/packages/evo-objects/src/evo/objects/utils/types.py b/packages/evo-objects/src/evo/objects/utils/types.py new file mode 100644 index 00000000..8ef092dc --- /dev/null +++ b/packages/evo-objects/src/evo/objects/utils/types.py @@ -0,0 +1,42 @@ +# Copyright © 2025 Bentley Systems, Incorporated +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +from typing import TypeAlias + +if sys.version_info >= (3, 11): + from typing import NotRequired, TypedDict +else: + from typing_extensions import NotRequired, TypedDict + +__all__ = [ + "ArrayTableInfo", + "LookupTableInfo", + "TableInfo", +] + + +class _BaseTableInfo(TypedDict): + data: str + length: int + + +class ArrayTableInfo(_BaseTableInfo): + data_type: str + width: NotRequired[int] + + +class LookupTableInfo(_BaseTableInfo): + keys_data_type: str + values_data_type: str + + +TableInfo: TypeAlias = ArrayTableInfo | LookupTableInfo From 17016670b3e60b92730d1dc58d606856dd68a147 Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Fri, 3 Oct 2025 17:16:25 +1300 Subject: [PATCH 09/32] WIP: Update unit tests --- packages/evo-objects/tests/helpers.py | 124 ++++++++++++++++++ .../tests/test_object_service_client.py | 17 ++- packages/evo-objects/tests/test_tables.py | 110 +--------------- 3 files changed, 144 insertions(+), 107 deletions(-) diff --git a/packages/evo-objects/tests/helpers.py b/packages/evo-objects/tests/helpers.py index af7c43b4..384cbb8d 100644 --- a/packages/evo-objects/tests/helpers.py +++ b/packages/evo-objects/tests/helpers.py @@ -9,7 +9,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +import random import sys +from collections.abc import Iterator +from datetime import datetime, timezone + +import numpy +import pyarrow as pa + +from evo.objects.utils.tables import BaseTableFormat, _ColumnFormat class NoImport: @@ -30,3 +38,119 @@ def __exit__(self, exc_type, exc_val, exc_tb) -> bool: for name in self._names: # Remove the module from sys.modules to clean up. del sys.modules[name] + + +class UnloadModule: + """Simple context manager to unload one or more named modules on entry and restore on exit.""" + + def __init__(self, *names: str) -> None: + """ + :param names: The names of the modules to unload on entry and restore on exit. + """ + self._names = names + self._unloaded_modules = {} + + def _unload_module(self, name: str) -> None: + if name in sys.modules: + self._unloaded_modules[name] = sys.modules[name] + del sys.modules[name] + + parent, *_ = name.rpartition(".") + if parent: + self._unload_module(parent) + + def __enter__(self) -> None: + for name in self._names: + self._unload_module(name) + + def __exit__(self, exc_type, exc_val, exc_tb) -> bool: + # Restore the unloaded modules. + for name, module in self._unloaded_modules.items(): + sys.modules[name] = module + + +def _generate_float64_data(n_samples: int) -> Iterator[float]: + max_ = numpy.finfo("float64").max + for _ in range(n_samples): + yield max_ * random.uniform(-1.0, 1.0) + + +def _generate_int_data(int_type: str, n_samples: int) -> Iterator[int]: + min_, max_ = numpy.iinfo(int_type).min, numpy.iinfo(int_type).max + for _ in range(n_samples): + yield random.randint(min_, max_) + + +def _generate_bool_data(n_samples: int) -> Iterator[bool]: + for _ in range(n_samples): + yield random.choice((True, False)) + + +def _generate_string_data(n_samples: int) -> Iterator[str]: + str_sample = "0123456789ABCDEF " + for _ in range(n_samples): + length = random.randint(10, 10000) + yield "".join(random.choices(str_sample, k=length)) + + +def _generate_timestamp_data(n_samples: int) -> Iterator[datetime]: + min_ = datetime(1970, 1, 1, tzinfo=timezone.utc).timestamp() + max_ = datetime(2038, 12, 31, 23, 59, 59, 999999, tzinfo=timezone.utc).timestamp() + for _ in range(n_samples): + yield datetime.utcfromtimestamp(random.uniform(min_, max_)) + + +def _generate_data(format_id: str, n_samples: int) -> Iterator: + match format_id: + case "float64": + yield from _generate_float64_data(n_samples) + case "uint8" | "uint32" | "uint64" | "int32" | "int64" as int_type: + yield from _generate_int_data(int_type, n_samples) + case "bool": + yield from _generate_bool_data(n_samples) + case "string": + yield from _generate_string_data(n_samples) + case "timestamp": + yield from _generate_timestamp_data(n_samples) + case unknown_format: + raise TypeError(f"Unsupported format '{unknown_format}'") + + +def _change_format(current_format: _ColumnFormat) -> _ColumnFormat: + match current_format.id: + case "float64": + return _ColumnFormat("int64") + case "uint8" | "uint32" | "uint64" | "int32" | "int64": + return _ColumnFormat("float64") + case "bool" | "timestamp": + return _ColumnFormat("string") + case "string": + return _ColumnFormat("bool") + case unknown_format: + raise TypeError(f"Unsupported format '{unknown_format}'") + + +def get_sample_table( + table_format: BaseTableFormat, n_rows: int, add_column: bool = False, change_types: bool = False +) -> pa.Table: + column_formats = [column for column in table_format._columns] + + if add_column: + column_formats.append(_ColumnFormat(column_formats[-1].type)) + + if change_types: + column_formats = [_change_format(column) for column in column_formats] + + if table_format._multi_dimensional: + # Test multidimensional tables with an arbitrary number of columns. If the number of columns matches a more + # specific GO type (one with a fixed number of columns), the more specific type would be instantiated. + column_formats *= 20 + + sample_schema = pa.schema( + [pa.field(f"{column.id}[{i}]", column.type, nullable=False) for i, column in enumerate(column_formats)] + ) + sample_data = [ + pa.array(_generate_data(column_format.id, n_rows), type=column_format.type, size=n_rows) + for column_format in column_formats + ] + return pa.table(sample_data, names=sample_schema.names).cast(sample_schema) diff --git a/packages/evo-objects/tests/test_object_service_client.py b/packages/evo-objects/tests/test_object_service_client.py index d44a0c5d..eaacee1d 100644 --- a/packages/evo-objects/tests/test_object_service_client.py +++ b/packages/evo-objects/tests/test_object_service_client.py @@ -35,7 +35,7 @@ from evo.objects.data import ObjectOrderByEnum, OrgObjectMetadata, Stage from evo.objects.exceptions import ObjectAlreadyExistsError, ObjectUUIDError from evo.objects.utils import ObjectDataClient -from helpers import NoImport +from helpers import NoImport, UnloadModule EMPTY_CONTENT = '{"objects": [], "links": {"next": null, "prev": null}}' MOCK_VERSION_CONTENT = json.dumps(load_test_data("list_versions.json")) @@ -669,8 +669,19 @@ def test_get_data_client(self) -> None: def test_get_data_client_missing_dependencies(self) -> None: """Test getting a data client with missing dependencies.""" - with NoImport("pyarrow"), self.assertRaises(RuntimeError): - self.object_client.get_data_client(self.cache) + with UnloadModule("evo.objects.client.api_client", "evo.objects.utils.data"), NoImport("pyarrow"): + from evo.objects.client import ObjectAPIClient + + client = ObjectAPIClient(self.environment, self.connector) + self.assertFalse( + any( + ( + hasattr(ObjectAPIClient, "get_data_client"), + hasattr(client, "get_data_client"), + ) + ), + "get_data_client should not be available if pyarrow is missing", + ) async def test_get_latest_object_versions(self) -> None: content = json.dumps( diff --git a/packages/evo-objects/tests/test_tables.py b/packages/evo-objects/tests/test_tables.py index 91f3f6ca..9a79d529 100644 --- a/packages/evo-objects/tests/test_tables.py +++ b/packages/evo-objects/tests/test_tables.py @@ -10,18 +10,15 @@ # limitations under the License. import hashlib -import random import unittest import uuid from collections.abc import Iterator -from datetime import datetime, timezone from io import BytesIO from pathlib import Path from typing import BinaryIO from unittest import mock from uuid import UUID -import numpy import pyarrow as pa import pyarrow.parquet as pq from parameterized import parameterized, parameterized_class @@ -37,7 +34,7 @@ all_known_formats, get_known_format, ) -from evo.objects.utils.tables import _ColumnFormat +from helpers import get_sample_table SAMPLE_DATA_LENGTH = 10 ENVIRONMENT = Environment(hub_url=BASE_URL, org_id=UUID(int=0), workspace_id=UUID(int=0)) @@ -66,101 +63,6 @@ def _all_known_formats_for_testing() -> Iterator[dict]: yield {"data_format": known_format, "expected_field_names": known_format._field_names} -def _generate_float64_data(n_samples: int) -> Iterator[float]: - max_ = numpy.finfo("float64").max - for _ in range(n_samples): - yield max_ * random.uniform(-1.0, 1.0) - - -def _generate_int_data(int_type: str, n_samples: int) -> Iterator[int]: - min_, max_ = numpy.iinfo(int_type).min, numpy.iinfo(int_type).max - for _ in range(n_samples): - yield random.randint(min_, max_) - - -def _generate_bool_data(n_samples: int) -> Iterator[bool]: - for _ in range(n_samples): - yield random.choice((True, False)) - - -def _generate_string_data(n_samples: int) -> Iterator[str]: - str_sample = "0123456789ABCDEF " - for _ in range(n_samples): - length = random.randint(10, 10000) - yield "".join(random.choices(str_sample, k=length)) - - -def _generate_timestamp_data(n_samples: int) -> Iterator[datetime]: - min_ = datetime(1970, 1, 1, tzinfo=timezone.utc).timestamp() - max_ = datetime(2038, 12, 31, 23, 59, 59, 999999, tzinfo=timezone.utc).timestamp() - for _ in range(n_samples): - yield datetime.utcfromtimestamp(random.uniform(min_, max_)) - - -def _generate_data(format_id: str, n_samples: int) -> Iterator: - match format_id: - case "float64": - yield from _generate_float64_data(n_samples) - case "uint8" | "uint32" | "uint64" | "int32" | "int64" as int_type: - yield from _generate_int_data(int_type, n_samples) - case "bool": - yield from _generate_bool_data(n_samples) - case "string": - yield from _generate_string_data(n_samples) - case "timestamp": - yield from _generate_timestamp_data(n_samples) - case unknown_format: - raise TypeError(f"Unsupported format '{unknown_format}'") - - -def _get_sample_column(column_format: _ColumnFormat, n_samples: int) -> pa.Array: - return pa.array(_generate_data(column_format.id, n_samples), type=column_format.type, size=n_samples) - - -def _get_table_schema(columns: list[_ColumnFormat]) -> pa.Schema: - return pa.schema([pa.field(f"{column.id}[{i}]", column.type, nullable=False) for i, column in enumerate(columns)]) - - -def _change_format(current_format: _ColumnFormat) -> _ColumnFormat: - match current_format.id: - case "float64": - return _ColumnFormat("int64") - case "uint8" | "uint32" | "uint64" | "int32" | "int64": - return _ColumnFormat("float64") - case "bool" | "timestamp": - return _ColumnFormat("string") - case "string": - return _ColumnFormat("bool") - case unknown_format: - raise TypeError(f"Unsupported format '{unknown_format}'") - - -def _get_sample_table( - table_format: BaseTableFormat, n_rows: int, add_column: bool = False, change_types: bool = False -) -> pa.Table: - column_formats = [column for column in table_format._columns] - - if add_column: - column_formats.append(_ColumnFormat(column_formats[-1].type)) - - if change_types: - column_formats = [_change_format(column) for column in column_formats] - - if table_format._multi_dimensional: - # Test multidimensional tables with an arbitrary number of columns. If the number of columns matches a more - # specific GO type (one with a fixed number of columns), the more specific type would be instantiated. - column_formats *= 20 - - sample_schema = pa.schema( - [pa.field(f"{column.id}[{i}]", column.type, nullable=False) for i, column in enumerate(column_formats)] - ) - sample_data = [ - pa.array(_generate_data(column_format.id, n_rows), type=column_format.type, size=n_rows) - for column_format in column_formats - ] - return pa.table(sample_data, names=sample_schema.names).cast(sample_schema) - - def _get_buffer_digest(buffer: BinaryIO) -> str: """Return a sha256 digest of a binary buffer""" buffer.seek(0) @@ -189,7 +91,7 @@ class TestKnownFormat(unittest.TestCase): expected_field_names: str def setUp(self) -> None: - self.sample_table = _get_sample_table(table_format=self.data_format, n_rows=SAMPLE_DATA_LENGTH) + self.sample_table = get_sample_table(table_format=self.data_format, n_rows=SAMPLE_DATA_LENGTH) self.expected_parquet_digest = _get_table_digest(self.sample_table) self.data_dir = CACHE.get_location(ENVIRONMENT, self.__class__.__name__) self.parquet_file = self.data_dir / self.expected_parquet_digest @@ -251,7 +153,7 @@ def _save_parquet_file(self, add_column: bool = False, add_row: bool = False, ch sample_length = SAMPLE_DATA_LENGTH if add_row: sample_length += 1 - self.sample_table = _get_sample_table( + self.sample_table = get_sample_table( table_format=self.data_format, n_rows=sample_length, add_column=add_column, change_types=change_type ) @@ -400,7 +302,7 @@ class TestComplexFormats(unittest.TestCase): expect_extra_column_fails: bool def setUp(self) -> None: - self.sample_table = _get_sample_table(table_format=self.data_format, n_rows=SAMPLE_DATA_LENGTH) + self.sample_table = get_sample_table(table_format=self.data_format, n_rows=SAMPLE_DATA_LENGTH) self.expected_parquet_digest = _get_table_digest(self.sample_table) self.data_dir = CACHE.get_location(ENVIRONMENT, self.__class__.__name__) self.parquet_file = self.data_dir / self.expected_parquet_digest @@ -422,7 +324,7 @@ def test_save_table(self) -> None: def test_save_table_extra_column_fails(self) -> None: self.assertFalse(self.parquet_file.is_file()) - sample_table = _get_sample_table(self.data_format, n_rows=SAMPLE_DATA_LENGTH, add_column=True) + sample_table = get_sample_table(self.data_format, n_rows=SAMPLE_DATA_LENGTH, add_column=True) if self.expect_extra_column_fails: with self.assertRaises(TableFormatError): @@ -435,7 +337,7 @@ def test_save_table_extra_column_fails(self) -> None: def test_save_table_different_column_types_fails(self) -> None: self.assertFalse(self.parquet_file.is_file()) - sample_table = _get_sample_table(self.data_format, n_rows=SAMPLE_DATA_LENGTH, change_types=True) + sample_table = get_sample_table(self.data_format, n_rows=SAMPLE_DATA_LENGTH, change_types=True) known_format = get_known_format(sample_table) with self.assertRaises(TableFormatError): From 6897d864a1dd91438ca7ecb0025f2933f5cf43c0 Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Mon, 6 Oct 2025 10:35:56 +1300 Subject: [PATCH 10/32] Update data client unit tests --- .../evo-objects/tests/test_data_client.py | 208 +++++++++++------- 1 file changed, 127 insertions(+), 81 deletions(-) diff --git a/packages/evo-objects/tests/test_data_client.py b/packages/evo-objects/tests/test_data_client.py index 71616a9d..afdd17c9 100644 --- a/packages/evo-objects/tests/test_data_client.py +++ b/packages/evo-objects/tests/test_data_client.py @@ -10,15 +10,28 @@ # limitations under the License. import json +from io import BytesIO from unittest import mock from uuid import UUID +import pyarrow as pa +import pyarrow.parquet as pq +from pandas.testing import assert_frame_equal + from data import load_test_data from evo.common import IFeedback, RequestMethod from evo.common.io.exceptions import DataExistsError from evo.common.test_tools import TestWithConnector, TestWithStorage from evo.common.utils import NoFeedback, PartialFeedback -from evo.objects.utils import KnownTableFormat, ObjectDataClient +from evo.objects.utils import BaseTableFormat, KnownTableFormat, ObjectDataClient +from helpers import NoImport, UnloadModule, get_sample_table + + +def _get_sample_table_and_bytes(table_format: BaseTableFormat, n_rows: int) -> tuple[pa.Table, bytes]: + memory = BytesIO() + table = get_sample_table(table_format, n_rows) + pq.write_table(table, where=memory, version="2.4", compression="gzip") + return table, memory.getvalue() class TestObjectDataClient(TestWithConnector, TestWithStorage): @@ -27,6 +40,10 @@ def setUp(self) -> None: TestWithStorage.setUp(self) self.data_client = ObjectDataClient(environment=self.environment, connector=self.connector, cache=self.cache) + def tearDown(self) -> None: + # Clear cache between tests to avoid cached files interfering with subsequent tests. + self.cache.clear_cache() + @property def base_path(self) -> str: return f"geoscience-object/orgs/{self.environment.org_id}/workspaces/{self.environment.workspace_id}" @@ -302,21 +319,28 @@ async def test_download_table(self) -> None: object_id = UUID(int=2) with ( self.transport.set_http_response(status_code=200, content=json.dumps(get_object_response)), - mock.patch("evo.objects.utils.tables.KnownTableFormat") as mock_known_table_format, mock.patch("evo.common.io.download.HTTPSource", autospec=True) as mock_source, ): - mock_table_info = {} - mock_table_info["data"] = mock_data_id = "0000000000000000000000000000000000000000000000000000000000000001" - mock_known_table_format.load_table = mock_load_table = mock.Mock() + mock_table_info = { + "data": "0000000000000000000000000000000000000000000000000000000000000001", + "length": 1, + "width": 3, + "data_type": "float64", + } + mock_data_id: str = mock_table_info["data"] + expected_filename = self.data_client.cache_location / mock_data_id + sample_table, payload_bytes = _get_sample_table_and_bytes( + KnownTableFormat.from_table_info(mock_table_info), 1 + ) async def _mock_download_file_side_effect(*args, **kwargs): - expected_filename = self.data_client.cache_location / mock_data_id expected_download_url = get_object_response["links"]["data"][1]["download_url"] actual_download_url = await kwargs["url_generator"]() self.assertEqual(expected_filename, kwargs["filename"]) self.assertEqual(expected_download_url, actual_download_url) self.assertIs(self.transport, kwargs["transport"]) self.assertIs(NoFeedback, kwargs["fb"]) + expected_filename.write_bytes(payload_bytes) mock_source.download_file.side_effect = _mock_download_file_side_effect actual_table = await self.data_client.download_table(object_id, None, mock_table_info) @@ -327,8 +351,7 @@ async def _mock_download_file_side_effect(*args, **kwargs): path=f"{self.base_path}/objects/{object_id}", headers={"Accept": "application/json", "Accept-Encoding": "gzip"}, ) - mock_load_table.assert_called_once_with(mock_table_info, self.data_client.cache_location) - self.assertIs(mock_load_table.return_value, actual_table) + self.assertEqual(sample_table, actual_table) async def test_download_dataframe(self) -> None: """Test downloading tabular data using pandas.""" @@ -336,25 +359,31 @@ async def test_download_dataframe(self) -> None: object_id = UUID(int=2) with ( self.transport.set_http_response(status_code=200, content=json.dumps(get_object_response)), - mock.patch("evo.objects.utils.tables.KnownTableFormat") as mock_known_table_format, mock.patch("evo.common.io.download.HTTPSource", autospec=True) as mock_source, ): - mock_table_info = {} - mock_table_info["data"] = mock_data_id = "0000000000000000000000000000000000000000000000000000000000000001" - mock_known_table_format.load_table = mock_load_table = mock.Mock() + mock_table_info = { + "data": "0000000000000000000000000000000000000000000000000000000000000001", + "length": 1, + "width": 3, + "data_type": "float64", + } + mock_data_id: str = mock_table_info["data"] + expected_filename = self.data_client.cache_location / mock_data_id + sample_table, payload_bytes = _get_sample_table_and_bytes( + KnownTableFormat.from_table_info(mock_table_info), 1 + ) async def _mock_download_file_side_effect(*args, **kwargs): - expected_filename = self.data_client.cache_location / mock_data_id expected_download_url = get_object_response["links"]["data"][1]["download_url"] actual_download_url = await kwargs["url_generator"]() self.assertEqual(expected_filename, kwargs["filename"]) self.assertEqual(expected_download_url, actual_download_url) self.assertIs(self.transport, kwargs["transport"]) self.assertIs(NoFeedback, kwargs["fb"]) + expected_filename.write_bytes(payload_bytes) mock_source.download_file.side_effect = _mock_download_file_side_effect - - _actual_dataframe = await self.data_client.download_dataframe(object_id, None, mock_table_info) + actual_dataframe = await self.data_client.download_dataframe(object_id, None, mock_table_info) mock_source.download_file.assert_called_once() self.assert_request_made( @@ -362,28 +391,23 @@ async def _mock_download_file_side_effect(*args, **kwargs): path=f"{self.base_path}/objects/{object_id}", headers={"Accept": "application/json", "Accept-Encoding": "gzip"}, ) - mock_load_table.assert_called_once_with(mock_table_info, self.data_client.cache_location) - - async def test_download_dataframe_error(self) -> None: - """Test error when trying to download dataframe without pandas installed.""" - get_object_response = load_test_data("get_object.json") - object_id = UUID(int=2) - with ( - self.transport.set_http_response(status_code=200, content=json.dumps(get_object_response)), - mock.patch("evo.objects.utils.tables.KnownTableFormat") as mock_known_table_format, - mock.patch("evo.common.io.download.HTTPSource", autospec=True), - ): - mock_table_info = {} - mock_table_info["data"] = "0000000000000000000000000000000000000000000000000000000000000001" - - mock_known_table_format.load_table.return_value = mock_table = mock.Mock() - # This is the error that a non-mocked `pyarrow.Table.to_pandas()` would raise. - mock_table.to_pandas.side_effect = ModuleNotFoundError("No module named 'pandas'") - - with self.assertRaisesRegex( - RuntimeError, "Unable to download dataframe because the `pandas` package is not installed" - ): - _ = await self.data_client.download_dataframe(object_id, None, mock_table_info) + assert_frame_equal(sample_table.to_pandas(), actual_dataframe) + + async def test_download_dataframe_optional(self) -> None: + """Test download dataframe is not available if pandas is not installed.""" + with UnloadModule("evo.objects.utils.data"), NoImport("pandas"): + from evo.objects.utils.data import ObjectDataClient + + client = ObjectDataClient(environment=self.environment, connector=self.connector, cache=self.cache) + self.assertFalse( + any( + ( + hasattr(ObjectDataClient, "download_dataframe"), + hasattr(client, "download_dataframe"), + ) + ), + "download_dataframe should not be available if pandas is missing", + ) async def test_download_table_confusable(self) -> None: """Test downloading tabular data using pyarrow that includes confusable types.""" @@ -391,23 +415,28 @@ async def test_download_table_confusable(self) -> None: object_id = UUID(int=2) with ( self.transport.set_http_response(status_code=200, content=json.dumps(get_object_response)), - mock.patch("evo.objects.utils.tables.KnownTableFormat") as mock_known_table_format, mock.patch("evo.common.io.download.HTTPSource", autospec=True) as mock_source, ): - mock_table_info = {} - mock_table_info["data"] = mock_data_name = ( - "995f2e6cab5ad17147d9c5fddf371189bef4b623f657dde91f175a0734ed17dc" + mock_table_info = { + "data": "995f2e6cab5ad17147d9c5fddf371189bef4b623f657dde91f175a0734ed17dc", + "length": 1, + "width": 3, + "data_type": "float64", + } + mock_data_id: str = mock_table_info["data"] + expected_filename = self.data_client.cache_location / mock_data_id + sample_table, payload_bytes = _get_sample_table_and_bytes( + KnownTableFormat.from_table_info(mock_table_info), 1 ) - mock_known_table_format.load_table = mock_load_table = mock.Mock() async def _mock_download_file_side_effect(*args, **kwargs): - expected_filename = self.data_client.cache_location / str(mock_data_name) expected_download_url = get_object_response["links"]["data"][0]["download_url"] actual_download_url = await kwargs["url_generator"]() self.assertEqual(expected_filename, kwargs["filename"]) self.assertEqual(expected_download_url, actual_download_url) self.assertIs(self.transport, kwargs["transport"]) self.assertIs(NoFeedback, kwargs["fb"]) + expected_filename.write_bytes(payload_bytes) mock_source.download_file.side_effect = _mock_download_file_side_effect actual_table = await self.data_client.download_table(object_id, None, mock_table_info) @@ -418,8 +447,7 @@ async def _mock_download_file_side_effect(*args, **kwargs): path=f"{self.base_path}/objects/{object_id}", headers={"Accept": "application/json", "Accept-Encoding": "gzip"}, ) - mock_load_table.assert_called_once_with(mock_table_info, self.data_client.cache_location) - self.assertIs(mock_load_table.return_value, actual_table) + self.assertEqual(sample_table, actual_table) async def test_download_dataframe_confusable(self) -> None: """Test downloading tabular data using pandas that includes confusable types.""" @@ -427,26 +455,31 @@ async def test_download_dataframe_confusable(self) -> None: object_id = UUID(int=2) with ( self.transport.set_http_response(status_code=200, content=json.dumps(get_object_response)), - mock.patch("evo.objects.utils.tables.KnownTableFormat") as mock_known_table_format, mock.patch("evo.common.io.download.HTTPSource", autospec=True) as mock_source, ): - mock_table_info = {} - mock_table_info["data"] = mock_data_name = ( - "995f2e6cab5ad17147d9c5fddf371189bef4b623f657dde91f175a0734ed17dc" + mock_table_info = { + "data": "995f2e6cab5ad17147d9c5fddf371189bef4b623f657dde91f175a0734ed17dc", + "length": 1, + "width": 3, + "data_type": "float64", + } + mock_data_id: str = mock_table_info["data"] + expected_filename = self.data_client.cache_location / mock_data_id + sample_table, payload_bytes = _get_sample_table_and_bytes( + KnownTableFormat.from_table_info(mock_table_info), 1 ) - mock_known_table_format.load_table = mock_load_table = mock.Mock() async def _mock_download_file_side_effect(*args, **kwargs): - expected_filename = self.data_client.cache_location / str(mock_data_name) expected_download_url = get_object_response["links"]["data"][0]["download_url"] actual_download_url = await kwargs["url_generator"]() self.assertEqual(expected_filename, kwargs["filename"]) self.assertEqual(expected_download_url, actual_download_url) self.assertIs(self.transport, kwargs["transport"]) self.assertIs(NoFeedback, kwargs["fb"]) + expected_filename.write_bytes(payload_bytes) mock_source.download_file.side_effect = _mock_download_file_side_effect - _actual_dataframe = await self.data_client.download_dataframe(object_id, None, mock_table_info) + actual_dataframe = await self.data_client.download_dataframe(object_id, None, mock_table_info) mock_source.download_file.assert_called_once() self.assert_request_made( @@ -454,7 +487,7 @@ async def _mock_download_file_side_effect(*args, **kwargs): path=f"{self.base_path}/objects/{object_id}", headers={"Accept": "application/json", "Accept-Encoding": "gzip"}, ) - mock_load_table.assert_called_once_with(mock_table_info, self.data_client.cache_location) + assert_frame_equal(sample_table.to_pandas(), actual_dataframe) async def test_download_table_already_downloaded(self) -> None: """Test downloading tabular data using pyarrow or pandas when the table is already downloaded.""" @@ -462,16 +495,21 @@ async def test_download_table_already_downloaded(self) -> None: object_id = UUID(int=2) with ( self.transport.set_http_response(status_code=200, content=json.dumps(get_object_response)), - mock.patch("evo.objects.utils.tables.KnownTableFormat") as mock_known_table_format, mock.patch("evo.common.io.download.HTTPSource", autospec=True) as mock_source, ): - mock_table_info = {} - mock_table_info["data"] = mock_data_id = "0000000000000000000000000000000000000000000000000000000000000001" - mock_known_table_format.load_table = mock_load_table = mock.Mock() - expected_file = self.data_client.cache_location / mock_data_id + mock_table_info = { + "data": "0000000000000000000000000000000000000000000000000000000000000001", + "length": 1, + "width": 3, + "data_type": "float64", + } + mock_data_id: str = mock_table_info["data"] + expected_filename = self.data_client.cache_location / mock_data_id + sample_table, payload_bytes = _get_sample_table_and_bytes( + KnownTableFormat.from_table_info(mock_table_info), 1 + ) async def _mock_download_file_side_effect(*args, **kwargs): - expected_filename = self.data_client.cache_location / mock_data_id expected_download_url = get_object_response["links"]["data"][1]["download_url"] actual_download_url = await kwargs["url_generator"]() self.assertEqual(expected_filename, kwargs["filename"]) @@ -481,17 +519,18 @@ async def _mock_download_file_side_effect(*args, **kwargs): mock_source.download_file.side_effect = _mock_download_file_side_effect - expected_file.touch() + expected_filename.write_bytes(payload_bytes) actual_table = await self.data_client.download_table(object_id, None, mock_table_info) mock_source.download_file.assert_not_called() - self.transport.assert_no_requests() - mock_load_table.assert_called_once_with(mock_table_info, self.data_client.cache_location) - self.assertIs(mock_load_table.return_value, actual_table) - - # Otherwise this will interfere with the other "already_download" test, since cache cleanup in TestWithStorage - # is in class setup, not individual test setup. - expected_file.unlink() + # the object metadata is still requested to get the initial download URL and check permissions. + self.assert_request_made( + method=RequestMethod.GET, + path=f"{self.base_path}/objects/{object_id}", + headers={"Accept": "application/json", "Accept-Encoding": "gzip"}, + ) + self.transport.request.assert_called_once() # Ensure no other requests were made. + self.assertEqual(sample_table, actual_table) async def test_download_dataframe_already_downloaded(self) -> None: """Test downloading tabular data using pandas when the table is already downloaded.""" @@ -499,16 +538,21 @@ async def test_download_dataframe_already_downloaded(self) -> None: object_id = UUID(int=2) with ( self.transport.set_http_response(status_code=200, content=json.dumps(get_object_response)), - mock.patch("evo.objects.utils.tables.KnownTableFormat") as mock_known_table_format, mock.patch("evo.common.io.download.HTTPSource", autospec=True) as mock_source, ): - mock_table_info = {} - mock_table_info["data"] = mock_data_id = "0000000000000000000000000000000000000000000000000000000000000001" - mock_known_table_format.load_table = mock_load_table = mock.Mock() - expected_file = self.data_client.cache_location / mock_data_id + mock_table_info = { + "data": "0000000000000000000000000000000000000000000000000000000000000001", + "length": 1, + "width": 3, + "data_type": "float64", + } + mock_data_id: str = mock_table_info["data"] + expected_filename = self.data_client.cache_location / mock_data_id + sample_table, payload_bytes = _get_sample_table_and_bytes( + KnownTableFormat.from_table_info(mock_table_info), 1 + ) async def _mock_download_file_side_effect(*args, **kwargs): - expected_filename = self.data_client.cache_location / mock_data_id expected_download_url = get_object_response["links"]["data"][1]["download_url"] actual_download_url = await kwargs["url_generator"]() self.assertEqual(expected_filename, kwargs["filename"]) @@ -518,13 +562,15 @@ async def _mock_download_file_side_effect(*args, **kwargs): mock_source.download_file.side_effect = _mock_download_file_side_effect - expected_file.touch() - _actual_dataframe = await self.data_client.download_dataframe(object_id, None, mock_table_info) + expected_filename.write_bytes(payload_bytes) + actual_dataframe = await self.data_client.download_dataframe(object_id, None, mock_table_info) mock_source.download_file.assert_not_called() - self.transport.assert_no_requests() - mock_load_table.assert_called_once_with(mock_table_info, self.data_client.cache_location) - - # Otherwise this will interfere with the other "already_download" test, since cache cleanup in TestWithStorage - # is in class setup, not individual test setup. - expected_file.unlink() + # the object metadata is still requested to get the initial download URL and check permissions. + self.assert_request_made( + method=RequestMethod.GET, + path=f"{self.base_path}/objects/{object_id}", + headers={"Accept": "application/json", "Accept-Encoding": "gzip"}, + ) + self.transport.request.assert_called_once() # Ensure no other requests were made. + assert_frame_equal(sample_table.to_pandas(), actual_dataframe) From a021eb3ab0c3f91731f85c81523dfc6232ae7ebe Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Mon, 6 Oct 2025 10:55:47 +1300 Subject: [PATCH 11/32] Remove outdated data client tests --- .../evo-objects/tests/test_data_client.py | 80 ------------------- 1 file changed, 80 deletions(-) diff --git a/packages/evo-objects/tests/test_data_client.py b/packages/evo-objects/tests/test_data_client.py index afdd17c9..51839098 100644 --- a/packages/evo-objects/tests/test_data_client.py +++ b/packages/evo-objects/tests/test_data_client.py @@ -409,86 +409,6 @@ async def test_download_dataframe_optional(self) -> None: "download_dataframe should not be available if pandas is missing", ) - async def test_download_table_confusable(self) -> None: - """Test downloading tabular data using pyarrow that includes confusable types.""" - get_object_response = load_test_data("get_object_validator_check.json") - object_id = UUID(int=2) - with ( - self.transport.set_http_response(status_code=200, content=json.dumps(get_object_response)), - mock.patch("evo.common.io.download.HTTPSource", autospec=True) as mock_source, - ): - mock_table_info = { - "data": "995f2e6cab5ad17147d9c5fddf371189bef4b623f657dde91f175a0734ed17dc", - "length": 1, - "width": 3, - "data_type": "float64", - } - mock_data_id: str = mock_table_info["data"] - expected_filename = self.data_client.cache_location / mock_data_id - sample_table, payload_bytes = _get_sample_table_and_bytes( - KnownTableFormat.from_table_info(mock_table_info), 1 - ) - - async def _mock_download_file_side_effect(*args, **kwargs): - expected_download_url = get_object_response["links"]["data"][0]["download_url"] - actual_download_url = await kwargs["url_generator"]() - self.assertEqual(expected_filename, kwargs["filename"]) - self.assertEqual(expected_download_url, actual_download_url) - self.assertIs(self.transport, kwargs["transport"]) - self.assertIs(NoFeedback, kwargs["fb"]) - expected_filename.write_bytes(payload_bytes) - - mock_source.download_file.side_effect = _mock_download_file_side_effect - actual_table = await self.data_client.download_table(object_id, None, mock_table_info) - - mock_source.download_file.assert_called_once() - self.assert_request_made( - method=RequestMethod.GET, - path=f"{self.base_path}/objects/{object_id}", - headers={"Accept": "application/json", "Accept-Encoding": "gzip"}, - ) - self.assertEqual(sample_table, actual_table) - - async def test_download_dataframe_confusable(self) -> None: - """Test downloading tabular data using pandas that includes confusable types.""" - get_object_response = load_test_data("get_object_validator_check.json") - object_id = UUID(int=2) - with ( - self.transport.set_http_response(status_code=200, content=json.dumps(get_object_response)), - mock.patch("evo.common.io.download.HTTPSource", autospec=True) as mock_source, - ): - mock_table_info = { - "data": "995f2e6cab5ad17147d9c5fddf371189bef4b623f657dde91f175a0734ed17dc", - "length": 1, - "width": 3, - "data_type": "float64", - } - mock_data_id: str = mock_table_info["data"] - expected_filename = self.data_client.cache_location / mock_data_id - sample_table, payload_bytes = _get_sample_table_and_bytes( - KnownTableFormat.from_table_info(mock_table_info), 1 - ) - - async def _mock_download_file_side_effect(*args, **kwargs): - expected_download_url = get_object_response["links"]["data"][0]["download_url"] - actual_download_url = await kwargs["url_generator"]() - self.assertEqual(expected_filename, kwargs["filename"]) - self.assertEqual(expected_download_url, actual_download_url) - self.assertIs(self.transport, kwargs["transport"]) - self.assertIs(NoFeedback, kwargs["fb"]) - expected_filename.write_bytes(payload_bytes) - - mock_source.download_file.side_effect = _mock_download_file_side_effect - actual_dataframe = await self.data_client.download_dataframe(object_id, None, mock_table_info) - - mock_source.download_file.assert_called_once() - self.assert_request_made( - method=RequestMethod.GET, - path=f"{self.base_path}/objects/{object_id}", - headers={"Accept": "application/json", "Accept-Encoding": "gzip"}, - ) - assert_frame_equal(sample_table.to_pandas(), actual_dataframe) - async def test_download_table_already_downloaded(self) -> None: """Test downloading tabular data using pyarrow or pandas when the table is already downloaded.""" get_object_response = load_test_data("get_object.json") From d4796bb266759ed2eeddd8f111b18e2c520c3cb2 Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Mon, 6 Oct 2025 11:34:23 +1300 Subject: [PATCH 12/32] Fix type annotation --- packages/evo-objects/src/evo/objects/parse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/evo-objects/src/evo/objects/parse.py b/packages/evo-objects/src/evo/objects/parse.py index 2b6c0759..a234b127 100644 --- a/packages/evo-objects/src/evo/objects/parse.py +++ b/packages/evo-objects/src/evo/objects/parse.py @@ -140,7 +140,7 @@ def object_metadata( ) -def org_object_metadata(model: models.OrgListedObject, environment: Environment) -> ObjectMetadata: +def org_object_metadata(model: models.OrgListedObject, environment: Environment) -> OrgObjectMetadata: """Parse an OrgObjectMetadata from the generated model. :param model: The model returned by the generated code. From 82a1145fedc3c4d896bd0ee4695d80e148eade2b Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Mon, 6 Oct 2025 11:35:21 +1300 Subject: [PATCH 13/32] Add object reference type --- packages/evo-objects/src/evo/objects/data.py | 128 +++++++++++++++++-- 1 file changed, 118 insertions(+), 10 deletions(-) diff --git a/packages/evo-objects/src/evo/objects/data.py b/packages/evo-objects/src/evo/objects/data.py index 2dde2f4e..9b305720 100644 --- a/packages/evo-objects/src/evo/objects/data.py +++ b/packages/evo-objects/src/evo/objects/data.py @@ -16,9 +16,10 @@ from dataclasses import dataclass from datetime import datetime from typing import Protocol +from urllib.parse import parse_qs, urlparse from uuid import UUID -from evo.common import ResourceMetadata +from evo.common import Environment, ResourceMetadata from evo.workspaces import ServiceUser from .exceptions import SchemaIDFormatError @@ -26,6 +27,7 @@ __all__ = [ "ObjectMetadata", "ObjectOrderByEnum", + "ObjectReference", "ObjectSchema", "ObjectVersion", "SchemaVersion", @@ -43,6 +45,116 @@ class ObjectOrderByEnum(str, enum.Enum): object_name = "object_name" +class ObjectReference(str): + """A structured URL reference to a geoscience object, optionally including a version ID. + + Geoscience Object URL references are the fully qualified HTTPS URLs used to access objects in the + Geoscience Object API. The URL may follow the path or UUID format, and may optionally include a version ID. + + In most cases, UUID-based references are preferred, as they are immutable and unambiguous. However, path-based references + can be useful in scenarios where the object ID is not known, such as when creating new objects or when working with + objects in a more human-readable way. + """ + + _RE_PATH = re.compile( + r""" + ^/geoscience-object + /orgs/(?P[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}) + /workspaces/(?P[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}) + /objects + (?: + /path/(?P[^?]+) | /(?P[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}) + )$ + """, + re.IGNORECASE | re.VERBOSE, + ) + + hub_url: str + """The base URL of the Evo Hub.""" + + org_id: UUID + """The ID of the Evo Organization the object belongs to.""" + + workspace_id: UUID + """The ID of the Evo Workspace the object belongs to.""" + + object_id: UUID | None + """The UUID of the object, if specified in the URL.""" + + object_path: str | None + """The path of the object, if specified in the URL.""" + + version_id: str | None + """The version ID of the object, if specified in the URL.""" + + def __new__(cls, value: str) -> ObjectReference: + inst = str.__new__(cls, value) + + parsed = urlparse(value) + if parsed.scheme != "https": + raise ValueError("Reference must be a valid HTTPS URL") + + inst.hub_url = f"{parsed.scheme}://{parsed.netloc}" + + if match := cls._RE_PATH.fullmatch(parsed.path): + inst.org_id = UUID(match.group("org_id")) + inst.workspace_id = UUID(match.group("workspace_id")) + + if match.group("object_id"): + inst.object_id = UUID(match.group("object_id")) + inst.object_path = None + else: + inst.object_id = None + inst.object_path = "/" + match.group("object_path").lstrip("/") + else: + raise ValueError("Reference path is not valid") + + query_params = parse_qs(parsed.query) + inst.version_id = query_params.get("version", [None])[0] + return inst + + @property + def environment(self) -> Environment: + return Environment(hub_url=self.hub_url, org_id=self.org_id, workspace_id=self.workspace_id) + + @staticmethod + def new( + environment: Environment, + object_id: UUID | None = None, + object_path: str | None = None, + version_id: str | None = None, + ) -> ObjectReference: + """Create a new ObjectReference from its components. + + Either object_id or object_path must be provided, but not both. + + :param environment: The Evo environment the object belongs to. + :param object_id: The UUID of the object, if known. + :param object_path: The path of the object, if known. + :param version_id: The version ID of the object, if known. + + :returns: A new ObjectReference instance. + + :raises ValueError: If neither or both of object_id and object_path are provided. + """ + if object_id is None and object_path is None: + raise ValueError("Either object_id or object_path must be provided") + if object_id is not None and object_path is not None: + raise ValueError("Only one of object_id or object_path can be provided") + + if object_id is not None: + path = ( + f"geoscience-object/orgs/{environment.org_id}/workspaces/{environment.workspace_id}/objects/{object_id}" + ) + else: + path = f"geoscience-object/orgs/{environment.org_id}/workspaces/{environment.workspace_id}/objects/path/{object_path.lstrip('/')}" + + if version_id is not None: + path += f"?version={version_id}" + + return ObjectReference(f"{environment.hub_url.rstrip('/')}/{path}") + + @dataclass(frozen=True, kw_only=True) class ObjectMetadata(ResourceMetadata): """Metadata about a geoscience object.""" @@ -71,12 +183,10 @@ def path(self) -> str: return f"{self.parent}/{self.name}" @property - def url(self) -> str: + def url(self) -> ObjectReference: """The url of the object.""" - return "{hub_url}/geoscience-object/orgs/{org_id}/workspaces/{workspace_id}/objects/{object_id}?version={version_id}".format( - hub_url=self.environment.hub_url.rstrip("/"), - org_id=self.environment.org_id, - workspace_id=self.environment.workspace_id, + return ObjectReference.new( + environment=self.environment, object_id=self.id, version_id=self.version_id, ) @@ -107,10 +217,8 @@ class OrgObjectMetadata(ResourceMetadata): @property def url(self) -> str: """The url of the object.""" - return "{hub_url}/geoscience-object/orgs/{org_id}/workspaces/{workspace_id}/objects/{object_id}".format( - hub_url=self.environment.hub_url.rstrip("/"), - org_id=self.environment.org_id, - workspace_id=self.workspace_id, + return ObjectReference.new( + environment=self.environment, object_id=self.id, ) From 8b7ef6102db39e1bd4520f88fd892fbdc1159b1d Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Mon, 6 Oct 2025 12:26:31 +1300 Subject: [PATCH 14/32] Use HTTPS test URLs --- .../tests/data/category_colormap_response.json | 2 +- .../colormap_association_collection_response.json | 12 ++++++------ .../tests/data/colormap_association_response.json | 4 ++-- .../tests/data/colormap_collection_response.json | 6 +++--- .../tests/data/continuous_colormap_response.json | 2 +- .../tests/data/discrete_colormap_response.json | 2 +- packages/evo-files/tests/data/get_file.json | 6 +++--- .../evo-files/tests/data/get_file_long_name.json | 6 +++--- packages/evo-files/tests/data/list_files_0.json | 6 +++--- packages/evo-files/tests/data/list_files_1.json | 4 ++-- packages/evo-files/tests/data/list_versions.json | 10 +++++----- packages/evo-files/tests/data/update_file.json | 2 +- packages/evo-files/tests/data/upsert_file.json | 2 +- packages/evo-objects/tests/data/get_object.json | 2 +- .../tests/data/get_object_validator_check.json | 2 +- .../evo-objects/tests/data/list_objects_0.json | 6 +++--- .../evo-objects/tests/data/list_objects_1.json | 4 ++-- .../tests/data/list_objects_for_instance_0.json | 2 +- .../tests/data/list_objects_for_instance_1.json | 2 +- packages/evo-objects/tests/data/list_versions.json | 14 +++++++------- .../src/evo/common/test_tools/consts.py | 2 +- .../tests/data/list_workspaces_0.json | 10 +++++----- .../tests/data/list_workspaces_1.json | 8 ++++---- .../tests/data/list_workspaces_summary.json | 4 ++-- .../data/list_workspaces_summary_paginated_0.json | 6 +++--- .../data/list_workspaces_summary_paginated_1.json | 6 +++--- .../evo-sdk-common/tests/data/new_workspace.json | 2 +- 27 files changed, 67 insertions(+), 67 deletions(-) diff --git a/packages/evo-colormaps/tests/data/category_colormap_response.json b/packages/evo-colormaps/tests/data/category_colormap_response.json index 88625a87..5c4a7c6d 100644 --- a/packages/evo-colormaps/tests/data/category_colormap_response.json +++ b/packages/evo-colormaps/tests/data/category_colormap_response.json @@ -24,5 +24,5 @@ "modified_by": "00000000-0000-0000-0000-000000000010", "name": "category colormap 1", "schema": "category", - "self_link": "http://unittest.localhost/colormap/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/colormaps/00000000-0000-0000-0000-000000000008" + "self_link": "https://unittest.localhost/colormap/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/colormaps/00000000-0000-0000-0000-000000000008" } diff --git a/packages/evo-colormaps/tests/data/colormap_association_collection_response.json b/packages/evo-colormaps/tests/data/colormap_association_collection_response.json index 62e5521b..2092aa79 100644 --- a/packages/evo-colormaps/tests/data/colormap_association_collection_response.json +++ b/packages/evo-colormaps/tests/data/colormap_association_collection_response.json @@ -3,37 +3,37 @@ { "attribute_id": "a fairly unique ID", "colormap_id": "00000000-0000-0000-0000-000000000006", - "colormap_uri": "http://unittest.localhost/colormap/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/colormaps/00000000-0000-0000-0000-000000000006", + "colormap_uri": "https://unittest.localhost/colormap/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/colormaps/00000000-0000-0000-0000-000000000006", "created_at": "2024-09-16 01:30:00", "created_by": "00000000-0000-0000-0000-000000000010", "id": "00000000-0000-0000-0000-00000000001e", "modified_at": "2024-09-16 01:30:00", "modified_by": "00000000-0000-0000-0000-000000000010", - "self_link": "http://unittest.localhost/colormap/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects/00000000-0000-0000-0000-000000000014/associations/00000000-0000-0000-0000-00000000001e", + "self_link": "https://unittest.localhost/colormap/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects/00000000-0000-0000-0000-000000000014/associations/00000000-0000-0000-0000-00000000001e", "workspace_id": "00000000-0000-0000-0000-00000000162e" }, { "attribute_id": "00000000-0000-0000-0000-000000000002", "colormap_id": "00000000-0000-0000-0000-000000000007", - "colormap_uri": "http://unittest.localhost/colormap/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/colormaps/00000000-0000-0000-0000-000000000007", + "colormap_uri": "https://unittest.localhost/colormap/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/colormaps/00000000-0000-0000-0000-000000000007", "created_at": "2024-09-16 01:30:00", "created_by": "00000000-0000-0000-0000-000000000010", "id": "00000000-0000-0000-0000-00000000001f", "modified_at": "2024-09-16 01:30:00", "modified_by": "00000000-0000-0000-0000-000000000010", - "self_link": "http://unittest.localhost/colormap/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects/00000000-0000-0000-0000-000000000014/associations/00000000-0000-0000-0000-00000000001f", + "self_link": "https://unittest.localhost/colormap/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects/00000000-0000-0000-0000-000000000014/associations/00000000-0000-0000-0000-00000000001f", "workspace_id": "00000000-0000-0000-0000-00000000162e" }, { "attribute_id": "another fairly unique ID", "colormap_id": "00000000-0000-0000-0000-000000000008", - "colormap_uri": "http://unittest.localhost/colormap/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/colormaps/00000000-0000-0000-0000-000000000008", + "colormap_uri": "https://unittest.localhost/colormap/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/colormaps/00000000-0000-0000-0000-000000000008", "created_at": "2024-09-16 01:30:00", "created_by": "00000000-0000-0000-0000-000000000010", "id": "00000000-0000-0000-0000-000000000020", "modified_at": "2024-09-16 01:30:00", "modified_by": "00000000-0000-0000-0000-000000000010", - "self_link": "http://unittest.localhost/colormap/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects/00000000-0000-0000-0000-000000000014/associations/00000000-0000-0000-0000-000000000020", + "self_link": "https://unittest.localhost/colormap/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects/00000000-0000-0000-0000-000000000014/associations/00000000-0000-0000-0000-000000000020", "workspace_id": "00000000-0000-0000-0000-00000000162e" } ] diff --git a/packages/evo-colormaps/tests/data/colormap_association_response.json b/packages/evo-colormaps/tests/data/colormap_association_response.json index cd6b1a3d..639ab41e 100644 --- a/packages/evo-colormaps/tests/data/colormap_association_response.json +++ b/packages/evo-colormaps/tests/data/colormap_association_response.json @@ -1,12 +1,12 @@ { "attribute_id": "a very unique ID", "colormap_id": "00000000-0000-0000-0000-000000000006", - "colormap_uri": "http://unittest.localhost/colormap/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/colormaps/00000000-0000-0000-0000-000000000006", + "colormap_uri": "https://unittest.localhost/colormap/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/colormaps/00000000-0000-0000-0000-000000000006", "created_at": "2024-09-16 01:30:00", "created_by": "00000000-0000-0000-0000-000000000010", "id": "00000000-0000-0000-0000-00000000001e", "modified_at": "2024-09-16 01:30:00", "modified_by": "00000000-0000-0000-0000-000000000010", - "self_link": "http://unittest.localhost/colormap/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects/00000000-0000-0000-0000-000000000014/associations/00000000-0000-0000-0000-00000000001e", + "self_link": "https://unittest.localhost/colormap/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects/00000000-0000-0000-0000-000000000014/associations/00000000-0000-0000-0000-00000000001e", "workspace_id": "00000000-0000-0000-0000-00000000162e" } \ No newline at end of file diff --git a/packages/evo-colormaps/tests/data/colormap_collection_response.json b/packages/evo-colormaps/tests/data/colormap_collection_response.json index 10858592..2740fe4a 100644 --- a/packages/evo-colormaps/tests/data/colormap_collection_response.json +++ b/packages/evo-colormaps/tests/data/colormap_collection_response.json @@ -35,7 +35,7 @@ "modified_by": "00000000-0000-0000-0000-000000000010", "name": "continuous colormap 1", "schema": "continuous", - "self_link": "http://unittest.localhost/colormap/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/colormaps/00000000-0000-0000-0000-000000000006" + "self_link": "https://unittest.localhost/colormap/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/colormaps/00000000-0000-0000-0000-000000000006" }, { "colors": [ @@ -70,7 +70,7 @@ "modified_by": "00000000-0000-0000-0000-000000000010", "name": "discrete colormap 1", "schema": "discrete", - "self_link": "http://unittest.localhost/colormap/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/colormaps/00000000-0000-0000-0000-000000000007" + "self_link": "https://unittest.localhost/colormap/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/colormaps/00000000-0000-0000-0000-000000000007" }, { "colors": [ @@ -102,7 +102,7 @@ "modified_by": "00000000-0000-0000-0000-000000000010", "name": "category colormap 1", "schema": "category", - "self_link": "http://unittest.localhost/colormap/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/colormaps/00000000-0000-0000-0000-000000000008" + "self_link": "https://unittest.localhost/colormap/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/colormaps/00000000-0000-0000-0000-000000000008" } ] } diff --git a/packages/evo-colormaps/tests/data/continuous_colormap_response.json b/packages/evo-colormaps/tests/data/continuous_colormap_response.json index 9ce4f1e0..8133d47e 100644 --- a/packages/evo-colormaps/tests/data/continuous_colormap_response.json +++ b/packages/evo-colormaps/tests/data/continuous_colormap_response.json @@ -33,5 +33,5 @@ "modified_by": "00000000-0000-0000-0000-000000000010", "name": "continuous colormap 1", "schema": "continuous", - "self_link": "http://unittest.localhost/colormap/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/colormaps/00000000-0000-0000-0000-000000000006" + "self_link": "https://unittest.localhost/colormap/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/colormaps/00000000-0000-0000-0000-000000000006" } diff --git a/packages/evo-colormaps/tests/data/discrete_colormap_response.json b/packages/evo-colormaps/tests/data/discrete_colormap_response.json index 2b021962..f99ed11c 100644 --- a/packages/evo-colormaps/tests/data/discrete_colormap_response.json +++ b/packages/evo-colormaps/tests/data/discrete_colormap_response.json @@ -28,5 +28,5 @@ "modified_by": "00000000-0000-0000-0000-000000000010", "name": "discrete colormap 1", "schema": "discrete", - "self_link": "http://unittest.localhost/colormap/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/colormaps/00000000-0000-0000-0000-000000000007" + "self_link": "https://unittest.localhost/colormap/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/colormaps/00000000-0000-0000-0000-000000000007" } diff --git a/packages/evo-files/tests/data/get_file.json b/packages/evo-files/tests/data/get_file.json index f9f84a31..c16002dd 100644 --- a/packages/evo-files/tests/data/get_file.json +++ b/packages/evo-files/tests/data/get_file.json @@ -11,12 +11,12 @@ "name": "x y", "email": "test@example.com" }, - "download": "http://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/points_0.csv", + "download": "https://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/points_0.csv", "etag": "", "file_id": "00000000-0000-0000-0000-000000000006", "name": "points.csv", "path": "/", - "self": "http://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/points_0.csv", + "self": "https://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/points_0.csv", "size": 10, "version_id": "1", "versions": [ @@ -28,7 +28,7 @@ "email": "test@example.com" }, "file_id": "00000000-0000-0000-0000-000000000006", - "link": "http://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/points_0.csv", + "link": "https://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/points_0.csv", "name": "points.csv", "path": "/", "size": 0, diff --git a/packages/evo-files/tests/data/get_file_long_name.json b/packages/evo-files/tests/data/get_file_long_name.json index cfdcef93..611f87a0 100644 --- a/packages/evo-files/tests/data/get_file_long_name.json +++ b/packages/evo-files/tests/data/get_file_long_name.json @@ -11,12 +11,12 @@ "name": "x y", "email": "test@example.com" }, - "download": "http://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/points_0.csv", + "download": "https://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/points_0.csv", "etag": "", "file_id": "00000000-0000-0000-0000-000000000006", "name": "pointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspointspoints.csv", "path": "/", - "self": "http://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/points_0.csv", + "self": "https://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/points_0.csv", "size": 10, "version_id": "1", "versions": [ @@ -28,7 +28,7 @@ "email": "test@example.com" }, "file_id": "00000000-0000-0000-0000-000000000006", - "link": "http://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/points_0.csv", + "link": "https://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/points_0.csv", "name": "points.csv", "path": "/", "size": 0, diff --git a/packages/evo-files/tests/data/list_files_0.json b/packages/evo-files/tests/data/list_files_0.json index 2cfaf500..b5a4c537 100644 --- a/packages/evo-files/tests/data/list_files_0.json +++ b/packages/evo-files/tests/data/list_files_0.json @@ -20,7 +20,7 @@ }, "etag": "", "links": { - "self": "http://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/A/m.json" + "self": "https://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/A/m.json" }, "size": 11 }, @@ -43,14 +43,14 @@ }, "etag": "", "links": { - "self": "http://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/A/n.json" + "self": "https://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/A/n.json" }, "size": 12 } ], "limit": 5000, "links": { - "next": "http://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files?path=pointset/&limit=2&offset=2", + "next": "https://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files?path=pointset/&limit=2&offset=2", "prev": null, "self": "" }, diff --git a/packages/evo-files/tests/data/list_files_1.json b/packages/evo-files/tests/data/list_files_1.json index 044a4bc8..2f45c0ed 100644 --- a/packages/evo-files/tests/data/list_files_1.json +++ b/packages/evo-files/tests/data/list_files_1.json @@ -20,7 +20,7 @@ }, "etag": "", "links": { - "self": "http://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/B/o.json" + "self": "https://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/B/o.json" }, "size": 13 } @@ -28,7 +28,7 @@ "limit": 5000, "links": { "next": null, - "prev": "http://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files?limit=2&offset=0", + "prev": "https://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files?limit=2&offset=0", "self": "" }, "offset": 0, diff --git a/packages/evo-files/tests/data/list_versions.json b/packages/evo-files/tests/data/list_versions.json index b78b3ab5..27d5b084 100644 --- a/packages/evo-files/tests/data/list_versions.json +++ b/packages/evo-files/tests/data/list_versions.json @@ -11,12 +11,12 @@ "name": "x y", "email": "test@example.com" }, - "download": "http://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/points.csv", + "download": "https://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/points.csv", "etag": "", "file_id": "00000000-0000-0000-0000-000000000006", "name": "points.csv", "path": "/", - "self": "http://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/points.csv", + "self": "https://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/points.csv", "size": 14, "version_id": "1", "versions": [ @@ -28,7 +28,7 @@ "email": "test@example.com" }, "file_id": "00000000-0000-0000-0000-000000000006", - "link": "http://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/points.csv", + "link": "https://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/points.csv", "name": "points.csv", "path": "/", "size": 14, @@ -42,7 +42,7 @@ "email": "test@example.com" }, "file_id": "00000000-0000-0000-0000-000000000006", - "link": "http://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/points.csv", + "link": "https://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/points.csv", "name": "points.csv", "path": "/", "size": 15, @@ -56,7 +56,7 @@ "email": "test@example.com" }, "file_id": "00000000-0000-0000-0000-000000000006", - "link": "http://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/points.csv", + "link": "https://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/points.csv", "name": "points.csv", "path": "/", "size": 16, diff --git a/packages/evo-files/tests/data/update_file.json b/packages/evo-files/tests/data/update_file.json index 364a53ca..72ad018d 100644 --- a/packages/evo-files/tests/data/update_file.json +++ b/packages/evo-files/tests/data/update_file.json @@ -1,5 +1,5 @@ { "file_id": "00000000-0000-0000-0000-000000000005", - "upload": "http://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/00000000-0000-0000-0000-000000000005", + "upload": "https://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/00000000-0000-0000-0000-000000000005", "version_id": "1" } \ No newline at end of file diff --git a/packages/evo-files/tests/data/upsert_file.json b/packages/evo-files/tests/data/upsert_file.json index 183db42e..bb45d26d 100644 --- a/packages/evo-files/tests/data/upsert_file.json +++ b/packages/evo-files/tests/data/upsert_file.json @@ -1,5 +1,5 @@ { "file_id": "00000000-0000-0000-0000-000000000005", - "upload": "http://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/points_0.csv", + "upload": "https://unittest.localhost/path/file/v2/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/files/path/points_0.csv", "version_id": "1" } \ No newline at end of file diff --git a/packages/evo-objects/tests/data/get_object.json b/packages/evo-objects/tests/data/get_object.json index d85bc947..a9529db9 100644 --- a/packages/evo-objects/tests/data/get_object.json +++ b/packages/evo-objects/tests/data/get_object.json @@ -41,7 +41,7 @@ }, "etag": "", "links": { - "download": "http://unittest.localhost/path/geoscience-object/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects/path/A/m.json", + "download": "https://unittest.localhost/path/geoscience-object/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects/path/A/m.json", "data": [ { "id": "00000000-0000-0000-0000-000000000000", diff --git a/packages/evo-objects/tests/data/get_object_validator_check.json b/packages/evo-objects/tests/data/get_object_validator_check.json index ba48bde4..5b96efeb 100644 --- a/packages/evo-objects/tests/data/get_object_validator_check.json +++ b/packages/evo-objects/tests/data/get_object_validator_check.json @@ -39,7 +39,7 @@ }, "etag": "", "links": { - "download": "http://unittest.localhost/path/geoscience-object/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects/path/A/m.json", + "download": "https://unittest.localhost/path/geoscience-object/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects/path/A/m.json", "data": [ { "name": "995f2e6cab5ad17147d9c5fddf371189bef4b623f657dde91f175a0734ed17dc", diff --git a/packages/evo-objects/tests/data/list_objects_0.json b/packages/evo-objects/tests/data/list_objects_0.json index dafed5a9..baebf699 100644 --- a/packages/evo-objects/tests/data/list_objects_0.json +++ b/packages/evo-objects/tests/data/list_objects_0.json @@ -20,7 +20,7 @@ }, "etag": "", "links": { - "download": "http://unittest.localhost/path/geoscience-object/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects/path/A/m.json" + "download": "https://unittest.localhost/path/geoscience-object/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects/path/A/m.json" }, "stage": { "name": "Approved", @@ -47,7 +47,7 @@ }, "etag": "", "links": { - "download": "http://unittest.localhost/path/geoscience-object/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects/path/A/n.json" + "download": "https://unittest.localhost/path/geoscience-object/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects/path/A/n.json" } } ], @@ -55,7 +55,7 @@ "limit": 2, "total": 3, "links": { - "next": "http://unittest.localhost/path/geoscience-object/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects?path=pointset/&limit=2&continuation_token=a_continuation_token", + "next": "https://unittest.localhost/path/geoscience-object/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects?path=pointset/&limit=2&continuation_token=a_continuation_token", "prev": null } } diff --git a/packages/evo-objects/tests/data/list_objects_1.json b/packages/evo-objects/tests/data/list_objects_1.json index 0c9c8dbe..2d81291b 100644 --- a/packages/evo-objects/tests/data/list_objects_1.json +++ b/packages/evo-objects/tests/data/list_objects_1.json @@ -20,7 +20,7 @@ }, "etag": "", "links": { - "download": "http://unittest.localhost/path/geoscience-object/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects/path/B/o.json" + "download": "https://unittest.localhost/path/geoscience-object/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects/path/B/o.json" } } ], @@ -29,6 +29,6 @@ "total": 3, "links": { "next": null, - "prev": "http://unittest.localhost/path/geoscience-object/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects?limit=2" + "prev": "https://unittest.localhost/path/geoscience-object/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects?limit=2" } } diff --git a/packages/evo-objects/tests/data/list_objects_for_instance_0.json b/packages/evo-objects/tests/data/list_objects_for_instance_0.json index 59b7d0d6..c8c09b78 100644 --- a/packages/evo-objects/tests/data/list_objects_for_instance_0.json +++ b/packages/evo-objects/tests/data/list_objects_for_instance_0.json @@ -49,7 +49,7 @@ "limit": 2, "total": 3, "links": { - "next": "http://unittest.localhost/path/geoscience-object/orgs/00000000-0000-0000-0000-0000000004d2/objects?limit=2&continuation_token=a_continuation_token", + "next": "https://unittest.localhost/path/geoscience-object/orgs/00000000-0000-0000-0000-0000000004d2/objects?limit=2&continuation_token=a_continuation_token", "prev": null } } diff --git a/packages/evo-objects/tests/data/list_objects_for_instance_1.json b/packages/evo-objects/tests/data/list_objects_for_instance_1.json index 21367fe7..3afdfdef 100644 --- a/packages/evo-objects/tests/data/list_objects_for_instance_1.json +++ b/packages/evo-objects/tests/data/list_objects_for_instance_1.json @@ -26,6 +26,6 @@ "total": 3, "links": { "next": null, - "prev": "http://unittest.localhost/path/geoscience-object/orgs/00000000-0000-0000-0000-0000000004d2/objects?limit=2" + "prev": "https://unittest.localhost/path/geoscience-object/orgs/00000000-0000-0000-0000-0000000004d2/objects?limit=2" } } diff --git a/packages/evo-objects/tests/data/list_versions.json b/packages/evo-objects/tests/data/list_versions.json index 9e634026..b11599ce 100644 --- a/packages/evo-objects/tests/data/list_versions.json +++ b/packages/evo-objects/tests/data/list_versions.json @@ -21,7 +21,7 @@ }, "etag": "", "links": { - "download": "http://unittest.localhost/path/geoscience-object/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects/path/A/m.json", + "download": "https://unittest.localhost/path/geoscience-object/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects/path/A/m.json", "data": [] }, "versions": [{ @@ -35,11 +35,11 @@ "etag": "", "links": { "data": [{ - "download_url": "http://unittest.localhost/geoscience-data/00000000-0000-0000-0000-0000000004d2/00000000-0000-0000-0000-00000000162e/d565a304-a618-4020-a05b-a245f8177d2d?se=2025-03-26T00%3A26%3A11Z&sp=r&sv=2025-01-05&sr=b&skoid=e00b2565-5ef7-46ac-b8fd-b6aff6f93a9d&sktid=067e9632-ea4c-4ed9-9e6d-e294956e284b&skt=2025-03-25T23%3A32%3A48Z&ske=2025-03-26T00%3A32%3A48Z&sks=b&skv=2025-01-05&sig=6HCMQXBZJFvVSNYE%2FdP0E6D9cBTZ1OQfApTR97ccXew%3D", + "download_url": "https://unittest.localhost/geoscience-data/00000000-0000-0000-0000-0000000004d2/00000000-0000-0000-0000-00000000162e/d565a304-a618-4020-a05b-a245f8177d2d?se=2025-03-26T00%3A26%3A11Z&sp=r&sv=2025-01-05&sr=b&skoid=e00b2565-5ef7-46ac-b8fd-b6aff6f93a9d&sktid=067e9632-ea4c-4ed9-9e6d-e294956e284b&skt=2025-03-25T23%3A32%3A48Z&ske=2025-03-26T00%3A32%3A48Z&sks=b&skv=2025-01-05&sig=6HCMQXBZJFvVSNYE%2FdP0E6D9cBTZ1OQfApTR97ccXew%3D", "id": "d565a304-a618-4020-a05b-a245f8177d2d", "name": "35b8b8ba5479a34e905a1b9e212e1cb4a52ec484b969359f2c28f5a00311dbca" }], - "download": "http://unittest.localhost/path/geoscience-object/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects/path/A/m.json" + "download": "https://unittest.localhost/path/geoscience-object/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects/path/A/m.json" }, "stage": { "name": "Approved", @@ -57,11 +57,11 @@ "etag": "", "links": { "data": [{ - "download_url": "http://unittest.localhost/geoscience-data/00000000-0000-0000-0000-0000000004d2/00000000-0000-0000-0000-00000000162e/d565a304-a618-4020-a05b-a245f8177d2d?se=2025-03-26T00%3A26%3A11Z&sp=r&sv=2025-01-05&sr=b&skoid=e00b2565-5ef7-46ac-b8fd-b6aff6f93a9d&sktid=067e9632-ea4c-4ed9-9e6d-e294956e284b&skt=2025-03-25T23%3A32%3A48Z&ske=2025-03-26T00%3A32%3A48Z&sks=b&skv=2025-01-05&sig=6HCMQXBZJFvVSNYE%2FdP0E6D9cBTZ1OQfApTR97ccXew%3D", + "download_url": "https://unittest.localhost/geoscience-data/00000000-0000-0000-0000-0000000004d2/00000000-0000-0000-0000-00000000162e/d565a304-a618-4020-a05b-a245f8177d2d?se=2025-03-26T00%3A26%3A11Z&sp=r&sv=2025-01-05&sr=b&skoid=e00b2565-5ef7-46ac-b8fd-b6aff6f93a9d&sktid=067e9632-ea4c-4ed9-9e6d-e294956e284b&skt=2025-03-25T23%3A32%3A48Z&ske=2025-03-26T00%3A32%3A48Z&sks=b&skv=2025-01-05&sig=6HCMQXBZJFvVSNYE%2FdP0E6D9cBTZ1OQfApTR97ccXew%3D", "id": "d565a304-a618-4020-a05b-a245f8177d2d", "name": "35b8b8ba5479a34e905a1b9e212e1cb4a52ec484b969359f2c28f5a00311dbca" }], - "download": "http://unittest.localhost/path/geoscience-object/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects/path/A/m.json" + "download": "https://unittest.localhost/path/geoscience-object/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects/path/A/m.json" }, "stage": { "name": "Approved", @@ -79,11 +79,11 @@ "etag": "", "links": { "data": [{ - "download_url": "http://unittest.localhost/geoscience-data/00000000-0000-0000-0000-0000000004d2/00000000-0000-0000-0000-00000000162e/d565a304-a618-4020-a05b-a245f8177d2d?se=2025-03-26T00%3A26%3A11Z&sp=r&sv=2025-01-05&sr=b&skoid=e00b2565-5ef7-46ac-b8fd-b6aff6f93a9d&sktid=067e9632-ea4c-4ed9-9e6d-e294956e284b&skt=2025-03-25T23%3A32%3A48Z&ske=2025-03-26T00%3A32%3A48Z&sks=b&skv=2025-01-05&sig=6HCMQXBZJFvVSNYE%2FdP0E6D9cBTZ1OQfApTR97ccXew%3D", + "download_url": "https://unittest.localhost/geoscience-data/00000000-0000-0000-0000-0000000004d2/00000000-0000-0000-0000-00000000162e/d565a304-a618-4020-a05b-a245f8177d2d?se=2025-03-26T00%3A26%3A11Z&sp=r&sv=2025-01-05&sr=b&skoid=e00b2565-5ef7-46ac-b8fd-b6aff6f93a9d&sktid=067e9632-ea4c-4ed9-9e6d-e294956e284b&skt=2025-03-25T23%3A32%3A48Z&ske=2025-03-26T00%3A32%3A48Z&sks=b&skv=2025-01-05&sig=6HCMQXBZJFvVSNYE%2FdP0E6D9cBTZ1OQfApTR97ccXew%3D", "id": "d565a304-a618-4020-a05b-a245f8177d2d", "name": "35b8b8ba5479a34e905a1b9e212e1cb4a52ec484b969359f2c28f5a00311dbca" }], - "download": "http://unittest.localhost/path/geoscience-object/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects/path/A/m.json" + "download": "https://unittest.localhost/path/geoscience-object/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects/path/A/m.json" } } ] diff --git a/packages/evo-sdk-common/src/evo/common/test_tools/consts.py b/packages/evo-sdk-common/src/evo/common/test_tools/consts.py index 4bd065bd..e91dd212 100644 --- a/packages/evo-sdk-common/src/evo/common/test_tools/consts.py +++ b/packages/evo-sdk-common/src/evo/common/test_tools/consts.py @@ -13,7 +13,7 @@ from evo.discovery import Hub, Organization -BASE_URL = "http://unittest.localhost/" +BASE_URL = "https://unittest.localhost/" ACCESS_TOKEN = "" HUB = Hub( diff --git a/packages/evo-sdk-common/tests/data/list_workspaces_0.json b/packages/evo-sdk-common/tests/data/list_workspaces_0.json index 653daeb2..ef4e4dc5 100644 --- a/packages/evo-sdk-common/tests/data/list_workspaces_0.json +++ b/packages/evo-sdk-common/tests/data/list_workspaces_0.json @@ -1,8 +1,8 @@ { "links": { - "first": "http://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces?limit=2&offset=0", - "last": "http://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces?limit=2&offset=2", - "next": "http://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces?limit=2&offset=2", + "first": "https://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces?limit=2&offset=0", + "last": "https://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces?limit=2&offset=2", + "next": "https://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces?limit=2&offset=2", "previous": null, "count": 2, "total": 3 @@ -28,7 +28,7 @@ }, "default_coordinate_system": "", "labels": [], - "self_link": "http://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces/00000000-0000-0000-0000-00000000000a" + "self_link": "https://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces/00000000-0000-0000-0000-00000000000a" }, { "id": "00000000-0000-0000-0000-00000000000b", @@ -50,7 +50,7 @@ }, "default_coordinate_system": "", "labels": [], - "self_link": "http://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces/00000000-0000-0000-0000-00000000000b" + "self_link": "https://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces/00000000-0000-0000-0000-00000000000b" } ] } diff --git a/packages/evo-sdk-common/tests/data/list_workspaces_1.json b/packages/evo-sdk-common/tests/data/list_workspaces_1.json index 472aecae..6dec7967 100644 --- a/packages/evo-sdk-common/tests/data/list_workspaces_1.json +++ b/packages/evo-sdk-common/tests/data/list_workspaces_1.json @@ -1,9 +1,9 @@ { "links": { - "first": "http://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces?limit=2&offset=0", - "last": "http://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces?limit=2&offset=2", + "first": "https://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces?limit=2&offset=0", + "last": "https://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces?limit=2&offset=2", "next": null, - "previous": "http://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces?limit=2&offset=0", + "previous": "https://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces?limit=2&offset=0", "count": 1, "total": 3 }, @@ -28,7 +28,7 @@ }, "default_coordinate_system": "", "labels": [], - "self_link": "http://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces/00000000-0000-0000-0000-00000000000c" + "self_link": "https://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces/00000000-0000-0000-0000-00000000000c" } ] } \ No newline at end of file diff --git a/packages/evo-sdk-common/tests/data/list_workspaces_summary.json b/packages/evo-sdk-common/tests/data/list_workspaces_summary.json index 1c577014..2a456de6 100644 --- a/packages/evo-sdk-common/tests/data/list_workspaces_summary.json +++ b/packages/evo-sdk-common/tests/data/list_workspaces_summary.json @@ -1,10 +1,10 @@ { "links": { - "first": "http://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces/summary", + "first": "https://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces/summary", "next": null, "previous": null, "count": 3, - "last": "http://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces/summary", + "last": "https://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces/summary", "total": 3 }, "results": [ diff --git a/packages/evo-sdk-common/tests/data/list_workspaces_summary_paginated_0.json b/packages/evo-sdk-common/tests/data/list_workspaces_summary_paginated_0.json index 2589be52..de3030e4 100644 --- a/packages/evo-sdk-common/tests/data/list_workspaces_summary_paginated_0.json +++ b/packages/evo-sdk-common/tests/data/list_workspaces_summary_paginated_0.json @@ -1,10 +1,10 @@ { "links": { - "first": "http://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces/summary?limit=2&offset=0", - "next": "http://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces/summary?limit=2&offset=2", + "first": "https://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces/summary?limit=2&offset=0", + "next": "https://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces/summary?limit=2&offset=2", "previous": null, "count": 2, - "last": "http://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces/summary?limit=2&offset=2", + "last": "https://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces/summary?limit=2&offset=2", "total": 3 }, "results": [ diff --git a/packages/evo-sdk-common/tests/data/list_workspaces_summary_paginated_1.json b/packages/evo-sdk-common/tests/data/list_workspaces_summary_paginated_1.json index 31093789..d2a6ecbf 100644 --- a/packages/evo-sdk-common/tests/data/list_workspaces_summary_paginated_1.json +++ b/packages/evo-sdk-common/tests/data/list_workspaces_summary_paginated_1.json @@ -1,10 +1,10 @@ { "links": { - "first": "http://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces/summary?limit=2&offset=0", + "first": "https://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces/summary?limit=2&offset=0", "next": null, - "previous": "http://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces/summary?limit=2&offset=0", + "previous": "https://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces/summary?limit=2&offset=0", "count": 1, - "last": "http://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces/summary?limit=2&offset=2", + "last": "https://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces/summary?limit=2&offset=2", "total": 3 }, "results": [ diff --git a/packages/evo-sdk-common/tests/data/new_workspace.json b/packages/evo-sdk-common/tests/data/new_workspace.json index 81b4a70f..f1bbef9c 100644 --- a/packages/evo-sdk-common/tests/data/new_workspace.json +++ b/packages/evo-sdk-common/tests/data/new_workspace.json @@ -19,5 +19,5 @@ "ml_enabled": false, "default_coordinate_system": "", "labels": [], - "self_link": "http://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces" + "self_link": "https://unittest.localhost/workspace/orgs/00000000-0000-0000-0000-000000000000/workspaces" } From 04834fe0f0ecb6416f51dd2b5adece11266c1a05 Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Mon, 6 Oct 2025 12:29:42 +1300 Subject: [PATCH 15/32] Refactor downloading a geoscience object --- .../src/evo/objects/client/api_client.py | 44 +++-------- .../src/evo/objects/client/object_client.py | 76 ++++++++++++++++++- 2 files changed, 84 insertions(+), 36 deletions(-) diff --git a/packages/evo-objects/src/evo/objects/client/api_client.py b/packages/evo-objects/src/evo/objects/client/api_client.py index df8c8bdf..e0821c7f 100644 --- a/packages/evo-objects/src/evo/objects/client/api_client.py +++ b/packages/evo-objects/src/evo/objects/client/api_client.py @@ -20,14 +20,9 @@ from evo.common.utils import get_service_health, parse_order_by from .. import parse -from ..data import ObjectMetadata, ObjectOrderByEnum, ObjectVersion, OrgObjectMetadata, Stage +from ..data import ObjectMetadata, ObjectOrderByEnum, ObjectReference, ObjectVersion, OrgObjectMetadata, Stage from ..endpoints import MetadataApi, ObjectsApi, StagesApi -from ..endpoints.models import ( - GeoscienceObject, - GetObjectResponse, - MetadataUpdateBody, - UpdateGeoscienceObject, -) +from ..endpoints.models import GeoscienceObject, MetadataUpdateBody, UpdateGeoscienceObject from ..exceptions import ObjectUUIDError from ..io import ObjectDataDownload, ObjectDataUpload from .object_client import DownloadedObject @@ -366,17 +361,6 @@ async def update_geoscience_object( ) return parse.object_metadata(result, self._environment) - def _downloaded_object_from_response(self, response: GetObjectResponse) -> DownloadedObject: - """Parse object metadata and a geoscience object model instance from a get object response - - :param response: The response from one of the get object endpoints. - - :return: A tuple containing the object metadata and a data model of the requested geoscience object. - """ - metadata = parse.object_metadata(response, self._environment) - urls_by_name = {getattr(link, "name", link.id): link.download_url for link in response.links.data} - return DownloadedObject(response.object, metadata, urls_by_name, self._connector) - async def download_object_by_path( self, path: str, @@ -393,15 +377,13 @@ async def download_object_by_path( :return: A tuple containing the object metadata and a data model of the requested geoscience object. """ - response = await self._objects_api.get_object( - org_id=str(self._environment.org_id), - workspace_id=str(self._environment.workspace_id), - objects_path=path, - version=version, - additional_headers={"Accept-Encoding": "gzip"}, + reference = ObjectReference.new(environment=self._environment, object_path=path, version_id=version) + return await DownloadedObject.from_reference( + connector=self._connector, + reference=reference, + cache=None, # TODO: Add an optional cache to the ObjectAPIClient. request_timeout=request_timeout, ) - return self._downloaded_object_from_response(response) async def download_object_by_id( self, @@ -419,15 +401,13 @@ async def download_object_by_id( :return: A tuple containing the object metadata and a data model of the requested geoscience object. """ - response = await self._objects_api.get_object_by_id( - org_id=str(self._environment.org_id), - workspace_id=str(self._environment.workspace_id), - object_id=str(object_id), - version=version, - additional_headers={"Accept-Encoding": "gzip"}, + reference = ObjectReference.new(environment=self._environment, object_id=object_id, version_id=version) + return await DownloadedObject.from_reference( + connector=self._connector, + reference=reference, + cache=None, # TODO: Add an optional cache to the ObjectAPIClient. request_timeout=request_timeout, ) - return self._downloaded_object_from_response(response) async def get_latest_object_versions( self, diff --git a/packages/evo-objects/src/evo/objects/client/object_client.py b/packages/evo-objects/src/evo/objects/client/object_client.py index bc3995c9..c276004d 100644 --- a/packages/evo-objects/src/evo/objects/client/object_client.py +++ b/packages/evo-objects/src/evo/objects/client/object_client.py @@ -15,11 +15,12 @@ from uuid import UUID from evo import logging -from evo.common import APIConnector +from evo.common import APIConnector, ICache from evo.common.io.exceptions import DataNotFoundError -from ..data import ObjectMetadata, ObjectSchema -from ..endpoints.models import GeoscienceObject +from .. import parse +from ..data import ObjectMetadata, ObjectReference, ObjectSchema +from ..endpoints import ObjectsApi, models from ..io import ObjectDataDownload __all__ = ["DownloadedObject"] @@ -31,12 +32,79 @@ class DownloadedObject: """A downloaded geoscience object.""" def __init__( - self, object_: GeoscienceObject, metadata: ObjectMetadata, urls_by_name: dict[str, str], connector: APIConnector + self, + object_: models.GeoscienceObject, + metadata: ObjectMetadata, + urls_by_name: dict[str, str], + connector: APIConnector, + cache: ICache | None = None, ) -> None: + """ + :param object_: The raw geoscience object model. + :param metadata: The parsed metadata for the object. + :param urls_by_name: A mapping of data names to their initial download URLs. + :param connector: The API connector to use for downloading data. + :param cache: An optional cache to use for data downloads. + """ self._object = object_ self._metadata = metadata self._urls_by_name = urls_by_name self._connector = connector + self._cache = cache + + @staticmethod + async def from_reference( + connector: APIConnector, + reference: ObjectReference | str, + cache: ICache | None = None, + request_timeout: int | float | tuple[int | float, int | float] | None = None, + ) -> DownloadedObject: + """Download a geoscience object from the service, given an object reference. + + :param connector: The API connector to use for downloading data. + :param reference: The reference to the object to download, or a URL as a string that can be parsed into + a reference. + :param cache: An optional cache to use for data downloads. + :param request_timeout: An optional timeout to use for API requests. See evo.common.APIConnector for details. + + :raises ValueError: If the reference is invalid, or if the connector base URL does not match the reference hub URL. + """ + ref = ObjectReference(reference) # Parse the reference if it's a string + + if connector.base_url != ref.hub_url: + raise ValueError( + f"The connector base URL '{connector.base_url}' does not match the reference hub URL '{ref.hub_url}'" + ) + + api = ObjectsApi(connector) + + request_kwargs = dict( + org_id=str(ref.org_id), + workspace_id=str(ref.workspace_id), + version=ref.version_id, + additional_headers={"Accept-Encoding": "gzip"}, + request_timeout=request_timeout, + ) + + if ref.object_id is not None and ref.object_path is not None: + raise ValueError("Only one of object_id or object_path should be provided") + + if ref.object_id is not None: + response = await api.get_object_by_id(object_id=ref.object_id, **request_kwargs) + elif ref.object_path is not None: + response = await api.get_object(objects_path=ref.object_path, **request_kwargs) + else: + raise ValueError("Either object_id or object_path must be provided") + + metadata = parse.object_metadata(response, ref.environment) + urls_by_name = {getattr(link, "name", link.id): link.download_url for link in response.links.data} + return DownloadedObject( + object_=response.object, + metadata=metadata, + urls_by_name=urls_by_name, + connector=connector, + cache=cache, + ) @property def schema(self) -> ObjectSchema: From 81d773a17af10b55e41698ee11d769a3a67a33f9 Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Mon, 6 Oct 2025 12:30:06 +1300 Subject: [PATCH 16/32] Fix formatting of hub URL and object path in ObjectReference --- packages/evo-objects/src/evo/objects/data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/evo-objects/src/evo/objects/data.py b/packages/evo-objects/src/evo/objects/data.py index 9b305720..3ab73a5b 100644 --- a/packages/evo-objects/src/evo/objects/data.py +++ b/packages/evo-objects/src/evo/objects/data.py @@ -94,7 +94,7 @@ def __new__(cls, value: str) -> ObjectReference: if parsed.scheme != "https": raise ValueError("Reference must be a valid HTTPS URL") - inst.hub_url = f"{parsed.scheme}://{parsed.netloc}" + inst.hub_url = f"{parsed.scheme}://{parsed.netloc}/" if match := cls._RE_PATH.fullmatch(parsed.path): inst.org_id = UUID(match.group("org_id")) @@ -105,7 +105,7 @@ def __new__(cls, value: str) -> ObjectReference: inst.object_path = None else: inst.object_id = None - inst.object_path = "/" + match.group("object_path").lstrip("/") + inst.object_path = match.group("object_path").lstrip("/") else: raise ValueError("Reference path is not valid") From 4326454287b6200a9903951c940db21fa0ec119c Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Mon, 6 Oct 2025 12:39:38 +1300 Subject: [PATCH 17/32] Move parse.py to client submodule --- packages/evo-objects/src/evo/objects/client/api_client.py | 2 +- packages/evo-objects/src/evo/objects/client/object_client.py | 2 +- packages/evo-objects/src/evo/objects/{ => client}/parse.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) rename packages/evo-objects/src/evo/objects/{ => client}/parse.py (98%) diff --git a/packages/evo-objects/src/evo/objects/client/api_client.py b/packages/evo-objects/src/evo/objects/client/api_client.py index e0821c7f..5f908f34 100644 --- a/packages/evo-objects/src/evo/objects/client/api_client.py +++ b/packages/evo-objects/src/evo/objects/client/api_client.py @@ -19,12 +19,12 @@ from evo.common.data import EmptyResponse, Environment, OrderByOperatorEnum from evo.common.utils import get_service_health, parse_order_by -from .. import parse from ..data import ObjectMetadata, ObjectOrderByEnum, ObjectReference, ObjectVersion, OrgObjectMetadata, Stage from ..endpoints import MetadataApi, ObjectsApi, StagesApi from ..endpoints.models import GeoscienceObject, MetadataUpdateBody, UpdateGeoscienceObject from ..exceptions import ObjectUUIDError from ..io import ObjectDataDownload, ObjectDataUpload +from . import parse from .object_client import DownloadedObject try: diff --git a/packages/evo-objects/src/evo/objects/client/object_client.py b/packages/evo-objects/src/evo/objects/client/object_client.py index c276004d..afa38d1b 100644 --- a/packages/evo-objects/src/evo/objects/client/object_client.py +++ b/packages/evo-objects/src/evo/objects/client/object_client.py @@ -18,10 +18,10 @@ from evo.common import APIConnector, ICache from evo.common.io.exceptions import DataNotFoundError -from .. import parse from ..data import ObjectMetadata, ObjectReference, ObjectSchema from ..endpoints import ObjectsApi, models from ..io import ObjectDataDownload +from . import parse __all__ = ["DownloadedObject"] diff --git a/packages/evo-objects/src/evo/objects/parse.py b/packages/evo-objects/src/evo/objects/client/parse.py similarity index 98% rename from packages/evo-objects/src/evo/objects/parse.py rename to packages/evo-objects/src/evo/objects/client/parse.py index a234b127..5cab5884 100644 --- a/packages/evo-objects/src/evo/objects/parse.py +++ b/packages/evo-objects/src/evo/objects/client/parse.py @@ -15,8 +15,8 @@ from evo.common import Environment, Page, ServiceUser -from .data import ObjectMetadata, ObjectSchema, ObjectVersion, OrgObjectMetadata, Stage -from .endpoints import models +from ..data import ObjectMetadata, ObjectSchema, ObjectVersion, OrgObjectMetadata, Stage +from ..endpoints import models __all__ = [ "object_metadata", From 8092e4eed2da1359beae276adc91f8401635ef84 Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Mon, 6 Oct 2025 12:44:15 +1300 Subject: [PATCH 18/32] Add optional cache to ObjectAPIClient --- .../evo-objects/src/evo/objects/client/api_client.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/packages/evo-objects/src/evo/objects/client/api_client.py b/packages/evo-objects/src/evo/objects/client/api_client.py index 5f908f34..a71bd452 100644 --- a/packages/evo-objects/src/evo/objects/client/api_client.py +++ b/packages/evo-objects/src/evo/objects/client/api_client.py @@ -40,11 +40,17 @@ class ObjectAPIClient(BaseAPIClient): - def __init__(self, environment: Environment, connector: APIConnector) -> None: + def __init__(self, environment: Environment, connector: APIConnector, cache: ICache | None = None) -> None: + """ + :param environment: The target Evo environment, providing org and workspace IDs. + :param connector: The API connector to use for making API calls. + :param cache: An optional cache to use for data downloads. + """ super().__init__(environment, connector) self._stages_api = StagesApi(connector=connector) self._objects_api = ObjectsApi(connector=connector) self._metadata_api = MetadataApi(connector=connector) + self._cache = cache async def get_service_health(self, check_type: HealthCheckType = HealthCheckType.FULL) -> ServiceHealth: """Get the health of the geoscience object service. @@ -381,7 +387,7 @@ async def download_object_by_path( return await DownloadedObject.from_reference( connector=self._connector, reference=reference, - cache=None, # TODO: Add an optional cache to the ObjectAPIClient. + cache=self._cache, request_timeout=request_timeout, ) @@ -405,7 +411,7 @@ async def download_object_by_id( return await DownloadedObject.from_reference( connector=self._connector, reference=reference, - cache=None, # TODO: Add an optional cache to the ObjectAPIClient. + cache=self._cache, request_timeout=request_timeout, ) From faecdd6d55c126181cfc178c0f35c2fc1627df17 Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Mon, 6 Oct 2025 13:43:18 +1300 Subject: [PATCH 19/32] Add optional JMESPath support to DownloadedObject --- .../src/evo/objects/client/object_client.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/packages/evo-objects/src/evo/objects/client/object_client.py b/packages/evo-objects/src/evo/objects/client/object_client.py index afa38d1b..1ab810cf 100644 --- a/packages/evo-objects/src/evo/objects/client/object_client.py +++ b/packages/evo-objects/src/evo/objects/client/object_client.py @@ -12,6 +12,7 @@ from __future__ import annotations from collections.abc import Iterator, Sequence +from typing import Any from uuid import UUID from evo import logging @@ -23,6 +24,13 @@ from ..io import ObjectDataDownload from . import parse +try: + from evo import jmespath +except ImportError: + _JMESPATH_AVAILABLE = False +else: + _JMESPATH_AVAILABLE = True + __all__ = ["DownloadedObject"] logger = logging.getLogger("object.client") @@ -120,6 +128,18 @@ def as_dict(self) -> dict: """Get this object as a dictionary.""" return self._object.model_dump(mode="python", by_alias=True) + if _JMESPATH_AVAILABLE: + # Optional JMESPath support for searching within the object JSON content. + + def search(self, expression: str) -> Any: + """Search the object metadata using a JMESPath expression. + + :param expression: The JMESPath expression to use for the search. + + :return: The result of the search. + """ + return jmespath.search(expression, self.as_dict()) + def prepare_data_download(self, data_identifiers: Sequence[str | UUID]) -> Iterator[ObjectDataDownload]: """Prepare to download multiple data files from the geoscience object service, for this object. From a904cfe020defddc158199bfd50f61fd73bc698f Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Mon, 6 Oct 2025 14:01:12 +1300 Subject: [PATCH 20/32] Move ParquetLoader to a separate submodule --- .../src/evo/objects/loader/__init__.py | 25 +++++++++++++++++++ .../{utils => loader}/parquet_loader.py | 21 ++++++++++------ .../evo/objects/{utils => loader}/types.py | 0 .../src/evo/objects/utils/__init__.py | 4 --- .../evo-objects/src/evo/objects/utils/data.py | 18 ++++++------- 5 files changed, 48 insertions(+), 20 deletions(-) create mode 100644 packages/evo-objects/src/evo/objects/loader/__init__.py rename packages/evo-objects/src/evo/objects/{utils => loader}/parquet_loader.py (88%) rename packages/evo-objects/src/evo/objects/{utils => loader}/types.py (100%) diff --git a/packages/evo-objects/src/evo/objects/loader/__init__.py b/packages/evo-objects/src/evo/objects/loader/__init__.py new file mode 100644 index 00000000..412a5a20 --- /dev/null +++ b/packages/evo-objects/src/evo/objects/loader/__init__.py @@ -0,0 +1,25 @@ +# Copyright © 2025 Bentley Systems, Incorporated +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +try: + import pyarrow # noqa: F401 +except ImportError: + raise ImportError("The 'pyarrow' package is required to use ParquetLoader") from None + +from .parquet_loader import ParquetLoader +from .types import ArrayTableInfo, LookupTableInfo, TableInfo + +__all__ = [ + "ArrayTableInfo", + "LookupTableInfo", + "ParquetLoader", + "TableInfo", +] diff --git a/packages/evo-objects/src/evo/objects/utils/parquet_loader.py b/packages/evo-objects/src/evo/objects/loader/parquet_loader.py similarity index 88% rename from packages/evo-objects/src/evo/objects/utils/parquet_loader.py rename to packages/evo-objects/src/evo/objects/loader/parquet_loader.py index 2162ab6a..b13edfe0 100644 --- a/packages/evo-objects/src/evo/objects/utils/parquet_loader.py +++ b/packages/evo-objects/src/evo/objects/loader/parquet_loader.py @@ -1,9 +1,22 @@ +# Copyright © 2025 Bentley Systems, Incorporated +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from __future__ import annotations from io import BytesIO from logging import getLogger from typing import cast +import pyarrow as pa +import pyarrow.parquet as pq from pydantic import TypeAdapter from evo.common import ICache, IFeedback, ITransport @@ -11,15 +24,9 @@ from evo.common.utils import NoFeedback from ..exceptions import SchemaValidationError -from . import ArrowTableFormat, KnownTableFormat +from ..utils import ArrowTableFormat, KnownTableFormat from .types import TableInfo -try: - import pyarrow as pa - import pyarrow.parquet as pq -except ImportError: - raise ImportError("The 'pyarrow' package is required to use ParquetLoader") from None - try: import pandas as pd except ImportError: diff --git a/packages/evo-objects/src/evo/objects/utils/types.py b/packages/evo-objects/src/evo/objects/loader/types.py similarity index 100% rename from packages/evo-objects/src/evo/objects/utils/types.py rename to packages/evo-objects/src/evo/objects/loader/types.py diff --git a/packages/evo-objects/src/evo/objects/utils/__init__.py b/packages/evo-objects/src/evo/objects/utils/__init__.py index 5ce0a58b..ccd86e99 100644 --- a/packages/evo-objects/src/evo/objects/utils/__init__.py +++ b/packages/evo-objects/src/evo/objects/utils/__init__.py @@ -25,7 +25,6 @@ from .data import ObjectDataClient from .table_formats import all_known_formats, get_known_format from .tables import ArrowTableFormat, BaseTableFormat, KnownTableFormat -from .types import ArrayTableInfo, LookupTableInfo, TableInfo # We _used_ to export Table and DataFrame from this package as custom protocols, but we are using the actual # pyarrow.Table and pandas.DataFrame types now. We are importing these types here from pyarrow and pandas @@ -33,13 +32,10 @@ # removed in a future release. __all__ = [ - "ArrayTableInfo", "ArrowTableFormat", "BaseTableFormat", "KnownTableFormat", - "LookupTableInfo", "ObjectDataClient", - "TableInfo", "all_known_formats", "get_known_format", ] diff --git a/packages/evo-objects/src/evo/objects/utils/data.py b/packages/evo-objects/src/evo/objects/utils/data.py index 19d0e60b..b9acc77f 100644 --- a/packages/evo-objects/src/evo/objects/utils/data.py +++ b/packages/evo-objects/src/evo/objects/utils/data.py @@ -21,7 +21,6 @@ from evo.common.utils import NoFeedback, PartialFeedback from ..io import _CACHE_SCOPE, ObjectDataUpload -from .types import TableInfo try: import pyarrow as pa @@ -125,7 +124,7 @@ async def upload_referenced_data(self, object_model: dict, fb: IFeedback = NoFee ) fb.progress(1) - def save_table(self, table: pa.Table) -> TableInfo: + def save_table(self, table: pa.Table) -> dict: """Save a pyarrow table to a file, returning the table info as a dictionary. :param table: The pyarrow table to save. @@ -141,7 +140,7 @@ def save_table(self, table: pa.Table) -> TableInfo: table_info = known_format.save_table(table=table, destination=self.cache_location) return table_info - async def upload_table(self, table: pa.Table, fb: IFeedback = NoFeedback) -> TableInfo: + async def upload_table(self, table: pa.Table, fb: IFeedback = NoFeedback) -> dict: """Upload pyarrow table to the geoscience object service, returning a GO model of the uploaded data. :param table: The table to be uploaded. @@ -161,7 +160,7 @@ async def upload_table(self, table: pa.Table, fb: IFeedback = NoFeedback) -> Tab return table_info async def download_table( - self, object_id: UUID, version_id: str, table_info: TableInfo, fb: IFeedback = NoFeedback + self, object_id: UUID, version_id: str, table_info: dict, fb: IFeedback = NoFeedback ) -> pa.Table: """Download pyarrow table from the geoscience object service. @@ -180,8 +179,9 @@ async def download_table( :raises TableFormatError: If the data does not match the expected format. :raises SchemaValidationError: If the data has a different number of rows than expected. """ - from ..client import ObjectAPIClient # Import here to avoid circular import. - from .parquet_loader import ParquetLoader + # Import here to avoid circular import. + from ..client import ObjectAPIClient + from ..loader import ParquetLoader client = ObjectAPIClient(self._environment, self._connector) (download,) = [d async for d in client.prepare_data_download(object_id, version_id, [table_info["data"]])] @@ -195,7 +195,7 @@ async def download_table( if _PD_AVAILABLE: # Optional support for pandas dataframes. Depends on both pyarrow and pandas. - def save_dataframe(self, dataframe: pd.DataFrame) -> TableInfo: + def save_dataframe(self, dataframe: pd.DataFrame) -> dict: """Save a pandas dataframe to a file, returning the table info as a dictionary. :param dataframe: The pandas dataframe to save. @@ -207,7 +207,7 @@ def save_dataframe(self, dataframe: pd.DataFrame) -> TableInfo: """ return self.save_table(pa.Table.from_pandas(dataframe)) - async def upload_dataframe(self, dataframe: pd.DataFrame, fb: IFeedback = NoFeedback) -> TableInfo: + async def upload_dataframe(self, dataframe: pd.DataFrame, fb: IFeedback = NoFeedback) -> dict: """Upload pandas dataframe to the geoscience object service, returning a GO model of the uploaded data. :param dataframe: The pandas dataframe to be uploaded. @@ -221,7 +221,7 @@ async def upload_dataframe(self, dataframe: pd.DataFrame, fb: IFeedback = NoFeed return table_info async def download_dataframe( - self, object_id: UUID, version_id: str, table_info: TableInfo, fb: IFeedback = NoFeedback + self, object_id: UUID, version_id: str, table_info: dict, fb: IFeedback = NoFeedback ) -> pd.DataFrame: """Download pandas dataframe data from the geoscience object service. From 0d7d890f54e76b6b0e4b3cc696fec41fedcacb65 Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Mon, 6 Oct 2025 14:09:48 +1300 Subject: [PATCH 21/32] Add optional support to DownloadedObject for downloading tables as pyarrow tables, pandas dataframes, or numpy arrays --- .../src/evo/objects/client/object_client.py | 92 ++++++++++++++++++- 1 file changed, 91 insertions(+), 1 deletion(-) diff --git a/packages/evo-objects/src/evo/objects/client/object_client.py b/packages/evo-objects/src/evo/objects/client/object_client.py index 1ab810cf..bf9e2263 100644 --- a/packages/evo-objects/src/evo/objects/client/object_client.py +++ b/packages/evo-objects/src/evo/objects/client/object_client.py @@ -15,9 +15,12 @@ from typing import Any from uuid import UUID +from pydantic import ConfigDict, TypeAdapter + from evo import logging -from evo.common import APIConnector, ICache +from evo.common import APIConnector, ICache, IFeedback from evo.common.io.exceptions import DataNotFoundError +from evo.common.utils import NoFeedback from ..data import ObjectMetadata, ObjectReference, ObjectSchema from ..endpoints import ObjectsApi, models @@ -31,6 +34,31 @@ else: _JMESPATH_AVAILABLE = True +try: + import pyarrow as pa + + from ..loader import ParquetLoader, TableInfo +except ImportError: + _LOADER_AVAILABLE = False +else: + _LOADER_AVAILABLE = True + + _TABLE_INFO_VALIDATOR: TypeAdapter[TableInfo] = TypeAdapter(TableInfo, config=ConfigDict(extra="ignore")) + +try: + import pandas as pd +except ImportError: + _PD_AVAILABLE = False +else: + _PD_AVAILABLE = True + +try: + import numpy as np +except ImportError: + _NP_AVAILABLE = False +else: + _NP_AVAILABLE = True + __all__ = ["DownloadedObject"] logger = logging.getLogger("object.client") @@ -159,3 +187,65 @@ def prepare_data_download(self, data_identifiers: Sequence[str | UUID]) -> Itera connector=self._connector, metadata=self._metadata, urls_by_name=filtered_urls_by_name ): yield ctx + + if _LOADER_AVAILABLE: + # Optional support for loading Parquet data using PyArrow. + + def get_parquet_loader(self, table_info: TableInfo | str) -> ParquetLoader: + """Get a ParquetLoader for the data referenced by the given table info or data reference string. + + :param table_info: The table info dict, JMESPath to table info, or data reference string. + + :returns: A ParquetLoader that can be used to download and read the referenced data. + """ + if isinstance(table_info, str): + if not _JMESPATH_AVAILABLE: + raise ValueError("The 'jmespath' package is required to use JMESPath expressions") from None + elif isinstance(resolved := self.search(table_info), jmespath.JMESPathObjectProxy): + table_info = _TABLE_INFO_VALIDATOR.validate_python(resolved.raw) + else: + raise ValueError(f"Expected table info, got {type(resolved)}") + else: + table_info = _TABLE_INFO_VALIDATOR.validate_python(table_info) + + (download,) = self.prepare_data_download([table_info["data"]]) + return ParquetLoader(download, table_info, self._connector.transport, self._cache) + + async def download_table(self, table_info: TableInfo | str, fb: IFeedback = NoFeedback) -> pa.Table: + """Download the data referenced by the given table info or data reference string as a PyArrow Table. + + :param table_info: The table info dict, JMESPath to table info, or data reference string. + :param fb: An optional feedback instance to report download progress to. + + :returns: A PyArrow Table containing the downloaded data. + """ + loader = self.get_parquet_loader(table_info) + return await loader.load_as_table(fb) + + if _PD_AVAILABLE: + # Optional support for loading data as Pandas DataFrames. Requires parquet support via PyArrow as well. + + async def download_dataframe(self, table_info: TableInfo | str, fb: IFeedback = NoFeedback) -> pd.DataFrame: + """Download the data referenced by the given table info or data reference string as a Pandas DataFrame. + + :param table_info: The table info dict, JMESPath to table info, or data reference string. + :param fb: An optional feedback instance to report download progress to. + + :returns: A Pandas DataFrame containing the downloaded data. + """ + loader = self.get_parquet_loader(table_info) + return await loader.load_as_dataframe(fb) + + if _NP_AVAILABLE: + # Optional support for loading data as NumPy arrays. Requires parquet support via PyArrow as well. + + async def download_array(self, table_info: TableInfo | str, fb: IFeedback = NoFeedback) -> np.ndarray: + """Download the data referenced by the given table info or data reference string as a NumPy array. + + :param table_info: The table info dict, JMESPath to table info, or data reference string. + :param fb: An optional feedback instance to report download progress to. + + :returns: A NumPy array containing the downloaded data. + """ + loader = self.get_parquet_loader(table_info) + return await loader.load_as_array(fb) From 2d30f681a1b47965828b8b02dbab2caa37bdf85e Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Mon, 6 Oct 2025 17:04:05 +1300 Subject: [PATCH 22/32] Export ObjectReference from evo.objects --- packages/evo-objects/src/evo/objects/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/evo-objects/src/evo/objects/__init__.py b/packages/evo-objects/src/evo/objects/__init__.py index e81312be..dba64df8 100644 --- a/packages/evo-objects/src/evo/objects/__init__.py +++ b/packages/evo-objects/src/evo/objects/__init__.py @@ -10,7 +10,7 @@ # limitations under the License. from .client import DownloadedObject, ObjectAPIClient -from .data import ObjectMetadata, ObjectSchema, ObjectVersion, SchemaVersion, Stage +from .data import ObjectMetadata, ObjectReference, ObjectSchema, ObjectVersion, SchemaVersion, Stage from .io import ObjectDataDownload, ObjectDataUpload __all__ = [ @@ -19,6 +19,7 @@ "ObjectDataDownload", "ObjectDataUpload", "ObjectMetadata", + "ObjectReference", "ObjectSchema", "ObjectVersion", "SchemaVersion", From 2533605b8c5d26025cca048a0bd16ee2d159d952 Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Mon, 6 Oct 2025 17:04:47 +1300 Subject: [PATCH 23/32] Add unit tests for DownloadedObject --- .../tests/data/get_object_detailed.json | 108 +++++ packages/evo-objects/tests/helpers.py | 9 + .../evo-objects/tests/test_data_client.py | 22 +- .../tests/test_downloaded_object.py | 375 ++++++++++++++++++ 4 files changed, 498 insertions(+), 16 deletions(-) create mode 100644 packages/evo-objects/tests/data/get_object_detailed.json create mode 100644 packages/evo-objects/tests/test_downloaded_object.py diff --git a/packages/evo-objects/tests/data/get_object_detailed.json b/packages/evo-objects/tests/data/get_object_detailed.json new file mode 100644 index 00000000..7bdcf214 --- /dev/null +++ b/packages/evo-objects/tests/data/get_object_detailed.json @@ -0,0 +1,108 @@ +{ + "object_path": "A/m.json", + "object_id": "00000000-0000-0000-0000-000000000002", + "object": { + "schema": "/objects/pointset/1.0.1/pointset.schema.json", + "uuid": "00000000-0000-0000-0000-000000000002", + "name": "Sample pointset", + "description": "A sample pointset object", + "bounding_box": { + "min_x": 0.0, + "max_x": 1.0, + "min_y": 2.0, + "max_y": 3.0, + "min_z": 4.0, + "max_z": 5.0 + }, + "coordinate_reference_system": { + "epsg_code": 2048 + }, + "locations": { + "coordinates": { + "data": "0000000000000000000000000000000000000000000000000000000000000000", + "length": 123, + "width": 3, + "data_type": "float64" + }, + "attributes": [ + { + "table": { + "data": "0000000000000000000000000000000000000000000000000000000000000001", + "length": 12, + "keys_data_type": "int32", + "values_data_type": "string" + }, + "values": { + "data": "0000000000000000000000000000000000000000000000000000000000000002", + "length": 123, + "width": 1, + "data_type": "int32" + }, + "name": "Stn", + "nan_description": { + "values": [ + 0 + ] + }, + "attribute_type": "category" + }, + { + "name": "InvRes", + "nan_description": { + "values": [] + }, + "values": { + "data": "0000000000000000000000000000000000000000000000000000000000000003", + "length": 123, + "width": 1, + "data_type": "float64" + }, + "attribute_type": "scalar" + } + ] + } + }, + "version_id": "2023-08-03T05:47:18.3402289Z", + "created_at": "2023-08-03T05:47:18Z", + "created_by": { + "id": "00000000-0000-0000-0000-0000000003e8", + "name": "Test User", + "email": "t.user@example.com" + }, + "modified_at": "2023-08-04T05:47:18Z", + "modified_by": { + "id": "00000000-0000-0000-0000-0000000003e8", + "name": "Test User", + "email": "t.user@example.com" + }, + "etag": "", + "links": { + "download": "https://unittest.localhost/path/geoscience-object/orgs/00000000-0000-0000-0000-0000000004d2/workspaces/00000000-0000-0000-0000-00000000162e/objects/path/A/m.json", + "data": [ + { + "id": "00000000-0000-0000-0000-000000000000", + "name": "0000000000000000000000000000000000000000000000000000000000000000", + "download_url": "https://storage.unittest.localhost/wheres/my/data/0000000000000000000000000000000000000000000000000000000000000000" + }, + { + "id": "00000000-0000-0000-0000-000000000001", + "name": "0000000000000000000000000000000000000000000000000000000000000001", + "download_url": "https://storage.unittest.localhost/wheres/my/data/0000000000000000000000000000000000000000000000000000000000000001" + }, + { + "id": "00000000-0000-0000-0000-000000000002", + "name": "0000000000000000000000000000000000000000000000000000000000000002", + "download_url": "https://storage.unittest.localhost/wheres/my/data/0000000000000000000000000000000000000000000000000000000000000002" + }, + { + "id": "00000000-0000-0000-0000-000000000003", + "name": "0000000000000000000000000000000000000000000000000000000000000003", + "download_url": "https://storage.unittest.localhost/wheres/my/data/0000000000000000000000000000000000000000000000000000000000000003" + } + ] + }, + "stage": { + "name": "Approved", + "stage_id": "00000000-0000-0000-0000-000000000888" + } +} \ No newline at end of file diff --git a/packages/evo-objects/tests/helpers.py b/packages/evo-objects/tests/helpers.py index 384cbb8d..fd18496e 100644 --- a/packages/evo-objects/tests/helpers.py +++ b/packages/evo-objects/tests/helpers.py @@ -13,9 +13,11 @@ import sys from collections.abc import Iterator from datetime import datetime, timezone +from io import BytesIO import numpy import pyarrow as pa +import pyarrow.parquet as pq from evo.objects.utils.tables import BaseTableFormat, _ColumnFormat @@ -154,3 +156,10 @@ def get_sample_table( for column_format in column_formats ] return pa.table(sample_data, names=sample_schema.names).cast(sample_schema) + + +def get_sample_table_and_bytes(table_format: BaseTableFormat, n_rows: int) -> tuple[pa.Table, bytes]: + memory = BytesIO() + table = get_sample_table(table_format, n_rows) + pq.write_table(table, where=memory, version="2.4", compression="gzip") + return table, memory.getvalue() diff --git a/packages/evo-objects/tests/test_data_client.py b/packages/evo-objects/tests/test_data_client.py index 51839098..1af51e32 100644 --- a/packages/evo-objects/tests/test_data_client.py +++ b/packages/evo-objects/tests/test_data_client.py @@ -10,12 +10,9 @@ # limitations under the License. import json -from io import BytesIO from unittest import mock from uuid import UUID -import pyarrow as pa -import pyarrow.parquet as pq from pandas.testing import assert_frame_equal from data import load_test_data @@ -23,15 +20,8 @@ from evo.common.io.exceptions import DataExistsError from evo.common.test_tools import TestWithConnector, TestWithStorage from evo.common.utils import NoFeedback, PartialFeedback -from evo.objects.utils import BaseTableFormat, KnownTableFormat, ObjectDataClient -from helpers import NoImport, UnloadModule, get_sample_table - - -def _get_sample_table_and_bytes(table_format: BaseTableFormat, n_rows: int) -> tuple[pa.Table, bytes]: - memory = BytesIO() - table = get_sample_table(table_format, n_rows) - pq.write_table(table, where=memory, version="2.4", compression="gzip") - return table, memory.getvalue() +from evo.objects.utils import KnownTableFormat, ObjectDataClient +from helpers import NoImport, UnloadModule, get_sample_table_and_bytes class TestObjectDataClient(TestWithConnector, TestWithStorage): @@ -329,7 +319,7 @@ async def test_download_table(self) -> None: } mock_data_id: str = mock_table_info["data"] expected_filename = self.data_client.cache_location / mock_data_id - sample_table, payload_bytes = _get_sample_table_and_bytes( + sample_table, payload_bytes = get_sample_table_and_bytes( KnownTableFormat.from_table_info(mock_table_info), 1 ) @@ -369,7 +359,7 @@ async def test_download_dataframe(self) -> None: } mock_data_id: str = mock_table_info["data"] expected_filename = self.data_client.cache_location / mock_data_id - sample_table, payload_bytes = _get_sample_table_and_bytes( + sample_table, payload_bytes = get_sample_table_and_bytes( KnownTableFormat.from_table_info(mock_table_info), 1 ) @@ -425,7 +415,7 @@ async def test_download_table_already_downloaded(self) -> None: } mock_data_id: str = mock_table_info["data"] expected_filename = self.data_client.cache_location / mock_data_id - sample_table, payload_bytes = _get_sample_table_and_bytes( + sample_table, payload_bytes = get_sample_table_and_bytes( KnownTableFormat.from_table_info(mock_table_info), 1 ) @@ -468,7 +458,7 @@ async def test_download_dataframe_already_downloaded(self) -> None: } mock_data_id: str = mock_table_info["data"] expected_filename = self.data_client.cache_location / mock_data_id - sample_table, payload_bytes = _get_sample_table_and_bytes( + sample_table, payload_bytes = get_sample_table_and_bytes( KnownTableFormat.from_table_info(mock_table_info), 1 ) diff --git a/packages/evo-objects/tests/test_downloaded_object.py b/packages/evo-objects/tests/test_downloaded_object.py new file mode 100644 index 00000000..62ecf360 --- /dev/null +++ b/packages/evo-objects/tests/test_downloaded_object.py @@ -0,0 +1,375 @@ +# Copyright © 2025 Bentley Systems, Incorporated +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import json +from collections.abc import Generator +from typing import cast +from unittest import mock +from urllib.parse import quote +from uuid import UUID + +import pyarrow as pa +from numpy.testing import assert_array_equal +from pandas.testing import assert_frame_equal +from parameterized import parameterized + +from data import load_test_data +from evo.common import RequestMethod +from evo.common.test_tools import ( + BASE_URL, + ORG, + WORKSPACE_ID, + DownloadRequestHandler, + TestWithConnector, + TestWithStorage, +) +from evo.common.utils import NoFeedback +from evo.jmespath import JMESPathObjectProxy +from evo.objects import DownloadedObject, ObjectReference +from evo.objects.endpoints import models +from evo.objects.io import _CACHE_SCOPE +from evo.objects.loader import ParquetLoader, TableInfo +from evo.objects.utils import KnownTableFormat +from helpers import NoImport, UnloadModule, get_sample_table_and_bytes + +_OBJECTS_URL = f"{BASE_URL.rstrip('/')}/geoscience-object/orgs/{ORG.id}/workspaces/{WORKSPACE_ID}/objects" + +_TABLE_INFO_VARIANTS: list[tuple[str, TableInfo | str]] = [ + ( + "with TableInfo dict", + { + "data": "0000000000000000000000000000000000000000000000000000000000000000", + "length": 123, + "width": 3, + "data_type": "float64", + }, + ), + ("with JMESPath reference", "locations.coordinates"), +] + + +class TestDownloadedObject(TestWithConnector, TestWithStorage): + def setUp(self) -> None: + from evo.objects.client import parse + + TestWithConnector.setUp(self) + TestWithStorage.setUp(self) + + raw = models.GetObjectResponse.model_validate(load_test_data("get_object_detailed.json")) + self.object = DownloadedObject( + object_=raw.object, + metadata=parse.object_metadata(raw, self.environment), + urls_by_name={link.name: link.download_url for link in raw.links.data}, + connector=self.connector, + cache=self.cache, + ) + + def tearDown(self) -> None: + # Clear cache between tests to avoid cached files interfering with subsequent tests. + self.cache.clear_cache() + + @parameterized.expand( + [ + ("by id as string", f"{_OBJECTS_URL}/00000000-0000-0000-0000-000000000002"), + ( + "by id as ObjectReference", + ObjectReference(f"{_OBJECTS_URL}/00000000-0000-0000-0000-000000000002"), + ), + ( + "by id with version id", + ObjectReference( + f"{_OBJECTS_URL}/00000000-0000-0000-0000-000000000002?version=2023-08-03T05:47:18.3402289Z" + ), + ), + ("by path as string", f"{_OBJECTS_URL}/path/A/m.json"), + ("by path as ObjectReference", ObjectReference(f"{_OBJECTS_URL}/path/A/m.json")), + ( + "by path with version id", + ObjectReference(f"{_OBJECTS_URL}/path/A/m.json?version=2023-08-03T05:47:18.3402289Z"), + ), + ] + ) + async def test_from_reference(self, _label: str, reference: str) -> None: + """Test downloading a geoscience object by reference.""" + get_object_response = load_test_data("get_object.json") + expected_uuid = UUID(int=2) + expected_object_dict = { + "schema": "/objects/pointset/1.0.1/pointset.schema.json", + "uuid": UUID("00000000-0000-0000-0000-000000000002"), + "name": "Sample pointset", + "description": "A sample pointset object", + "bounding_box": {"min_x": 0.0, "max_x": 0.0, "min_y": 0.0, "max_y": 0.0, "min_z": 0.0, "max_z": 0.0}, + "coordinate_reference_system": {"epsg_code": 2048}, + "locations": { + "coordinates": { + "data": "0000000000000000000000000000000000000000000000000000000000000001", + "length": 1, + "width": 3, + "data_type": "float64", + } + }, + } + expected_path = "A/m.json" + expected_version = "2023-08-03T05:47:18.3402289Z" + with self.transport.set_http_response(status_code=200, content=json.dumps(get_object_response)): + actual_object = await DownloadedObject.from_reference(self.connector, reference, self.cache) + + ref = ObjectReference(reference) + if ref.object_id is not None: + expected_request_path = f"{_OBJECTS_URL}/{ref.object_id}" + else: + expected_request_path = f"{_OBJECTS_URL}/path/{ref.object_path}" + + if ref.version_id is not None: + expected_request_path += f"?version={quote(ref.version_id)}" + + self.assert_request_made( + method=RequestMethod.GET, + path=expected_request_path, + headers={"Accept": "application/json", "Accept-Encoding": "gzip"}, + ) + # Check metadata. + actual_metadata = actual_object.metadata + self.assertEqual(expected_path, actual_metadata.path) + self.assertEqual("A", actual_metadata.parent) + self.assertEqual("m.json", actual_metadata.name) + self.assertEqual(expected_uuid, actual_metadata.id) + self.assertEqual(expected_version, actual_metadata.version_id) + + # Check geoscience object. + self.assertEqual(expected_object_dict, actual_object.as_dict()) + + def test_search(self) -> None: + """Test the JMESPath search implementation.""" + expected_result = JMESPathObjectProxy( + { + "x": [0.0, 1.0], + "y": [2.0, 3.0], + "z": [4.0, 5.0], + } + ) + actual_result = self.object.search("bounding_box | {x: [min_x, max_x], y: [min_y, max_y], z: [min_z, max_z]}") + self.assertEqual(expected_result, actual_result) + + def _assert_optional_method(self, method_name: str, *, unload: list[str], no_import: list[str]) -> None: + # Verify the method exists before unloading any modules. + from evo.objects.client import DownloadedObject + + self.assertTrue( + all( + [ + hasattr(DownloadedObject, method_name), + hasattr(self.object, method_name), + ] + ), + f"DownloadedObject.{method_name} should be available for this test to be valid", + ) + + with UnloadModule("evo.objects.client.object_client", *unload), NoImport(*no_import): + # Re-import the class to ensure the module is re-evaluated without the optional dependency. + from evo.objects.client import DownloadedObject + + # Re-create the object to ensure the class is re-evaluated without the optional dependency. + client = DownloadedObject( + object_=self.object._object, + metadata=self.object.metadata, + urls_by_name=self.object._urls_by_name, + connector=self.object._connector, + cache=self.object._cache, + ) + self.assertFalse( + all( + [ + hasattr(DownloadedObject, method_name), + hasattr(client, method_name), + ] + ), + f"DownloadedObject.{method_name} should not be available if " + f"{', '.join(no_import)} {'is' if len(no_import) == 1 else 'are'} not available", + ) + + def test_search_is_optional(self) -> None: + """Test that the JMESPath search implementation is optional.""" + self._assert_optional_method("search", unload=["evo.jmespath"], no_import=["jmespath"]) + + @parameterized.expand(_TABLE_INFO_VARIANTS) + def test_get_parquet_loader(self, _label: str, table_info: TableInfo | str) -> None: + """Test getting a ParquetLoader instance.""" + loader = self.object.get_parquet_loader(table_info) + self.assertIsInstance(loader, ParquetLoader) + + def test_get_parquet_loader_jmespath_support_is_optional(self) -> None: + """Test that the JMESPath support for get_parquet_loader is optional.""" + with ( + UnloadModule("evo.objects.client.object_client", "evo.objects.loader.parquet_loader", "evo.jmespath"), + NoImport("jmespath"), + ): + from evo.objects.client import DownloadedObject + + client = DownloadedObject( + object_=self.object._object, + metadata=self.object.metadata, + urls_by_name=self.object._urls_by_name, + connector=self.object._connector, + cache=self.object._cache, + ) + + with self.assertRaises(ValueError): + client.get_parquet_loader("locations.coordinates") + + def test_get_parquet_loader_is_optional(self) -> None: + """Test that the get_parquet_loader method is optional.""" + self._assert_optional_method( + "get_parquet_loader", unload=["evo.objects.loader.parquet_loader"], no_import=["pyarrow"] + ) + + @contextlib.contextmanager + def _patch_downloading_table(self, table_info: TableInfo | str) -> Generator[pa.Table, None, None]: + mock_table_info = table_info + if isinstance(mock_table_info, str): + mock_table_info = cast(TableInfo, self.object.search(mock_table_info)) + + mock_data_id = mock_table_info["data"] + expected_filename = self.cache.get_location(self.environment, _CACHE_SCOPE) / mock_data_id + sample_table, payload_bytes = get_sample_table_and_bytes( + KnownTableFormat.from_table_info(mock_table_info), mock_table_info["length"] + ) + with mock.patch("evo.common.io.download.HTTPSource", autospec=True) as mock_source: + + async def _mock_download_file_side_effect(*args, **kwargs): + expected_download_url = self.object._urls_by_name[mock_data_id] + actual_download_url = await kwargs["url_generator"]() + self.assertEqual(expected_filename, kwargs["filename"]) + self.assertEqual(expected_download_url, actual_download_url) + self.assertIs(self.transport, kwargs["transport"]) + self.assertIs(NoFeedback, kwargs["fb"]) + expected_filename.write_bytes(payload_bytes) + + mock_source.download_file.side_effect = _mock_download_file_side_effect + yield sample_table + + mock_source.download_file.assert_called_once() + self.transport.assert_no_requests() + + @parameterized.expand(_TABLE_INFO_VARIANTS) + async def test_download_table(self, _label: str, table_info: TableInfo | str) -> None: + """Test downloading parquet data as a pyarrow.Table.""" + with self._patch_downloading_table(table_info) as sample_table: + actual_table = await self.object.download_table(table_info) + + # Should use cache second time. + # The _patch_downloading_table context manager verifies this by checking the data is only downloaded once. + cached_table = await self.object.download_table(table_info) + + self.assertEqual(sample_table, actual_table) + self.assertEqual(sample_table, cached_table) + + @contextlib.contextmanager + def _patch_downloading_table_in_memory(self, table_info: TableInfo | str) -> Generator[pa.Table, None, None]: + self.object._cache = None # Disable the cache for this test. + + mock_table_info = table_info + if isinstance(mock_table_info, str): + mock_table_info = cast(TableInfo, self.object.search(mock_table_info)) + + sample_table, payload_bytes = get_sample_table_and_bytes( + KnownTableFormat.from_table_info(mock_table_info), mock_table_info["length"] + ) + + # Use the DownloadRequestHandler from evo.common.test_tools.io to mock the binary download. + download_handler = DownloadRequestHandler(data=payload_bytes) + self.transport.set_request_handler(download_handler) + yield sample_table + + @parameterized.expand(_TABLE_INFO_VARIANTS) + async def test_download_table_without_cache(self, _label: str, table_info: TableInfo | str) -> None: + """Test downloading parquet data in memory as a pyarrow.Table.""" + with self._patch_downloading_table_in_memory(table_info) as sample_table: + actual_table = await self.object.download_table(table_info) + + self.assertEqual(sample_table, actual_table) + + def test_download_table_is_optional(self) -> None: + """Test that the download_table method is not available when pyarrow is not installed.""" + self._assert_optional_method( + "download_table", unload=["evo.objects.loader.parquet_loader"], no_import=["pyarrow"] + ) + + @parameterized.expand(_TABLE_INFO_VARIANTS) + async def test_download_dataframe(self, _label: str, table_info: TableInfo | str) -> None: + """Test downloading parquet data as a pandas.DataFrame.""" + with self._patch_downloading_table(table_info) as sample_table: + actual_dataframe = await self.object.download_dataframe(table_info) + + # Should use cache second time. + # The _patch_downloading_table context manager verifies this by checking the data is only downloaded once. + cached_dataframe = await self.object.download_dataframe(table_info) + + expected_dataframe = sample_table.to_pandas() + assert_frame_equal(expected_dataframe, actual_dataframe) + assert_frame_equal(expected_dataframe, cached_dataframe) + + @parameterized.expand(_TABLE_INFO_VARIANTS) + async def test_download_dataframe_without_cache(self, _label: str, table_info: TableInfo | str) -> None: + """Test downloading parquet data in memory as a pandas.DataFrame.""" + with self._patch_downloading_table_in_memory(table_info) as sample_table: + actual_dataframe = await self.object.download_dataframe(table_info) + + expected_dataframe = sample_table.to_pandas() + assert_frame_equal(expected_dataframe, actual_dataframe) + + @parameterized.expand( + [ + ("pyarrow",), + ("pandas",), + ] + ) + def test_download_dataframe_is_optional(self, missing: str) -> None: + """Test that the download_dataframe method is not available when pandas or pyarrow is not installed.""" + self._assert_optional_method( + "download_dataframe", unload=["evo.objects.loader.parquet_loader"], no_import=[missing] + ) + + @parameterized.expand(_TABLE_INFO_VARIANTS) + async def test_download_array(self, _label: str, table_info: TableInfo | str) -> None: + """Test downloading parquet data as a numpy.ndarray.""" + with self._patch_downloading_table(table_info) as sample_table: + actual_array = await self.object.download_array(table_info) + + # Should use cache second time. + # The _patch_downloading_table context manager verifies this by checking the data is only downloaded once. + cached_array = await self.object.download_array(table_info) + + expected_array = sample_table.to_pandas().to_numpy() + assert_array_equal(expected_array, actual_array, strict=True) + assert_array_equal(expected_array, cached_array, strict=True) + + @parameterized.expand(_TABLE_INFO_VARIANTS) + async def test_download_array_without_cache(self, _label: str, table_info: TableInfo | str) -> None: + """Test downloading parquet data in memory as a numpy.ndarray.""" + with self._patch_downloading_table_in_memory(table_info) as sample_table: + actual_array = await self.object.download_array(table_info) + + expected_array = sample_table.to_pandas().to_numpy() + assert_array_equal(expected_array, actual_array, strict=True) + + @parameterized.expand( + [ + ("pyarrow",), + ("numpy",), + ] + ) + def test_download_array_is_optional(self, missing: str) -> None: + """Test that the download_array method is not available when numpy or pyarrow is not installed.""" + self._assert_optional_method( + "download_array", unload=["evo.objects.loader.parquet_loader"], no_import=[missing] + ) From e124d4087164f48406e0ac3623b32275dd30283a Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Mon, 6 Oct 2025 17:06:30 +1300 Subject: [PATCH 24/32] Bump evo-objects to 0.3.0 --- packages/evo-objects/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/evo-objects/pyproject.toml b/packages/evo-objects/pyproject.toml index ad81aa6e..722ff02d 100644 --- a/packages/evo-objects/pyproject.toml +++ b/packages/evo-objects/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "evo-objects" description = "Python SDK for using the Seequent Evo Geoscience Object API" -version = "0.2.3" +version = "0.3.0" requires-python = ">=3.10" license-files = ["LICENSE.md"] dynamic = ["readme"] From c066364bdae0d0d04717ff1b307d9131f80629c3 Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Mon, 6 Oct 2025 17:20:40 +1300 Subject: [PATCH 25/32] Run uv lock --- uv.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/uv.lock b/uv.lock index 946aec84..4c7b78e0 100644 --- a/uv.lock +++ b/uv.lock @@ -786,7 +786,7 @@ test = [ [[package]] name = "evo-objects" -version = "0.2.3" +version = "0.3.0" source = { editable = "packages/evo-objects" } dependencies = [ { name = "evo-sdk-common", extra = ["jmespath"] }, From 4cd763ec18431bbdb4490e2c15da8bb3cc3cd3a2 Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Tue, 7 Oct 2025 11:40:08 +1300 Subject: [PATCH 26/32] Update evo-objects quickstart --- .../docs/examples/quickstart.ipynb | 102 +++++++++++++++--- 1 file changed, 87 insertions(+), 15 deletions(-) diff --git a/packages/evo-objects/docs/examples/quickstart.ipynb b/packages/evo-objects/docs/examples/quickstart.ipynb index 70b63392..fb1ef3e0 100644 --- a/packages/evo-objects/docs/examples/quickstart.ipynb +++ b/packages/evo-objects/docs/examples/quickstart.ipynb @@ -44,7 +44,7 @@ "environment = manager.get_environment()\n", "connector = manager.get_connector()\n", "\n", - "object_client = ObjectAPIClient(environment, connector)\n", + "object_client = ObjectAPIClient(environment, connector, manager.cache) # Cache is optional\n", "service_health = await object_client.get_service_health()\n", "\n", "print(f\"Object API is {service_health.status.name.lower()}\")\n", @@ -267,16 +267,6 @@ "\n", "downloaded_object = await object_client.download_object_by_path(\"sdk/v2/sample-pointset.json\")\n", "metadata = downloaded_object.metadata\n", - "downloaded_dict = downloaded_object.as_dict()\n", - "print(downloaded_dict)\n", - "\n", - "# Use the data client to download the parquet data.\n", - "downloaded_data = await data_client.download_table(\n", - " object_id=metadata.id,\n", - " version_id=metadata.version_id,\n", - " table_info=downloaded_dict[\"locations\"][\"coordinates\"],\n", - " fb=FeedbackWidget(\"Downloading data\"),\n", - ")\n", "\n", "if metadata.created_by is not None and metadata.created_by.name is not None:\n", " accreditation = f\"{metadata.created_by.name}\"\n", @@ -284,10 +274,92 @@ " accreditation = \"an unknown user\"\n", "created_at_str = metadata.created_at.astimezone().strftime(\"on %Y-%m-%d at %H:%M:%S\")\n", "print(f\"{metadata.path} :: uploaded by {accreditation} {created_at_str}\")\n", - "print(downloaded_dict)\n", + "\n", + "# Downloaded object supports JMESPath queries for more expressive access to JSON data.\n", + "print(\n", + " downloaded_object.search( # Project only a few fields for display\n", + " \"\"\"\n", + " {\n", + " name: name,\n", + " uuid: uuid,\n", + " schema: schema,\n", + " coordinate_reference_system: coordinate_reference_system,\n", + " bounding_box: {\n", + " min: [bounding_box.min_x, bounding_box.min_y, bounding_box.min_z],\n", + " max: [bounding_box.max_x, bounding_box.max_y, bounding_box.max_z]\n", + " }\n", + " }\n", + " \"\"\"\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from evo.objects import DownloadedObject, ObjectReference\n", + "\n", + "# If you already know the object you want, you don't even need the object client\n", + "ref = ObjectReference.new(\n", + " environment=manager.get_environment(),\n", + " object_path=\"sdk/v2/sample-pointset.json\",\n", + " version_id=metadata.version_id, # The version ID is optional\n", + ")\n", + "print(\"ObjectReference URL:\", ref) # An object reference can also be a string in the format printed here.\n", + "downloaded_object = await DownloadedObject.from_reference(\n", + " connector=manager.get_connector(),\n", + " reference=ref,\n", + " cache=manager.cache,\n", + ")\n", + "downloaded_object.search(\"@\") # Pretty-print the entire object via a JMESPath proxy object" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Download parquet data from a pointset object" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "table_info = downloaded_object.search(\"locations.coordinates\") # Use a JMESPath expression to find the table info\n", + "\n", + "# Download parquet data by table info reference\n", + "downloaded_data = await downloaded_object.download_table(table_info, fb=FeedbackWidget(\"Downloading pyarrow.Table\"))\n", + "\n", + "# OR you can just use the JMESPath expression in the download_table call directly\n", + "downloaded_data = await downloaded_object.download_table(\"locations.coordinates\")\n", "print(downloaded_data)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# A similar interface can be used to download as a pandas DataFrame\n", + "await downloaded_object.download_dataframe(\"locations.coordinates\", fb=FeedbackWidget(\"Downloading pandas.DataFrame\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# And as a NumPy array\n", + "await downloaded_object.download_array(\"locations.coordinates\", fb=FeedbackWidget(\"Downloading numpy.ndarray\"))" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -324,7 +396,7 @@ "stages = await object_client.list_stages()\n", "\n", "for stage in stages:\n", - " print(f\"{stage.name} ({stage.stage_id})\")" + " print(f\"{stage.name} ({stage.id})\")" ] }, { @@ -344,7 +416,7 @@ "metadata = downloaded_object.metadata\n", "print(f\"Current stage: {metadata}\")\n", "\n", - "await object_client.set_stage(metadata.id, version_id=metadata.version_id, stage_id=stages[1].stage_id)\n", + "await object_client.set_stage(metadata.id, version_id=metadata.version_id, stage_id=stages[1].id)\n", "\n", "updated_metadata = await object_client.download_object_by_id(metadata.id)\n", "print(f\"Updated stage: {updated_metadata.metadata.stage}\")" @@ -362,7 +434,7 @@ ], "metadata": { "kernelspec": { - "display_name": "venv", + "display_name": "evo-sdk", "language": "python", "name": "python3" }, From 878b6b47f2e37edb5901fb386489ef012d89ee71 Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Tue, 7 Oct 2025 11:57:07 +1300 Subject: [PATCH 27/32] JMESPath support is not optional in evo-objects --- .../src/evo/objects/client/object_client.py | 28 +++++---------- .../tests/test_downloaded_object.py | 35 ++++--------------- 2 files changed, 14 insertions(+), 49 deletions(-) diff --git a/packages/evo-objects/src/evo/objects/client/object_client.py b/packages/evo-objects/src/evo/objects/client/object_client.py index bf9e2263..932c7072 100644 --- a/packages/evo-objects/src/evo/objects/client/object_client.py +++ b/packages/evo-objects/src/evo/objects/client/object_client.py @@ -17,7 +17,7 @@ from pydantic import ConfigDict, TypeAdapter -from evo import logging +from evo import jmespath, logging from evo.common import APIConnector, ICache, IFeedback from evo.common.io.exceptions import DataNotFoundError from evo.common.utils import NoFeedback @@ -27,13 +27,6 @@ from ..io import ObjectDataDownload from . import parse -try: - from evo import jmespath -except ImportError: - _JMESPATH_AVAILABLE = False -else: - _JMESPATH_AVAILABLE = True - try: import pyarrow as pa @@ -156,17 +149,14 @@ def as_dict(self) -> dict: """Get this object as a dictionary.""" return self._object.model_dump(mode="python", by_alias=True) - if _JMESPATH_AVAILABLE: - # Optional JMESPath support for searching within the object JSON content. + def search(self, expression: str) -> Any: + """Search the object metadata using a JMESPath expression. - def search(self, expression: str) -> Any: - """Search the object metadata using a JMESPath expression. + :param expression: The JMESPath expression to use for the search. - :param expression: The JMESPath expression to use for the search. - - :return: The result of the search. - """ - return jmespath.search(expression, self.as_dict()) + :return: The result of the search. + """ + return jmespath.search(expression, self.as_dict()) def prepare_data_download(self, data_identifiers: Sequence[str | UUID]) -> Iterator[ObjectDataDownload]: """Prepare to download multiple data files from the geoscience object service, for this object. @@ -199,9 +189,7 @@ def get_parquet_loader(self, table_info: TableInfo | str) -> ParquetLoader: :returns: A ParquetLoader that can be used to download and read the referenced data. """ if isinstance(table_info, str): - if not _JMESPATH_AVAILABLE: - raise ValueError("The 'jmespath' package is required to use JMESPath expressions") from None - elif isinstance(resolved := self.search(table_info), jmespath.JMESPathObjectProxy): + if isinstance(resolved := self.search(table_info), jmespath.JMESPathObjectProxy): table_info = _TABLE_INFO_VALIDATOR.validate_python(resolved.raw) else: raise ValueError(f"Expected table info, got {type(resolved)}") diff --git a/packages/evo-objects/tests/test_downloaded_object.py b/packages/evo-objects/tests/test_downloaded_object.py index 62ecf360..95ae9e48 100644 --- a/packages/evo-objects/tests/test_downloaded_object.py +++ b/packages/evo-objects/tests/test_downloaded_object.py @@ -160,6 +160,12 @@ def test_search(self) -> None: actual_result = self.object.search("bounding_box | {x: [min_x, max_x], y: [min_y, max_y], z: [min_z, max_z]}") self.assertEqual(expected_result, actual_result) + @parameterized.expand(_TABLE_INFO_VARIANTS) + def test_get_parquet_loader(self, _label: str, table_info: TableInfo | str) -> None: + """Test getting a ParquetLoader instance.""" + loader = self.object.get_parquet_loader(table_info) + self.assertIsInstance(loader, ParquetLoader) + def _assert_optional_method(self, method_name: str, *, unload: list[str], no_import: list[str]) -> None: # Verify the method exists before unloading any modules. from evo.objects.client import DownloadedObject @@ -197,35 +203,6 @@ def _assert_optional_method(self, method_name: str, *, unload: list[str], no_imp f"{', '.join(no_import)} {'is' if len(no_import) == 1 else 'are'} not available", ) - def test_search_is_optional(self) -> None: - """Test that the JMESPath search implementation is optional.""" - self._assert_optional_method("search", unload=["evo.jmespath"], no_import=["jmespath"]) - - @parameterized.expand(_TABLE_INFO_VARIANTS) - def test_get_parquet_loader(self, _label: str, table_info: TableInfo | str) -> None: - """Test getting a ParquetLoader instance.""" - loader = self.object.get_parquet_loader(table_info) - self.assertIsInstance(loader, ParquetLoader) - - def test_get_parquet_loader_jmespath_support_is_optional(self) -> None: - """Test that the JMESPath support for get_parquet_loader is optional.""" - with ( - UnloadModule("evo.objects.client.object_client", "evo.objects.loader.parquet_loader", "evo.jmespath"), - NoImport("jmespath"), - ): - from evo.objects.client import DownloadedObject - - client = DownloadedObject( - object_=self.object._object, - metadata=self.object.metadata, - urls_by_name=self.object._urls_by_name, - connector=self.object._connector, - cache=self.object._cache, - ) - - with self.assertRaises(ValueError): - client.get_parquet_loader("locations.coordinates") - def test_get_parquet_loader_is_optional(self) -> None: """Test that the get_parquet_loader method is optional.""" self._assert_optional_method( From a56f875f9638c7c63061f8426838f5de0bce0444 Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Tue, 7 Oct 2025 12:53:44 +1300 Subject: [PATCH 28/32] get_parquet_loader() doesn't need to be part of the public API --- .../src/evo/objects/client/object_client.py | 8 ++++---- .../evo-objects/tests/test_downloaded_object.py | 14 +------------- 2 files changed, 5 insertions(+), 17 deletions(-) diff --git a/packages/evo-objects/src/evo/objects/client/object_client.py b/packages/evo-objects/src/evo/objects/client/object_client.py index 932c7072..d4f262ef 100644 --- a/packages/evo-objects/src/evo/objects/client/object_client.py +++ b/packages/evo-objects/src/evo/objects/client/object_client.py @@ -181,7 +181,7 @@ def prepare_data_download(self, data_identifiers: Sequence[str | UUID]) -> Itera if _LOADER_AVAILABLE: # Optional support for loading Parquet data using PyArrow. - def get_parquet_loader(self, table_info: TableInfo | str) -> ParquetLoader: + def _get_parquet_loader(self, table_info: TableInfo | str) -> ParquetLoader: """Get a ParquetLoader for the data referenced by the given table info or data reference string. :param table_info: The table info dict, JMESPath to table info, or data reference string. @@ -207,7 +207,7 @@ async def download_table(self, table_info: TableInfo | str, fb: IFeedback = NoFe :returns: A PyArrow Table containing the downloaded data. """ - loader = self.get_parquet_loader(table_info) + loader = self._get_parquet_loader(table_info) return await loader.load_as_table(fb) if _PD_AVAILABLE: @@ -221,7 +221,7 @@ async def download_dataframe(self, table_info: TableInfo | str, fb: IFeedback = :returns: A Pandas DataFrame containing the downloaded data. """ - loader = self.get_parquet_loader(table_info) + loader = self._get_parquet_loader(table_info) return await loader.load_as_dataframe(fb) if _NP_AVAILABLE: @@ -235,5 +235,5 @@ async def download_array(self, table_info: TableInfo | str, fb: IFeedback = NoFe :returns: A NumPy array containing the downloaded data. """ - loader = self.get_parquet_loader(table_info) + loader = self._get_parquet_loader(table_info) return await loader.load_as_array(fb) diff --git a/packages/evo-objects/tests/test_downloaded_object.py b/packages/evo-objects/tests/test_downloaded_object.py index 95ae9e48..9cdd3eea 100644 --- a/packages/evo-objects/tests/test_downloaded_object.py +++ b/packages/evo-objects/tests/test_downloaded_object.py @@ -37,7 +37,7 @@ from evo.objects import DownloadedObject, ObjectReference from evo.objects.endpoints import models from evo.objects.io import _CACHE_SCOPE -from evo.objects.loader import ParquetLoader, TableInfo +from evo.objects.loader import TableInfo from evo.objects.utils import KnownTableFormat from helpers import NoImport, UnloadModule, get_sample_table_and_bytes @@ -160,12 +160,6 @@ def test_search(self) -> None: actual_result = self.object.search("bounding_box | {x: [min_x, max_x], y: [min_y, max_y], z: [min_z, max_z]}") self.assertEqual(expected_result, actual_result) - @parameterized.expand(_TABLE_INFO_VARIANTS) - def test_get_parquet_loader(self, _label: str, table_info: TableInfo | str) -> None: - """Test getting a ParquetLoader instance.""" - loader = self.object.get_parquet_loader(table_info) - self.assertIsInstance(loader, ParquetLoader) - def _assert_optional_method(self, method_name: str, *, unload: list[str], no_import: list[str]) -> None: # Verify the method exists before unloading any modules. from evo.objects.client import DownloadedObject @@ -203,12 +197,6 @@ def _assert_optional_method(self, method_name: str, *, unload: list[str], no_imp f"{', '.join(no_import)} {'is' if len(no_import) == 1 else 'are'} not available", ) - def test_get_parquet_loader_is_optional(self) -> None: - """Test that the get_parquet_loader method is optional.""" - self._assert_optional_method( - "get_parquet_loader", unload=["evo.objects.loader.parquet_loader"], no_import=["pyarrow"] - ) - @contextlib.contextmanager def _patch_downloading_table(self, table_info: TableInfo | str) -> Generator[pa.Table, None, None]: mock_table_info = table_info From 4112b6bfb4150e370afbb4a3f3ae6d390fd886c6 Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Wed, 8 Oct 2025 10:18:00 +1300 Subject: [PATCH 29/32] Rename `evo.objects.loader` -> `evo.objects.parquet` --- .../evo/objects/{loader => parquet}/__init__.py | 2 +- .../parquet_loader.py => parquet/loader.py} | 0 .../src/evo/objects/{loader => parquet}/types.py | 0 .../evo-objects/src/evo/objects/utils/tables.py | 4 +--- .../evo-objects/tests/test_downloaded_object.py | 14 ++++---------- 5 files changed, 6 insertions(+), 14 deletions(-) rename packages/evo-objects/src/evo/objects/{loader => parquet}/__init__.py (95%) rename packages/evo-objects/src/evo/objects/{loader/parquet_loader.py => parquet/loader.py} (100%) rename packages/evo-objects/src/evo/objects/{loader => parquet}/types.py (100%) diff --git a/packages/evo-objects/src/evo/objects/loader/__init__.py b/packages/evo-objects/src/evo/objects/parquet/__init__.py similarity index 95% rename from packages/evo-objects/src/evo/objects/loader/__init__.py rename to packages/evo-objects/src/evo/objects/parquet/__init__.py index 412a5a20..1ddbb174 100644 --- a/packages/evo-objects/src/evo/objects/loader/__init__.py +++ b/packages/evo-objects/src/evo/objects/parquet/__init__.py @@ -14,7 +14,7 @@ except ImportError: raise ImportError("The 'pyarrow' package is required to use ParquetLoader") from None -from .parquet_loader import ParquetLoader +from .loader import ParquetLoader from .types import ArrayTableInfo, LookupTableInfo, TableInfo __all__ = [ diff --git a/packages/evo-objects/src/evo/objects/loader/parquet_loader.py b/packages/evo-objects/src/evo/objects/parquet/loader.py similarity index 100% rename from packages/evo-objects/src/evo/objects/loader/parquet_loader.py rename to packages/evo-objects/src/evo/objects/parquet/loader.py diff --git a/packages/evo-objects/src/evo/objects/loader/types.py b/packages/evo-objects/src/evo/objects/parquet/types.py similarity index 100% rename from packages/evo-objects/src/evo/objects/loader/types.py rename to packages/evo-objects/src/evo/objects/parquet/types.py diff --git a/packages/evo-objects/src/evo/objects/utils/tables.py b/packages/evo-objects/src/evo/objects/utils/tables.py index 5ff336de..baae199a 100644 --- a/packages/evo-objects/src/evo/objects/utils/tables.py +++ b/packages/evo-objects/src/evo/objects/utils/tables.py @@ -346,9 +346,7 @@ def from_table_info(cls, table_info: dict) -> KnownTableFormat: return KnownTableFormat(name=type_name, columns=columns, field_names=table_info.get("field_names")) @classmethod - @deprecated( - "KnownTableFormat.load_table is deprecated, use evo.objects.client.parquet_loader.ParquetLoader instead" - ) + @deprecated("KnownTableFormat.load_table is deprecated, use evo.objects.parquet.ParquetLoader instead") def load_table(cls, table_info: dict, source: Path) -> pa.Table: """Load parquet data as a pyarrow.Table and verify the format against the provided table info. diff --git a/packages/evo-objects/tests/test_downloaded_object.py b/packages/evo-objects/tests/test_downloaded_object.py index 9cdd3eea..62e8681a 100644 --- a/packages/evo-objects/tests/test_downloaded_object.py +++ b/packages/evo-objects/tests/test_downloaded_object.py @@ -37,7 +37,7 @@ from evo.objects import DownloadedObject, ObjectReference from evo.objects.endpoints import models from evo.objects.io import _CACHE_SCOPE -from evo.objects.loader import TableInfo +from evo.objects.parquet import TableInfo from evo.objects.utils import KnownTableFormat from helpers import NoImport, UnloadModule, get_sample_table_and_bytes @@ -265,9 +265,7 @@ async def test_download_table_without_cache(self, _label: str, table_info: Table def test_download_table_is_optional(self) -> None: """Test that the download_table method is not available when pyarrow is not installed.""" - self._assert_optional_method( - "download_table", unload=["evo.objects.loader.parquet_loader"], no_import=["pyarrow"] - ) + self._assert_optional_method("download_table", unload=["evo.objects.parquet.loader"], no_import=["pyarrow"]) @parameterized.expand(_TABLE_INFO_VARIANTS) async def test_download_dataframe(self, _label: str, table_info: TableInfo | str) -> None: @@ -300,9 +298,7 @@ async def test_download_dataframe_without_cache(self, _label: str, table_info: T ) def test_download_dataframe_is_optional(self, missing: str) -> None: """Test that the download_dataframe method is not available when pandas or pyarrow is not installed.""" - self._assert_optional_method( - "download_dataframe", unload=["evo.objects.loader.parquet_loader"], no_import=[missing] - ) + self._assert_optional_method("download_dataframe", unload=["evo.objects.parquet.loader"], no_import=[missing]) @parameterized.expand(_TABLE_INFO_VARIANTS) async def test_download_array(self, _label: str, table_info: TableInfo | str) -> None: @@ -335,6 +331,4 @@ async def test_download_array_without_cache(self, _label: str, table_info: Table ) def test_download_array_is_optional(self, missing: str) -> None: """Test that the download_array method is not available when numpy or pyarrow is not installed.""" - self._assert_optional_method( - "download_array", unload=["evo.objects.loader.parquet_loader"], no_import=[missing] - ) + self._assert_optional_method("download_array", unload=["evo.objects.parquet.loader"], no_import=[missing]) From 8b0786073d0698f10dc0ed54c38c095f0f029d3a Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Wed, 8 Oct 2025 11:39:29 +1300 Subject: [PATCH 30/32] Refactor ParquetLoader into ParquetDownloader + ParquetLoader --- .../src/evo/objects/client/object_client.py | 32 ++- .../src/evo/objects/parquet/__init__.py | 3 +- .../src/evo/objects/parquet/loader.py | 225 ++++++++++++------ .../evo-objects/src/evo/objects/utils/data.py | 11 +- 4 files changed, 177 insertions(+), 94 deletions(-) diff --git a/packages/evo-objects/src/evo/objects/client/object_client.py b/packages/evo-objects/src/evo/objects/client/object_client.py index d4f262ef..bc72bcda 100644 --- a/packages/evo-objects/src/evo/objects/client/object_client.py +++ b/packages/evo-objects/src/evo/objects/client/object_client.py @@ -11,7 +11,8 @@ from __future__ import annotations -from collections.abc import Iterator, Sequence +import contextlib +from collections.abc import AsyncGenerator, Iterator, Sequence from typing import Any from uuid import UUID @@ -30,7 +31,7 @@ try: import pyarrow as pa - from ..loader import ParquetLoader, TableInfo + from ..parquet import ParquetDownloader, ParquetLoader, TableInfo except ImportError: _LOADER_AVAILABLE = False else: @@ -181,12 +182,17 @@ def prepare_data_download(self, data_identifiers: Sequence[str | UUID]) -> Itera if _LOADER_AVAILABLE: # Optional support for loading Parquet data using PyArrow. - def _get_parquet_loader(self, table_info: TableInfo | str) -> ParquetLoader: - """Get a ParquetLoader for the data referenced by the given table info or data reference string. + @contextlib.asynccontextmanager + async def _with_parquet_loader( + self, table_info: TableInfo | str, fb: IFeedback + ) -> AsyncGenerator[ParquetLoader, None]: + """Download parquet data and get a ParquetLoader for the data referenced by the given + table info or data reference string. :param table_info: The table info dict, JMESPath to table info, or data reference string. + :param fb: An optional feedback instance to report download progress to. - :returns: A ParquetLoader that can be used to download and read the referenced data. + :returns: A ParquetLoader that can be used to read the referenced data. """ if isinstance(table_info, str): if isinstance(resolved := self.search(table_info), jmespath.JMESPathObjectProxy): @@ -197,7 +203,9 @@ def _get_parquet_loader(self, table_info: TableInfo | str) -> ParquetLoader: table_info = _TABLE_INFO_VALIDATOR.validate_python(table_info) (download,) = self.prepare_data_download([table_info["data"]]) - return ParquetLoader(download, table_info, self._connector.transport, self._cache) + async with ParquetDownloader(download, self._connector.transport, self._cache).with_feedback(fb) as loader: + loader.validate_with_table_info(table_info) + yield loader async def download_table(self, table_info: TableInfo | str, fb: IFeedback = NoFeedback) -> pa.Table: """Download the data referenced by the given table info or data reference string as a PyArrow Table. @@ -207,8 +215,8 @@ async def download_table(self, table_info: TableInfo | str, fb: IFeedback = NoFe :returns: A PyArrow Table containing the downloaded data. """ - loader = self._get_parquet_loader(table_info) - return await loader.load_as_table(fb) + async with self._with_parquet_loader(table_info, fb) as loader: + return loader.load_as_table() if _PD_AVAILABLE: # Optional support for loading data as Pandas DataFrames. Requires parquet support via PyArrow as well. @@ -221,8 +229,8 @@ async def download_dataframe(self, table_info: TableInfo | str, fb: IFeedback = :returns: A Pandas DataFrame containing the downloaded data. """ - loader = self._get_parquet_loader(table_info) - return await loader.load_as_dataframe(fb) + async with self._with_parquet_loader(table_info, fb) as loader: + return loader.load_as_dataframe() if _NP_AVAILABLE: # Optional support for loading data as NumPy arrays. Requires parquet support via PyArrow as well. @@ -235,5 +243,5 @@ async def download_array(self, table_info: TableInfo | str, fb: IFeedback = NoFe :returns: A NumPy array containing the downloaded data. """ - loader = self._get_parquet_loader(table_info) - return await loader.load_as_array(fb) + async with self._with_parquet_loader(table_info, fb) as loader: + return loader.load_as_array() diff --git a/packages/evo-objects/src/evo/objects/parquet/__init__.py b/packages/evo-objects/src/evo/objects/parquet/__init__.py index 1ddbb174..efb0ae70 100644 --- a/packages/evo-objects/src/evo/objects/parquet/__init__.py +++ b/packages/evo-objects/src/evo/objects/parquet/__init__.py @@ -14,12 +14,13 @@ except ImportError: raise ImportError("The 'pyarrow' package is required to use ParquetLoader") from None -from .loader import ParquetLoader +from .loader import ParquetDownloader, ParquetLoader from .types import ArrayTableInfo, LookupTableInfo, TableInfo __all__ = [ "ArrayTableInfo", "LookupTableInfo", + "ParquetDownloader", "ParquetLoader", "TableInfo", ] diff --git a/packages/evo-objects/src/evo/objects/parquet/loader.py b/packages/evo-objects/src/evo/objects/parquet/loader.py index b13edfe0..e9a7d7b9 100644 --- a/packages/evo-objects/src/evo/objects/parquet/loader.py +++ b/packages/evo-objects/src/evo/objects/parquet/loader.py @@ -11,8 +11,11 @@ from __future__ import annotations +import contextlib +from collections.abc import AsyncGenerator from io import BytesIO from logging import getLogger +from types import TracebackType from typing import cast import pyarrow as pa @@ -23,7 +26,7 @@ from evo.common.io import BytesDestination, ChunkedIOManager, Download, HTTPSource from evo.common.utils import NoFeedback -from ..exceptions import SchemaValidationError +from ..exceptions import SchemaValidationError, TableFormatError from ..utils import ArrowTableFormat, KnownTableFormat from .types import TableInfo @@ -41,7 +44,10 @@ else: _NP_AVAILABLE = True -__all__ = ["ParquetLoader"] +__all__ = [ + "ParquetDownloader", + "ParquetLoader", +] logger = getLogger(__name__) @@ -49,110 +55,177 @@ class ParquetLoader: - """A loader for Parquet data from a geoscience object.""" + """A loader for Parquet data from a pyarrow.parquet.ParquetFile. - def __init__( - self, download: Download, table_info: TableInfo, transport: ITransport, cache: ICache | None = None - ) -> None: + This class adds standardised support for validating Geoscience Object table info + against the loaded Parquet schema, as well as convenience methods for loading + the data as a PyArrow Table, Pandas DataFrame, or NumPy array. + """ + + def __init__(self, pa_file: pa.NativeFile) -> None: """ - :param download: The download information for the Parquet data. - :param table_info: The expected table information for validation. - :param transport: The transport to use for data downloads. - :param cache: An optional cache to use for data downloads. + :param pa_file: A PyArrow NativeFile containing the Parquet data. """ - self._download = download - validated_table_info = _TABLE_INFO_ADAPTER.validate_python(table_info) - self._expected_format = KnownTableFormat.from_table_info(validated_table_info) - self._expected_length = table_info["length"] - self._transport = transport - self._cache = cache - - async def _reader_from_cache(self, fb: IFeedback) -> pa.NativeFile: - cached = await self._download.download_to_cache(self._cache, self._transport, fb=fb) - return pa.OSFile(str(cached), "r") - - async def _reader_from_memory(self, fb: IFeedback) -> pa.NativeFile: - # Initialize a buffer to store the downloaded data in memory - memory = BytesIO() + self._pa_file = pa_file + self._parquet_file: pq.ParquetFile | None = None + + def __enter__(self) -> ParquetLoader: + if self._parquet_file is not None: + raise RuntimeError("ParquetLoader is already in use") + self._parquet_file = pq.ParquetFile(self._pa_file.__enter__()) + return self + + async def __aenter__(self) -> ParquetLoader: + # Delegate to the synchronous context manager. + # This implementation is just to support async with + # syntax for combination with ParquetDownloader below. + return self.__enter__() + + def __exit__( + self, + exc_type: type[Exception] | None, + exc_val: Exception | None, + exc_tb: TracebackType | None, + ) -> None: + self._parquet_file = None + return self._pa_file.__exit__(exc_type, exc_val, exc_tb) + + async def __aexit__( + self, + exc_type: type[Exception] | None, + exc_val: Exception | None, + exc_tb: TracebackType | None, + ) -> None: + # Delegate to the synchronous context manager. + # This implementation is just to support async with + # syntax for combination with ParquetDownloader below. + return self.__exit__(exc_type, exc_val, exc_tb) - # Use ChunkedIOManager to download the data into the memory buffer - manager = ChunkedIOManager() - async with HTTPSource(self._download.get_download_url, self._transport) as source: - destination = BytesDestination(memory) - await manager.run(source, destination, fb=fb) + def validate_with_table_info(self, table_info: TableInfo) -> None: + """Validate the provided TableInfo against the loaded Parquet schema. - # Reset the buffer's position to the beginning - memory.seek(0) - return pa.BufferReader(memory.getbuffer()) + :param table_info: The TableInfo to validate against the loaded Parquet schema. - async def _reader(self, fb: IFeedback) -> pa.NativeFile: - if self._cache is not None: - return await self._reader_from_cache(fb) - else: - return await self._reader_from_memory(fb) + :raises SchemaValidationError: If the loaded Parquet schema does not match the expected schema. + """ + if (pa_file := self._parquet_file) is None: + raise RuntimeError("ParquetLoader context is not active") - def _validate_data(self, data: pq.ParquetFile) -> None: logger.debug("Checking parquet data format") - actual_format = ArrowTableFormat.from_schema(data.schema_arrow) - KnownTableFormat._check_format(self._expected_format, actual_format) + + validated_table_info = _TABLE_INFO_ADAPTER.validate_python(table_info) + expected_format = KnownTableFormat.from_table_info(validated_table_info) + actual_format = ArrowTableFormat.from_schema(pa_file.schema_arrow) + try: + expected_format._check_format(actual_format) + except TableFormatError as e: + raise SchemaValidationError(str(e)) from None logger.debug("Checking parquet data length") - actual_length = data.metadata.num_rows - if self._expected_length != actual_length: + actual_length = pa_file.metadata.num_rows + if table_info["length"] != actual_length: raise SchemaValidationError( - f"Row count ({actual_length}) does not match expectation ({self._expected_length})" + f"Row count ({actual_length}) does not match expectation ({table_info['length']})" ) logger.debug("Parquet metadata checks succeeded") - async def load_as_table(self, fb: IFeedback = NoFeedback) -> pa.Table: - """Load the Parquet data as a PyArrow Table. - - :param fb: An optional feedback interface to report progress. - - :raises SchemaValidationError: If the data does not match the expected schema. - """ - with await self._reader(fb) as reader: - data = pq.ParquetFile(reader) - self._validate_data(data) - return data.read() + def load_as_table(self) -> pa.Table: + """Load the Parquet data as a PyArrow Table.""" + if self._parquet_file is None: + raise RuntimeError("ParquetLoader context is not active") + else: + return self._parquet_file.read() if _PD_AVAILABLE: # Optional support for pandas dataframes - async def load_as_dataframe(self, fb: IFeedback = NoFeedback) -> pd.DataFrame: - """Load the Parquet data as a Pandas DataFrame. - - :param fb: An optional feedback interface to report progress. - - :raises SchemaValidationError: If the data does not match the expected schema. - """ - table = await self.load_as_table(fb) + def load_as_dataframe(self) -> pd.DataFrame: + """Load the Parquet data as a Pandas DataFrame.""" + table = self.load_as_table() return table.to_pandas() if _NP_AVAILABLE: # Optional support for numpy arrays - async def load_as_array(self, fb: IFeedback = NoFeedback) -> np.ndarray: + def load_as_array(self) -> np.ndarray: """Load the Parquet data as a NumPy array. The array will have a shape of (N,) for single-column data or (N, M) for multi-column data, where N is the number of rows and M is the number of columns. The target data _must_ have a uniform dtype. - :param fb: An optional feedback interface to report progress. - - :raises SchemaValidationError: If the data does not match the expected schema. + :return: A NumPy array containing the data. """ - try: - dtype = np.dtype(self._expected_format.data_type) - except TypeError: - raise SchemaValidationError( - f"Unsupported data type '{self._expected_format.data_type}' cannot be loaded as a numpy array" - ) - - table = await self.load_as_table(fb) + table = self.load_as_table() columns = cast(list[np.ndarray], [col.combine_chunks().to_numpy() for col in table.itercolumns()]) if len(columns) == 1: - return columns[0].astype(dtype) + return columns[0] else: - return np.column_stack(columns).astype(dtype) + return np.column_stack(columns) + + +class ParquetDownloader: + """A downloader for Parquet data that provides a ParquetLoader for reading the data. + + This class supports downloading the data to a cache or to memory, and provides + a ParquetLoader for reading the downloaded data. + """ + + def __init__(self, download: Download, transport: ITransport, cache: ICache | None = None) -> None: + """ + :param download: The download information for the Parquet data. + :param transport: The transport to use for data downloads. + :param cache: An optional cache to use for data downloads. + """ + self._evo_download = download + self._transport = transport + self._cache = cache + + async def _download_to_cache(self, fb: IFeedback) -> pa.OSFile: + cached = await self._evo_download.download_to_cache(self._cache, self._transport, fb=fb) + return pa.OSFile(str(cached), "r") + + async def _download_to_memory(self, fb: IFeedback) -> pa.BufferReader: + # Initialize a buffer to store the downloaded data in memory + memory = BytesIO() + + # Use ChunkedIOManager to download the data into the memory buffer + manager = ChunkedIOManager() + async with HTTPSource(self._evo_download.get_download_url, self._transport) as source: + destination = BytesDestination(memory) + await manager.run(source, destination, fb=fb) + + # Reset the buffer's position to the beginning + memory.seek(0) + return pa.BufferReader(memory.getbuffer()) + + async def download(self, fb: IFeedback = NoFeedback) -> ParquetLoader: + """Download the Parquet data and return a ParquetLoader for reading it. + + :param fb: An optional feedback instance to report download progress to. + + :return: A ParquetLoader that can be used to read the downloaded data. + """ + if self._cache is not None: + file = await self._download_to_cache(fb) + else: + file = await self._download_to_memory(fb) + + return ParquetLoader(file) + + @contextlib.asynccontextmanager + async def __aenter__(self) -> AsyncGenerator[ParquetLoader, None]: + # Delegate to the download method to get a ParquetLoader. + async with await self.download() as loader: + yield loader + + @contextlib.asynccontextmanager + async def with_feedback(self, fb: IFeedback) -> AsyncGenerator[ParquetLoader, None]: + """Async context manager to download the Parquet data with feedback and provide a ParquetLoader for reading it. + + :param fb: A feedback instance to report download progress to. + + :yields: A ParquetLoader that can be used to read the downloaded data. + """ + async with await self.download(fb=fb) as loader: + yield loader diff --git a/packages/evo-objects/src/evo/objects/utils/data.py b/packages/evo-objects/src/evo/objects/utils/data.py index b9acc77f..cbb38844 100644 --- a/packages/evo-objects/src/evo/objects/utils/data.py +++ b/packages/evo-objects/src/evo/objects/utils/data.py @@ -181,16 +181,17 @@ async def download_table( """ # Import here to avoid circular import. from ..client import ObjectAPIClient - from ..loader import ParquetLoader + from ..parquet import ParquetDownloader client = ObjectAPIClient(self._environment, self._connector) (download,) = [d async for d in client.prepare_data_download(object_id, version_id, [table_info["data"]])] # Defer downloading the table to the new ParquetLoader class. - loader = ParquetLoader( - download=download, table_info=table_info, transport=self._connector.transport, cache=self._cache - ) - return await loader.load_as_table(fb=fb) + async with ParquetDownloader( + download=download, transport=self._connector.transport, cache=self._cache + ).with_feedback(fb) as loader: + loader.validate_with_table_info(table_info) + return loader.load_as_table() if _PD_AVAILABLE: # Optional support for pandas dataframes. Depends on both pyarrow and pandas. From 4836794ea2d06e942fef85fd9b3b17b9a21e6fe8 Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Wed, 8 Oct 2025 12:09:09 +1300 Subject: [PATCH 31/32] Fix pydantic error in python 3.11 --- packages/evo-objects/src/evo/objects/parquet/types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/evo-objects/src/evo/objects/parquet/types.py b/packages/evo-objects/src/evo/objects/parquet/types.py index 8ef092dc..abb1775f 100644 --- a/packages/evo-objects/src/evo/objects/parquet/types.py +++ b/packages/evo-objects/src/evo/objects/parquet/types.py @@ -12,7 +12,7 @@ import sys from typing import TypeAlias -if sys.version_info >= (3, 11): +if sys.version_info >= (3, 12): from typing import NotRequired, TypedDict else: from typing_extensions import NotRequired, TypedDict From a43c437c38e5f40ef2845863f6a8e613dfedee63 Mon Sep 17 00:00:00 2001 From: Chris Wordsworth <104798236+wordsworthc@users.noreply.github.com> Date: Wed, 8 Oct 2025 13:53:29 +1300 Subject: [PATCH 32/32] Fix NoImport test util for macos --- packages/evo-objects/tests/helpers.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/packages/evo-objects/tests/helpers.py b/packages/evo-objects/tests/helpers.py index fd18496e..7a91a5b0 100644 --- a/packages/evo-objects/tests/helpers.py +++ b/packages/evo-objects/tests/helpers.py @@ -30,16 +30,19 @@ def __init__(self, *names: str) -> None: :param names: The names of the modules to prevent from being imported. """ self._names = names + self._unloaded_modules = {} def __enter__(self) -> None: for name in self._names: - # Set the module to None to prevent it from being imported. + # If the module is already imported, save it and set to None. + self._unloaded_modules[name] = sys.modules[name] + # Set the module to None to prevent it from being re-imported. sys.modules[name] = None def __exit__(self, exc_type, exc_val, exc_tb) -> bool: - for name in self._names: - # Remove the module from sys.modules to clean up. - del sys.modules[name] + # Restore the unloaded modules. + for name, module in self._unloaded_modules.items(): + sys.modules[name] = module class UnloadModule: