From 05f2c154ee26fd86ed6f5d1dfe55b96128a2d817 Mon Sep 17 00:00:00 2001 From: Albert Engstfeld Date: Tue, 27 Jan 2026 14:56:41 +0100 Subject: [PATCH 01/14] refactor loading resources with different formats --- unitpackage/entry.py | 4 +- unitpackage/loaders/baseloader.py | 14 +++ unitpackage/local.py | 179 +++++++++++++++++++----------- 3 files changed, 133 insertions(+), 64 deletions(-) diff --git a/unitpackage/entry.py b/unitpackage/entry.py index 5ddfca1..2bc9f67 100644 --- a/unitpackage/entry.py +++ b/unitpackage/entry.py @@ -526,9 +526,9 @@ def mutable_resource(self): if not self.resource.custom["MutableResource"]: from frictionless import Schema - from unitpackage.local import create_df_resource + from unitpackage.local import create_df_resource_from_tabular_resource - self.resource.custom["MutableResource"] = create_df_resource(self.resource) + self.resource.custom["MutableResource"] = create_df_resource_from_tabular_resource(self.resource) self.resource.custom["MutableResource"].schema = Schema.from_descriptor( self.resource.schema.to_dict() ) diff --git a/unitpackage/loaders/baseloader.py b/unitpackage/loaders/baseloader.py index e5dd4b6..37f0e5b 100644 --- a/unitpackage/loaders/baseloader.py +++ b/unitpackage/loaders/baseloader.py @@ -422,6 +422,20 @@ def df(self): 0 0 0 1 1 1 + A file with two column header lines, which is sometimes, for example, + used for storing units to the values:: + + >>> from io import StringIO + >>> file = StringIO(r'''a,b + ... m,s + ... 0,0 + ... 1,1''') + >>> csv = BaseLoader(file, column_header_lines=2) + >>> csv.df + a / m b / s + 0 0 0 + 1 1 1 + """ import pandas as pd diff --git a/unitpackage/local.py b/unitpackage/local.py index 9f05eaa..f116987 100644 --- a/unitpackage/local.py +++ b/unitpackage/local.py @@ -36,15 +36,79 @@ logger = logging.getLogger("unitpackage") -def create_df_resource(resource): +def create_tabular_resource_from_csv(csvname): + r""" + Return a Data Package built from a :param metadata: dict and tabular data + in :param csvname: str. + + The :param fields: list must must be structured such as + `[{'name':'E', 'unit': 'mV'}, {'name':'T', 'unit': 'K'}]`. + + EXAMPLES:: + + >>> resource = create_tabular_resource_from_csv("./examples/from_csv/from_csv.csv") + >>> resource # doctest: +NORMALIZE_WHITESPACE + {'name': 'from_csv', ... + + >>> resource.format + 'csv' + + """ + + csv_basename = os.path.basename(csvname) + + resource = Resource( + path=csv_basename, + basepath=os.path.dirname(csvname) or ".", + ) + + resource.infer() + + return resource + +def create_df_resource_from_df(df): + r""" + Return a pandas dataframe resource for a pandas DataFrame. + + EXAMPLES:: + + >>> data = {'x': [1, 2, 3], 'y': [4, 5, 6]} + >>> df = pd.DataFrame(data) + >>> from unitpackage.local import create_df_resource_from_df + >>> resource = create_df_resource_from_df(df) + >>> resource # doctest: +NORMALIZE_WHITESPACE + {'name': 'memory', + 'type': 'table', + 'data': [], + 'format': 'pandas', ... + + >>> resource.data + x y + 0 1 4 + 1 2 5 + 2 3 6 + + >>> resource.format + 'pandas' + + + """ + df_resource = Resource(df) + df_resource.infer() + + return df_resource + + +def create_df_resource_from_tabular_resource(resource): r""" Return a pandas dataframe resource for a frictionless Tabular Resource. EXAMPLES:: >>> from frictionless import Package - >>> resource = Package("./examples/local/no_bibliography/no_bibliography.json").resources[0] - >>> df_resource = create_df_resource(resource) # doctest: +NORMALIZE_WHITESPACE + >>> from unitpackage.local import create_df_resource_from_tabular_resource + >>> tabular_resource = Package("./examples/local/no_bibliography/no_bibliography.json").resources[0] + >>> df_resource = create_df_resource_from_tabular_resource(tabular_resource) # doctest: +NORMALIZE_WHITESPACE >>> df_resource {'name': 'memory', ... @@ -70,10 +134,6 @@ def create_df_resource(resource): 2 3 6 """ - if not resource: - raise ValueError( - "dataframe resource can not be created since the Data Package has no resources." - ) descriptor_path = ( resource.basepath + "/" + resource.path if resource.basepath else resource.path ) @@ -122,6 +182,54 @@ def collect_datapackages(data): return [Package(package) for package in packages] +def update_fields(schema, fields): + original_schema = schema + if not isinstance(fields, list): + raise ValueError( + "'fields' must be a list such as \ + [{'name': '', 'unit':''}]`, \ + e.g., `[{'name':'E', 'unit': 'mV}, {'name':'T', 'unit': 'K}]`" + ) + + # remove field if it is not a Mapping instance + from collections.abc import Mapping + + for field in fields: + if not isinstance(field, Mapping): + raise ValueError( + "'field' must be a dict such as {'name': '', 'unit':''},\ + e.g., `{'name':'j', 'unit': 'uA / cm2'}`" + ) + + provided_schema = Schema.from_descriptor({"fields": fields}, allow_invalid=True) + + new_fields = [] + unspecified_fields = [] + + for name in original_schema.field_names: + if name in provided_schema.field_names: + new_fields.append( + provided_schema.get_field(name).to_dict() + | original_schema.get_field(name).to_dict() + ) + else: + new_fields.append(original_schema.get_field(name).to_dict()) + + if len(unspecified_fields) != 0: + logger.warning( + f"Additional information were not provided for fields {unspecified_fields}." + ) + + unused_provided_fields = [] + for name in provided_schema.field_names: + if name not in original_schema.field_names: + unused_provided_fields.append(name) + if len(unused_provided_fields) != 0: + logger.warning( + f"Fields with names {unused_provided_fields} was provided but does not appear in the field names of tabular resource {original_schema.field_names}." + ) + + return Schema.from_descriptor({"fields": new_fields}) def create_unitpackage(csvname, metadata=None, fields=None): r""" @@ -163,67 +271,14 @@ def create_unitpackage(csvname, metadata=None, fields=None): """ - csv_basename = os.path.basename(csvname) - - resource = Resource( - path=csv_basename, - basepath=os.path.dirname(csvname) or ".", - ) - - resource.infer() + resource = create_tabular_resource_from_csv(csvname) resource.custom.setdefault("metadata", {}) resource.custom["metadata"].setdefault("echemdb", metadata) if fields: # Update fields in the Resource describing the data in the CSV - resource_schema = resource.schema - if not isinstance(fields, list): - raise ValueError( - "'fields' must be a list such as \ - [{'name': '', 'unit':''}]`, \ - e.g., `[{'name':'E', 'unit': 'mV}, {'name':'T', 'unit': 'K}]`" - ) - - # remove field if it is not a Mapping instance - from collections.abc import Mapping - - for field in fields: - if not isinstance(field, Mapping): - raise ValueError( - "'field' must be a dict such as {'name': '', 'unit':''},\ - e.g., `{'name':'j', 'unit': 'uA / cm2'}`" - ) - - provided_schema = Schema.from_descriptor({"fields": fields}, allow_invalid=True) - - new_fields = [] - unspecified_fields = [] - - for name in resource_schema.field_names: - if name in provided_schema.field_names: - new_fields.append( - provided_schema.get_field(name).to_dict() - | resource_schema.get_field(name).to_dict() - ) - else: - new_fields.append(resource_schema.get_field(name).to_dict()) - - if len(unspecified_fields) != 0: - logger.warning( - f"Additional information were not provided for fields {unspecified_fields}." - ) - - unused_provided_fields = [] - for name in provided_schema.field_names: - if name not in resource_schema.field_names: - unused_provided_fields.append(name) - if len(unused_provided_fields) != 0: - logger.warning( - f"Fields with names {unused_provided_fields} was provided but does not appear in the field names of tabular resource {resource_schema.field_names}." - ) - - resource.schema = Schema.from_descriptor({"fields": new_fields}) + resource.schema = update_fields(resource.schema, fields) package = Package(resources=[resource]) From 2fbafb5bbc0f4857ba9b1e6412eb5b6110b11d0f Mon Sep 17 00:00:00 2001 From: Albert Engstfeld Date: Wed, 28 Jan 2026 09:49:32 +0100 Subject: [PATCH 02/14] refactor loading from df --- pixi.lock | 19 ++++++++++- unitpackage/entry.py | 78 +++++++++++++++++++++++++++++++------------- unitpackage/local.py | 45 ++++++++++++++++++------- 3 files changed, 108 insertions(+), 34 deletions(-) diff --git a/pixi.lock b/pixi.lock index 2c94af5..13e2aa0 100644 --- a/pixi.lock +++ b/pixi.lock @@ -5,6 +5,8 @@ environments: - url: https://conda.anaconda.org/conda-forge/ indexes: - https://pypi.org/simple + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -1579,6 +1581,8 @@ environments: - url: https://conda.anaconda.org/conda-forge/ indexes: - https://pypi.org/simple + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -3386,6 +3390,8 @@ environments: - url: https://conda.anaconda.org/conda-forge/ indexes: - https://pypi.org/simple + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -5074,6 +5080,8 @@ environments: - url: https://conda.anaconda.org/conda-forge/ indexes: - https://pypi.org/simple + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -6219,6 +6227,8 @@ environments: - url: https://conda.anaconda.org/conda-forge/ indexes: - https://pypi.org/simple + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -7816,6 +7826,8 @@ environments: - url: https://conda.anaconda.org/conda-forge/ indexes: - https://pypi.org/simple + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -9428,6 +9440,8 @@ environments: - url: https://conda.anaconda.org/conda-forge/ indexes: - https://pypi.org/simple + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -11031,6 +11045,8 @@ environments: - url: https://conda.anaconda.org/conda-forge/ indexes: - https://pypi.org/simple + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -12644,6 +12660,8 @@ environments: - url: https://conda.anaconda.org/conda-forge/ indexes: - https://pypi.org/simple + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -40375,7 +40393,6 @@ packages: - plotly>=5,<7 - pybtex>=0.25,<0.26 requires_python: '>=3.10' - editable: true - conda: https://conda.anaconda.org/conda-forge/noarch/uri-template-1.3.0-pyhd8ed1ab_1.conda sha256: e0eb6c8daf892b3056f08416a96d68b0a358b7c46b99c8a50481b22631a4dfc0 md5: e7cb0f5745e4c5035a460248334af7eb diff --git a/unitpackage/entry.py b/unitpackage/entry.py index 2bc9f67..d79d735 100644 --- a/unitpackage/entry.py +++ b/unitpackage/entry.py @@ -503,7 +503,7 @@ def add_offset(self, field_name=None, offset=None, unit=""): @property def mutable_resource(self): r""" - Return the data of this entry's "MutableResource" as a data frame. + Return the entry's "MutableResource". EXAMPLES:: @@ -524,11 +524,24 @@ def mutable_resource(self): self.resource.custom.setdefault("MutableResource", "") if not self.resource.custom["MutableResource"]: - from frictionless import Schema + if not self.resource.format in ["csv", "pandas"]: + raise ValueError( + "MutableResource can only be created from resources of format 'csv' or 'pandas'." + ) + + if self.resource.format == "csv": + + from unitpackage.local import create_df_resource_from_tabular_resource + + self.resource.custom["MutableResource"] = ( + create_df_resource_from_tabular_resource(self.resource) + ) - from unitpackage.local import create_df_resource_from_tabular_resource + elif self.resource.format == "pandas": + self.resource.custom["MutableResource"] = self.resource + + from frictionless import Schema - self.resource.custom["MutableResource"] = create_df_resource_from_tabular_resource(self.resource) self.resource.custom["MutableResource"].schema = Schema.from_descriptor( self.resource.schema.to_dict() ) @@ -556,6 +569,18 @@ def df(self): {'name': 'E', 'type': 'number', 'unit': 'V', 'reference': 'RHE'}, {'name': 'j', 'type': 'number', 'unit': 'A / m2'}] + TESTS:: + + >>> import pandas as pd + >>> from unitpackage.entry import Entry + >>> df = pd.DataFrame({'x':[1,2,3], 'y':[2,3,4]}) + >>> entry = Entry.from_df(df=df, basename='test_df') + >>> entry.df + x y + 0 1 2 + 1 2 3 + 2 3 4 + """ return self.mutable_resource.data @@ -718,7 +743,7 @@ def from_csv(cls, csvname, metadata=None, fields=None): Units describing the fields can be provided:: - >>> import os + >>> from unitpackage.entry import Entry >>> fields = [{'name':'E', 'unit': 'mV'}, {'name':'I', 'unit': 'A'}] >>> entry = Entry.from_csv(csvname='examples/from_csv/from_csv.csv', fields=fields) >>> entry @@ -730,7 +755,6 @@ def from_csv(cls, csvname, metadata=None, fields=None): Metadata can be appended:: - >>> import os >>> fields = [{'name':'E', 'unit': 'mV'}, {'name':'I', 'unit': 'A'}] >>> metadata = {'user':'Max Doe'} >>> entry = Entry.from_csv(csvname='examples/from_csv/from_csv.csv', metadata=metadata, fields=fields) @@ -742,7 +766,6 @@ def from_csv(cls, csvname, metadata=None, fields=None): A filename containing upper case characters:: - >>> import os >>> fields = [{'name':'E', 'unit': 'mV'}, {'name':'I', 'unit': 'A'}] >>> entry = Entry.from_csv(csvname='examples/from_csv/UpperCase.csv', fields=fields) >>> entry @@ -757,9 +780,15 @@ def from_csv(cls, csvname, metadata=None, fields=None): ... """ - from unitpackage.local import create_unitpackage + from unitpackage.local import ( + create_tabular_resource_from_csv, + create_unitpackage, + ) - package = create_unitpackage(csvname=csvname, metadata=metadata, fields=fields) + resource = create_tabular_resource_from_csv(csvname=csvname) + package = create_unitpackage( + resource=resource, metadata=metadata, fields=fields + ) return cls(resource=package.resources[0]) @@ -891,9 +920,13 @@ def from_local(cls, filename): return cls(resource=package.resources[0]) @classmethod - def from_df(cls, df, metadata=None, fields=None, outdir=None, *, basename): + def from_df(cls, df, metadata=None, fields=None, *, basename): r""" Returns an entry constructed from a pandas dataframe. + A name `basename` for the entry must be provided. + The name must be lower-case and contain only alphanumeric + characters along with `.` , `_` or `-` characters'. + (Upper case characters are converted to lower case.) EXAMPLES:: @@ -941,21 +974,15 @@ def from_df(cls, df, metadata=None, fields=None, outdir=None, *, basename): [{'name': 'x', 'type': 'integer', 'unit': 'm'}, {'name': 'y', 'type': 'integer'}] """ - if outdir is None: - import atexit - import shutil - import tempfile - - outdir = tempfile.mkdtemp() - atexit.register(shutil.rmtree, outdir) + from unitpackage.local import create_df_resource_from_df, create_unitpackage - csvname = basename + ".csv" + resource = create_df_resource_from_df(df) + resource.name = basename.lower() - df.to_csv(os.path.join(outdir, csvname), index=False) - - return cls.from_csv( - os.path.join(outdir, csvname), metadata=metadata, fields=fields + package = create_unitpackage( + resource=resource, metadata=metadata, fields=fields ) + return cls(resource=package.resources[0]) def save(self, *, outdir, basename=None): r""" @@ -1028,6 +1055,7 @@ def save(self, *, outdir, basename=None): os.makedirs(outdir) basename = basename or self.identifier + basename = basename.lower() csv_name = os.path.join(outdir, basename + ".csv") json_name = os.path.join(outdir, basename + ".json") @@ -1038,6 +1066,12 @@ def save(self, *, outdir, basename=None): self.resource.path = basename + ".csv" self.resource.name = basename + # convert a pandas resource into a csv resource + if self.resource.format == "pandas": + self.resource.format = "csv" + self.resource.mediatype = "text/csv" + del self.resource.data + resource = self.resource.to_dict() # update the fields from the main resource with those from the "MutableResource"resource diff --git a/unitpackage/local.py b/unitpackage/local.py index f116987..e294af1 100644 --- a/unitpackage/local.py +++ b/unitpackage/local.py @@ -66,6 +66,7 @@ def create_tabular_resource_from_csv(csvname): return resource + def create_df_resource_from_df(df): r""" Return a pandas dataframe resource for a pandas DataFrame. @@ -182,7 +183,29 @@ def collect_datapackages(data): return [Package(package) for package in packages] + def update_fields(schema, fields): + r""" + Return a new Schema based on :param schema: where the fields have been + updated with the information in :param fields:. + + The :param fields: list must must be structured such as + `[{'name':'E', 'unit': 'mV'}, {'name':'T', 'unit': 'K'}]`. + + EXAMPLES:: + + >>> from unitpackage.local import update_fields, create_tabular_resource_from_csv + >>> schema = create_tabular_resource_from_csv("./examples/from_csv/from_csv.csv").schema + >>> schema + {'fields': [{'name': 'E', 'type': 'integer'}, {'name': 'I', 'type': 'integer'}]} + + >>> fields = [{'name':'E', 'unit': 'mV'}, {'name':'I', 'unit': 'A'}, {'name':'x', 'unit': 'm'}] + >>> new_schema = update_fields(schema, fields) + >>> new_schema # doctest: +NORMALIZE_WHITESPACE + {'fields': [{'name': 'E', 'type': 'integer', 'unit': 'mV'}, + {'name': 'I', 'type': 'integer', 'unit': 'A'}]} + + """ original_schema = schema if not isinstance(fields, list): raise ValueError( @@ -205,6 +228,7 @@ def update_fields(schema, fields): new_fields = [] unspecified_fields = [] + unused_provided_fields = [] for name in original_schema.field_names: if name in provided_schema.field_names: @@ -212,6 +236,8 @@ def update_fields(schema, fields): provided_schema.get_field(name).to_dict() | original_schema.get_field(name).to_dict() ) + elif name not in original_schema.field_names: + unused_provided_fields.append(name) else: new_fields.append(original_schema.get_field(name).to_dict()) @@ -220,10 +246,6 @@ def update_fields(schema, fields): f"Additional information were not provided for fields {unspecified_fields}." ) - unused_provided_fields = [] - for name in provided_schema.field_names: - if name not in original_schema.field_names: - unused_provided_fields.append(name) if len(unused_provided_fields) != 0: logger.warning( f"Fields with names {unused_provided_fields} was provided but does not appear in the field names of tabular resource {original_schema.field_names}." @@ -231,7 +253,8 @@ def update_fields(schema, fields): return Schema.from_descriptor({"fields": new_fields}) -def create_unitpackage(csvname, metadata=None, fields=None): + +def create_unitpackage(resource, metadata=None, fields=None): r""" Return a Data Package built from a :param metadata: dict and tabular data in :param csvname: str. @@ -241,8 +264,10 @@ def create_unitpackage(csvname, metadata=None, fields=None): EXAMPLES:: + >>> from unitpackage.local import create_tabular_resource_from_csv, create_unitpackage + >>> resource = create_tabular_resource_from_csv("./examples/from_csv/from_csv.csv") >>> fields = [{'name':'E', 'unit': 'mV'}, {'name':'I', 'unit': 'A'}] - >>> package = create_unitpackage("./examples/from_csv/from_csv.csv", fields=fields) + >>> package = create_unitpackage(resource=resource, fields=fields) >>> package # doctest: +NORMALIZE_WHITESPACE {'resources': [{'name': ... @@ -252,7 +277,7 @@ def create_unitpackage(csvname, metadata=None, fields=None): Invalid fields:: >>> fields = 'not a list' - >>> package = create_unitpackage("./examples/from_csv/from_csv.csv", fields=fields) # doctest: +NORMALIZE_WHITESPACE + >>> package = create_unitpackage(resource=resource, fields=fields) # doctest: +NORMALIZE_WHITESPACE Traceback (most recent call last): ... ValueError: 'fields' must be a list such as @@ -262,17 +287,15 @@ def create_unitpackage(csvname, metadata=None, fields=None): More fields than required:: >>> fields = [{'name':'E', 'unit': 'mV'}, {'name':'I', 'unit': 'A'}, {'name':'x', 'unit': 'm'}] - >>> package = create_unitpackage("./examples/from_csv/from_csv.csv", fields=fields) # doctest: +NORMALIZE_WHITESPACE + >>> package = create_unitpackage(resource=resource, fields=fields) # doctest: +NORMALIZE_WHITESPACE Part of the fields specified: >>> fields = [{'name':'E', 'unit': 'mV'}] - >>> package = create_unitpackage("./examples/from_csv/from_csv.csv", fields=fields) # doctest: +NORMALIZE_WHITESPACE + >>> package = create_unitpackage(resource=resource, fields=fields) # doctest: +NORMALIZE_WHITESPACE """ - resource = create_tabular_resource_from_csv(csvname) - resource.custom.setdefault("metadata", {}) resource.custom["metadata"].setdefault("echemdb", metadata) From 0fb4f7ed2bb32db948929bed62fc607677dec0cb Mon Sep 17 00:00:00 2001 From: Albert Engstfeld Date: Wed, 28 Jan 2026 09:55:50 +0100 Subject: [PATCH 03/14] Move docstrings to update fields --- unitpackage/local.py | 45 ++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/unitpackage/local.py b/unitpackage/local.py index e294af1..6cee4ef 100644 --- a/unitpackage/local.py +++ b/unitpackage/local.py @@ -205,6 +205,29 @@ def update_fields(schema, fields): {'fields': [{'name': 'E', 'type': 'integer', 'unit': 'mV'}, {'name': 'I', 'type': 'integer', 'unit': 'A'}]} + TESTS: + + Invalid fields:: + + >>> fields = 'not a list' + >>> new_schema = update_fields(schema, fields) + Traceback (most recent call last): + ... + ValueError: 'fields' must be a list such as + [{'name': '', 'unit':''}]`, + e.g., `[{'name':'E', 'unit': 'mV}, {'name':'T', 'unit': 'K}]` + + More fields than required:: + + >>> fields = [{'name':'E', 'unit': 'mV'}, {'name':'I', 'unit': 'A'}, {'name':'x', 'unit': 'm'}] + >>> new_schema = update_fields(schema, fields) # doctest: +NORMALIZE_WHITESPACE + + Part of the fields specified: + + >>> fields = [{'name':'E', 'unit': 'mV'}] + >>> new_schema = update_fields(schema, fields) # doctest: +NORMALIZE_WHITESPACE + + """ original_schema = schema if not isinstance(fields, list): @@ -272,28 +295,6 @@ def create_unitpackage(resource, metadata=None, fields=None): {'resources': [{'name': ... - TESTS: - - Invalid fields:: - - >>> fields = 'not a list' - >>> package = create_unitpackage(resource=resource, fields=fields) # doctest: +NORMALIZE_WHITESPACE - Traceback (most recent call last): - ... - ValueError: 'fields' must be a list such as - [{'name': '', 'unit':''}]`, - e.g., `[{'name':'E', 'unit': 'mV}, {'name':'T', 'unit': 'K}]` - - More fields than required:: - - >>> fields = [{'name':'E', 'unit': 'mV'}, {'name':'I', 'unit': 'A'}, {'name':'x', 'unit': 'm'}] - >>> package = create_unitpackage(resource=resource, fields=fields) # doctest: +NORMALIZE_WHITESPACE - - Part of the fields specified: - - >>> fields = [{'name':'E', 'unit': 'mV'}] - >>> package = create_unitpackage(resource=resource, fields=fields) # doctest: +NORMALIZE_WHITESPACE - """ resource.custom.setdefault("metadata", {}) From 460fb63bdf85d3f0b0d1f9bd565cd1664d6e7c2b Mon Sep 17 00:00:00 2001 From: Albert Engstfeld Date: Wed, 28 Jan 2026 11:15:46 +0100 Subject: [PATCH 04/14] include baseloader --- unitpackage/entry.py | 25 ++++++++-- unitpackage/local.py | 109 ++++++++++++++++++++++++++++++++++++------- 2 files changed, 113 insertions(+), 21 deletions(-) diff --git a/unitpackage/entry.py b/unitpackage/entry.py index d79d735..3d8b32a 100644 --- a/unitpackage/entry.py +++ b/unitpackage/entry.py @@ -55,7 +55,7 @@ # ******************************************************************** # This file is part of unitpackage. # -# Copyright (C) 2021-2025 Albert Engstfeld +# Copyright (C) 2021-2026 Albert Engstfeld # Copyright (C) 2021 Johannes Hermann # Copyright (C) 2021-2022 Julian Rüth # Copyright (C) 2021 Nicolas Hörmann @@ -735,7 +735,17 @@ def plot(self, x_label=None, y_label=None, name=None): return fig @classmethod - def from_csv(cls, csvname, metadata=None, fields=None): + def from_csv( + cls, + csvname, + encoding=None, + header_lines=None, + column_header_lines=None, + decimal=None, + delimiters=None, + metadata=None, + fields=None, + ): r""" Returns an entry constructed from a CSV with a single header line. @@ -785,7 +795,16 @@ def from_csv(cls, csvname, metadata=None, fields=None): create_unitpackage, ) - resource = create_tabular_resource_from_csv(csvname=csvname) + # pylint: disable=duplicate-code + resource = create_tabular_resource_from_csv( + csvname=csvname, + encoding=encoding, + header_lines=header_lines, + column_header_lines=column_header_lines, + decimal=decimal, + delimiters=delimiters, + ) + package = create_unitpackage( resource=resource, metadata=metadata, fields=fields ) diff --git a/unitpackage/local.py b/unitpackage/local.py index 6cee4ef..70fbdbe 100644 --- a/unitpackage/local.py +++ b/unitpackage/local.py @@ -6,7 +6,7 @@ # ******************************************************************** # This file is part of unitpackage. # -# Copyright (C) 2021-2025 Albert Engstfeld +# Copyright (C) 2021-2026 Albert Engstfeld # Copyright (C) 2021 Johannes Hermann # Copyright (C) 2021 Julian Rüth # Copyright (C) 2021 Nicolas Hörmann @@ -36,35 +36,109 @@ logger = logging.getLogger("unitpackage") -def create_tabular_resource_from_csv(csvname): +def create_tabular_resource_from_csv( + csvname, + encoding=None, + header_lines=None, + column_header_lines=None, + decimal=None, + delimiters=None, +): r""" - Return a Data Package built from a :param metadata: dict and tabular data - in :param csvname: str. - - The :param fields: list must must be structured such as - `[{'name':'E', 'unit': 'mV'}, {'name':'T', 'unit': 'K'}]`. + Return a resource built from a provided CSV. EXAMPLES:: - >>> resource = create_tabular_resource_from_csv("./examples/from_csv/from_csv.csv") + For standard CSV files (single header line and subsequent + lines with data, using `.` as decimal separator.) + a tabular data resource is created:: + + >>> filename = './examples/from_csv/from_csv.csv' + >>> resource = create_tabular_resource_from_csv(filename) >>> resource # doctest: +NORMALIZE_WHITESPACE - {'name': 'from_csv', ... + {'name': 'from_csv', + 'type': 'table', + 'path': 'from_csv.csv', + 'scheme': 'file', + 'format': 'csv', + 'mediatype': 'text/csv', ... - >>> resource.format - 'csv' + For CSV files with a more complex structure (header, multiple column header lines, or other separators) + a pandas dataframe resource is created instead:: - """ + >>> filename = 'examples/from_csv/from_csv_multiple_headers.csv' + >>> resource = create_tabular_resource_from_csv(csvname=filename, column_header_lines=2) + >>> resource # doctest: +NORMALIZE_WHITESPACE + {'name': 'memory', + 'type': 'table', + 'data': [], + 'format': 'pandas', + 'mediatype': 'application/pandas', + 'schema': {'fields': [{'name': 'E / V', 'type': 'integer'}, + {'name': 'j / A / cm2', 'type': 'integer'}]}} + + """ csv_basename = os.path.basename(csvname) - resource = Resource( - path=csv_basename, - basepath=os.path.dirname(csvname) or ".", + if not header_lines and not column_header_lines and not decimal and not delimiters: + resource = Resource( + path=csv_basename, + basepath=os.path.dirname(csvname) or ".", + ) + resource.infer() + return resource + + # pylint: disable=duplicate-code + return create_df_resource_from_csv( + csvname, + encoding=encoding, + header_lines=header_lines, + column_header_lines=column_header_lines, + decimal=decimal, + delimiters=delimiters, ) - resource.infer() - return resource +def create_df_resource_from_csv( + csvname, + encoding=None, + header_lines=None, + column_header_lines=None, + decimal=None, + delimiters=None, +): + r""" + Create a pandas dataframe resource from a CSV file. + + EXAMPLES:: + + >>> from unitpackage.local import create_df_resource_from_csv + >>> filename = 'examples/from_csv/from_csv_multiple_headers.csv' + >>> resource = create_df_resource_from_csv(csvname='examples/from_csv/from_csv_multiple_headers.csv', column_header_lines=2) + >>> resource # doctest: +NORMALIZE_WHITESPACE + {'name': 'memory', + 'type': 'table', + 'data': [], + 'format': 'pandas', + 'mediatype': 'application/pandas', + 'schema': {'fields': [{'name': 'E / V', 'type': 'integer'}, + {'name': 'j / A / cm2', 'type': 'integer'}]}} + + """ + + from unitpackage.loaders.baseloader import BaseLoader + + with open(csvname, "r", encoding=encoding or "utf-8") as f: + csv = BaseLoader( + f, + header_lines=header_lines, + column_header_lines=column_header_lines, + decimal=decimal, + delimiters=delimiters, + ) + + return create_df_resource_from_df(csv.df) def create_df_resource_from_df(df): @@ -296,7 +370,6 @@ def create_unitpackage(resource, metadata=None, fields=None): ... """ - resource.custom.setdefault("metadata", {}) resource.custom["metadata"].setdefault("echemdb", metadata) From b399db315331c355fe6a4d6d1e50e2b1aa1d4ec0 Mon Sep 17 00:00:00 2001 From: Albert Engstfeld Date: Wed, 28 Jan 2026 11:27:51 +0100 Subject: [PATCH 05/14] fix pixi -v in workflows and add missing file --- .github/workflows/doc.yml | 2 +- .github/workflows/lint.yml | 2 +- .github/workflows/test.yml | 2 +- examples/from_csv/from_csv_multiple_headers.csv | 4 ++++ 4 files changed, 7 insertions(+), 3 deletions(-) create mode 100644 examples/from_csv/from_csv_multiple_headers.csv diff --git a/.github/workflows/doc.yml b/.github/workflows/doc.yml index c66ca99..ef13b7b 100644 --- a/.github/workflows/doc.yml +++ b/.github/workflows/doc.yml @@ -21,7 +21,7 @@ jobs: uses: actions/checkout@v6 with: { submodules: recursive } - uses: prefix-dev/setup-pixi@v0.9.3 - with: { pixi-version: v0.50.2 } + with: { pixi-version: v0.63.2 } - name: build documentation run: | pixi run doc diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index dc1f92a..2d4a795 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -18,7 +18,7 @@ jobs: - name: checkout uses: actions/checkout@v6 - uses: prefix-dev/setup-pixi@v0.9.3 - with: { pixi-version: v0.50.2 } + with: { pixi-version: v0.63.2 } - name: pylint run: pixi run pylint - name: black diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4903b2e..963b3db 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -33,7 +33,7 @@ jobs: with: { submodules: recursive } - uses: prefix-dev/setup-pixi@v0.9.3 with: - pixi-version: v0.50.2 + pixi-version: v0.63.2 - name: doctest run: | pixi run -e ${{ matrix.environment }} doctest ${{ matrix.remote-data == 'remote' && '--remote-data' || '' }} diff --git a/examples/from_csv/from_csv_multiple_headers.csv b/examples/from_csv/from_csv_multiple_headers.csv new file mode 100644 index 0000000..e8b1307 --- /dev/null +++ b/examples/from_csv/from_csv_multiple_headers.csv @@ -0,0 +1,4 @@ +E,j +V,A / cm2 +1,2 +3,4 From 69e2d5d06567ceaa6e60dfe90bc3db6c7445845d Mon Sep 17 00:00:00 2001 From: Albert Engstfeld Date: Thu, 29 Jan 2026 15:34:39 +0100 Subject: [PATCH 06/14] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- unitpackage/entry.py | 5 +++-- unitpackage/local.py | 18 ++++++++++++------ 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/unitpackage/entry.py b/unitpackage/entry.py index 3d8b32a..a4e1227 100644 --- a/unitpackage/entry.py +++ b/unitpackage/entry.py @@ -524,7 +524,7 @@ def mutable_resource(self): self.resource.custom.setdefault("MutableResource", "") if not self.resource.custom["MutableResource"]: - if not self.resource.format in ["csv", "pandas"]: + if self.resource.format not in ["csv", "pandas"]: raise ValueError( "MutableResource can only be created from resources of format 'csv' or 'pandas'." ) @@ -1089,7 +1089,8 @@ def save(self, *, outdir, basename=None): if self.resource.format == "pandas": self.resource.format = "csv" self.resource.mediatype = "text/csv" - del self.resource.data + if hasattr(self.resource, "data"): + del self.resource.data resource = self.resource.to_dict() diff --git a/unitpackage/local.py b/unitpackage/local.py index 70fbdbe..8867746 100644 --- a/unitpackage/local.py +++ b/unitpackage/local.py @@ -148,6 +148,7 @@ def create_df_resource_from_df(df): EXAMPLES:: >>> data = {'x': [1, 2, 3], 'y': [4, 5, 6]} + >>> import pandas as pd >>> df = pd.DataFrame(data) >>> from unitpackage.local import create_df_resource_from_df >>> resource = create_df_resource_from_df(df) @@ -327,25 +328,30 @@ def update_fields(schema, fields): unspecified_fields = [] unused_provided_fields = [] + # First, update fields that exist in the original schema, + # and record which original fields have no additional information provided. for name in original_schema.field_names: if name in provided_schema.field_names: new_fields.append( provided_schema.get_field(name).to_dict() | original_schema.get_field(name).to_dict() ) - elif name not in original_schema.field_names: - unused_provided_fields.append(name) else: + unspecified_fields.append(name) new_fields.append(original_schema.get_field(name).to_dict()) + # Then, record any provided fields that are not present in the original schema. + for name in provided_schema.field_names: + if name not in original_schema.field_names: + unused_provided_fields.append(name) if len(unspecified_fields) != 0: logger.warning( - f"Additional information were not provided for fields {unspecified_fields}." + f"Additional information was not provided for fields {unspecified_fields}." ) if len(unused_provided_fields) != 0: logger.warning( - f"Fields with names {unused_provided_fields} was provided but does not appear in the field names of tabular resource {original_schema.field_names}." + f"Fields with names {unused_provided_fields} were provided but do not appear in the field names of tabular resource {original_schema.field_names}." ) return Schema.from_descriptor({"fields": new_fields}) @@ -354,9 +360,9 @@ def update_fields(schema, fields): def create_unitpackage(resource, metadata=None, fields=None): r""" Return a Data Package built from a :param metadata: dict and tabular data - in :param csvname: str. + in :param resource: frictionless.Resource. - The :param fields: list must must be structured such as + The :param fields: list must be structured such as `[{'name':'E', 'unit': 'mV'}, {'name':'T', 'unit': 'K'}]`. EXAMPLES:: From 86953dc3506c739c756fbfe586125bbb4d60bf3e Mon Sep 17 00:00:00 2001 From: Albert Engstfeld Date: Thu, 29 Jan 2026 20:55:08 +0100 Subject: [PATCH 07/14] removed deprecated `cv` modules --- doc/news/load-metadata.rst | 24 +++ unitpackage/cv/__init__.py | 0 unitpackage/cv/cv_collection.py | 128 -------------- unitpackage/cv/cv_entry.py | 286 -------------------------------- 4 files changed, 24 insertions(+), 414 deletions(-) create mode 100644 doc/news/load-metadata.rst delete mode 100644 unitpackage/cv/__init__.py delete mode 100644 unitpackage/cv/cv_collection.py delete mode 100644 unitpackage/cv/cv_entry.py diff --git a/doc/news/load-metadata.rst b/doc/news/load-metadata.rst new file mode 100644 index 0000000..450b3a7 --- /dev/null +++ b/doc/news/load-metadata.rst @@ -0,0 +1,24 @@ +**Added:** + +* Added . + +**Changed:** + +* Changed . + +**Removed:** + +* Removed deprecated module `cv_collection`. +* Removed deprecated module `cv_entry`. + +**Fixed:** + +* Fixed . + +**Deprecated:** + +* Deprecated . + +**Performance:** + +* Improved . diff --git a/unitpackage/cv/__init__.py b/unitpackage/cv/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/unitpackage/cv/cv_collection.py b/unitpackage/cv/cv_collection.py deleted file mode 100644 index f7db1bc..0000000 --- a/unitpackage/cv/cv_collection.py +++ /dev/null @@ -1,128 +0,0 @@ -r""" -A Collection of Cyclic Voltammograms. It provides additional functionalities compared to -the :class:`Collection` specific to Cyclic Voltammograms and electrochemical data. - -EXAMPLES: - -Create a collection from local `frictionless Data Packages `__ -in the `data/` directory:: - - >>> from unitpackage.cv.cv_collection import CVCollection - >>> collection = CVCollection.from_local('data/') - -Create a collection from the Data Packages published in the `echemdb data repository -`_ displayed on the `echemdb website -`_.:: - - >>> collection = CVCollection.from_remote() # doctest: +REMOTE_DATA - -Search the collection for entries from a single publication:: - - >>> collection.filter(lambda entry: entry.source.url == 'https://doi.org/10.1039/C0CP01001D') # doctest: +REMOTE_DATA - [CVEntry('alves_2011_electrochemistry_6010_f1a_solid'), ... - -""" - -# ******************************************************************** -# This file is part of unitpackage. -# -# Copyright (C) 2021-2025 Albert Engstfeld -# Copyright (C) 2021 Johannes Hermann -# Copyright (C) 2021-2022 Julian Rüth -# Copyright (C) 2021 Nicolas Hörmann -# -# unitpackage is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# unitpackage is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with unitpackage. If not, see . -# ******************************************************************** -import logging -import warnings - -from unitpackage.collection import Collection - -logger = logging.getLogger("unitpackage") - - -class CVCollection(Collection): - r""" - A collection of `frictionless Data Packages `__. - - Essentially this is just a list of data packages with some additional - convenience wrap for use in the `echemdb data repository `_ - displayed on the `echemdb website `_. - - EXAMPLES: - - An example collection:: - - >>> collection = Collection.create_example() - >>> collection.package.resource_names # doctest: +NORMALIZE_WHITESPACE - ['alves_2011_electrochemistry_6010_f1a_solid', - 'engstfeld_2018_polycrystalline_17743_f4b_1', - 'no_bibliography'] - - """ - - from unitpackage.cv.cv_entry import CVEntry - - Entry = CVEntry - - def __init__(self, *args, **kwargs): - warnings.warn( - f"{self.__class__.__name__} is deprecated. Loading the echemdb database has been moved to `echemdb.Echemdb` and will be removed or refactored in a future version.", - category=DeprecationWarning, - stacklevel=2, - ) - super().__init__(*args, **kwargs) - - def materials(self): - r""" - Return the substrate materials in the collection. - - EXAMPLES:: - - >>> collection = CVCollection.create_example() - >>> collection.materials() == {'Cu', 'Ru'} - True - - """ - # pylint: disable=R0801 - import pandas as pd - - return set( - pd.unique(pd.Series([entry.get_electrode("WE").material for entry in self])) - ) - - def describe(self): - r""" - Return some statistics about the collection. - - EXAMPLES:: - - >>> collection = CVCollection.create_example() - >>> collection.describe() == \ - ... {'number of references': 2, - ... 'number of entries': 3, - ... 'materials': {'Cu', 'Ru'}} - True - - """ - # pylint: disable=R0801 - return { - "number of references": ( - 0 - if isinstance(self.bibliography, str) - else len(self.bibliography.entries) - ), - "number of entries": len(self), - "materials": self.materials(), - } diff --git a/unitpackage/cv/cv_entry.py b/unitpackage/cv/cv_entry.py deleted file mode 100644 index 805b5bd..0000000 --- a/unitpackage/cv/cv_entry.py +++ /dev/null @@ -1,286 +0,0 @@ -r""" -A Data Package describing a Cyclic Voltammogram (CV) found in the field of electrochemistry. -It provides additional functionalities compared to the class :class:`~unitpackage.entry.Entry`. - -These are the individual elements of a :class:`~unitpackage.cv.cv_collection.CVCollection`. - -EXAMPLES: - -Create a collection from local `frictionless Data Packages `__ -in the `data/` directory:: - - >>> from unitpackage.cv.cv_collection import CVCollection - >>> collection = CVCollection.from_local('data/') - -We can directly access the material of an electrode used in the experiment, -such as the WE, CE or REF:: - - >>> from unitpackage.cv.cv_collection import CVCollection - >>> db = CVCollection.create_example() - >>> entry = db['alves_2011_electrochemistry_6010_f1a_solid'] - >>> entry.get_electrode('WE').material - 'Ru' - -The :meth:`~unitpackage.cv.cv_entry.CVEntry.plot` creates a typical representation of a CV, -where ``I`` or. ``j`` is plotted vs. ``U`` or. ``E``:: - - >>> entry.plot() - Figure(...) - -""" - -# ******************************************************************** -# This file is part of unitpackage. -# -# Copyright (C) 2021-2025 Albert Engstfeld -# Copyright (C) 2021 Johannes Hermann -# Copyright (C) 2021-2022 Julian Rüth -# Copyright (C) 2021 Nicolas Hörmann -# -# unitpackage is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# unitpackage is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with unitpackage. If not, see . -# ******************************************************************** -import logging -import warnings - -from unitpackage.entry import Entry - -logger = logging.getLogger("unitpackage") - - -class CVEntry(Entry): - r""" - A `frictionless Data Package `_ describing a CV. - - EXAMPLES: - - An entry can be created directly from a Data Package that has been created - with `svgdigitizer's `_ `cv` command. - However, entries are normally obtained by opening a :class:`~unitpackage.cv.cv_collection.CVCollection` of entries:: - - >>> from unitpackage.cv.cv_collection import CVCollection - >>> collection = CVCollection.create_example() - >>> entry = next(iter(collection)) - - """ - - def __init__(self, *args, **kwargs): - warnings.warn( - f"{self.__class__.__name__} is deprecated. Use `echemdb.echemdb_entry.EchemdbEntry` instead.", - category=DeprecationWarning, - stacklevel=2, - ) - super().__init__(*args, **kwargs) - - def __repr__(self): - r""" - Return a printable representation of this entry. - - EXAMPLES:: - - >>> entry = CVEntry.create_examples()[0] - >>> entry - CVEntry('alves_2011_electrochemistry_6010_f1a_solid') - - """ - return f"CVEntry({self.identifier!r})" - - def get_electrode(self, name): - r""" - Returns an electrode with the specified name. - - EXAMPLES:: - - >>> entry = CVEntry.create_examples()[0] - >>> entry.get_electrode('WE') # doctest: +NORMALIZE_WHITESPACE - {'name': 'WE', 'function': 'workingElectrode', 'type': 'single crystal', - 'crystallographicOrientation': '0001', 'material': 'Ru', - 'preparationProcedure': 'Sputtering and flash annealing under UHV - conditions with repeated cycles of oxygen adsorption and desorption.', - 'shape': {'height': {'unit': 'mm', 'value': 2}, 'type': 'hat shaped'}, - 'source': {'supplier': 'Mateck'}} - - TESTS:: - - >>> entry.get_electrode('foo') # doctest: +NORMALIZE_WHITESPACE - Traceback (most recent call last): - ... - KeyError: "Electrode with name 'foo' does not exist" - - """ - # pylint: disable=R0801 - for electrode in self.system.electrodes: - if electrode["name"] == name: - return electrode - - raise KeyError(f"Electrode with name '{name}' does not exist") - - def rescale(self, units): - r""" - Return a rescaled :class:`~unitpackage.cv.cv_entry.CVEntry` with axes in the specified ``units``. - - Usage is essentially the same as for :meth:`~unitpackage.entry.Entry.rescale`, i.e., - new units are expected as dict, where the key is the axis name and the value - the new unit, such as ``{'j': 'uA / cm2', 't': 'h'}``. - - Additionally, the entry can be rescaled to the axes' units of the original data. - These units must be defined in the metadata of the resource, - within the key ``figure_description.fields``:: - - >>> entry = CVEntry.create_examples()[0] - >>> rescaled_entry = entry.rescale(units='original') - >>> rescaled_entry.mutable_resource.schema.fields # doctest: +NORMALIZE_WHITESPACE - [{'name': 't', 'type': 'number', 'unit': 's'}, - {'name': 'E', 'type': 'number', 'unit': 'V', 'reference': 'RHE'}, - {'name': 'j', 'type': 'number', 'unit': 'mA / cm2'}] - - """ - # pylint: disable=R0801 - if units == "original": - units = { - field["name"]: field["unit"] for field in self.figureDescription.fields - } - - return super().rescale(units) - - def _normalize_field_name(self, field_name): - r""" - Return the name of a field name of the `unitpackage` resource. - - If 'j' is requested but is not present in the resource, - 'I' is returned instead. - - EXAMPLES:: - - >>> entry = CVEntry.create_examples()[0] - >>> entry._normalize_field_name('j') - 'j' - >>> entry._normalize_field_name('x') - Traceback (most recent call last): - ... - ValueError: No axis with name 'x' found. - - """ - # pylint: disable=R0801 - if field_name in self.mutable_resource.schema.field_names: - return field_name - if field_name == "j": - return self._normalize_field_name("I") - raise ValueError(f"No axis with name '{field_name}' found.") - - def thumbnail(self, width=96, height=72, dpi=72, **kwds): - r""" - Return a thumbnail of the entry's curve as a PNG byte stream. - - EXAMPLES:: - - >>> entry = CVEntry.create_examples()[0] - >>> thumb = entry.thumbnail() - >>> thumb.startswith(b'\x89PNG') # different python versions may produce different binary outputs using " or '. - True - - The PNG's ``width`` and ``height`` can be specified in pixels. - Additional keyword arguments are passed to the data frame plotting - method:: - - >>> thumb = entry.thumbnail(width=4, height=2, color='red', linewidth=2) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE - >>> thumb.startswith(b'\x89PNG') # different python versions may produce different binary outputs using " or '. - True - - """ - # pylint: disable=R0801 - kwds.setdefault("color", "b") - kwds.setdefault("linewidth", 1) - kwds.setdefault("legend", False) - - import matplotlib.pyplot - - # A reasonable DPI setting that should work for most screens is the default value of 72. - fig, axis = matplotlib.pyplot.subplots( - 1, 1, figsize=[width / dpi, height / dpi], dpi=dpi - ) - self.df.plot( - "E", - self._normalize_field_name("j"), - ax=axis, - **kwds, - ) - - matplotlib.pyplot.axis("off") - matplotlib.pyplot.close(fig) - - import io - - buffer = io.BytesIO() - fig.savefig(buffer, format="png", transparent=True, dpi=dpi) - - buffer.seek(0) - return buffer.read() - - def plot(self, x_label="E", y_label="j", name=None): - r""" - Return a plot of this entry. - The default plot is a Cyclic Voltammogram ('j vs E'). - When `j` is not present in the data, `I` is used instead. - - EXAMPLES:: - - >>> entry = CVEntry.create_examples()[0] - >>> entry.plot() - Figure(...) - - The plot can also be returned with custom axis dimensions (field names) available in the resource:: - - >>> entry.plot(x_label='t', y_label='E') - Figure(...) - - A plot resembling the original figure can be obtained by first rescaling:: - - >>> rescaled_entry = entry.rescale('original') - >>> rescaled_entry.plot() - Figure(...) - - """ - # pylint: disable=R0801 - x_label = self._normalize_field_name(x_label) - y_label = self._normalize_field_name(y_label) - - def figure_name(): - if ( - hasattr(self.resource, "source") - and hasattr(self.resource.source, "figure") - and hasattr(self.resource.source, "curve") - ): - return f"Fig. {self.source.figure}: {self.source.curve}" - - return self.identifier - - fig = super().plot(x_label=x_label, y_label=y_label, name=name or figure_name()) - - def reference(label): - if not label == "E": - return "" - field = self.mutable_resource.schema.get_field(label).to_dict() - if "reference" not in field: - return "" - return f" vs. {field['reference']}" - - def axis_label(label): - return f"{label} [{self.field_unit(label)}{reference(label)}]" - - fig.update_layout( - xaxis_title=axis_label(x_label), - yaxis_title=axis_label(y_label), - ) - - return fig From 7ef5f47b931fcc5cb21b72c9e277e41bc5d9f36d Mon Sep 17 00:00:00 2001 From: Albert Engstfeld Date: Thu, 29 Jan 2026 21:12:37 +0100 Subject: [PATCH 08/14] Add class property `default_metadata_key` --- pixi.lock | 2 +- pyproject.toml | 1 - unitpackage/collection.py | 63 +--------- unitpackage/database/echemdb.py | 58 ++++++++- unitpackage/database/echemdb_entry.py | 127 +++++++++++++++++++- unitpackage/descriptor.py | 8 +- unitpackage/entry.py | 167 ++++++++------------------ unitpackage/local.py | 2 +- 8 files changed, 242 insertions(+), 186 deletions(-) diff --git a/pixi.lock b/pixi.lock index 13e2aa0..2345818 100644 --- a/pixi.lock +++ b/pixi.lock @@ -40383,7 +40383,7 @@ packages: - pypi: ./ name: unitpackage version: 0.11.2 - sha256: 6ee96f38df146d7ad06c80c2571eed38cdc16d6de7f0262440a1b092a35f6364 + sha256: af8d578e9c797241a93a5f9807b13eee883afc722b8fef79022d4fcb8beeca0c requires_dist: - astropy>=5,<8 - click>=8,<9 diff --git a/pyproject.toml b/pyproject.toml index b2c5f98..ce01c11 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,6 @@ unitpackage = "unitpackage.entrypoint:cli" [tool.setuptools] packages = [ "unitpackage", - "unitpackage.cv", "unitpackage.database", "unitpackage.electrochemistry", "unitpackage.loaders", diff --git a/unitpackage/collection.py b/unitpackage/collection.py index e4c1f0f..5a8300e 100644 --- a/unitpackage/collection.py +++ b/unitpackage/collection.py @@ -20,7 +20,7 @@ Search the collection for entries, for example, from a single publication providing its DOI:: - >>> collection.filter(lambda entry: entry.source.url == 'https://doi.org/10.1039/C0CP01001D') # doctest: +REMOTE_DATA + >>> collection.filter(lambda entry: entry.echemdb.source.url == 'https://doi.org/10.1039/C0CP01001D') # doctest: +REMOTE_DATA [Entry('alves_2011_electrochemistry_6010_f1a_solid'), ... """ @@ -28,7 +28,7 @@ # ******************************************************************** # This file is part of unitpackage. # -# Copyright (C) 2021-2025 Albert Engstfeld +# Copyright (C) 2021-2026 Albert Engstfeld # Copyright (C) 2021 Johannes Hermann # Copyright (C) 2021-2022 Julian Rüth # Copyright (C) 2021 Nicolas Hörmann @@ -47,7 +47,6 @@ # along with unitpackage. If not, see . # ******************************************************************** import logging -from functools import cached_property from frictionless import Package @@ -138,60 +137,6 @@ def create_example(cls): package=package, ) - @cached_property - def bibliography(self): - r""" - Return a pybtex database of all bibtex bibliography files, - associated with the entries. - - EXAMPLES:: - - >>> collection = Collection.create_example() - >>> collection.bibliography - BibliographyData( - entries=OrderedCaseInsensitiveDict([ - ('alves_2011_electrochemistry_6010', Entry('article', - ... - ('engstfeld_2018_polycrystalline_17743', Entry('article', - ... - - A derived collection includes only the bibliographic entries of the remaining entries:: - - >>> collection.filter(lambda entry: entry.source.citationKey != 'alves_2011_electrochemistry_6010').bibliography - BibliographyData( - entries=OrderedCaseInsensitiveDict([ - ('engstfeld_2018_polycrystalline_17743', Entry('article', - ... - - A collection with entries without bibliography:: - - >>> collection = Collection.create_example()["no_bibliography"] - >>> collection.bibliography - '' - - """ - from pybtex.database import BibliographyData - - bib_data = BibliographyData( - { - entry.bibliography.key: entry.bibliography - for entry in self - if entry.bibliography - } - ) - - if isinstance(bib_data, str): - return bib_data - - # Remove duplicates from the bibliography - bib_data_ = BibliographyData() - - for key, entry in bib_data.entries.items(): - if key not in bib_data_.entries: - bib_data_.add_entry(key, entry) - - return bib_data_ - def filter(self, predicate): r""" Return the subset of the collection that satisfies predicate. @@ -199,7 +144,7 @@ def filter(self, predicate): EXAMPLES:: >>> collection = Collection.create_example() - >>> collection.filter(lambda entry: entry.source.url == 'https://doi.org/10.1039/C0CP01001D') + >>> collection.filter(lambda entry: entry.echemdb.source.url == 'https://doi.org/10.1039/C0CP01001D') [Entry('alves_2011_electrochemistry_6010_f1a_solid')] @@ -610,7 +555,7 @@ def from_remote(cls, url=None, data=None, outdir=None): >>> from unitpackage.collection import Collection >>> collection = Collection.from_remote() # doctest: +REMOTE_DATA - >>> collection.filter(lambda entry: entry.source.url == 'https://doi.org/10.1039/C0CP01001D') # doctest: +REMOTE_DATA + >>> collection.filter(lambda entry: entry.echemdb.source.url == 'https://doi.org/10.1039/C0CP01001D') # doctest: +REMOTE_DATA [Entry('alves_2011_electrochemistry_6010_f1a_solid'), Entry('alves_2011_electrochemistry_6010_f2_red')] The folder containing the data in the zip can be specified with the :param data:. diff --git a/unitpackage/database/echemdb.py b/unitpackage/database/echemdb.py index 38928a3..99c09ec 100644 --- a/unitpackage/database/echemdb.py +++ b/unitpackage/database/echemdb.py @@ -22,7 +22,7 @@ # ******************************************************************** # This file is part of unitpackage. # -# Copyright (C) 2021-2025 Albert Engstfeld +# Copyright (C) 2021-2026 Albert Engstfeld # Copyright (C) 2021 Johannes Hermann # Copyright (C) 2021-2022 Julian Rüth # Copyright (C) 2021 Nicolas Hörmann @@ -41,6 +41,7 @@ # along with unitpackage. If not, see . # ******************************************************************** import logging +from functools import cached_property from unitpackage.collection import Collection @@ -114,3 +115,58 @@ def describe(self): "number of entries": len(self), "materials": self.materials(), } + + @cached_property + def bibliography(self): + r""" + Return a pybtex database of all bibtex bibliography files, + associated with the entries. + + EXAMPLES:: + + >>> from unitpackage.database.echemdb import Echemdb + >>> collection = Echemdb.create_example() + >>> collection.bibliography + BibliographyData( + entries=OrderedCaseInsensitiveDict([ + ('alves_2011_electrochemistry_6010', Entry('article', + ... + ('engstfeld_2018_polycrystalline_17743', Entry('article', + ... + + A derived collection includes only the bibliographic entries of the remaining entries:: + + >>> collection.filter(lambda entry: entry.source.citationKey != 'alves_2011_electrochemistry_6010').bibliography + BibliographyData( + entries=OrderedCaseInsensitiveDict([ + ('engstfeld_2018_polycrystalline_17743', Entry('article', + ... + + A collection with entries without bibliography:: + + >>> collection = Echemdb.create_example()["no_bibliography"] + >>> collection.bibliography + '' + + """ + from pybtex.database import BibliographyData + + bib_data = BibliographyData( + { + entry.bibliography.key: entry.bibliography + for entry in self + if entry.bibliography + } + ) + + if isinstance(bib_data, str): + return bib_data + + # Remove duplicates from the bibliography + bib_data_ = BibliographyData() + + for key, entry in bib_data.entries.items(): + if key not in bib_data_.entries: + bib_data_.add_entry(key, entry) + + return bib_data_ diff --git a/unitpackage/database/echemdb_entry.py b/unitpackage/database/echemdb_entry.py index d08d02d..e4ec061 100644 --- a/unitpackage/database/echemdb_entry.py +++ b/unitpackage/database/echemdb_entry.py @@ -21,12 +21,32 @@ >>> entry.plot() Figure(...) + Data Entries containing published data, + also contain information on the source of the data.:: + + >>> from unitpackage.database.echemdb import Echemdb + >>> db = Echemdb.create_example() + >>> entry = db['alves_2011_electrochemistry_6010_f1a_solid'] + >>> entry.bibliography # doctest: +NORMALIZE_WHITESPACE +REMOTE_DATA + Entry('article', + fields=[ + ('title', 'Electrochemistry at Ru(0001) in a flowing CO-saturated electrolyte—reactive and inert adlayer phases'), + ('journal', 'Physical Chemistry Chemical Physics'), + ('volume', '13'), + ('number', '13'), + ('pages', '6010--6021'), + ('year', '2011'), + ('publisher', 'Royal Society of Chemistry'), + ('abstract', 'We investigated ...')], + persons={'author': [Person('Alves, Otavio B'), Person('Hoster, Harry E'), Person('Behm, Rolf J{\\"u}rgen')]}) + + """ # ******************************************************************** # This file is part of unitpackage. # -# Copyright (C) 2021-2025 Albert Engstfeld +# Copyright (C) 2021-2026 Albert Engstfeld # Copyright (C) 2021 Johannes Hermann # Copyright (C) 2021-2022 Julian Rüth # Copyright (C) 2021 Nicolas Hörmann @@ -66,6 +86,9 @@ class EchemdbEntry(Entry): """ + default_metadata_key = "echemdb" + """Use 'echemdb' key to access descriptor metadata.""" + def __repr__(self): r""" Return a printable representation of this entry. @@ -79,6 +102,108 @@ def __repr__(self): """ return f"Echemdb({self.identifier!r})" + @property + def bibliography(self): + r""" + Return a pybtex bibliography object associated with this entry. + + EXAMPLES:: + + >>> entry = EchemdbEntry.create_examples()[0] + >>> entry.bibliography # doctest: +NORMALIZE_WHITESPACE + Entry('article', + fields=[ + ('title', ... + ... + + >>> entry_no_bib = EchemdbEntry.create_examples(name="no_bibliography")[0] + >>> entry_no_bib.bibliography + '' + + """ + metadata = self._default_metadata.setdefault("source", {}) + citation = metadata.setdefault("bibdata", "") + + if not citation: + logger.warning(f"Entry with name {self.identifier} has no bibliography.") + return citation + + from pybtex.database import parse_string + + bibliography = parse_string(citation, "bibtex") + return bibliography.entries[self.source.citationKey] + + def citation(self, backend="text"): + r""" + Return a formatted reference for the entry's bibliography such as: + + J. Doe, et al., Journal Name, volume (YEAR) page, "Title" + + Rendering default is plain text 'text', but can be changed to any format + supported by pybtex, such as markdown 'md', 'latex' or 'html'. + + EXAMPLES:: + + >>> entry = EchemdbEntry.create_examples()[0] + >>> entry.citation(backend='text') + 'O. B. Alves et al. Electrochemistry at Ru(0001) in a flowing CO-saturated electrolyte—reactive and inert adlayer phases. Physical Chemistry Chemical Physics, 13(13):6010–6021, 2011.' + >>> print(entry.citation(backend='md')) + O\. B\. Alves *et al\.* + *Electrochemistry at Ru\(0001\) in a flowing CO\-saturated electrolyte—reactive and inert adlayer phases*\. + *Physical Chemistry Chemical Physics*, 13\(13\):6010–6021, 2011\. + + """ + from pybtex.style.formatting.unsrt import Style + + # TODO:: Remove `class EchemdbStyle` from citation and improve citation style. (see #104) + class EchemdbStyle(Style): + r""" + A citation style for the echemdb website. + """ + + def format_names(self, role, as_sentence=True): + from pybtex.style.template import node + + @node + def names(_, context, role): + persons = context["entry"].persons[role] + style = context["style"] + + names = [ + style.format_name(person, style.abbreviate_names) + for person in persons + ] + + if len(names) == 1: + return names[0].format_data(context) + + from pybtex.style.template import tag, words + + # pylint: disable=no-value-for-parameter + return words(sep=" ")[names[0], tag("i")["et al."]].format_data( + context + ) + + # pylint: disable=no-value-for-parameter + names = names(role) + + from pybtex.style.template import sentence + + return sentence[names] if as_sentence else names + + def format_title(self, e, which_field, as_sentence=True): + from pybtex.style.template import field, sentence, tag + + # pylint: disable=no-value-for-parameter + title = tag("i")[field(which_field)] + return sentence[title] if as_sentence else title + + return ( + EchemdbStyle(abbreviate_names=True) + .format_entry("unused", self.bibliography) + .text.render_as(backend) + ) + def get_electrode(self, name): r""" Returns an electrode with the specified name. diff --git a/unitpackage/descriptor.py b/unitpackage/descriptor.py index f1c0345..7dc1092 100644 --- a/unitpackage/descriptor.py +++ b/unitpackage/descriptor.py @@ -45,7 +45,7 @@ # ******************************************************************** # This file is part of unitpackage. # -# Copyright (C) 2021-2023 Albert Engstfeld +# Copyright (C) 2021-2026 Albert Engstfeld # Copyright (C) 2021 Johannes Hermann # Copyright (C) 2021 Julian Rüth # Copyright (C) 2021 Nicolas Hörmann @@ -171,7 +171,7 @@ class QuantityDescriptor(GenericDescriptor): >>> from unitpackage.entry import Entry >>> entry = Entry.create_examples()[0] - >>> temperature = entry.system.electrolyte.temperature + >>> temperature = entry.echemdb.system.electrolyte.temperature >>> temperature 298.15 K @@ -188,7 +188,7 @@ def quantity(self): >>> from unitpackage.entry import Entry >>> entry = Entry.create_examples()[0] - >>> temperature = entry.system.electrolyte.temperature + >>> temperature = entry.echemdb.system.electrolyte.temperature >>> temperature.quantity @@ -205,7 +205,7 @@ def __repr__(self): >>> from unitpackage.entry import Entry >>> entry = Entry.create_examples()[0] - >>> temperature = entry.system.electrolyte.temperature + >>> temperature = entry.echemdb.system.electrolyte.temperature >>> temperature 298.15 K diff --git a/unitpackage/entry.py b/unitpackage/entry.py index a4e1227..6e9f95a 100644 --- a/unitpackage/entry.py +++ b/unitpackage/entry.py @@ -15,7 +15,7 @@ Metadata included in an entry is accessible as an attribute:: >>> entry = Entry.create_examples()[0] - >>> entry.source # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + >>> entry.echemdb.source # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE {'citationKey': 'alves_2011_electrochemistry_6010', 'url': 'https://doi.org/10.1039/C0CP01001D', 'figure': '1a', @@ -31,25 +31,6 @@ 1 0.020000 -0.102158 -0.981762 ... -Data Entries containing published data, -also contain information on the source of the data.:: - - >>> from unitpackage.collection import Collection - >>> db = Collection.create_example() - >>> entry = db['alves_2011_electrochemistry_6010_f1a_solid'] - >>> entry.bibliography # doctest: +NORMALIZE_WHITESPACE +REMOTE_DATA - Entry('article', - fields=[ - ('title', 'Electrochemistry at Ru(0001) in a flowing CO-saturated electrolyte—reactive and inert adlayer phases'), - ('journal', 'Physical Chemistry Chemical Physics'), - ('volume', '13'), - ('number', '13'), - ('pages', '6010--6021'), - ('year', '2011'), - ('publisher', 'Royal Society of Chemistry'), - ('abstract', 'We investigated ...')], - persons={'author': [Person('Alves, Otavio B'), Person('Hoster, Harry E'), Person('Behm, Rolf J{\\"u}rgen')]}) - """ # ******************************************************************** @@ -115,6 +96,10 @@ class Entry: """ + default_metadata_key = "" + """Default metadata key to use when accessing the descriptor. + If empty string, the entire metadata dict is used. Subclasses can override this.""" + def __init__(self, resource): self.resource = resource @@ -142,10 +127,9 @@ def __dir__(self): >>> entry = Entry.create_examples()[0] >>> dir(entry) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE - [... 'bibliography', 'citation', 'create_examples', 'curation', - 'dataDescription', 'df', 'experimental', 'field_unit', 'figureDescription', - 'from_csv', 'from_df', 'from_local', 'identifier', 'mutable_resource', 'plot', - 'rename_fields', 'rescale', 'resource', 'save', 'source', 'system', 'yaml'] + [... 'create_examples', 'default_metadata_key', 'df', 'echemdb', 'field_unit', + 'from_csv', 'from_df', 'from_local', 'identifier', 'mutable_resource', + 'plot', 'rename_fields', 'rescale', 'resource', 'save', 'yaml'] """ return list(set(dir(self._descriptor) + object.__dir__(self))) @@ -157,7 +141,7 @@ def __getattr__(self, name): EXAMPLES:: >>> entry = Entry.create_examples()[0] - >>> entry.source # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + >>> entry.echemdb.source # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE {'citationKey': 'alves_2011_electrochemistry_6010', 'url': 'https://doi.org/10.1039/C0CP01001D', 'figure': '1a', @@ -166,7 +150,7 @@ def __getattr__(self, name): The returned descriptor can again be accessed in the same way:: - >>> entry.system.electrolyte.components[0].name + >>> entry.echemdb.system.electrolyte.components[0].name 'H2O' """ @@ -179,7 +163,7 @@ def __getitem__(self, name): EXAMPLES:: >>> entry = Entry.create_examples()[0] - >>> entry["source"] # doctest: +NORMALIZE_WHITESPACE + >>> entry["echemdb"]["source"] # doctest: +NORMALIZE_WHITESPACE {'citationKey': 'alves_2011_electrochemistry_6010', 'url': 'https://doi.org/10.1039/C0CP01001D', 'figure': '1a', @@ -191,123 +175,70 @@ def __getitem__(self, name): @property def _descriptor(self): - return Descriptor(self.resource.custom["metadata"]["echemdb"]) - - @property - def _metadata(self): r""" - Returns the metadata named "echemdb" associated with this entry. - - EXAMPLES:: + Return a Descriptor object wrapping the entry's metadata. - >>> entry = Entry.create_examples()[0] - >>> entry._metadata # doctest: +NORMALIZE_WHITESPACE - {...'source': {'citationKey': 'alves_2011_electrochemistry_6010',...} + The metadata structure depends on the :attr:`default_metadata_key` class attribute: - """ - return self.resource.custom["metadata"]["echemdb"] + - If ``default_metadata_key`` is an empty string (default in :class:`Entry`), + the entire ``metadata`` dict is wrapped as the descriptor. + - If ``default_metadata_key`` is set to a non-empty string (e.g., "echemdb" in subclasses), + the descriptor wraps only the metadata under that specific key. - @property - def bibliography(self): - r""" - Return a pybtex bibliography object associated with this entry. + This allows subclasses to work with different metadata structures while maintaining + a consistent interface through the Descriptor class. EXAMPLES:: >>> entry = Entry.create_examples()[0] - >>> entry.bibliography # doctest: +NORMALIZE_WHITESPACE - Entry('article', - fields=[ - ('title', ... - ... - - >>> entry_no_bib = Entry.create_examples(name="no_bibliography")[0] - >>> entry_no_bib.bibliography - '' + >>> entry._descriptor # doctest: +ELLIPSIS + {'echemdb': ...} - """ - metadata = self._metadata.setdefault("source", {}) - citation = metadata.setdefault("bibdata", "") - - if not citation: - logger.warning(f"Entry with name {self.identifier} has no bibliography.") - return citation + >>> entry.echemdb.source.citationKey + 'alves_2011_electrochemistry_6010' - from pybtex.database import parse_string - bibliography = parse_string(citation, "bibtex") - return bibliography.entries[self.source.citationKey] + """ + return Descriptor(self._default_metadata) - def citation(self, backend="text"): + @property + def _metadata(self): r""" - Return a formatted reference for the entry's bibliography such as: - - J. Doe, et al., Journal Name, volume (YEAR) page, "Title" + Returns the metadata associated with this entry. - Rendering default is plain text 'text', but can be changed to any format - supported by pybtex, such as markdown 'md', 'latex' or 'html'. + The metadata may contain keys which nest entire metadata schemas (e.g., "echemdb", "myExperiment", etc.). + Use :attr:`_default_metadata` to access the subset determined by :attr:`default_metadata_key`. EXAMPLES:: >>> entry = Entry.create_examples()[0] - >>> entry.citation(backend='text') - 'O. B. Alves et al. Electrochemistry at Ru(0001) in a flowing CO-saturated electrolyte—reactive and inert adlayer phases. Physical Chemistry Chemical Physics, 13(13):6010–6021, 2011.' - >>> print(entry.citation(backend='md')) - O\. B\. Alves *et al\.* - *Electrochemistry at Ru\(0001\) in a flowing CO\-saturated electrolyte—reactive and inert adlayer phases*\. - *Physical Chemistry Chemical Physics*, 13\(13\):6010–6021, 2011\. + >>> entry._metadata # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + {...'echemdb': {...'source': {'citationKey': 'alves_2011_electrochemistry_6010',...}...} """ - from pybtex.style.formatting.unsrt import Style - - # TODO:: Remove `class EchemdbStyle` from citation and improve citation style. (see #104) - class EchemdbStyle(Style): - r""" - A citation style for the echemdb website. - """ - - def format_names(self, role, as_sentence=True): - from pybtex.style.template import node - - @node - def names(_, context, role): - persons = context["entry"].persons[role] - style = context["style"] - - names = [ - style.format_name(person, style.abbreviate_names) - for person in persons - ] - - if len(names) == 1: - return names[0].format_data(context) - - from pybtex.style.template import tag, words + return self.resource.custom["metadata"] - # pylint: disable=no-value-for-parameter - return words(sep=" ")[names[0], tag("i")["et al."]].format_data( - context - ) - - # pylint: disable=no-value-for-parameter - names = names(role) + @property + def _default_metadata(self): + r""" + Returns the metadata subset based on :attr:`default_metadata_key`. - from pybtex.style.template import sentence + If :attr:`default_metadata_key` is empty, returns the entire metadata dict. + Otherwise, returns the metadata under the specified key. - return sentence[names] if as_sentence else names + This is useful for subclasses that want to work with a specific metadata structure. - def format_title(self, e, which_field, as_sentence=True): - from pybtex.style.template import field, sentence, tag + EXAMPLES:: - # pylint: disable=no-value-for-parameter - title = tag("i")[field(which_field)] - return sentence[title] if as_sentence else title + >>> entry = Entry.create_examples()[0] + >>> entry._default_metadata # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + {...'echemdb': {...'source': {'citationKey': 'alves_2011_electrochemistry_6010',...}...} - return ( - EchemdbStyle(abbreviate_names=True) - .format_entry("unused", self.bibliography) - .text.render_as(backend) - ) + """ + metadata = self._metadata + if self.default_metadata_key and self.default_metadata_key in metadata: + return metadata[self.default_metadata_key] + return metadata def field_unit(self, field_name): r""" diff --git a/unitpackage/local.py b/unitpackage/local.py index 8867746..148f161 100644 --- a/unitpackage/local.py +++ b/unitpackage/local.py @@ -377,7 +377,7 @@ def create_unitpackage(resource, metadata=None, fields=None): """ resource.custom.setdefault("metadata", {}) - resource.custom["metadata"].setdefault("echemdb", metadata) + resource.custom["metadata"] = metadata if fields: # Update fields in the Resource describing the data in the CSV From 9f658d57cdb2ed8d5da0b9f128b779ae8087b7e8 Mon Sep 17 00:00:00 2001 From: Albert Engstfeld Date: Thu, 29 Jan 2026 23:37:23 +0100 Subject: [PATCH 09/14] Add appproach for loading metadata from file --- unitpackage/entry.py | 186 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 185 insertions(+), 1 deletion(-) diff --git a/unitpackage/entry.py b/unitpackage/entry.py index 6e9f95a..f17eb3d 100644 --- a/unitpackage/entry.py +++ b/unitpackage/entry.py @@ -62,6 +62,163 @@ logger = logging.getLogger("unitpackage") +class ResourceMetadata: + r""" + Manages metadata for an Entry, supporting both dict and attribute access, + and providing methods to load metadata from various sources. + + EXAMPLES:: + + >>> entry = Entry.create_examples()[0] + >>> entry.metadata['echemdb']['source']['citationKey'] + 'alves_2011_electrochemistry_6010' + + >>> entry.metadata.echemdb.source.citationKey + 'alves_2011_electrochemistry_6010' + + """ + + def __init__(self, entry): + object.__setattr__(self, "_entry", entry) + + @property + def _metadata(self): + return self._entry.resource.custom.setdefault("metadata", {}) + + @property + def _descriptor(self): + return Descriptor(self._metadata) + + def __getitem__(self, key): + r""" + Dict-style access to metadata with descriptor support. + + EXAMPLES:: + + >>> entry = Entry.create_examples()[0] + >>> entry.metadata['echemdb']['source']['citationKey'] + 'alves_2011_electrochemistry_6010' + + """ + return self._descriptor[key] + + def __setitem__(self, key, value): + r""" + Dict-style assignment to metadata. + + EXAMPLES:: + + >>> entry = Entry.create_examples()[0] + >>> entry.metadata['custom_key'] = {'data': 'value'} + >>> entry.metadata['custom_key'] + {'data': 'value'} + + """ + self._metadata[key] = value + + def __getattr__(self, name): + r""" + Attribute-style access to metadata with full descriptor support. + + EXAMPLES:: + + >>> entry = Entry.create_examples()[0] + >>> entry.metadata.echemdb.source.citationKey + 'alves_2011_electrochemistry_6010' + + """ + return getattr(self._descriptor, name) + + def from_dict(self, data): + r""" + Load metadata from a dictionary. + + EXAMPLES:: + + >>> entry = Entry.create_examples()[0] + >>> entry.metadata.from_dict({'echemdb': {'source': {'citationKey': 'test'}}}) + >>> entry.metadata['echemdb']['source']['citationKey'] + 'test' + + """ + self._entry.resource.custom["metadata"] = data + + def _add_metadata(self, key, data): + r""" + Add metadata under a specific key. + + EXAMPLES:: + + >>> entry = Entry.create_examples()[0] + >>> entry.metadata._add_metadata('custom_key', {'data': 'value'}) + >>> entry.metadata['custom_key'] + {'data': 'value'} + + """ + if key: + self._entry.resource.custom["metadata"][key] = data + else: + self._entry.resource.custom["metadata"] = data + + def from_yaml(self, filename, key=None): + r""" + Load metadata from a YAML file. + + If a key is provided, the loaded data is stored under that key. + Otherwise, it replaces the entire metadata dict. + + EXAMPLES:: + + >>> import os + >>> import tempfile + >>> import yaml + >>> entry = Entry.create_examples()[0] + >>> with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: + ... yaml.dump({'source': {'citationKey': 'yaml_test'}}, f) + ... temp_path = f.name + >>> entry.metadata.from_yaml(temp_path, key='echemdb') + >>> entry.metadata['echemdb']['source']['citationKey'] + 'yaml_test' + >>> os.unlink(temp_path) + + """ + import yaml + + with open(filename, "r", encoding="utf-8") as f: + data = yaml.safe_load(f) + + self._add_metadata(key, data) + + def from_json(self, filename, key=None): + r""" + Load metadata from a JSON file. + + If a key is provided, the loaded data is stored under that key. + Otherwise, it replaces the entire metadata dict. + + EXAMPLES:: + + >>> import os + >>> import json + >>> import tempfile + >>> entry = Entry.create_examples()[0] + >>> with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: + ... json.dump({'source': {'citationKey': 'json_test'}}, f) + ... temp_path = f.name + >>> entry.metadata.from_json(temp_path, key='echemdb') + >>> entry.metadata['echemdb']['source']['citationKey'] + 'json_test' + >>> os.unlink(temp_path) + + """ + import json + + with open(filename, "r", encoding="utf-8") as f: + data = json.load(f) + + self._add_metadata(key, data) + + class Entry: r""" A `frictionless Resource `_ @@ -103,6 +260,33 @@ class Entry: def __init__(self, resource): self.resource = resource + @property + def metadata(self): + r""" + Access and manage entry metadata. + + Returns a ResourceMetadata that supports both dict and attribute-style access. + Allows loading metadata from various sources. Modifications are applied in-place. + + EXAMPLES:: + + >>> entry = Entry.create_examples()[0] + >>> entry.metadata['echemdb']['source']['citationKey'] + 'alves_2011_electrochemistry_6010' + + >>> entry.metadata.echemdb['source']['citationKey'] + 'alves_2011_electrochemistry_6010' + + Load metadata from a dict:: + + >>> new_entry = Entry.create_examples()[0] + >>> new_entry.metadata.from_dict({'echemdb': {'test': 'data'}}) + >>> new_entry.metadata['echemdb']['test'] + 'data' + + """ + return ResourceMetadata(self) + @property def identifier(self): r""" @@ -128,7 +312,7 @@ def __dir__(self): >>> entry = Entry.create_examples()[0] >>> dir(entry) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE [... 'create_examples', 'default_metadata_key', 'df', 'echemdb', 'field_unit', - 'from_csv', 'from_df', 'from_local', 'identifier', 'mutable_resource', + 'from_csv', 'from_df', 'from_local', 'identifier', 'metadata', 'mutable_resource', 'plot', 'rename_fields', 'rescale', 'resource', 'save', 'yaml'] """ From 4e790f7909c0e68289ed31958af567c7edcd6f36 Mon Sep 17 00:00:00 2001 From: Albert Engstfeld Date: Thu, 29 Jan 2026 23:40:44 +0100 Subject: [PATCH 10/14] Add repr to ResourceMetadata --- unitpackage/entry.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/unitpackage/entry.py b/unitpackage/entry.py index f17eb3d..f8608b2 100644 --- a/unitpackage/entry.py +++ b/unitpackage/entry.py @@ -70,6 +70,9 @@ class ResourceMetadata: EXAMPLES:: >>> entry = Entry.create_examples()[0] + >>> entry.metadata # doctest: +ELLIPSIS + {'echemdb': {'experimental': ... + >>> entry.metadata['echemdb']['source']['citationKey'] 'alves_2011_electrochemistry_6010' @@ -81,6 +84,9 @@ class ResourceMetadata: def __init__(self, entry): object.__setattr__(self, "_entry", entry) + def __repr__(self): + return repr(self._metadata) + @property def _metadata(self): return self._entry.resource.custom.setdefault("metadata", {}) From a0395fee57e878a9fde4fa7bc223668e29525b9e Mon Sep 17 00:00:00 2001 From: Albert Engstfeld Date: Fri, 30 Jan 2026 08:02:40 +0100 Subject: [PATCH 11/14] Move metadata methods in a separate module --- unitpackage/entry.py | 168 +------------------------------ unitpackage/metadata.py | 218 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 221 insertions(+), 165 deletions(-) create mode 100644 unitpackage/metadata.py diff --git a/unitpackage/entry.py b/unitpackage/entry.py index f8608b2..205929c 100644 --- a/unitpackage/entry.py +++ b/unitpackage/entry.py @@ -58,173 +58,11 @@ import os.path from unitpackage.descriptor import Descriptor +from unitpackage.metadata import MetadataDescriptor logger = logging.getLogger("unitpackage") -class ResourceMetadata: - r""" - Manages metadata for an Entry, supporting both dict and attribute access, - and providing methods to load metadata from various sources. - - EXAMPLES:: - - >>> entry = Entry.create_examples()[0] - >>> entry.metadata # doctest: +ELLIPSIS - {'echemdb': {'experimental': ... - - >>> entry.metadata['echemdb']['source']['citationKey'] - 'alves_2011_electrochemistry_6010' - - >>> entry.metadata.echemdb.source.citationKey - 'alves_2011_electrochemistry_6010' - - """ - - def __init__(self, entry): - object.__setattr__(self, "_entry", entry) - - def __repr__(self): - return repr(self._metadata) - - @property - def _metadata(self): - return self._entry.resource.custom.setdefault("metadata", {}) - - @property - def _descriptor(self): - return Descriptor(self._metadata) - - def __getitem__(self, key): - r""" - Dict-style access to metadata with descriptor support. - - EXAMPLES:: - - >>> entry = Entry.create_examples()[0] - >>> entry.metadata['echemdb']['source']['citationKey'] - 'alves_2011_electrochemistry_6010' - - """ - return self._descriptor[key] - - def __setitem__(self, key, value): - r""" - Dict-style assignment to metadata. - - EXAMPLES:: - - >>> entry = Entry.create_examples()[0] - >>> entry.metadata['custom_key'] = {'data': 'value'} - >>> entry.metadata['custom_key'] - {'data': 'value'} - - """ - self._metadata[key] = value - - def __getattr__(self, name): - r""" - Attribute-style access to metadata with full descriptor support. - - EXAMPLES:: - - >>> entry = Entry.create_examples()[0] - >>> entry.metadata.echemdb.source.citationKey - 'alves_2011_electrochemistry_6010' - - """ - return getattr(self._descriptor, name) - - def from_dict(self, data): - r""" - Load metadata from a dictionary. - - EXAMPLES:: - - >>> entry = Entry.create_examples()[0] - >>> entry.metadata.from_dict({'echemdb': {'source': {'citationKey': 'test'}}}) - >>> entry.metadata['echemdb']['source']['citationKey'] - 'test' - - """ - self._entry.resource.custom["metadata"] = data - - def _add_metadata(self, key, data): - r""" - Add metadata under a specific key. - - EXAMPLES:: - - >>> entry = Entry.create_examples()[0] - >>> entry.metadata._add_metadata('custom_key', {'data': 'value'}) - >>> entry.metadata['custom_key'] - {'data': 'value'} - - """ - if key: - self._entry.resource.custom["metadata"][key] = data - else: - self._entry.resource.custom["metadata"] = data - - def from_yaml(self, filename, key=None): - r""" - Load metadata from a YAML file. - - If a key is provided, the loaded data is stored under that key. - Otherwise, it replaces the entire metadata dict. - - EXAMPLES:: - - >>> import os - >>> import tempfile - >>> import yaml - >>> entry = Entry.create_examples()[0] - >>> with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: - ... yaml.dump({'source': {'citationKey': 'yaml_test'}}, f) - ... temp_path = f.name - >>> entry.metadata.from_yaml(temp_path, key='echemdb') - >>> entry.metadata['echemdb']['source']['citationKey'] - 'yaml_test' - >>> os.unlink(temp_path) - - """ - import yaml - - with open(filename, "r", encoding="utf-8") as f: - data = yaml.safe_load(f) - - self._add_metadata(key, data) - - def from_json(self, filename, key=None): - r""" - Load metadata from a JSON file. - - If a key is provided, the loaded data is stored under that key. - Otherwise, it replaces the entire metadata dict. - - EXAMPLES:: - - >>> import os - >>> import json - >>> import tempfile - >>> entry = Entry.create_examples()[0] - >>> with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: - ... json.dump({'source': {'citationKey': 'json_test'}}, f) - ... temp_path = f.name - >>> entry.metadata.from_json(temp_path, key='echemdb') - >>> entry.metadata['echemdb']['source']['citationKey'] - 'json_test' - >>> os.unlink(temp_path) - - """ - import json - - with open(filename, "r", encoding="utf-8") as f: - data = json.load(f) - - self._add_metadata(key, data) - - class Entry: r""" A `frictionless Resource `_ @@ -271,7 +109,7 @@ def metadata(self): r""" Access and manage entry metadata. - Returns a ResourceMetadata that supports both dict and attribute-style access. + Returns a MetadataDescriptor that supports both dict and attribute-style access. Allows loading metadata from various sources. Modifications are applied in-place. EXAMPLES:: @@ -291,7 +129,7 @@ def metadata(self): 'data' """ - return ResourceMetadata(self) + return MetadataDescriptor(self) @property def identifier(self): diff --git a/unitpackage/metadata.py b/unitpackage/metadata.py new file mode 100644 index 0000000..ef9a573 --- /dev/null +++ b/unitpackage/metadata.py @@ -0,0 +1,218 @@ +r""" +Metadata management for unitpackage entries. + +This module provides the MetadataDescriptor class that manages metadata +for Entry objects, supporting both dict and attribute access, and providing +methods to load metadata from various sources (YAML, JSON, dict). + +EXAMPLES: + +Access metadata with dict-style or attribute-style syntax:: + + >>> from unitpackage.entry import Entry + >>> entry = Entry.create_examples()[0] + >>> entry.metadata['echemdb']['source']['citationKey'] + 'alves_2011_electrochemistry_6010' + + >>> entry.metadata.echemdb.source.citationKey + 'alves_2011_electrochemistry_6010' + +Load metadata from external sources:: + + >>> entry.metadata.from_dict({'custom': {'key': 'value'}}) + >>> entry.metadata['custom']['key'] + 'value' + +""" + +# ******************************************************************** +# This file is part of unitpackage. +# +# Copyright (C) 2026 Albert Engstfeld +# +# unitpackage is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# unitpackage is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with unitpackage. If not, see . +# ******************************************************************** + +from unitpackage.descriptor import Descriptor + + +class MetadataDescriptor: + r""" + Manages metadata for an Entry, supporting both dict and attribute access, + and providing methods to load metadata from various sources. + + EXAMPLES:: + + >>> from unitpackage.entry import Entry + >>> entry = Entry.create_examples()[0] + >>> entry.metadata # doctest: +ELLIPSIS + {'echemdb': {'experimental': ... + + >>> entry.metadata['echemdb']['source']['citationKey'] + 'alves_2011_electrochemistry_6010' + + >>> entry.metadata.echemdb.source.citationKey + 'alves_2011_electrochemistry_6010' + + """ + + def __init__(self, entry): + object.__setattr__(self, "_entry", entry) + + def __repr__(self): + return repr(self._metadata) + + @property + def _metadata(self): + return self._entry.resource.custom.setdefault("metadata", {}) + + @property + def _descriptor(self): + return Descriptor(self._metadata) + + def __getitem__(self, key): + r""" + Dict-style access to metadata with descriptor support. + + EXAMPLES:: + + >>> from unitpackage.entry import Entry + >>> entry = Entry.create_examples()[0] + >>> entry.metadata['echemdb']['source']['citationKey'] + 'alves_2011_electrochemistry_6010' + + """ + return self._descriptor[key] + + def __setitem__(self, key, value): + r""" + Dict-style assignment to metadata. + + EXAMPLES:: + + >>> from unitpackage.entry import Entry + >>> entry = Entry.create_examples()[0] + >>> entry.metadata['custom_key'] = {'data': 'value'} + >>> entry.metadata['custom_key'] + {'data': 'value'} + + """ + self._metadata[key] = value + + def __getattr__(self, name): + r""" + Attribute-style access to metadata with full descriptor support. + + EXAMPLES:: + + >>> from unitpackage.entry import Entry + >>> entry = Entry.create_examples()[0] + >>> entry.metadata.echemdb.source.citationKey + 'alves_2011_electrochemistry_6010' + + """ + return getattr(self._descriptor, name) + + def from_dict(self, data): + r""" + Load metadata from a dictionary. + + EXAMPLES:: + + >>> from unitpackage.entry import Entry + >>> entry = Entry.create_examples()[0] + >>> entry.metadata.from_dict({'echemdb': {'source': {'citationKey': 'test'}}}) + >>> entry.metadata['echemdb']['source']['citationKey'] + 'test' + + """ + self._entry.resource.custom["metadata"] = data + + def _add_metadata(self, key, data): + r""" + Add metadata under a specific key. + + EXAMPLES:: + + >>> from unitpackage.entry import Entry + >>> entry = Entry.create_examples()[0] + >>> entry.metadata._add_metadata('custom_key', {'data': 'value'}) + >>> entry.metadata['custom_key'] + {'data': 'value'} + + """ + if key: + self._entry.resource.custom["metadata"][key] = data + else: + self._entry.resource.custom["metadata"] = data + + def from_yaml(self, filename, key=None): + r""" + Load metadata from a YAML file. + + If a key is provided, the loaded data is stored under that key. + Otherwise, it replaces the entire metadata dict. + + EXAMPLES:: + + >>> import os + >>> import tempfile + >>> import yaml + >>> from unitpackage.entry import Entry + >>> entry = Entry.create_examples()[0] + >>> with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: + ... yaml.dump({'source': {'citationKey': 'yaml_test'}}, f) + ... temp_path = f.name + >>> entry.metadata.from_yaml(temp_path, key='echemdb') + >>> entry.metadata['echemdb']['source']['citationKey'] + 'yaml_test' + >>> os.unlink(temp_path) + + """ + import yaml + + with open(filename, "r", encoding="utf-8") as f: + data = yaml.safe_load(f) + + self._add_metadata(key, data) + + def from_json(self, filename, key=None): + r""" + Load metadata from a JSON file. + + If a key is provided, the loaded data is stored under that key. + Otherwise, it replaces the entire metadata dict. + + EXAMPLES:: + + >>> import os + >>> import json + >>> import tempfile + >>> from unitpackage.entry import Entry + >>> entry = Entry.create_examples()[0] + >>> with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: + ... json.dump({'source': {'citationKey': 'json_test'}}, f) + ... temp_path = f.name + >>> entry.metadata.from_json(temp_path, key='echemdb') + >>> entry.metadata['echemdb']['source']['citationKey'] + 'json_test' + >>> os.unlink(temp_path) + + """ + import json + + with open(filename, "r", encoding="utf-8") as f: + data = json.load(f) + + self._add_metadata(key, data) From fcdfac0dfc687fbe3dd90101820d8f8d108a48dc Mon Sep 17 00:00:00 2001 From: Albert Engstfeld Date: Fri, 30 Jan 2026 08:15:26 +0100 Subject: [PATCH 12/14] Add load_metadata for chaining --- unitpackage/entry.py | 66 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 64 insertions(+), 2 deletions(-) diff --git a/unitpackage/entry.py b/unitpackage/entry.py index 205929c..3dffe93 100644 --- a/unitpackage/entry.py +++ b/unitpackage/entry.py @@ -131,6 +131,67 @@ def metadata(self): """ return MetadataDescriptor(self) + def load_metadata(self, filename, format=None, key=None): + r""" + Load metadata from a file and return self for method chaining. + + The file format is auto-detected from the extension if not specified. + Supported formats are 'yaml' and 'json'. + + EXAMPLES: + + Load metadata from a YAML file:: + + >>> import os + >>> import tempfile + >>> import yaml + >>> entry = Entry.create_examples()[0] + >>> with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: + ... yaml.dump({'source': {'citationKey': 'chain_test'}}, f) + ... temp_path = f.name + >>> entry.load_metadata(temp_path, key='echemdb').metadata.echemdb.source.citationKey + 'chain_test' + >>> os.unlink(temp_path) + + Load metadata from a JSON file with auto-detection:: + + >>> import os + >>> import json + >>> import tempfile + >>> entry = Entry.create_examples()[0] + >>> with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: + ... json.dump({'custom': {'data': 'value'}}, f) + ... temp_path = f.name + >>> entry.load_metadata(temp_path).metadata.custom.data + 'value' + >>> os.unlink(temp_path) + + + """ + # Auto-detect format from file extension if not specified + if format is None: + if filename.endswith('.yaml') or filename.endswith('.yml'): + format = 'yaml' + elif filename.endswith('.json'): + format = 'json' + else: + raise ValueError( + f"Cannot auto-detect format for '{filename}'. " + "Please specify format='yaml' or format='json'" + ) + + # Load metadata using the appropriate method + if format == 'yaml': + self.metadata.from_yaml(filename, key=key) + elif format == 'json': + self.metadata.from_json(filename, key=key) + else: + raise ValueError( + f"Unsupported format '{format}'. Use 'yaml' or 'json'" + ) + + return self + @property def identifier(self): r""" @@ -156,8 +217,9 @@ def __dir__(self): >>> entry = Entry.create_examples()[0] >>> dir(entry) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE [... 'create_examples', 'default_metadata_key', 'df', 'echemdb', 'field_unit', - 'from_csv', 'from_df', 'from_local', 'identifier', 'metadata', 'mutable_resource', - 'plot', 'rename_fields', 'rescale', 'resource', 'save', 'yaml'] + 'from_csv', 'from_df', 'from_local', 'identifier', 'load_metadata', + 'metadata', 'mutable_resource', 'plot', 'rename_fields', 'rescale', 'resource', + 'save', 'yaml'] """ return list(set(dir(self._descriptor) + object.__dir__(self))) From bb9a2d1f5861316e5e6374e11cbb203ed35ffcac Mon Sep 17 00:00:00 2001 From: Albert Engstfeld Date: Fri, 30 Jan 2026 18:33:58 +0100 Subject: [PATCH 13/14] Add metadata class and update file creation handling --- unitpackage/entry.py | 155 +++++++++++++++++++++++++------------- unitpackage/entrypoint.py | 8 +- unitpackage/local.py | 108 +++++++++++++++----------- 3 files changed, 174 insertions(+), 97 deletions(-) diff --git a/unitpackage/entry.py b/unitpackage/entry.py index 3dffe93..d3f2ab2 100644 --- a/unitpackage/entry.py +++ b/unitpackage/entry.py @@ -14,6 +14,7 @@ Metadata included in an entry is accessible as an attribute:: + >>> from unitpackage.entry import Entry >>> entry = Entry.create_examples()[0] >>> entry.echemdb.source # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE {'citationKey': 'alves_2011_electrochemistry_6010', @@ -31,6 +32,28 @@ 1 0.020000 -0.102158 -0.981762 ... +Entries can be created from from various sources, such as csv files or pandas dataframes:: + + >>> entry = Entry.from_csv(csvname='examples/from_csv/from_csv.csv') + >>> entry + Entry('from_csv') + +Information on the fields such as units can be updated:: + + >>> fields = [{'name':'E', 'unit': 'mV'}, {'name':'I', 'unit': 'A'}] + >>> entry = entry.update_fields(fields=fields) + >>> entry.mutable_resource.schema.fields # doctest: +NORMALIZE_WHITESPACE + [{'name': 'E', 'type': 'integer', 'unit': 'mV'}, + {'name': 'I', 'type': 'integer', 'unit': 'A'}] + +Metadata to the resource can be updated in-place:: + + >>> metadata = {'echemdb': {'source': {'citationKey': 'new_key'}}} + >>> entry.metadata.from_dict(metadata) + >>> entry.metadata + {'echemdb': {'source': {'citationKey': 'new_key'}}} + + """ # ******************************************************************** @@ -131,7 +154,7 @@ def metadata(self): """ return MetadataDescriptor(self) - def load_metadata(self, filename, format=None, key=None): + def load_metadata(self, filename, file_format=None, key=None): r""" Load metadata from a file and return self for method chaining. @@ -169,25 +192,25 @@ def load_metadata(self, filename, format=None, key=None): """ # Auto-detect format from file extension if not specified - if format is None: - if filename.endswith('.yaml') or filename.endswith('.yml'): - format = 'yaml' - elif filename.endswith('.json'): - format = 'json' + if file_format is None: + if filename.endswith(".yaml") or filename.endswith(".yml"): + file_format = "yaml" + elif filename.endswith(".json"): + file_format = "json" else: raise ValueError( f"Cannot auto-detect format for '{filename}'. " - "Please specify format='yaml' or format='json'" + "Please specify file_format='yaml' or file_format='json'" ) # Load metadata using the appropriate method - if format == 'yaml': + if file_format == "yaml": self.metadata.from_yaml(filename, key=key) - elif format == 'json': + elif file_format == "json": self.metadata.from_json(filename, key=key) else: raise ValueError( - f"Unsupported format '{format}'. Use 'yaml' or 'json'" + f"Unsupported format '{file_format}'. Use 'yaml' or 'json'" ) return self @@ -218,8 +241,8 @@ def __dir__(self): >>> dir(entry) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE [... 'create_examples', 'default_metadata_key', 'df', 'echemdb', 'field_unit', 'from_csv', 'from_df', 'from_local', 'identifier', 'load_metadata', - 'metadata', 'mutable_resource', 'plot', 'rename_fields', 'rescale', 'resource', - 'save', 'yaml'] + 'metadata', 'mutable_resource', 'plot', 'rename_fields', 'rescale', + 'resource', 'save', 'update_fields', 'yaml'] """ return list(set(dir(self._descriptor) + object.__dir__(self))) @@ -306,7 +329,7 @@ def _metadata(self): {...'echemdb': {...'source': {'citationKey': 'alves_2011_electrochemistry_6010',...}...} """ - return self.resource.custom["metadata"] + return self.resource.custom.setdefault("metadata", {}) @property def _default_metadata(self): @@ -644,9 +667,10 @@ def add_columns(self, df, new_fields): fields = [field.to_dict() for field in self.mutable_resource.schema.fields] fields.extend(new_fields) - return self.from_df( - df=df_, metadata=self._metadata, basename=self.identifier, fields=fields - ) + entry = self.from_df(df=df_, basename=self.identifier).update_fields(fields) + entry.metadata.from_dict(self._metadata) + + return entry def __repr__(self): r""" @@ -755,6 +779,56 @@ def plot(self, x_label=None, y_label=None, name=None): return fig + def update_fields(self, fields): + r""" + Return a new entry with updated fields in the MutableResource. + + The :param fields: list must must be structured such as + `[{'name':'E', 'unit': 'mV'}, {'name':'T', 'unit': 'K'}]`. + + EXAMPLES:: + + >>> from unitpackage.entry import Entry + >>> entry = Entry.create_examples()[0] + >>> entry.mutable_resource.schema.fields # doctest: +NORMALIZE_WHITESPACE + [{'name': 't', 'type': 'number', 'unit': 's'}, + {'name': 'E', 'type': 'number', 'unit': 'V', 'reference': 'RHE'}, + {'name': 'j', 'type': 'number', 'unit': 'A / m2'}] + + Updating the fields returns the same entry with updated field metadata:: + + >>> fields = [{'name':'E', 'unit': 'mV'}, + ... {'name':'j', 'unit': 'uA / cm2'}, + ... {'name':'x', 'unit': 'm'}] + >>> entry.update_fields(fields) + Entry('alves_2011_electrochemistry_6010_f1a_solid') + + >>> entry.mutable_resource.schema.fields # doctest: +NORMALIZE_WHITESPACE + [{'name': 't', 'type': 'number', 'unit': 's'}, + {'name': 'E', 'type': 'number', 'unit': 'mV', 'reference': 'RHE'}, + {'name': 'j', 'type': 'number', 'unit': 'uA / cm2'}] + + >>> new_entry = entry.update_fields(fields) + >>> new_entry.mutable_resource.schema.fields # doctest: +NORMALIZE_WHITESPACE + [{'name': 't', 'type': 'number', 'unit': 's'}, + {'name': 'E', 'type': 'number', 'unit': 'mV', 'reference': 'RHE'}, + {'name': 'j', 'type': 'number', 'unit': 'uA / cm2'}] + + """ + from unitpackage.local import update_fields + + updated_fields = update_fields( + self.mutable_resource.schema.to_dict()["fields"], fields + ) + + from frictionless import Schema + + original_schema = self.mutable_resource.schema.to_dict() + original_schema["fields"] = updated_fields + self.mutable_resource.schema = Schema.from_descriptor(original_schema) + + return self + @classmethod def from_csv( cls, @@ -764,8 +838,6 @@ def from_csv( column_header_lines=None, decimal=None, delimiters=None, - metadata=None, - fields=None, ): r""" Returns an entry constructed from a CSV with a single header line. @@ -775,8 +847,7 @@ def from_csv( Units describing the fields can be provided:: >>> from unitpackage.entry import Entry - >>> fields = [{'name':'E', 'unit': 'mV'}, {'name':'I', 'unit': 'A'}] - >>> entry = Entry.from_csv(csvname='examples/from_csv/from_csv.csv', fields=fields) + >>> entry = Entry.from_csv(csvname='examples/from_csv/from_csv.csv') >>> entry Entry('from_csv') @@ -784,21 +855,12 @@ def from_csv( {'name': 'from_csv', ... - Metadata can be appended:: - - >>> fields = [{'name':'E', 'unit': 'mV'}, {'name':'I', 'unit': 'A'}] - >>> metadata = {'user':'Max Doe'} - >>> entry = Entry.from_csv(csvname='examples/from_csv/from_csv.csv', metadata=metadata, fields=fields) - >>> entry.user - 'Max Doe' - .. important:: Upper case filenames are converted to lower case entry identifiers! A filename containing upper case characters:: - >>> fields = [{'name':'E', 'unit': 'mV'}, {'name':'I', 'unit': 'A'}] - >>> entry = Entry.from_csv(csvname='examples/from_csv/UpperCase.csv', fields=fields) + >>> entry = Entry.from_csv(csvname='examples/from_csv/UpperCase.csv') >>> entry Entry('uppercase') @@ -811,10 +873,7 @@ def from_csv( ... """ - from unitpackage.local import ( - create_tabular_resource_from_csv, - create_unitpackage, - ) + from unitpackage.local import create_tabular_resource_from_csv # pylint: disable=duplicate-code resource = create_tabular_resource_from_csv( @@ -826,11 +885,7 @@ def from_csv( delimiters=delimiters, ) - package = create_unitpackage( - resource=resource, metadata=metadata, fields=fields - ) - - return cls(resource=package.resources[0]) + return cls(resource) @classmethod def _modify_fields(cls, original, alternative, keep_original_name_as=None): @@ -960,7 +1015,7 @@ def from_local(cls, filename): return cls(resource=package.resources[0]) @classmethod - def from_df(cls, df, metadata=None, fields=None, *, basename): + def from_df(cls, df, *, basename): r""" Returns an entry constructed from a pandas dataframe. A name `basename` for the entry must be provided. @@ -982,9 +1037,10 @@ def from_df(cls, df, metadata=None, fields=None, *, basename): >>> import os >>> fields = [{'name':'x', 'unit': 'm'}, {'name':'P', 'unit': 'um'}, {'name':'E', 'unit': 'V'}] >>> metadata = {'user':'Max Doe'} - >>> entry = Entry.from_df(df=df, basename='test_df', metadata=metadata, fields=fields) - >>> entry.user - 'Max Doe' + >>> entry = Entry.from_df(df=df, basename='test_df').update_fields(fields=fields) + >>> entry.metadata.from_dict(metadata) + >>> entry.metadata + {'user': 'Max Doe'} Save the entry:: @@ -1008,21 +1064,17 @@ def from_df(cls, df, metadata=None, fields=None, *, basename): Verify that all fields are properly created even when they are not specified as fields:: >>> fields = [{'name':'x', 'unit': 'm'}, {'name':'P', 'unit': 'um'}, {'name':'E', 'unit': 'V'}] - >>> metadata = {'user':'Max Doe'} - >>> entry = Entry.from_df(df=df, basename='test_df', metadata=metadata, fields=fields) + >>> entry = Entry.from_df(df=df, basename='test_df').update_fields(fields=fields) >>> entry.resource.schema.fields [{'name': 'x', 'type': 'integer', 'unit': 'm'}, {'name': 'y', 'type': 'integer'}] """ - from unitpackage.local import create_df_resource_from_df, create_unitpackage + from unitpackage.local import create_df_resource_from_df resource = create_df_resource_from_df(df) resource.name = basename.lower() - package = create_unitpackage( - resource=resource, metadata=metadata, fields=fields - ) - return cls(resource=package.resources[0]) + return cls(resource) def save(self, *, outdir, basename=None): r""" @@ -1085,7 +1137,8 @@ def save(self, *, outdir, basename=None): >>> from unitpackage.entry import Entry >>> df = pd.DataFrame({'x':[1,2,3], 'y':[2,3,4]}) >>> basename = 'save_datetime' - >>> entry = Entry.from_df(df=df, basename=basename, metadata={'currentTime':datetime.now()}) + >>> entry = Entry.from_df(df=df, basename=basename) + >>> entry.metadata.from_dict({'currentTime':datetime.now()}) >>> entry.save(outdir='./test/generated') >>> os.path.exists(f'test/generated/{basename}.json') and os.path.exists(f'test/generated/{basename}.csv') True diff --git a/unitpackage/entrypoint.py b/unitpackage/entrypoint.py index 1d68f68..7d2df5c 100644 --- a/unitpackage/entrypoint.py +++ b/unitpackage/entrypoint.py @@ -114,9 +114,11 @@ def convert(csv, device, outdir, metadata): with open(csv, "r") as file: # pylint: disable=unspecified-encoding loader = BaseLoader(file) - entry = Entry.from_df( - df=loader.df, basename=Path(csv).stem, metadata=metadata, fields=fields - ) + entry = Entry.from_df(df=loader.df, basename=Path(csv).stem) + if fields: + entry = entry.update_fields(fields=fields) + if metadata: + entry.metadata.from_dict(metadata) entry.save(outdir=outdir) diff --git a/unitpackage/local.py b/unitpackage/local.py index 148f161..d3ba1ab 100644 --- a/unitpackage/local.py +++ b/unitpackage/local.py @@ -259,33 +259,38 @@ def collect_datapackages(data): return [Package(package) for package in packages] -def update_fields(schema, fields): +def update_fields(original_fields, new_fields): r""" - Return a new Schema based on :param schema: where the fields have been - updated with the information in :param fields:. + Return a new list of fields where a list of fields has been updated + based on a new list of fields. - The :param fields: list must must be structured such as - `[{'name':'E', 'unit': 'mV'}, {'name':'T', 'unit': 'K'}]`. + The :param: original_fields: list and :param new_fields: list + must must be structured such as + `[{'name':'E', 'unit': 'mV'}, {'name':'T', 'unit': 'K'}]` + and each entry must contain a key `name` corresponding to a field name + in the original fields. EXAMPLES:: >>> from unitpackage.local import update_fields, create_tabular_resource_from_csv >>> schema = create_tabular_resource_from_csv("./examples/from_csv/from_csv.csv").schema - >>> schema - {'fields': [{'name': 'E', 'type': 'integer'}, {'name': 'I', 'type': 'integer'}]} + >>> original_fields = schema.to_dict()['fields'] + >>> original_fields # doctest: +NORMALIZE_WHITESPACE + [{'name': 'E', 'type': 'integer'}, + {'name': 'I', 'type': 'integer'}] - >>> fields = [{'name':'E', 'unit': 'mV'}, {'name':'I', 'unit': 'A'}, {'name':'x', 'unit': 'm'}] - >>> new_schema = update_fields(schema, fields) - >>> new_schema # doctest: +NORMALIZE_WHITESPACE - {'fields': [{'name': 'E', 'type': 'integer', 'unit': 'mV'}, - {'name': 'I', 'type': 'integer', 'unit': 'A'}]} + >>> new_fields = [{'name':'E', 'unit': 'mV'}, {'name':'I', 'unit': 'A'}, {'name':'x', 'unit': 'm'}] + >>> updated_fields = update_fields(original_fields, new_fields) + >>> updated_fields # doctest: +NORMALIZE_WHITESPACE + [{'name': 'E', 'type': 'integer', 'unit': 'mV'}, + {'name': 'I', 'type': 'integer', 'unit': 'A'}] TESTS: Invalid fields:: >>> fields = 'not a list' - >>> new_schema = update_fields(schema, fields) + >>> updated_fields = update_fields(original_fields, fields) Traceback (most recent call last): ... ValueError: 'fields' must be a list such as @@ -295,53 +300,67 @@ def update_fields(schema, fields): More fields than required:: >>> fields = [{'name':'E', 'unit': 'mV'}, {'name':'I', 'unit': 'A'}, {'name':'x', 'unit': 'm'}] - >>> new_schema = update_fields(schema, fields) # doctest: +NORMALIZE_WHITESPACE + >>> updated_fields = update_fields(original_fields, fields) + >>> updated_fields # doctest: +NORMALIZE_WHITESPACE + [{'name': 'E', 'type': 'integer', 'unit': 'mV'}, + {'name': 'I', 'type': 'integer', 'unit': 'A'}] Part of the fields specified: >>> fields = [{'name':'E', 'unit': 'mV'}] - >>> new_schema = update_fields(schema, fields) # doctest: +NORMALIZE_WHITESPACE - + >>> updated_fields = update_fields(original_fields, fields) + >>> updated_fields # doctest: +NORMALIZE_WHITESPACE + [{'name': 'E', 'type': 'integer', 'unit': 'mV'}, + {'name': 'I', 'type': 'integer'}] """ - original_schema = schema - if not isinstance(fields, list): - raise ValueError( - "'fields' must be a list such as \ - [{'name': '', 'unit':''}]`, \ - e.g., `[{'name':'E', 'unit': 'mV}, {'name':'T', 'unit': 'K}]`" - ) - - # remove field if it is not a Mapping instance - from collections.abc import Mapping - for field in fields: - if not isinstance(field, Mapping): + def validate_field_structure(fields): + if not isinstance(fields, list): raise ValueError( - "'field' must be a dict such as {'name': '', 'unit':''},\ - e.g., `{'name':'j', 'unit': 'uA / cm2'}`" + "'fields' must be a list such as \ + [{'name': '', 'unit':''}]`, \ + e.g., `[{'name':'E', 'unit': 'mV}, {'name':'T', 'unit': 'K}]`" ) - provided_schema = Schema.from_descriptor({"fields": fields}, allow_invalid=True) + # remove field if it is not a Mapping instance + from collections.abc import Mapping - new_fields = [] + for field in fields: + if not isinstance(field, Mapping): + raise ValueError( + "'field' must be a dict such as {'name': '', 'unit':''},\ + e.g., `{'name':'j', 'unit': 'uA / cm2'}`" + ) + + validate_field_structure(original_fields) + validate_field_structure(new_fields) + + original_schema = Schema({"fields": original_fields}) + + # Create a lookup dict for provided fields by name + provided_fields_dict = { + field["name"]: field for field in new_fields if "name" in field + } + + updated_fields = [] unspecified_fields = [] unused_provided_fields = [] # First, update fields that exist in the original schema, # and record which original fields have no additional information provided. for name in original_schema.field_names: - if name in provided_schema.field_names: - new_fields.append( - provided_schema.get_field(name).to_dict() - | original_schema.get_field(name).to_dict() - ) + if name in provided_fields_dict: + # Start with original field, then update only the keys provided in the input + updated_field = original_schema.get_field(name).to_dict() + updated_field.update(provided_fields_dict[name]) + updated_fields.append(updated_field) else: unspecified_fields.append(name) - new_fields.append(original_schema.get_field(name).to_dict()) + updated_fields.append(original_schema.get_field(name).to_dict()) # Then, record any provided fields that are not present in the original schema. - for name in provided_schema.field_names: + for name in provided_fields_dict.keys(): if name not in original_schema.field_names: unused_provided_fields.append(name) if len(unspecified_fields) != 0: @@ -354,7 +373,7 @@ def update_fields(schema, fields): f"Fields with names {unused_provided_fields} were provided but do not appear in the field names of tabular resource {original_schema.field_names}." ) - return Schema.from_descriptor({"fields": new_fields}) + return updated_fields def create_unitpackage(resource, metadata=None, fields=None): @@ -369,8 +388,8 @@ def create_unitpackage(resource, metadata=None, fields=None): >>> from unitpackage.local import create_tabular_resource_from_csv, create_unitpackage >>> resource = create_tabular_resource_from_csv("./examples/from_csv/from_csv.csv") - >>> fields = [{'name':'E', 'unit': 'mV'}, {'name':'I', 'unit': 'A'}] - >>> package = create_unitpackage(resource=resource, fields=fields) + >>> new_fields = [{'name':'E', 'unit': 'mV'}, {'name':'I', 'unit': 'A'}] + >>> package = create_unitpackage(resource=resource, fields=new_fields) >>> package # doctest: +NORMALIZE_WHITESPACE {'resources': [{'name': ... @@ -381,7 +400,10 @@ def create_unitpackage(resource, metadata=None, fields=None): if fields: # Update fields in the Resource describing the data in the CSV - resource.schema = update_fields(resource.schema, fields) + updated_fields = update_fields(resource.schema.to_dict()["fields"], fields) + original_schema = resource.schema.to_dict() + original_schema["fields"] = updated_fields + resource.schema = Schema.from_descriptor(original_schema) package = Package(resources=[resource]) From 15a8ff84d7dcb77ce49773230c508dc1e76673cc Mon Sep 17 00:00:00 2001 From: Albert Engstfeld Date: Sat, 31 Jan 2026 13:34:31 +0100 Subject: [PATCH 14/14] Fix resource name upon importing complex csv --- unitpackage/entry.py | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/unitpackage/entry.py b/unitpackage/entry.py index d3f2ab2..569b666 100644 --- a/unitpackage/entry.py +++ b/unitpackage/entry.py @@ -659,6 +659,13 @@ def add_columns(self, df, new_fields): >>> new_entry.field_unit('P/A') Unit("A V / m2") + TESTS: + + Validate that the identifier is preserved:: + + >>> new_entry.identifier + 'alves_2011_electrochemistry_6010_f1a_solid' + """ import pandas as pd @@ -840,11 +847,9 @@ def from_csv( delimiters=None, ): r""" - Returns an entry constructed from a CSV with a single header line. + Returns an entry constructed from a CSV. - EXAMPLES: - - Units describing the fields can be provided:: + EXAMPLES:: >>> from unitpackage.entry import Entry >>> entry = Entry.from_csv(csvname='examples/from_csv/from_csv.csv') @@ -872,6 +877,19 @@ def from_csv( 'path': 'UpperCase.csv', ... + CSV with a more complex structure, such as multiple header lines can be constructed:: + + >>> filename = 'examples/from_csv/from_csv_multiple_headers.csv' + >>> entry = Entry.from_csv(csvname='examples/from_csv/from_csv_multiple_headers.csv', column_header_lines=2) + >>> entry.resource # doctest: +NORMALIZE_WHITESPACE + {'name': 'from_csv_multiple_headers', + 'type': 'table', + 'data': [], + 'format': 'pandas', + 'mediatype': 'application/pandas', + 'schema': {'fields': [{'name': 'E / V', 'type': 'integer'}, + {'name': 'j / A / cm2', 'type': 'integer'}]}} + """ from unitpackage.local import create_tabular_resource_from_csv @@ -885,6 +903,13 @@ def from_csv( delimiters=delimiters, ) + from pathlib import Path + + if resource.name == "memory": + resource.name = Path( + csvname + ).stem.lower() # Use stem (filename without extension) + return cls(resource) @classmethod