From 05f2c154ee26fd86ed6f5d1dfe55b96128a2d817 Mon Sep 17 00:00:00 2001 From: Albert Engstfeld Date: Tue, 27 Jan 2026 14:56:41 +0100 Subject: [PATCH 1/6] refactor loading resources with different formats --- unitpackage/entry.py | 4 +- unitpackage/loaders/baseloader.py | 14 +++ unitpackage/local.py | 179 +++++++++++++++++++----------- 3 files changed, 133 insertions(+), 64 deletions(-) diff --git a/unitpackage/entry.py b/unitpackage/entry.py index 5ddfca1..2bc9f67 100644 --- a/unitpackage/entry.py +++ b/unitpackage/entry.py @@ -526,9 +526,9 @@ def mutable_resource(self): if not self.resource.custom["MutableResource"]: from frictionless import Schema - from unitpackage.local import create_df_resource + from unitpackage.local import create_df_resource_from_tabular_resource - self.resource.custom["MutableResource"] = create_df_resource(self.resource) + self.resource.custom["MutableResource"] = create_df_resource_from_tabular_resource(self.resource) self.resource.custom["MutableResource"].schema = Schema.from_descriptor( self.resource.schema.to_dict() ) diff --git a/unitpackage/loaders/baseloader.py b/unitpackage/loaders/baseloader.py index e5dd4b6..37f0e5b 100644 --- a/unitpackage/loaders/baseloader.py +++ b/unitpackage/loaders/baseloader.py @@ -422,6 +422,20 @@ def df(self): 0 0 0 1 1 1 + A file with two column header lines, which is sometimes, for example, + used for storing units to the values:: + + >>> from io import StringIO + >>> file = StringIO(r'''a,b + ... m,s + ... 0,0 + ... 1,1''') + >>> csv = BaseLoader(file, column_header_lines=2) + >>> csv.df + a / m b / s + 0 0 0 + 1 1 1 + """ import pandas as pd diff --git a/unitpackage/local.py b/unitpackage/local.py index 9f05eaa..f116987 100644 --- a/unitpackage/local.py +++ b/unitpackage/local.py @@ -36,15 +36,79 @@ logger = logging.getLogger("unitpackage") -def create_df_resource(resource): +def create_tabular_resource_from_csv(csvname): + r""" + Return a Data Package built from a :param metadata: dict and tabular data + in :param csvname: str. + + The :param fields: list must must be structured such as + `[{'name':'E', 'unit': 'mV'}, {'name':'T', 'unit': 'K'}]`. + + EXAMPLES:: + + >>> resource = create_tabular_resource_from_csv("./examples/from_csv/from_csv.csv") + >>> resource # doctest: +NORMALIZE_WHITESPACE + {'name': 'from_csv', ... + + >>> resource.format + 'csv' + + """ + + csv_basename = os.path.basename(csvname) + + resource = Resource( + path=csv_basename, + basepath=os.path.dirname(csvname) or ".", + ) + + resource.infer() + + return resource + +def create_df_resource_from_df(df): + r""" + Return a pandas dataframe resource for a pandas DataFrame. + + EXAMPLES:: + + >>> data = {'x': [1, 2, 3], 'y': [4, 5, 6]} + >>> df = pd.DataFrame(data) + >>> from unitpackage.local import create_df_resource_from_df + >>> resource = create_df_resource_from_df(df) + >>> resource # doctest: +NORMALIZE_WHITESPACE + {'name': 'memory', + 'type': 'table', + 'data': [], + 'format': 'pandas', ... + + >>> resource.data + x y + 0 1 4 + 1 2 5 + 2 3 6 + + >>> resource.format + 'pandas' + + + """ + df_resource = Resource(df) + df_resource.infer() + + return df_resource + + +def create_df_resource_from_tabular_resource(resource): r""" Return a pandas dataframe resource for a frictionless Tabular Resource. EXAMPLES:: >>> from frictionless import Package - >>> resource = Package("./examples/local/no_bibliography/no_bibliography.json").resources[0] - >>> df_resource = create_df_resource(resource) # doctest: +NORMALIZE_WHITESPACE + >>> from unitpackage.local import create_df_resource_from_tabular_resource + >>> tabular_resource = Package("./examples/local/no_bibliography/no_bibliography.json").resources[0] + >>> df_resource = create_df_resource_from_tabular_resource(tabular_resource) # doctest: +NORMALIZE_WHITESPACE >>> df_resource {'name': 'memory', ... @@ -70,10 +134,6 @@ def create_df_resource(resource): 2 3 6 """ - if not resource: - raise ValueError( - "dataframe resource can not be created since the Data Package has no resources." - ) descriptor_path = ( resource.basepath + "/" + resource.path if resource.basepath else resource.path ) @@ -122,6 +182,54 @@ def collect_datapackages(data): return [Package(package) for package in packages] +def update_fields(schema, fields): + original_schema = schema + if not isinstance(fields, list): + raise ValueError( + "'fields' must be a list such as \ + [{'name': '', 'unit':''}]`, \ + e.g., `[{'name':'E', 'unit': 'mV}, {'name':'T', 'unit': 'K}]`" + ) + + # remove field if it is not a Mapping instance + from collections.abc import Mapping + + for field in fields: + if not isinstance(field, Mapping): + raise ValueError( + "'field' must be a dict such as {'name': '', 'unit':''},\ + e.g., `{'name':'j', 'unit': 'uA / cm2'}`" + ) + + provided_schema = Schema.from_descriptor({"fields": fields}, allow_invalid=True) + + new_fields = [] + unspecified_fields = [] + + for name in original_schema.field_names: + if name in provided_schema.field_names: + new_fields.append( + provided_schema.get_field(name).to_dict() + | original_schema.get_field(name).to_dict() + ) + else: + new_fields.append(original_schema.get_field(name).to_dict()) + + if len(unspecified_fields) != 0: + logger.warning( + f"Additional information were not provided for fields {unspecified_fields}." + ) + + unused_provided_fields = [] + for name in provided_schema.field_names: + if name not in original_schema.field_names: + unused_provided_fields.append(name) + if len(unused_provided_fields) != 0: + logger.warning( + f"Fields with names {unused_provided_fields} was provided but does not appear in the field names of tabular resource {original_schema.field_names}." + ) + + return Schema.from_descriptor({"fields": new_fields}) def create_unitpackage(csvname, metadata=None, fields=None): r""" @@ -163,67 +271,14 @@ def create_unitpackage(csvname, metadata=None, fields=None): """ - csv_basename = os.path.basename(csvname) - - resource = Resource( - path=csv_basename, - basepath=os.path.dirname(csvname) or ".", - ) - - resource.infer() + resource = create_tabular_resource_from_csv(csvname) resource.custom.setdefault("metadata", {}) resource.custom["metadata"].setdefault("echemdb", metadata) if fields: # Update fields in the Resource describing the data in the CSV - resource_schema = resource.schema - if not isinstance(fields, list): - raise ValueError( - "'fields' must be a list such as \ - [{'name': '', 'unit':''}]`, \ - e.g., `[{'name':'E', 'unit': 'mV}, {'name':'T', 'unit': 'K}]`" - ) - - # remove field if it is not a Mapping instance - from collections.abc import Mapping - - for field in fields: - if not isinstance(field, Mapping): - raise ValueError( - "'field' must be a dict such as {'name': '', 'unit':''},\ - e.g., `{'name':'j', 'unit': 'uA / cm2'}`" - ) - - provided_schema = Schema.from_descriptor({"fields": fields}, allow_invalid=True) - - new_fields = [] - unspecified_fields = [] - - for name in resource_schema.field_names: - if name in provided_schema.field_names: - new_fields.append( - provided_schema.get_field(name).to_dict() - | resource_schema.get_field(name).to_dict() - ) - else: - new_fields.append(resource_schema.get_field(name).to_dict()) - - if len(unspecified_fields) != 0: - logger.warning( - f"Additional information were not provided for fields {unspecified_fields}." - ) - - unused_provided_fields = [] - for name in provided_schema.field_names: - if name not in resource_schema.field_names: - unused_provided_fields.append(name) - if len(unused_provided_fields) != 0: - logger.warning( - f"Fields with names {unused_provided_fields} was provided but does not appear in the field names of tabular resource {resource_schema.field_names}." - ) - - resource.schema = Schema.from_descriptor({"fields": new_fields}) + resource.schema = update_fields(resource.schema, fields) package = Package(resources=[resource]) From 2fbafb5bbc0f4857ba9b1e6412eb5b6110b11d0f Mon Sep 17 00:00:00 2001 From: Albert Engstfeld Date: Wed, 28 Jan 2026 09:49:32 +0100 Subject: [PATCH 2/6] refactor loading from df --- pixi.lock | 19 ++++++++++- unitpackage/entry.py | 78 +++++++++++++++++++++++++++++++------------- unitpackage/local.py | 45 ++++++++++++++++++------- 3 files changed, 108 insertions(+), 34 deletions(-) diff --git a/pixi.lock b/pixi.lock index 2c94af5..13e2aa0 100644 --- a/pixi.lock +++ b/pixi.lock @@ -5,6 +5,8 @@ environments: - url: https://conda.anaconda.org/conda-forge/ indexes: - https://pypi.org/simple + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -1579,6 +1581,8 @@ environments: - url: https://conda.anaconda.org/conda-forge/ indexes: - https://pypi.org/simple + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -3386,6 +3390,8 @@ environments: - url: https://conda.anaconda.org/conda-forge/ indexes: - https://pypi.org/simple + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -5074,6 +5080,8 @@ environments: - url: https://conda.anaconda.org/conda-forge/ indexes: - https://pypi.org/simple + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -6219,6 +6227,8 @@ environments: - url: https://conda.anaconda.org/conda-forge/ indexes: - https://pypi.org/simple + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -7816,6 +7826,8 @@ environments: - url: https://conda.anaconda.org/conda-forge/ indexes: - https://pypi.org/simple + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -9428,6 +9440,8 @@ environments: - url: https://conda.anaconda.org/conda-forge/ indexes: - https://pypi.org/simple + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -11031,6 +11045,8 @@ environments: - url: https://conda.anaconda.org/conda-forge/ indexes: - https://pypi.org/simple + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -12644,6 +12660,8 @@ environments: - url: https://conda.anaconda.org/conda-forge/ indexes: - https://pypi.org/simple + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -40375,7 +40393,6 @@ packages: - plotly>=5,<7 - pybtex>=0.25,<0.26 requires_python: '>=3.10' - editable: true - conda: https://conda.anaconda.org/conda-forge/noarch/uri-template-1.3.0-pyhd8ed1ab_1.conda sha256: e0eb6c8daf892b3056f08416a96d68b0a358b7c46b99c8a50481b22631a4dfc0 md5: e7cb0f5745e4c5035a460248334af7eb diff --git a/unitpackage/entry.py b/unitpackage/entry.py index 2bc9f67..d79d735 100644 --- a/unitpackage/entry.py +++ b/unitpackage/entry.py @@ -503,7 +503,7 @@ def add_offset(self, field_name=None, offset=None, unit=""): @property def mutable_resource(self): r""" - Return the data of this entry's "MutableResource" as a data frame. + Return the entry's "MutableResource". EXAMPLES:: @@ -524,11 +524,24 @@ def mutable_resource(self): self.resource.custom.setdefault("MutableResource", "") if not self.resource.custom["MutableResource"]: - from frictionless import Schema + if not self.resource.format in ["csv", "pandas"]: + raise ValueError( + "MutableResource can only be created from resources of format 'csv' or 'pandas'." + ) + + if self.resource.format == "csv": + + from unitpackage.local import create_df_resource_from_tabular_resource + + self.resource.custom["MutableResource"] = ( + create_df_resource_from_tabular_resource(self.resource) + ) - from unitpackage.local import create_df_resource_from_tabular_resource + elif self.resource.format == "pandas": + self.resource.custom["MutableResource"] = self.resource + + from frictionless import Schema - self.resource.custom["MutableResource"] = create_df_resource_from_tabular_resource(self.resource) self.resource.custom["MutableResource"].schema = Schema.from_descriptor( self.resource.schema.to_dict() ) @@ -556,6 +569,18 @@ def df(self): {'name': 'E', 'type': 'number', 'unit': 'V', 'reference': 'RHE'}, {'name': 'j', 'type': 'number', 'unit': 'A / m2'}] + TESTS:: + + >>> import pandas as pd + >>> from unitpackage.entry import Entry + >>> df = pd.DataFrame({'x':[1,2,3], 'y':[2,3,4]}) + >>> entry = Entry.from_df(df=df, basename='test_df') + >>> entry.df + x y + 0 1 2 + 1 2 3 + 2 3 4 + """ return self.mutable_resource.data @@ -718,7 +743,7 @@ def from_csv(cls, csvname, metadata=None, fields=None): Units describing the fields can be provided:: - >>> import os + >>> from unitpackage.entry import Entry >>> fields = [{'name':'E', 'unit': 'mV'}, {'name':'I', 'unit': 'A'}] >>> entry = Entry.from_csv(csvname='examples/from_csv/from_csv.csv', fields=fields) >>> entry @@ -730,7 +755,6 @@ def from_csv(cls, csvname, metadata=None, fields=None): Metadata can be appended:: - >>> import os >>> fields = [{'name':'E', 'unit': 'mV'}, {'name':'I', 'unit': 'A'}] >>> metadata = {'user':'Max Doe'} >>> entry = Entry.from_csv(csvname='examples/from_csv/from_csv.csv', metadata=metadata, fields=fields) @@ -742,7 +766,6 @@ def from_csv(cls, csvname, metadata=None, fields=None): A filename containing upper case characters:: - >>> import os >>> fields = [{'name':'E', 'unit': 'mV'}, {'name':'I', 'unit': 'A'}] >>> entry = Entry.from_csv(csvname='examples/from_csv/UpperCase.csv', fields=fields) >>> entry @@ -757,9 +780,15 @@ def from_csv(cls, csvname, metadata=None, fields=None): ... """ - from unitpackage.local import create_unitpackage + from unitpackage.local import ( + create_tabular_resource_from_csv, + create_unitpackage, + ) - package = create_unitpackage(csvname=csvname, metadata=metadata, fields=fields) + resource = create_tabular_resource_from_csv(csvname=csvname) + package = create_unitpackage( + resource=resource, metadata=metadata, fields=fields + ) return cls(resource=package.resources[0]) @@ -891,9 +920,13 @@ def from_local(cls, filename): return cls(resource=package.resources[0]) @classmethod - def from_df(cls, df, metadata=None, fields=None, outdir=None, *, basename): + def from_df(cls, df, metadata=None, fields=None, *, basename): r""" Returns an entry constructed from a pandas dataframe. + A name `basename` for the entry must be provided. + The name must be lower-case and contain only alphanumeric + characters along with `.` , `_` or `-` characters'. + (Upper case characters are converted to lower case.) EXAMPLES:: @@ -941,21 +974,15 @@ def from_df(cls, df, metadata=None, fields=None, outdir=None, *, basename): [{'name': 'x', 'type': 'integer', 'unit': 'm'}, {'name': 'y', 'type': 'integer'}] """ - if outdir is None: - import atexit - import shutil - import tempfile - - outdir = tempfile.mkdtemp() - atexit.register(shutil.rmtree, outdir) + from unitpackage.local import create_df_resource_from_df, create_unitpackage - csvname = basename + ".csv" + resource = create_df_resource_from_df(df) + resource.name = basename.lower() - df.to_csv(os.path.join(outdir, csvname), index=False) - - return cls.from_csv( - os.path.join(outdir, csvname), metadata=metadata, fields=fields + package = create_unitpackage( + resource=resource, metadata=metadata, fields=fields ) + return cls(resource=package.resources[0]) def save(self, *, outdir, basename=None): r""" @@ -1028,6 +1055,7 @@ def save(self, *, outdir, basename=None): os.makedirs(outdir) basename = basename or self.identifier + basename = basename.lower() csv_name = os.path.join(outdir, basename + ".csv") json_name = os.path.join(outdir, basename + ".json") @@ -1038,6 +1066,12 @@ def save(self, *, outdir, basename=None): self.resource.path = basename + ".csv" self.resource.name = basename + # convert a pandas resource into a csv resource + if self.resource.format == "pandas": + self.resource.format = "csv" + self.resource.mediatype = "text/csv" + del self.resource.data + resource = self.resource.to_dict() # update the fields from the main resource with those from the "MutableResource"resource diff --git a/unitpackage/local.py b/unitpackage/local.py index f116987..e294af1 100644 --- a/unitpackage/local.py +++ b/unitpackage/local.py @@ -66,6 +66,7 @@ def create_tabular_resource_from_csv(csvname): return resource + def create_df_resource_from_df(df): r""" Return a pandas dataframe resource for a pandas DataFrame. @@ -182,7 +183,29 @@ def collect_datapackages(data): return [Package(package) for package in packages] + def update_fields(schema, fields): + r""" + Return a new Schema based on :param schema: where the fields have been + updated with the information in :param fields:. + + The :param fields: list must must be structured such as + `[{'name':'E', 'unit': 'mV'}, {'name':'T', 'unit': 'K'}]`. + + EXAMPLES:: + + >>> from unitpackage.local import update_fields, create_tabular_resource_from_csv + >>> schema = create_tabular_resource_from_csv("./examples/from_csv/from_csv.csv").schema + >>> schema + {'fields': [{'name': 'E', 'type': 'integer'}, {'name': 'I', 'type': 'integer'}]} + + >>> fields = [{'name':'E', 'unit': 'mV'}, {'name':'I', 'unit': 'A'}, {'name':'x', 'unit': 'm'}] + >>> new_schema = update_fields(schema, fields) + >>> new_schema # doctest: +NORMALIZE_WHITESPACE + {'fields': [{'name': 'E', 'type': 'integer', 'unit': 'mV'}, + {'name': 'I', 'type': 'integer', 'unit': 'A'}]} + + """ original_schema = schema if not isinstance(fields, list): raise ValueError( @@ -205,6 +228,7 @@ def update_fields(schema, fields): new_fields = [] unspecified_fields = [] + unused_provided_fields = [] for name in original_schema.field_names: if name in provided_schema.field_names: @@ -212,6 +236,8 @@ def update_fields(schema, fields): provided_schema.get_field(name).to_dict() | original_schema.get_field(name).to_dict() ) + elif name not in original_schema.field_names: + unused_provided_fields.append(name) else: new_fields.append(original_schema.get_field(name).to_dict()) @@ -220,10 +246,6 @@ def update_fields(schema, fields): f"Additional information were not provided for fields {unspecified_fields}." ) - unused_provided_fields = [] - for name in provided_schema.field_names: - if name not in original_schema.field_names: - unused_provided_fields.append(name) if len(unused_provided_fields) != 0: logger.warning( f"Fields with names {unused_provided_fields} was provided but does not appear in the field names of tabular resource {original_schema.field_names}." @@ -231,7 +253,8 @@ def update_fields(schema, fields): return Schema.from_descriptor({"fields": new_fields}) -def create_unitpackage(csvname, metadata=None, fields=None): + +def create_unitpackage(resource, metadata=None, fields=None): r""" Return a Data Package built from a :param metadata: dict and tabular data in :param csvname: str. @@ -241,8 +264,10 @@ def create_unitpackage(csvname, metadata=None, fields=None): EXAMPLES:: + >>> from unitpackage.local import create_tabular_resource_from_csv, create_unitpackage + >>> resource = create_tabular_resource_from_csv("./examples/from_csv/from_csv.csv") >>> fields = [{'name':'E', 'unit': 'mV'}, {'name':'I', 'unit': 'A'}] - >>> package = create_unitpackage("./examples/from_csv/from_csv.csv", fields=fields) + >>> package = create_unitpackage(resource=resource, fields=fields) >>> package # doctest: +NORMALIZE_WHITESPACE {'resources': [{'name': ... @@ -252,7 +277,7 @@ def create_unitpackage(csvname, metadata=None, fields=None): Invalid fields:: >>> fields = 'not a list' - >>> package = create_unitpackage("./examples/from_csv/from_csv.csv", fields=fields) # doctest: +NORMALIZE_WHITESPACE + >>> package = create_unitpackage(resource=resource, fields=fields) # doctest: +NORMALIZE_WHITESPACE Traceback (most recent call last): ... ValueError: 'fields' must be a list such as @@ -262,17 +287,15 @@ def create_unitpackage(csvname, metadata=None, fields=None): More fields than required:: >>> fields = [{'name':'E', 'unit': 'mV'}, {'name':'I', 'unit': 'A'}, {'name':'x', 'unit': 'm'}] - >>> package = create_unitpackage("./examples/from_csv/from_csv.csv", fields=fields) # doctest: +NORMALIZE_WHITESPACE + >>> package = create_unitpackage(resource=resource, fields=fields) # doctest: +NORMALIZE_WHITESPACE Part of the fields specified: >>> fields = [{'name':'E', 'unit': 'mV'}] - >>> package = create_unitpackage("./examples/from_csv/from_csv.csv", fields=fields) # doctest: +NORMALIZE_WHITESPACE + >>> package = create_unitpackage(resource=resource, fields=fields) # doctest: +NORMALIZE_WHITESPACE """ - resource = create_tabular_resource_from_csv(csvname) - resource.custom.setdefault("metadata", {}) resource.custom["metadata"].setdefault("echemdb", metadata) From 0fb4f7ed2bb32db948929bed62fc607677dec0cb Mon Sep 17 00:00:00 2001 From: Albert Engstfeld Date: Wed, 28 Jan 2026 09:55:50 +0100 Subject: [PATCH 3/6] Move docstrings to update fields --- unitpackage/local.py | 45 ++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/unitpackage/local.py b/unitpackage/local.py index e294af1..6cee4ef 100644 --- a/unitpackage/local.py +++ b/unitpackage/local.py @@ -205,6 +205,29 @@ def update_fields(schema, fields): {'fields': [{'name': 'E', 'type': 'integer', 'unit': 'mV'}, {'name': 'I', 'type': 'integer', 'unit': 'A'}]} + TESTS: + + Invalid fields:: + + >>> fields = 'not a list' + >>> new_schema = update_fields(schema, fields) + Traceback (most recent call last): + ... + ValueError: 'fields' must be a list such as + [{'name': '', 'unit':''}]`, + e.g., `[{'name':'E', 'unit': 'mV}, {'name':'T', 'unit': 'K}]` + + More fields than required:: + + >>> fields = [{'name':'E', 'unit': 'mV'}, {'name':'I', 'unit': 'A'}, {'name':'x', 'unit': 'm'}] + >>> new_schema = update_fields(schema, fields) # doctest: +NORMALIZE_WHITESPACE + + Part of the fields specified: + + >>> fields = [{'name':'E', 'unit': 'mV'}] + >>> new_schema = update_fields(schema, fields) # doctest: +NORMALIZE_WHITESPACE + + """ original_schema = schema if not isinstance(fields, list): @@ -272,28 +295,6 @@ def create_unitpackage(resource, metadata=None, fields=None): {'resources': [{'name': ... - TESTS: - - Invalid fields:: - - >>> fields = 'not a list' - >>> package = create_unitpackage(resource=resource, fields=fields) # doctest: +NORMALIZE_WHITESPACE - Traceback (most recent call last): - ... - ValueError: 'fields' must be a list such as - [{'name': '', 'unit':''}]`, - e.g., `[{'name':'E', 'unit': 'mV}, {'name':'T', 'unit': 'K}]` - - More fields than required:: - - >>> fields = [{'name':'E', 'unit': 'mV'}, {'name':'I', 'unit': 'A'}, {'name':'x', 'unit': 'm'}] - >>> package = create_unitpackage(resource=resource, fields=fields) # doctest: +NORMALIZE_WHITESPACE - - Part of the fields specified: - - >>> fields = [{'name':'E', 'unit': 'mV'}] - >>> package = create_unitpackage(resource=resource, fields=fields) # doctest: +NORMALIZE_WHITESPACE - """ resource.custom.setdefault("metadata", {}) From 460fb63bdf85d3f0b0d1f9bd565cd1664d6e7c2b Mon Sep 17 00:00:00 2001 From: Albert Engstfeld Date: Wed, 28 Jan 2026 11:15:46 +0100 Subject: [PATCH 4/6] include baseloader --- unitpackage/entry.py | 25 ++++++++-- unitpackage/local.py | 109 ++++++++++++++++++++++++++++++++++++------- 2 files changed, 113 insertions(+), 21 deletions(-) diff --git a/unitpackage/entry.py b/unitpackage/entry.py index d79d735..3d8b32a 100644 --- a/unitpackage/entry.py +++ b/unitpackage/entry.py @@ -55,7 +55,7 @@ # ******************************************************************** # This file is part of unitpackage. # -# Copyright (C) 2021-2025 Albert Engstfeld +# Copyright (C) 2021-2026 Albert Engstfeld # Copyright (C) 2021 Johannes Hermann # Copyright (C) 2021-2022 Julian Rüth # Copyright (C) 2021 Nicolas Hörmann @@ -735,7 +735,17 @@ def plot(self, x_label=None, y_label=None, name=None): return fig @classmethod - def from_csv(cls, csvname, metadata=None, fields=None): + def from_csv( + cls, + csvname, + encoding=None, + header_lines=None, + column_header_lines=None, + decimal=None, + delimiters=None, + metadata=None, + fields=None, + ): r""" Returns an entry constructed from a CSV with a single header line. @@ -785,7 +795,16 @@ def from_csv(cls, csvname, metadata=None, fields=None): create_unitpackage, ) - resource = create_tabular_resource_from_csv(csvname=csvname) + # pylint: disable=duplicate-code + resource = create_tabular_resource_from_csv( + csvname=csvname, + encoding=encoding, + header_lines=header_lines, + column_header_lines=column_header_lines, + decimal=decimal, + delimiters=delimiters, + ) + package = create_unitpackage( resource=resource, metadata=metadata, fields=fields ) diff --git a/unitpackage/local.py b/unitpackage/local.py index 6cee4ef..70fbdbe 100644 --- a/unitpackage/local.py +++ b/unitpackage/local.py @@ -6,7 +6,7 @@ # ******************************************************************** # This file is part of unitpackage. # -# Copyright (C) 2021-2025 Albert Engstfeld +# Copyright (C) 2021-2026 Albert Engstfeld # Copyright (C) 2021 Johannes Hermann # Copyright (C) 2021 Julian Rüth # Copyright (C) 2021 Nicolas Hörmann @@ -36,35 +36,109 @@ logger = logging.getLogger("unitpackage") -def create_tabular_resource_from_csv(csvname): +def create_tabular_resource_from_csv( + csvname, + encoding=None, + header_lines=None, + column_header_lines=None, + decimal=None, + delimiters=None, +): r""" - Return a Data Package built from a :param metadata: dict and tabular data - in :param csvname: str. - - The :param fields: list must must be structured such as - `[{'name':'E', 'unit': 'mV'}, {'name':'T', 'unit': 'K'}]`. + Return a resource built from a provided CSV. EXAMPLES:: - >>> resource = create_tabular_resource_from_csv("./examples/from_csv/from_csv.csv") + For standard CSV files (single header line and subsequent + lines with data, using `.` as decimal separator.) + a tabular data resource is created:: + + >>> filename = './examples/from_csv/from_csv.csv' + >>> resource = create_tabular_resource_from_csv(filename) >>> resource # doctest: +NORMALIZE_WHITESPACE - {'name': 'from_csv', ... + {'name': 'from_csv', + 'type': 'table', + 'path': 'from_csv.csv', + 'scheme': 'file', + 'format': 'csv', + 'mediatype': 'text/csv', ... - >>> resource.format - 'csv' + For CSV files with a more complex structure (header, multiple column header lines, or other separators) + a pandas dataframe resource is created instead:: - """ + >>> filename = 'examples/from_csv/from_csv_multiple_headers.csv' + >>> resource = create_tabular_resource_from_csv(csvname=filename, column_header_lines=2) + >>> resource # doctest: +NORMALIZE_WHITESPACE + {'name': 'memory', + 'type': 'table', + 'data': [], + 'format': 'pandas', + 'mediatype': 'application/pandas', + 'schema': {'fields': [{'name': 'E / V', 'type': 'integer'}, + {'name': 'j / A / cm2', 'type': 'integer'}]}} + + """ csv_basename = os.path.basename(csvname) - resource = Resource( - path=csv_basename, - basepath=os.path.dirname(csvname) or ".", + if not header_lines and not column_header_lines and not decimal and not delimiters: + resource = Resource( + path=csv_basename, + basepath=os.path.dirname(csvname) or ".", + ) + resource.infer() + return resource + + # pylint: disable=duplicate-code + return create_df_resource_from_csv( + csvname, + encoding=encoding, + header_lines=header_lines, + column_header_lines=column_header_lines, + decimal=decimal, + delimiters=delimiters, ) - resource.infer() - return resource +def create_df_resource_from_csv( + csvname, + encoding=None, + header_lines=None, + column_header_lines=None, + decimal=None, + delimiters=None, +): + r""" + Create a pandas dataframe resource from a CSV file. + + EXAMPLES:: + + >>> from unitpackage.local import create_df_resource_from_csv + >>> filename = 'examples/from_csv/from_csv_multiple_headers.csv' + >>> resource = create_df_resource_from_csv(csvname='examples/from_csv/from_csv_multiple_headers.csv', column_header_lines=2) + >>> resource # doctest: +NORMALIZE_WHITESPACE + {'name': 'memory', + 'type': 'table', + 'data': [], + 'format': 'pandas', + 'mediatype': 'application/pandas', + 'schema': {'fields': [{'name': 'E / V', 'type': 'integer'}, + {'name': 'j / A / cm2', 'type': 'integer'}]}} + + """ + + from unitpackage.loaders.baseloader import BaseLoader + + with open(csvname, "r", encoding=encoding or "utf-8") as f: + csv = BaseLoader( + f, + header_lines=header_lines, + column_header_lines=column_header_lines, + decimal=decimal, + delimiters=delimiters, + ) + + return create_df_resource_from_df(csv.df) def create_df_resource_from_df(df): @@ -296,7 +370,6 @@ def create_unitpackage(resource, metadata=None, fields=None): ... """ - resource.custom.setdefault("metadata", {}) resource.custom["metadata"].setdefault("echemdb", metadata) From b399db315331c355fe6a4d6d1e50e2b1aa1d4ec0 Mon Sep 17 00:00:00 2001 From: Albert Engstfeld Date: Wed, 28 Jan 2026 11:27:51 +0100 Subject: [PATCH 5/6] fix pixi -v in workflows and add missing file --- .github/workflows/doc.yml | 2 +- .github/workflows/lint.yml | 2 +- .github/workflows/test.yml | 2 +- examples/from_csv/from_csv_multiple_headers.csv | 4 ++++ 4 files changed, 7 insertions(+), 3 deletions(-) create mode 100644 examples/from_csv/from_csv_multiple_headers.csv diff --git a/.github/workflows/doc.yml b/.github/workflows/doc.yml index c66ca99..ef13b7b 100644 --- a/.github/workflows/doc.yml +++ b/.github/workflows/doc.yml @@ -21,7 +21,7 @@ jobs: uses: actions/checkout@v6 with: { submodules: recursive } - uses: prefix-dev/setup-pixi@v0.9.3 - with: { pixi-version: v0.50.2 } + with: { pixi-version: v0.63.2 } - name: build documentation run: | pixi run doc diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index dc1f92a..2d4a795 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -18,7 +18,7 @@ jobs: - name: checkout uses: actions/checkout@v6 - uses: prefix-dev/setup-pixi@v0.9.3 - with: { pixi-version: v0.50.2 } + with: { pixi-version: v0.63.2 } - name: pylint run: pixi run pylint - name: black diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4903b2e..963b3db 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -33,7 +33,7 @@ jobs: with: { submodules: recursive } - uses: prefix-dev/setup-pixi@v0.9.3 with: - pixi-version: v0.50.2 + pixi-version: v0.63.2 - name: doctest run: | pixi run -e ${{ matrix.environment }} doctest ${{ matrix.remote-data == 'remote' && '--remote-data' || '' }} diff --git a/examples/from_csv/from_csv_multiple_headers.csv b/examples/from_csv/from_csv_multiple_headers.csv new file mode 100644 index 0000000..e8b1307 --- /dev/null +++ b/examples/from_csv/from_csv_multiple_headers.csv @@ -0,0 +1,4 @@ +E,j +V,A / cm2 +1,2 +3,4 From 69e2d5d06567ceaa6e60dfe90bc3db6c7445845d Mon Sep 17 00:00:00 2001 From: Albert Engstfeld Date: Thu, 29 Jan 2026 15:34:39 +0100 Subject: [PATCH 6/6] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- unitpackage/entry.py | 5 +++-- unitpackage/local.py | 18 ++++++++++++------ 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/unitpackage/entry.py b/unitpackage/entry.py index 3d8b32a..a4e1227 100644 --- a/unitpackage/entry.py +++ b/unitpackage/entry.py @@ -524,7 +524,7 @@ def mutable_resource(self): self.resource.custom.setdefault("MutableResource", "") if not self.resource.custom["MutableResource"]: - if not self.resource.format in ["csv", "pandas"]: + if self.resource.format not in ["csv", "pandas"]: raise ValueError( "MutableResource can only be created from resources of format 'csv' or 'pandas'." ) @@ -1089,7 +1089,8 @@ def save(self, *, outdir, basename=None): if self.resource.format == "pandas": self.resource.format = "csv" self.resource.mediatype = "text/csv" - del self.resource.data + if hasattr(self.resource, "data"): + del self.resource.data resource = self.resource.to_dict() diff --git a/unitpackage/local.py b/unitpackage/local.py index 70fbdbe..8867746 100644 --- a/unitpackage/local.py +++ b/unitpackage/local.py @@ -148,6 +148,7 @@ def create_df_resource_from_df(df): EXAMPLES:: >>> data = {'x': [1, 2, 3], 'y': [4, 5, 6]} + >>> import pandas as pd >>> df = pd.DataFrame(data) >>> from unitpackage.local import create_df_resource_from_df >>> resource = create_df_resource_from_df(df) @@ -327,25 +328,30 @@ def update_fields(schema, fields): unspecified_fields = [] unused_provided_fields = [] + # First, update fields that exist in the original schema, + # and record which original fields have no additional information provided. for name in original_schema.field_names: if name in provided_schema.field_names: new_fields.append( provided_schema.get_field(name).to_dict() | original_schema.get_field(name).to_dict() ) - elif name not in original_schema.field_names: - unused_provided_fields.append(name) else: + unspecified_fields.append(name) new_fields.append(original_schema.get_field(name).to_dict()) + # Then, record any provided fields that are not present in the original schema. + for name in provided_schema.field_names: + if name not in original_schema.field_names: + unused_provided_fields.append(name) if len(unspecified_fields) != 0: logger.warning( - f"Additional information were not provided for fields {unspecified_fields}." + f"Additional information was not provided for fields {unspecified_fields}." ) if len(unused_provided_fields) != 0: logger.warning( - f"Fields with names {unused_provided_fields} was provided but does not appear in the field names of tabular resource {original_schema.field_names}." + f"Fields with names {unused_provided_fields} were provided but do not appear in the field names of tabular resource {original_schema.field_names}." ) return Schema.from_descriptor({"fields": new_fields}) @@ -354,9 +360,9 @@ def update_fields(schema, fields): def create_unitpackage(resource, metadata=None, fields=None): r""" Return a Data Package built from a :param metadata: dict and tabular data - in :param csvname: str. + in :param resource: frictionless.Resource. - The :param fields: list must must be structured such as + The :param fields: list must be structured such as `[{'name':'E', 'unit': 'mV'}, {'name':'T', 'unit': 'K'}]`. EXAMPLES::