Added resource end_date

alexiglaser · alexiglaser · commit a079ccb1996d · 2024-12-02T16:31:01.000Z
diff --git a/digital_land/cli.py b/digital_land/cli.py
@@ -142,6 +142,7 @@ def convert_cmd(input_path, output_path):
 @dataset_resource_dir
 @issue_dir
 @click.option("--cache-dir", type=click.Path(), default="var/cache/parquet")
+@click.option("--resource-path", type=click.Path(), default="collection/resource.csv")
 @click.argument("input-paths", nargs=-1, type=click.Path(exists=True))
 @click.pass_context
 def dataset_create_cmd(
@@ -153,6 +154,7 @@ def dataset_create_cmd(
     dataset_resource_dir,
     issue_dir,
     cache_dir,
+    resource_path,
 ):
     return dataset_create(
         input_paths=input_paths,
@@ -165,6 +167,7 @@ def dataset_create_cmd(
         dataset_resource_dir=dataset_resource_dir,
         issue_dir=issue_dir,
         cache_dir=cache_dir,
+        resource_path=resource_path,
     )
 
 
diff --git a/digital_land/commands.py b/digital_land/commands.py
@@ -361,6 +361,7 @@ def dataset_create(
     column_field_dir="var/column-field",
     dataset_resource_dir="var/dataset-resource",
     cache_dir="var/cache/parquet",
+    resource_path="collection/resource.csv",
 ):
     cache_dir = os.path.join(cache_dir, dataset)
 
@@ -409,6 +410,7 @@ def dataset_create(
         organisation=organisation,
         path=output_path,
         cache_dir=cache_dir,
+        resource_path=resource_path,
         specification_dir=None,  # TBD: package should use this specification object
     )
     pqpackage.create_temp_table(input_paths)
diff --git a/digital_land/package/datasetparquet.py b/digital_land/package/datasetparquet.py
@@ -26,13 +26,14 @@
 
 
 class DatasetParquetPackage(Package):
-    def __init__(self, dataset, organisation, cache_dir, **kwargs):
+    def __init__(self, dataset, organisation, cache_dir, resource_path, **kwargs):
         self.suffix = ".parquet"
         super().__init__(dataset, tables=tables, indexes=indexes, **kwargs)
         self.dataset = dataset
         self.organisation = organisation
         self.cache_dir = cache_dir
         self._spatialite = None
+        self.resource_path = resource_path
         # Persistent connection for the class. Given name to ensure that table is stored on disk (not purely in memory)
         os.makedirs(cache_dir, exist_ok=True)
         self.duckdb_file = os.path.join(cache_dir, f"{dataset}.duckdb")
@@ -192,7 +193,7 @@ def load_entities(self):
             SELECT {fields_str} FROM (
                 SELECT {fields_str}, CASE WHEN resource_csv."end-date" IS NULL THEN '2999-12-31' ELSE resource_csv."end-date" END AS resource_end_date
                 FROM temp_table
-                LEFT JOIN read_csv_auto('collection/resource.csv') resource_csv
+                LEFT JOIN read_csv_auto('{self.resource_path}') resource_csv
                 ON temp_table.resource = resource_csv.resource
                 QUALIFY ROW_NUMBER() OVER (
                     PARTITION BY entity, field
diff --git a/tests/acceptance/test_dataset_create.py b/tests/acceptance/test_dataset_create.py
@@ -68,13 +68,23 @@ def issue_dir(session_tmp_path):
     return issue_dir
 
 
+@pytest.fixture
+def resource_path(session_tmp_path):
+    resource_path = session_tmp_path / "resource.csv"
+    columns = ["resource", "end-date"]
+    with open(resource_path, "w") as f:
+        f.write(",".join(columns) + "\n")
+    return resource_path
+
+
 def test_acceptance_dataset_create(
     session_tmp_path,
     organisation_path,
     input_paths,
     issue_dir,
     cache_path,
     dataset_dir,
+    resource_path,
 ):
     output_path = dataset_dir / f"{test_dataset}.sqlite3"
 
@@ -99,6 +109,8 @@ def test_acceptance_dataset_create(
             str(issue_dir),
             "--cache-dir",
             str(cache_path),
+            "--resource-path",
+            str(resource_path),
         ]
         + input_paths,
         catch_exceptions=False,
diff --git a/tests/integration/test_package_datasetparquet.py b/tests/integration/test_package_datasetparquet.py
@@ -518,14 +518,21 @@ def test_dataset_parquet_package(temp_dir):
 
     # Leave hash3.csv empty except for the headers (to test that an empty csv doesn't screw things up).
     with open(input_paths[2], "w") as f:
-        f.write(",".join(map(lambda x: str(x) if x is not np.nan else "", row)) + "\n")
+        f.write(",".join(columns) + "\n")
+        # f.write(",".join(map(lambda x: str(x) if x is not np.nan else "", row)) + "\n")
+
+    resource_path = str(temp_dir / "resource.csv")
+    resource_columns = ["resource", "end-date"]
+    with open(resource_path, "w") as f:
+        f.write(",".join(resource_columns) + "\n")
 
     # Instantiate the DatasetParquetPackage with temp_dir input paths and a mock schema
     package = DatasetParquetPackage(
         dataset="conservation-area",
         organisation=MockOrganisation(os.path.join(temp_dir, "organisation.csv")),
         path=os.path.join(temp_dir, "integration_test.sqlite3"),
         cache_dir=temp_dir,
+        resource_path=resource_path,
         specification_dir=None,
     )
     package.create_temp_table(input_paths)