From 125b7e81277f8e1e837398726e0cb2a10dea9e79 Mon Sep 17 00:00:00 2001 From: "Heine, Matthew" Date: Thu, 7 Mar 2024 08:46:45 -0800 Subject: [PATCH 1/3] several fixes 1. Added azure dir to .dockerignore 2. Updated Dockerfile to use newer version of node (16 is end of life) 3. Modified config.yaml to point to the correct S3 paths for comstock & resstock. 4. Modified the code to allow for custom table prefixes. The autogenerated ones were very long and unwieldy. 5. Updated config.yaml to use the Table Prefix input for all datasets. Note that generate_table_prefix from utils.py is no longer used. Should I remove? --- .dockerignore | 3 +- Dockerfile | 2 +- oedi/AWS/data_lake/construct.py | 3 +- oedi/AWS/data_lake/stack.py | 10 ++++-- oedi/AWS/utils.py | 3 +- oedi/config.yaml | 59 +++++++++++++++++++++++++++++---- 6 files changed, 66 insertions(+), 14 deletions(-) diff --git a/.dockerignore b/.dockerignore index 8ee3850..20bb136 100644 --- a/.dockerignore +++ b/.dockerignore @@ -18,4 +18,5 @@ coverage.xml .git .pytest_cache .env -*.egg-info \ No newline at end of file +*.egg-info +azure diff --git a/Dockerfile b/Dockerfile index 09e1db2..9c56a29 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM node:16-bullseye-slim +FROM node:20-bullseye-slim # Setup environment variables ENV LC_ALL=C.UTF-8 diff --git a/oedi/AWS/data_lake/construct.py b/oedi/AWS/data_lake/construct.py index 973355b..2c3213e 100644 --- a/oedi/AWS/data_lake/construct.py +++ b/oedi/AWS/data_lake/construct.py @@ -63,10 +63,9 @@ def create_crawler_role(self): managed_policies=managed_policies, ) - def create_crawler(self, location, tags): + def create_crawler(self, location, table_prefix, tags): """Create crawler in data lake by given dataset location.""" crawler_name = generate_crawler_name(s3url=location) - table_prefix = generate_table_prefix(s3url=location) if not self.crawler_role: self.crawler_role() diff --git a/oedi/AWS/data_lake/stack.py b/oedi/AWS/data_lake/stack.py index dbb24a8..943e82f 100644 --- a/oedi/AWS/data_lake/stack.py +++ b/oedi/AWS/data_lake/stack.py @@ -27,5 +27,11 @@ def __init__(self, scope: Construct, config: OEDIConfigBase) -> None: data_lake.create_database() data_lake.create_crawler_role() #TODO: data_lake.create_workgroup() - for dataset_location in database['Locations']: - data_lake.create_crawler(location=dataset_location, tags=tags) + if 'Table Prefixes' in database.keys(): + table_prefixes = database['Table Prefixes'] # Prefix for each table + elif 'Table Prefix' in database.keys(): + table_prefixes = [database['Table Prefix']] * len(database['Locations']) # One prefix for all tables + else: + table_prefixes = ['table_'] * len(database['Locations']) # No prefix specified, use generic prefix + for dataset_location, table_prefix in zip(database['Locations'], table_prefixes): + data_lake.create_crawler(location=dataset_location, table_prefix=table_prefix, tags=tags) diff --git a/oedi/AWS/utils.py b/oedi/AWS/utils.py index 6cab413..295bafa 100644 --- a/oedi/AWS/utils.py +++ b/oedi/AWS/utils.py @@ -32,6 +32,7 @@ def generate_crawler_name(s3url): bucket, path = parse_s3url(s3url) dashed_path = path.replace("/", "-") name = f"{bucket}-{dashed_path}".replace("_", "-") +# name = name[-128:] # Crawler names have a limit of 128 characters return name.lower() @@ -93,5 +94,5 @@ def generate_table_prefix(s3url): prefix = os.path.dirname(path).replace("/", "-") + "_" table_prefix = prefix.replace("-", "_").lower() - + table_prefix = table_prefix[-128:] # A table prefix has a limit of 128 characters return table_prefix diff --git a/oedi/config.yaml b/oedi/config.yaml index ca604c7..f076561 100644 --- a/oedi/config.yaml +++ b/oedi/config.yaml @@ -10,13 +10,46 @@ AWS: - s3://oedi-data-lake/pv-rooftop/developable-planes/ - s3://oedi-data-lake/pv-rooftop/rasd/ - s3://oedi-data-lake/pv-rooftop-pr/developable-planes/ - - Identifier: buildstock - Name: oedi_buildstock + Table Prefixes: + - pv_rooftop_ + - pv_rooftop_ + - pv_rooftop_ + - pv_rooftop_ + - pv_rooftop_pr_ + - Identifier: comstock + Name: oedi_comstock_amy2018_release_2 Locations: - - s3://oedi-data-lake/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2021/comstock_amy2018_release_1/ - - s3://oedi-data-lake/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2021/comstock_tmy3_release_1/ - - s3://oedi-data-lake/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2021/resstock_amy2018_release_1/ - - s3://oedi-data-lake/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2021/resstock_tmy3_release_1/ + - s3://oedi-data-lake/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2023/comstock_amy2018_release_2/metadata/ + - s3://oedi-data-lake/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2023/comstock_amy2018_release_2/weather/amy2018/ + - s3://oedi-data-lake/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2023/comstock_amy2018_release_2/metadata_and_annual_results/national/parquet/ + - s3://oedi-data-lake/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2023/comstock_amy2018_release_2/timeseries_individual_buildings/by_puma_midwest/ + - s3://oedi-data-lake/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2023/comstock_amy2018_release_2/timeseries_individual_buildings/by_puma_northeast/ + - s3://oedi-data-lake/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2023/comstock_amy2018_release_2/timeseries_individual_buildings/by_puma_south/ + - s3://oedi-data-lake/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2023/comstock_amy2018_release_2/timeseries_individual_buildings/by_puma_west/ + - s3://oedi-data-lake/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2023/comstock_amy2018_release_2/timeseries_individual_buildings/by_state/ + Table Prefixes: + - amy_2018_ + - weather_ + - metadata_and_annual_results_national_ + - timeseries_individual_buildings_ + - timeseries_individual_buildings_ + - timeseries_individual_buildings_ + - timeseries_individual_buildings_ + - timeseries_individual_buildings_ + - Identifier: resstock + Name: oedi_resstock_2022_tmy3_1.1 + Locations: + - s3://oedi-data-lake/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2022/resstock_tmy3_release_1.1/metadata/ + - s3://oedi-data-lake/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2022/resstock_tmy3_release_1.1/metadata_and_annual_results/national/parquet/ + - s3://oedi-data-lake/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2022/resstock_tmy3_release_1.1/metadata_income/parquet/ + - s3://oedi-data-lake/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2022/resstock_tmy3_release_1.1/timeseries_individual_buildings/by_state/ + - s3://oedi-data-lake/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2022/resstock_tmy3_release_1.1/weather/ + Table Prefixes: + - resstock_tmy3_ + - metadata_and_annual_results_national_ + - metadata_income_ + - timeseries_individual_buildings_ + - resstock_tmy3_ - Identifier: tracking_the_sun Name: oedi_tracking_the_sun Locations: @@ -26,6 +59,7 @@ AWS: - s3://oedi-data-lake/tracking-the-sun/2021/ - s3://oedi-data-lake/tracking-the-sun/2022/ - s3://oedi-data-lake/tracking-the-sun/2023/ + Table Prefix: tracking_the_sun_ - Identifier: atb Name: oedi_atb Locations: @@ -38,6 +72,16 @@ AWS: - s3://oedi-data-lake/ATB/transportation/parquet/2022/fuels - s3://oedi-data-lake/ATB/transportation/parquet/2022/vehicles - s3://oedi-data-lake/ATB/transportation/parquet/2022/vehicles_fuels + Table Prefixes: + - atb_electricity_ + - atb_electricity_ + - atb_electricity_ + - atb_electricity_ + - atb_electricity_ + - atb_transportation_2022_ + - atb_transportation_2022_ + - atb_transportation_2022_ + - atb_transportation_2022_ - Identifier: pvdaq Name: oedi_pvdaq Locations: @@ -50,6 +94,7 @@ AWS: - s3://oedi-data-lake/pvdaq/parquet/mount/ - s3://oedi-data-lake/pvdaq/parquet/other-instruments/ - s3://oedi-data-lake/pvdaq/parquet/pvdata/ + Table Prefix: pv_daq_ - Identifier: nso Name: oedi_nso Locations: @@ -60,7 +105,7 @@ AWS: - s3://oedi-data-lake-rawdata/NSO-2/loads_20Hz/ - s3://oedi-data-lake-rawdata/NSO-2/wake_masts_1min/ - s3://oedi-data-lake-rawdata/NSO-2/wake_masts_20Hz/ - + Table Prefix: 'NSO_' Staging Location: s3://user-owned-staging-bucket/ Tags: - Key: Project From 3bd78acca6884b6711098f09bcac223397883abf Mon Sep 17 00:00:00 2001 From: "Heine, Matthew" Date: Thu, 7 Mar 2024 14:51:54 -0800 Subject: [PATCH 2/3] updates based on comments --- oedi/__init__.py | 2 +- oedi/config.yaml | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/oedi/__init__.py b/oedi/__init__.py index 788da1f..fe404ae 100644 --- a/oedi/__init__.py +++ b/oedi/__init__.py @@ -1 +1 @@ -__version__ = "0.2.4" +__version__ = "0.2.5" diff --git a/oedi/config.yaml b/oedi/config.yaml index f076561..fde25a5 100644 --- a/oedi/config.yaml +++ b/oedi/config.yaml @@ -94,7 +94,7 @@ AWS: - s3://oedi-data-lake/pvdaq/parquet/mount/ - s3://oedi-data-lake/pvdaq/parquet/other-instruments/ - s3://oedi-data-lake/pvdaq/parquet/pvdata/ - Table Prefix: pv_daq_ + Table Prefix: pvdaq_ - Identifier: nso Name: oedi_nso Locations: @@ -105,10 +105,10 @@ AWS: - s3://oedi-data-lake-rawdata/NSO-2/loads_20Hz/ - s3://oedi-data-lake-rawdata/NSO-2/wake_masts_1min/ - s3://oedi-data-lake-rawdata/NSO-2/wake_masts_20Hz/ - Table Prefix: 'NSO_' + Table Prefix: 'nso_' Staging Location: s3://user-owned-staging-bucket/ Tags: - Key: Project Value: OEDI - Key: Release - Value: 0.2.4 + Value: 0.2.5 From d90d01bfce509b50a8e5c856bb097b2db35830a1 Mon Sep 17 00:00:00 2001 From: jgu2 Date: Fri, 7 Feb 2025 09:54:59 -0700 Subject: [PATCH 3/3] Update year 2025 --- LICENSE | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/LICENSE b/LICENSE index da7e522..40c09c1 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ BSD 3-Clause License -Copyright (c) 2024 Alliance for Sustainable Energy, LLC and Skye Analytics, Inc. +Copyright (c) 2025 Alliance for Sustainable Energy, LLC and Skye Analytics, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/README.md b/README.md index 29ef781..f80dc54 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Open Data Access Tools The Open Energy Data Initiative (OEDI) provides a number of tools to enable the use of the open data published through this initiative. The source is largely written in Python, including Jupyter notebooks. -Copyright (c) 2024 Alliance for Sustainable Energy, LLC and Skye Analytics, Inc. +Copyright (c) 2025 Alliance for Sustainable Energy, LLC and Skye Analytics, Inc. Open Data Access Tools: NREL SWR-20-57. Azure Data Tools: SWR-23-92.