From fbbb788a04aaa700df0ee26005025dde8d6432d2 Mon Sep 17 00:00:00 2001 From: Alex Smith Date: Tue, 13 Jan 2026 13:29:37 -0500 Subject: [PATCH 1/2] ny_aeba - update read paths to use S3 --- reports/ny_aeba_grid/notebooks/analysis.qmd | 65 +++++++------------ .../utils/make_hourly_nyiso_load.R | 65 +++++++++++++++++++ 2 files changed, 89 insertions(+), 41 deletions(-) create mode 100644 reports/ny_aeba_grid/utils/make_hourly_nyiso_load.R diff --git a/reports/ny_aeba_grid/notebooks/analysis.qmd b/reports/ny_aeba_grid/notebooks/analysis.qmd index 2555749..662da5d 100644 --- a/reports/ny_aeba_grid/notebooks/analysis.qmd +++ b/reports/ny_aeba_grid/notebooks/analysis.qmd @@ -29,48 +29,31 @@ source("/workspaces/reports2/lib/ggplot/switchbox_theme.R") ```{r} -path_nyiso_hourly_load_csv <- "/workspaces/reports2/data/nyiso/hourly/20151025-20251026 NYISO Hourly Actual Load.csv" -path_nyiso_hourly_load_parquet <- "/workspaces/reports2/data/nyiso/hourly/nyiso_hourly_load.parquet" - -# Based on the image, the CSV's columns are: Date, Load, Zone (and probably more for time parsing) -# Let's cleanly parse the timestamp, extract date/hour, and aggregate total load (sum) across all zones per hour - -# first look for the parquet, fallback to loading and parsing CSV if not found -if (file.exists(path_nyiso_hourly_load_parquet)) { - nyiso_hourly_load <- read_parquet(path_nyiso_hourly_load_parquet) -} else { - nyiso_hourly_load <- read_csv(path_nyiso_hourly_load_csv) - - nyiso_hourly_load <- nyiso_hourly_load |> - # Rename columns to lowercase for easier handling if needed - rename( - datetime = Date, - load = Load, - zone = Zone - ) |> - # ensure load is numeric - mutate(load = as.numeric(load)) |> - # Parse the datetime (given as e.g. "10/25/2015 7:00:00 PM" -- note double space between date and hour) - mutate( - datetime = lubridate::mdy_hms( - datetime, - tz = "America/New_York", - quiet = TRUE - ), - year = lubridate::year(datetime), - month = lubridate::month(datetime), - day = lubridate::day(datetime), - hour = lubridate::hour(datetime) - ) |> - select(-datetime) - - # save to parquet - write_parquet( - nyiso_hourly_load, - "/workspaces/reports2/data/nyiso/hourly/nyiso_hourly_load.parquet" - ) -} +# Read processed NYISO hourly load data from S3 +# Set up S3 filesystem with arrow +s3_bucket <- arrow::s3_bucket( + bucket = "data.sb", + region = "us-west-2" +) +s3_file_path <- "ny_aeba_grid/nyiso/hourly/nyiso_hourly_load.parquet" + +nyiso_hourly_load <- tryCatch( + { + arrow::read_parquet(s3_bucket$path(s3_file_path)) + }, + error = function(e) { + cat("\n") + cat("ERROR: Could not read processed NYISO hourly load data from S3.\n") + cat("Bucket: data.sb\n") + cat("Path:", s3_file_path, "\n") + cat("Error details:", conditionMessage(e), "\n\n") + cat("The processed parquet file may not exist yet, or there may be an AWS credentials issue.\n") + cat("Please run the data processing script first:\n") + cat(" Rscript reports/ny_aeba_grid/utils/make_hourly_nyiso_load.R\n\n") + stop("Missing required data file on S3", call. = FALSE) + } +) # add a "NY_STATE" zone, which is the sum of all zones nyiso_monthly_peak_load <- nyiso_hourly_load |> diff --git a/reports/ny_aeba_grid/utils/make_hourly_nyiso_load.R b/reports/ny_aeba_grid/utils/make_hourly_nyiso_load.R new file mode 100644 index 0000000..6c3cb1e --- /dev/null +++ b/reports/ny_aeba_grid/utils/make_hourly_nyiso_load.R @@ -0,0 +1,65 @@ +#!/usr/bin/env Rscript + +# Utility script to process NYISO hourly load data from S3 +# Reads raw CSV, processes it, and uploads parquet to S3 + +library(tidyverse) +library(arrow) +library(lubridate) + +# Set up S3 filesystem +s3_bucket <- arrow::s3_bucket( + bucket = "data.sb", + region = "us-west-2" +) + +# S3 file paths (without bucket prefix) +s3_csv_file <- "ny_aeba_grid/nyiso/hourly/20151025-20251026 NYISO Hourly Actual Load.csv" +s3_parquet_file <- "ny_aeba_grid/nyiso/hourly/nyiso_hourly_load.parquet" + +cat("Starting NYISO hourly load data processing...\n") +cat("Reading CSV from S3: data.sb/", s3_csv_file, "\n", sep = "") + +# Read CSV from S3 +nyiso_hourly_load <- arrow::read_csv_arrow(s3_bucket$path(s3_csv_file)) + +cat("CSV loaded. Processing data...\n") + +# Apply transformations +nyiso_hourly_load <- nyiso_hourly_load |> + # Rename columns to lowercase for easier handling if needed + rename( + datetime = Date, + load = Load, + zone = Zone + ) |> + # ensure load is numeric + mutate(load = as.numeric(load)) |> + # Parse the datetime (given as e.g. "10/25/2015 7:00:00 PM" -- note double space between date and hour) + mutate( + datetime = lubridate::mdy_hms( + datetime, + tz = "America/New_York", + quiet = TRUE + ), + year = lubridate::year(datetime), + month = lubridate::month(datetime), + day = lubridate::day(datetime), + hour = lubridate::hour(datetime) + ) |> + select(-datetime) + +cat("Data processed successfully.\n") +cat("Writing parquet to S3: data.sb/", s3_parquet_file, "\n", sep = "") + +# Write parquet to S3 +arrow::write_parquet( + nyiso_hourly_load, + s3_bucket$path(s3_parquet_file) +) + +cat("✓ Parquet file uploaded to S3 successfully!\n") +cat("Summary:\n") +cat(" - Total rows:", nrow(nyiso_hourly_load), "\n") +cat(" - Columns:", paste(names(nyiso_hourly_load), collapse = ", "), "\n") +cat(" - Year range:", min(nyiso_hourly_load$year), "to", max(nyiso_hourly_load$year), "\n") From 16ea4b27b1b72b4b2a7b394f4725202bb8720f60 Mon Sep 17 00:00:00 2001 From: Alex Smith Date: Wed, 14 Jan 2026 10:26:59 -0500 Subject: [PATCH 2/2] update path name for local testing --- reports/ri_hp_rates/notebooks/analysis.qmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reports/ri_hp_rates/notebooks/analysis.qmd b/reports/ri_hp_rates/notebooks/analysis.qmd index 823181b..f63076e 100644 --- a/reports/ri_hp_rates/notebooks/analysis.qmd +++ b/reports/ri_hp_rates/notebooks/analysis.qmd @@ -39,7 +39,7 @@ path_heat_pump_plots <- file.path(path_to_lib, "rates_analysis", "heat_pump_rate path_create_housing_units <- file.path(path_to_lib, "rates_analysis", "create_sb_housing_units.R") # Data paths -path_monthly_data <- file.path(path_to_data, "resstock", "2024_release2_tmy3", "load_curve_monthly") +path_monthly_data <- file.path(path_to_data, "resstock", "2024_release2_tmy3_2", "load_curve_monthly") path_supply_year_metadata_dir <- file.path(path_to_data, "resstock", "2024_release2_tmy3", "metadata") path_fuel_oil_supply_rates <- file.path(path_to_data, "eia", "heating_oil", "ri_eia_heating_oil_prices_monthly.parquet") path_propane_supply_rates <- file.path(path_to_data, "eia", "propane", "ri_eia_propane_prices_monthly.parquet")