From 5b573d2ffc0fdb7b637cd07168da818ed9f93d06 Mon Sep 17 00:00:00 2001
From: Simon Topp <stopp@usgs.gov>
Date: Fri, 2 Jul 2021 16:08:05 -0400
Subject: [PATCH 01/15] Initial code for pulling lake temperatures from Landsat
 and Era5

---
 GEE_pull_functions.py | 106 ++++++++++++++++++++++++++++++++++++++++++
 LakeExport.py         |  92 ++++++++++++++++++++++++++++++++++++
 2 files changed, 198 insertions(+)
 create mode 100644 GEE_pull_functions.py
 create mode 100644 LakeExport.py

diff --git a/GEE_pull_functions.py b/GEE_pull_functions.py
new file mode 100644
index 0000000..120a8e7
--- /dev/null
+++ b/GEE_pull_functions.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python2
+# -*- coding: utf-8 -*-
+"""
+Google Earth Engine Reflectance Pull Functions
+Created on Mon Apr  9 14:24:13 2018
+@author: simontopp
+"""
+import ee
+import time
+import os
+
+## Recreate the fmask based on Collection 2 Pixel QA band
+
+def AddFmask(image):
+    qa = image.select('pixel_qa')
+    water = qa.bitwiseAnd(1 << 7)
+    cloud = qa.bitwiseAnd(1 << 1).Or(qa.bitwiseAnd(1 << 2)).Or(qa.bitwiseAnd(1 << 3))
+    snow = qa.bitwiseAnd(1 << 5)
+    cloudshadow = qa.bitwiseAnd(1 << 4)
+
+    fmask = (water.gt(0).rename('fmask')
+             .where(snow.gt(0), ee.Image(3))
+             .where(cloudshadow.gt(0), ee.Image(2))
+             .where(cloud.gt(0), ee.Image(4))
+             .updateMask(qa.gte(0)))
+    # mask the fmask so that it has the same footprint as the quality (BQA) band
+    return image.addBands(fmask)
+
+
+## Calculuate hillshadow to correct DWSE
+def CalcHillShadows(image, geo):
+    MergedDEM = ee.Image("users/eeProject/MERIT").clip(geo.buffer(3000))
+    hillShadow = (ee.Terrain.hillShadow(MergedDEM, ee.Number(image.get('SOLAR_AZIMUTH_ANGLE')),
+                                        ee.Number(90).subtract(image.get('SOLAR_ZENITH_ANGLE')), 30).rename(
+        ['hillShadow']))
+    return hillShadow
+
+
+## Buffer the lake sites
+def dpBuff(i):
+    dist = i.get('distance')
+    buffdist = ee.Number(dist).min(120)
+    return i.buffer(buffdist)
+
+
+## Remove geometries
+def removeGeo(i):
+    return i.setGeometry(None)
+
+
+## Create water mask and extract lake medians
+
+## Set up the reflectance pull
+def RefPull(image):
+    f = AddFmask(image).select('fmask')
+    water = f.eq(1).rename('water')
+    clouds = f.gte(2).rename('clouds')
+    #hs = CalcHillShadows(image, tile.geometry()).select('hillShadow')
+    #era5match = (ee.Image(era5.filterDate(ee.Date(image.get('system:time_start')).update(minute = 0, second = 0))
+    #             .first()).select('lake_mix_layer_temperature', 'lake_total_layer_temperature'))
+    era5match = ee.Image(0).rename('lake_mix_layer_temperature').addBands(ee.Image(1).rename('lake_total_layer_temperature'))
+    pixOut = (image
+              .addBands(era5match)
+              .updateMask(water)
+              .addBands(clouds)
+              .addBands(water))
+
+    combinedReducer = (ee.Reducer.median().unweighted()
+        .forEachBand(pixOut.select('temp', 'temp_qa', 'lake_mix_layer_temperature', 'lake_total_layer_temperature'))
+        .combine(ee.Reducer.mean().unweighted().forEachBand(pixOut.select('clouds')), 'cScore_', False)
+        .combine(ee.Reducer.count().unweighted().forEachBand(pixOut.select('water')), 'pCount_', False))
+
+
+    # Collect median reflectance and occurance values
+    # Make a cloud score, and get the water pixel count
+    lsout = pixOut.reduceRegions(lakes, combinedReducer, 30)
+
+    out = lsout.map(removeGeo)
+
+    return out
+
+
+##Function for limiting the max number of tasks sent to
+# earth engine at one time to avoid time out errors
+
+def maximum_no_of_tasks(MaxNActive, waitingPeriod):
+    ##maintain a maximum number of active tasks
+    time.sleep(10)
+    ## initialize submitting jobs
+    ts = list(ee.batch.Task.list())
+
+    NActive = 0
+    for task in ts:
+        if ('RUNNING' in str(task) or 'READY' in str(task)):
+            NActive += 1
+    ## wait if the number of current active tasks reach the maximum number
+    ## defined in MaxNActive
+    while (NActive >= MaxNActive):
+        time.sleep(waitingPeriod)  # if reach or over maximum no. of active tasks, wait for 2min and check again
+        ts = list(ee.batch.Task.list())
+        NActive = 0
+        for task in ts:
+            if ('RUNNING' in str(task) or 'READY' in str(task)):
+                NActive += 1
+    return ()
+
diff --git a/LakeExport.py b/LakeExport.py
new file mode 100644
index 0000000..67e3434
--- /dev/null
+++ b/LakeExport.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python2
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Feb 27 12:07:02 2018
+
+@author: simontopp
+"""
+#%%
+import time
+import ee
+import os
+#import pandas as pd
+#import feather
+import GEE_pull_functions as f
+ee.Initialize()
+
+#Source necessary functions.
+exec(open('GEE_pull_functions.py').read())
+
+#water = ee.Image("JRC/GSW1_1/GlobalSurfaceWater").select('occurrence').gt(80)
+
+## Bring in EE Assets
+# Deepest point for CONUS Hydrolakes from Xiao Yang
+# Code available https://zenodo.org/record/4136755#.X5d54pNKgUE
+dp = (ee.FeatureCollection('users/sntopp/USGS/PGDL_lakes_deepest_point')
+  .filterMetadata('distance', "greater_than", 60))
+
+#CONUS Boundary
+us = ee.FeatureCollection("USDOS/LSIB_SIMPLE/2017")\
+    .filterMetadata('country_na', 'equals', 'United States')
+
+## WRS Tiles in descending (daytime) mode for CONUS
+wrs = ee.FeatureCollection('users/sntopp/wrs2_asc_desc')\
+    .filterBounds(us)\
+    .filterMetadata('MODE', 'equals', 'D')
+    
+pr = wrs.aggregate_array('PR').getInfo()
+
+l8 = ee.ImageCollection("LANDSAT/LC08/C02/T1_L2")
+l7 = ee.ImageCollection("LANDSAT/LE07/C02/T1_L2")
+era5 = ee.ImageCollection("ECMWF/ERA5_LAND/HOURLY")
+
+#Standardize band names between the various collections and aggregate 
+#them into one image collection
+bn8 = ['ST_B10', 'ST_QA', 'QA_PIXEL']
+bn57 = ['ST_B6', 'ST_QA', 'QA_PIXEL']
+bns = ['temp','temp_qa','pixel_qa']
+  
+ls7 = l7.select(bn57, bns)
+ls8 = l8.select(bn8, bns)
+
+ls = ee.ImageCollection(ls7.merge(ls8))\
+    .filter(ee.Filter.lt('CLOUD_COVER', 50))\
+    .filterBounds(us)  
+
+
+## Set up a counter and a list to keep track of what's been done already
+counter = 0
+done = []    
+
+#%%
+
+## In case something goofed, you should be able to just 
+## re-run this chunk with the following line filtering out 
+## what's already run. 
+pr = [i for i in pr if i not in done]
+
+for tiles in pr[0:3]:
+    tile = wrs.filterMetadata('PR', 'equals', tiles)
+    # For some reason we need to cast this to a list and back to a
+    # feature collection
+    lakes = dp.filterBounds(tile.geometry())\
+        .map(dpBuff)
+        
+    #lakes = ee.FeatureCollection(lakes.toList(10000))
+    stack = ls.filterBounds(tile.geometry().centroid())
+    out = stack.map(RefPull).flatten().filterMetadata('cScore_clouds','less_than',.5)
+    dataOut = ee.batch.Export.table.toDrive(collection = out,\
+                                            description = str(tiles),\
+                                            folder = 'EE_TempPull',\
+                                            fileFormat = 'csv')#,\
+                                            #selectors = [])
+    #Check how many existing tasks are running and take a break if it's >15  
+    maximum_no_of_tasks(25, 120)
+    #Send next task.
+    dataOut.start()
+    counter = counter + 1
+    done.append(tiles)
+    print('done_' + str(counter) + '_' + str(tiles))
+        
+#%%
+

From 1682292ac4fb45f47b52359bcab827b2c81321fb Mon Sep 17 00:00:00 2001
From: Simon Topp <stopp@usgs.gov>
Date: Mon, 5 Jul 2021 14:10:52 -0400
Subject: [PATCH 02/15] Fixed Era5 pull

---
 GEE_pull_functions.py | 24 ++++++++++++++++--------
 LakeExport.py         | 15 +++++++++++----
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/GEE_pull_functions.py b/GEE_pull_functions.py
index 120a8e7..3f1a3cf 100644
--- a/GEE_pull_functions.py
+++ b/GEE_pull_functions.py
@@ -47,8 +47,7 @@ def dpBuff(i):
 def removeGeo(i):
     return i.setGeometry(None)
 
-
-## Create water mask and extract lake medians
+        ## Create water mask and extract lake medians
 
 ## Set up the reflectance pull
 def RefPull(image):
@@ -56,12 +55,17 @@ def RefPull(image):
     water = f.eq(1).rename('water')
     clouds = f.gte(2).rename('clouds')
     #hs = CalcHillShadows(image, tile.geometry()).select('hillShadow')
-    #era5match = (ee.Image(era5.filterDate(ee.Date(image.get('system:time_start')).update(minute = 0, second = 0))
-    #             .first()).select('lake_mix_layer_temperature', 'lake_total_layer_temperature'))
-    era5match = ee.Image(0).rename('lake_mix_layer_temperature').addBands(ee.Image(1).rename('lake_total_layer_temperature'))
-    pixOut = (image
-              .addBands(era5match)
+    era5match = (era5.filterDate(ee.Date(image.get('system:time_start')).update(None, None, None, None, 0,0)))
+
+    era5out = (ee.Algorithms.If(era5match.size(),
+                     ee.Image(era5match.first()).select('lake_mix_layer_temperature', 'lake_total_layer_temperature'),
+                     ee.Image(0).rename('lake_mix_layer_temperature').addBands(ee.Image(0).rename('lake_total_layer_temperature')),
+                     )
+               )
+
+    pixOut = (image.select('temp', 'temp_qa')
               .updateMask(water)
+              .addBands(era5out)
               .addBands(clouds)
               .addBands(water))
 
@@ -70,12 +74,16 @@ def RefPull(image):
         .combine(ee.Reducer.mean().unweighted().forEachBand(pixOut.select('clouds')), 'cScore_', False)
         .combine(ee.Reducer.count().unweighted().forEachBand(pixOut.select('water')), 'pCount_', False))
 
+    def copyMeta(i):
+        return i.copyProperties(image, ["CLOUD_COVER", 'SPACECRAFT_ID', 'system:index', 'DATE_ACQUIRED'])
+
 
-    # Collect median reflectance and occurance values
+        # Collect median reflectance and occurance values
     # Make a cloud score, and get the water pixel count
     lsout = pixOut.reduceRegions(lakes, combinedReducer, 30)
 
     out = lsout.map(removeGeo)
+    out = out.map(copyMeta)
 
     return out
 
diff --git a/LakeExport.py b/LakeExport.py
index 67e3434..ccf60ff 100644
--- a/LakeExport.py
+++ b/LakeExport.py
@@ -65,7 +65,10 @@
 ## what's already run. 
 pr = [i for i in pr if i not in done]
 
-for tiles in pr[0:3]:
+if not os.path.isfile('log.txt'):
+    open('log.txt', 'x')
+
+for tiles in pr[0:1]:
     tile = wrs.filterMetadata('PR', 'equals', tiles)
     # For some reason we need to cast this to a list and back to a
     # feature collection
@@ -78,14 +81,18 @@
     dataOut = ee.batch.Export.table.toDrive(collection = out,\
                                             description = str(tiles),\
                                             folder = 'EE_TempPull',\
-                                            fileFormat = 'csv')#,\
-                                            #selectors = [])
-    #Check how many existing tasks are running and take a break if it's >15  
+                                            fileFormat = 'csv',\
+                                            selectors = ["system:index", "areakm", "cScore_clouds", "CLOUD_COVER", 'SPACECRAFT_ID', 'DATE_ACQUIRED', "distance", "lake_mix_layer_temperature", "lake_total_layer_temperature", "pCount_water", "site_id", "temp", "temp_qa"])
+    #Check how many existing tasks are running and take a break if it's >15
     maximum_no_of_tasks(25, 120)
     #Send next task.
     dataOut.start()
     counter = counter + 1
     done.append(tiles)
+    f = open('log.txt', 'a')
+    f.write(str(tiles) + '\n')
+    f.close()
+
     print('done_' + str(counter) + '_' + str(tiles))
         
 #%%

From a1bb3aadcb0cf2c4aef7472d358f0b73cbeee2b6 Mon Sep 17 00:00:00 2001
From: Simon Topp <stopp@usgs.gov>
Date: Thu, 8 Jul 2021 07:57:44 -0400
Subject: [PATCH 03/15] Commented code for review.

---
 GEE_pull_functions.py | 18 ++++--------------
 LakeExport.py         | 29 +++++++++++++++--------------
 2 files changed, 19 insertions(+), 28 deletions(-)

diff --git a/GEE_pull_functions.py b/GEE_pull_functions.py
index 3f1a3cf..a3e6cce 100644
--- a/GEE_pull_functions.py
+++ b/GEE_pull_functions.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 """
 Google Earth Engine Reflectance Pull Functions
-Created on Mon Apr  9 14:24:13 2018
+
 @author: simontopp
 """
 import ee
@@ -10,7 +10,6 @@
 import os
 
 ## Recreate the fmask based on Collection 2 Pixel QA band
-
 def AddFmask(image):
     qa = image.select('pixel_qa')
     water = qa.bitwiseAnd(1 << 7)
@@ -27,16 +26,8 @@ def AddFmask(image):
     return image.addBands(fmask)
 
 
-## Calculuate hillshadow to correct DWSE
-def CalcHillShadows(image, geo):
-    MergedDEM = ee.Image("users/eeProject/MERIT").clip(geo.buffer(3000))
-    hillShadow = (ee.Terrain.hillShadow(MergedDEM, ee.Number(image.get('SOLAR_AZIMUTH_ANGLE')),
-                                        ee.Number(90).subtract(image.get('SOLAR_ZENITH_ANGLE')), 30).rename(
-        ['hillShadow']))
-    return hillShadow
-
-
-## Buffer the lake sites
+## Buffer the lake sites either 120 minutes or the distance to shore, whichever
+## is smaller
 def dpBuff(i):
     dist = i.get('distance')
     buffdist = ee.Number(dist).min(120)
@@ -47,14 +38,13 @@ def dpBuff(i):
 def removeGeo(i):
     return i.setGeometry(None)
 
-        ## Create water mask and extract lake medians
 
 ## Set up the reflectance pull
 def RefPull(image):
     f = AddFmask(image).select('fmask')
     water = f.eq(1).rename('water')
     clouds = f.gte(2).rename('clouds')
-    #hs = CalcHillShadows(image, tile.geometry()).select('hillShadow')
+
     era5match = (era5.filterDate(ee.Date(image.get('system:time_start')).update(None, None, None, None, 0,0)))
 
     era5out = (ee.Algorithms.If(era5match.size(),
diff --git a/LakeExport.py b/LakeExport.py
index ccf60ff..2bdea4f 100644
--- a/LakeExport.py
+++ b/LakeExport.py
@@ -1,8 +1,6 @@
 #!/usr/bin/env python2
 # -*- coding: utf-8 -*-
 """
-Created on Tue Feb 27 12:07:02 2018
-
 @author: simontopp
 """
 #%%
@@ -11,17 +9,18 @@
 import os
 #import pandas as pd
 #import feather
-import GEE_pull_functions as f
+
+## Initialize Earth Engine
 ee.Initialize()
 
-#Source necessary functions.
+#Source necessary functions. We do this instead of 'import' because of EE quirk.
 exec(open('GEE_pull_functions.py').read())
 
 #water = ee.Image("JRC/GSW1_1/GlobalSurfaceWater").select('occurrence').gt(80)
 
 ## Bring in EE Assets
-# Deepest point for CONUS Hydrolakes from Xiao Yang
-# Code available https://zenodo.org/record/4136755#.X5d54pNKgUE
+# Deepest point for CONUS PGDL Lakes
+# DP Code available at https://zenodo.org/record/4136755#.X5d54pNKgUE
 dp = (ee.FeatureCollection('users/sntopp/USGS/PGDL_lakes_deepest_point')
   .filterMetadata('distance', "greater_than", 60))
 
@@ -36,6 +35,7 @@
     
 pr = wrs.aggregate_array('PR').getInfo()
 
+#Bring in temp data
 l8 = ee.ImageCollection("LANDSAT/LC08/C02/T1_L2")
 l7 = ee.ImageCollection("LANDSAT/LE07/C02/T1_L2")
 era5 = ee.ImageCollection("ECMWF/ERA5_LAND/HOURLY")
@@ -49,26 +49,27 @@
 ls7 = l7.select(bn57, bns)
 ls8 = l8.select(bn8, bns)
 
+##Do coarse cloud filtering
 ls = ee.ImageCollection(ls7.merge(ls8))\
     .filter(ee.Filter.lt('CLOUD_COVER', 50))\
     .filterBounds(us)  
 
 
 ## Set up a counter and a list to keep track of what's been done already
-counter = 0
-done = []    
+if not os.path.isfile('log.txt'):
+    open('log.txt', 'x')
 
-#%%
+done = open('log.txt', 'r')
+done = [x.strip() for x in done.readlines()]
+done = list(map(int, done))
+counter = len(done)
 
 ## In case something goofed, you should be able to just 
 ## re-run this chunk with the following line filtering out 
 ## what's already run. 
 pr = [i for i in pr if i not in done]
 
-if not os.path.isfile('log.txt'):
-    open('log.txt', 'x')
-
-for tiles in pr[0:1]:
+for tiles in pr:
     tile = wrs.filterMetadata('PR', 'equals', tiles)
     # For some reason we need to cast this to a list and back to a
     # feature collection
@@ -95,5 +96,5 @@
 
     print('done_' + str(counter) + '_' + str(tiles))
         
-#%%
+
 

From bf15af924c3e34b40e9855fa1ca3d96f3b60ea3e Mon Sep 17 00:00:00 2001
From: Simon Topp <stopp@usgs.gov>
Date: Thu, 8 Jul 2021 11:32:21 -0400
Subject: [PATCH 04/15] Initial look at RS temps.

---
 01_TempMunge.Rmd | 242 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 242 insertions(+)
 create mode 100644 01_TempMunge.Rmd

diff --git a/01_TempMunge.Rmd b/01_TempMunge.Rmd
new file mode 100644
index 0000000..28e6e77
--- /dev/null
+++ b/01_TempMunge.Rmd
@@ -0,0 +1,242 @@
+---
+title: "01_TempMunge"
+author: "Simon Topp"
+date: "07/08/2021"
+output: html_document
+editor_options: 
+  chunk_output_type: console
+---
+
+```{r setup, include=FALSE}
+library(tidyverse)
+library(lubridate)
+library(purrr)
+library(furrr)
+library(data.table)
+library(feather)
+library(sf)
+library(ggpmisc)
+library(ggpubr)
+library(Hmisc)
+library(Metrics)
+
+knitr::opts_chunk$set(echo = TRUE, warning = FALSE)
+```
+
+```{r eval = F, include = F}
+## Read in whatever path you saved the raw pull data to
+files <- list.files('data/EE_TempPull', full.names = T)
+files <- files[file.size(files) > 1] ## 6 WRS tile returned no values
+
+## Munge it a little, basically zero clouds/shadow/ice and
+## at least 5 pixels
+munger <- function(file){
+  df <- read_csv(file) %>%
+    filter(!is.na(temp),
+           cScore_clouds == 0,
+           pCount_water > 5) %>%
+  select(-`system:index`)
+  return(df)
+}
+
+munged <- files %>% map_dfr(munger)
+
+munged <- munged %>% mutate(temp = 0.00341802*temp + 149 - 273.15,
+                                  temp_qa = temp_qa*0.01,
+                                  lake_mix_layer_temperature = lake_mix_layer_temperature - 273.15) %>%
+  rename(temp_e5_mix = lake_mix_layer_temperature, temp_e5_full = lake_total_layer_temperature, 
+         temp_ls = temp, temp_ls_qa = temp_qa, date = DATE_ACQUIRED, sat = SPACECRAFT_ID, 
+         scene_cloud_cover = CLOUD_COVER)
+
+write_feather(munged,'data/out/landsat_temps.feather')
+```
+
+```{r}
+munged <- read_feather('data/out/landsat_temps.feather')
+```
+
+## Between Landsat 7 and 8 we have `r nrow(munged)` temperature observations.
+
+```{r}
+
+obs <- read_csv('data/in/lake_surface_temp_obs.csv') %>%
+  rename(date = Date)
+
+same_day <- obs %>% inner_join(munged) %>% mutate(join = 'same_day')
+
+day_plus_one <- obs %>% inner_join(munged %>% mutate(date = date + 1)) %>% mutate(join = 'day_plus_1')
+
+day_minus_one <- obs %>% inner_join(munged %>% mutate(date = date -1)) %>% mutate(join = 'day_minus_1')
+
+matchups <- bind_rows(same_day, day_plus_one, day_minus_one)
+
+matchups_filt <- matchups %>%
+  filter(temp_ls_qa < 3,
+         scene_cloud_cover < 20,
+         distance > 90,
+         temp_ls >=0)
+
+rm(same_day, day_plus_one, day_minus_one)
+```
+
+## Of those, we have `r nrow(matchups)` coincident observations of RS and *in situ* temp (matchups).
+
+Let's look at some *inclusive* metrics. General filters here are:
+
+-   +/- 1 day matchups
+
+-   0 clouds/ice/cloud shadow over the 'deepest point'
+
+```{r}
+ggplot(matchups, aes(x = wtemp_obs, y = temp_ls)) + geom_hex() +
+  scale_fill_viridis_c(trans = 'log10') +
+  geom_smooth(method = 'lm') +
+  stat_poly_eq(formula = y~x, 
+                aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")), 
+                parse = TRUE) +
+  labs(x = 'Observed Temps', y = 'Landsat Temps', title = 'Landsat and Observed')
+
+matchups %>% summarise(rmse = rmse(wtemp_obs, temp_ls),
+                       mae = mae(wtemp_obs,temp_ls),
+                       smape = smape(wtemp_obs,temp_ls),
+                       bias = bias(wtemp_obs, temp_ls)) %>%
+  mutate(across(everything(), ~round(.,4))) %>%
+  kableExtra::kable() %>%
+  kableExtra::kable_styling()
+  
+
+ggplot(matchups, aes(x = wtemp_obs, y = temp_e5_mix)) + geom_hex() +
+  scale_fill_viridis_c(trans = 'log10') +
+  geom_smooth(method = 'lm') +
+  stat_poly_eq(formula = y~x, 
+                aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")), 
+                parse = TRUE) +
+  labs(x = 'Observed Temps', y = 'Era5 Mixed Layer Temps', 'Era5 and Observed')
+
+
+matchups %>% filter(!is.na(temp_e5_mix)) %>%
+  summarise(rmse = rmse(wtemp_obs, temp_e5_mix),
+                       mae = mae(wtemp_obs,temp_e5_mix),
+                       smape = smape(wtemp_obs,temp_e5_mix),
+                       bias = bias(wtemp_obs, temp_e5_mix)) %>%
+  mutate(across(everything(), ~round(.,4))) %>%
+  kableExtra::kable() %>%
+  kableExtra::kable_styling()
+```
+
+## Now the same but with *strict* filters. After qc filters we have `r nrow(matchups_filt)` matchups.
+
+General filters here are:
+
+-   +/- 1 day matchups
+
+-   0 clouds/ice/cloud shadow over the 'deepest point'
+
+-   Landsat uncertainty \< 3 degrees
+
+-   Scene cloud cover \< 20% - Distance to shore \> 90m
+
+```{r}
+ggplot(matchups_filt, aes(x = wtemp_obs, y = temp_ls)) + geom_hex() +
+  scale_fill_viridis_c(trans = 'log10') +
+  geom_smooth(method = 'lm') +
+  stat_poly_eq(formula = y~x, 
+                aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")), 
+                parse = TRUE) +
+  labs(x = 'Observed Temps', y = 'Landsat Temps', title = 'Landsat and Observed')
+
+matchups_filt %>% summarise(rmse = rmse(wtemp_obs, temp_ls),
+                       mae = mae(wtemp_obs,temp_ls),
+                       smape = smape(wtemp_obs,temp_ls),
+                       bias = bias(wtemp_obs, temp_ls)) %>%
+  mutate(across(everything(), ~round(.,4))) %>%
+  kableExtra::kable() %>%
+  kableExtra::kable_styling()
+
+ggplot(matchups_filt, aes(x = wtemp_obs, y = temp_e5_mix)) + geom_hex() +
+  scale_fill_viridis_c(trans = 'log10') +
+  geom_smooth(method = 'lm') +
+  stat_poly_eq(formula = y~x, 
+                aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")), 
+                parse = TRUE) +
+  labs(x = 'Observed Temps', y = 'Era5 Mixed Layer Temps', 'Era5 and Observed')
+
+
+matchups_filt %>% filter(!is.na(temp_e5_mix)) %>%
+  summarise(rmse = rmse(wtemp_obs, temp_e5_mix),
+                       mae = mae(wtemp_obs,temp_e5_mix),
+                       smape = smape(wtemp_obs,temp_e5_mix),
+                       bias = bias(wtemp_obs, temp_e5_mix)) %>%
+  mutate(across(everything(), ~round(.,4))) %>%
+  kableExtra::kable() %>%
+  kableExtra::kable_styling()
+```
+
+## Finally, just big lakes where we would expect to be most confident (distanct to shore \> 1000m)
+
+```{r}
+ggplot(matchups_filt %>% filter(distance > 1000), aes(x = wtemp_obs, y = temp_ls)) + geom_hex() +
+  scale_fill_viridis_c(trans = 'log10') +
+  geom_smooth(method = 'lm') +
+  stat_poly_eq(formula = y~x, 
+                aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")), 
+                parse = TRUE) +
+  labs(x = 'Observed Temps', y = 'Landsat Temps', title = 'Landsat and Observed')
+
+matchups_filt %>% filter(distance > 1000) %>% summarise(rmse = rmse(wtemp_obs, temp_ls),
+                       mae = mae(wtemp_obs,temp_ls),
+                       smape = smape(wtemp_obs,temp_ls),
+                       bias = bias(wtemp_obs, temp_ls)) %>%
+  mutate(across(everything(), ~round(.,4))) %>%
+  kableExtra::kable() %>%
+  kableExtra::kable_styling()
+```
+
+## Sliced and diced a couple ways
+
+```{r}
+lake_data <- read_csv('data/in/lake_metadata.csv')
+
+matchups <- matchups %>% left_join(lake_data) %>% 
+  mutate(residual = wtemp_obs - temp_ls,
+         abs_error = abs(residual))
+
+ggplot(matchups, aes(x = residual, fill = sat)) +
+  geom_density(alpha = .4) +
+  geom_vline(aes(xintercept = 0), color = 'red') +
+  labs(title = 'Residuals across sensors')
+
+matchups %>%
+  mutate(dist_group = cut_number(distance, 6)) %>%
+  ggplot(aes(x = dist_group, y = residual)) +
+  geom_violin(draw_quantiles = .5) +
+  labs(title = 'Residuals binned by distance to shore')
+
+
+## Spatially
+matchups.sf <- matchups %>% st_as_sf(coords = c('lake_lon_deg','lake_lat_deg'), crs = 4326) %>% st_transform(5070)
+
+usa <- maps::map('usa', plot = F) %>% st_as_sf() %>% st_transform(5070) 
+
+grid <- st_make_grid(usa, cellsize = c(200000,200000), square = F) %>% st_as_sf() %>% mutate(ID = row_number())
+
+rmse.sf <- matchups.sf %>% st_join(grid) %>% st_set_geometry(NULL) %>%
+  group_by(ID) %>%
+  summarise(rmse = rmse(wtemp_obs, temp_ls),
+            matchup.count = n()) %>%
+  inner_join(grid) %>%
+  st_as_sf() 
+
+ggplot(rmse.sf) + geom_sf(aes(fill = rmse)) +
+  scale_fill_viridis_c() +
+  geom_sf(data = usa, fill = 'transparent', color = 'black') +
+  theme_minimal() +
+  labs(title = 'Spatial Error Distribution')
+
+ggplot(rmse.sf) + geom_sf(aes(fill = matchup.count)) +
+  scale_fill_viridis_c(trans='log10') +
+  geom_sf(data = usa, fill = 'transparent', color = 'black') +
+  theme_minimal() +
+  labs(title = 'Matchup Count')
+
+```

From 1aaf8aa782a20a6a6f237da074e8f5d8197533ea Mon Sep 17 00:00:00 2001
From: Simon Topp <stopp@usgs.gov>
Date: Thu, 8 Jul 2021 13:44:07 -0400
Subject: [PATCH 05/15] Responded to reviews from PR request.

---
 01_TempMunge.Rmd      | 29 ++++++++---------------------
 GEE_pull_functions.py | 22 ++++++++++------------
 LakeExport.py         | 35 ++++++++++++++++-------------------
 3 files changed, 34 insertions(+), 52 deletions(-)

diff --git a/01_TempMunge.Rmd b/01_TempMunge.Rmd
index 28e6e77..cef58e9 100644
--- a/01_TempMunge.Rmd
+++ b/01_TempMunge.Rmd
@@ -20,7 +20,7 @@ library(ggpubr)
 library(Hmisc)
 library(Metrics)
 
-knitr::opts_chunk$set(echo = TRUE, warning = FALSE)
+knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
 ```
 
 ```{r eval = F, include = F}
@@ -104,7 +104,7 @@ matchups %>% summarise(rmse = rmse(wtemp_obs, temp_ls),
   kableExtra::kable() %>%
   kableExtra::kable_styling()
   
-
+## One quick look at Era5, it's not great
 ggplot(matchups, aes(x = wtemp_obs, y = temp_e5_mix)) + geom_hex() +
   scale_fill_viridis_c(trans = 'log10') +
   geom_smooth(method = 'lm') +
@@ -152,27 +152,9 @@ matchups_filt %>% summarise(rmse = rmse(wtemp_obs, temp_ls),
   mutate(across(everything(), ~round(.,4))) %>%
   kableExtra::kable() %>%
   kableExtra::kable_styling()
-
-ggplot(matchups_filt, aes(x = wtemp_obs, y = temp_e5_mix)) + geom_hex() +
-  scale_fill_viridis_c(trans = 'log10') +
-  geom_smooth(method = 'lm') +
-  stat_poly_eq(formula = y~x, 
-                aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")), 
-                parse = TRUE) +
-  labs(x = 'Observed Temps', y = 'Era5 Mixed Layer Temps', 'Era5 and Observed')
-
-
-matchups_filt %>% filter(!is.na(temp_e5_mix)) %>%
-  summarise(rmse = rmse(wtemp_obs, temp_e5_mix),
-                       mae = mae(wtemp_obs,temp_e5_mix),
-                       smape = smape(wtemp_obs,temp_e5_mix),
-                       bias = bias(wtemp_obs, temp_e5_mix)) %>%
-  mutate(across(everything(), ~round(.,4))) %>%
-  kableExtra::kable() %>%
-  kableExtra::kable_styling()
 ```
 
-## Finally, just big lakes where we would expect to be most confident (distanct to shore \> 1000m)
+## Finally, just big lakes where we would expect to be most confident (distance to shore \> 1000m)
 
 ```{r}
 ggplot(matchups_filt %>% filter(distance > 1000), aes(x = wtemp_obs, y = temp_ls)) + geom_hex() +
@@ -206,6 +188,11 @@ ggplot(matchups, aes(x = residual, fill = sat)) +
   geom_vline(aes(xintercept = 0), color = 'red') +
   labs(title = 'Residuals across sensors')
 
+ggplot(matchups, aes(x = residual, fill = join)) +
+  geom_density(alpha = .4) +
+  geom_vline(aes(xintercept = 0), color = 'red') +
+  labs(title = 'Residuals across sensors')
+
 matchups %>%
   mutate(dist_group = cut_number(distance, 6)) %>%
   ggplot(aes(x = dist_group, y = residual)) +
diff --git a/GEE_pull_functions.py b/GEE_pull_functions.py
index a3e6cce..0dc2ad9 100644
--- a/GEE_pull_functions.py
+++ b/GEE_pull_functions.py
@@ -22,7 +22,7 @@ def AddFmask(image):
              .where(cloudshadow.gt(0), ee.Image(2))
              .where(cloud.gt(0), ee.Image(4))
              .updateMask(qa.gte(0)))
-    # mask the fmask so that it has the same footprint as the quality (BQA) band
+    ## mask the fmask so that it has the same footprint as the quality (BQA) band
     return image.addBands(fmask)
 
 
@@ -34,10 +34,6 @@ def dpBuff(i):
     return i.buffer(buffdist)
 
 
-## Remove geometries
-def removeGeo(i):
-    return i.setGeometry(None)
-
 
 ## Set up the reflectance pull
 def RefPull(image):
@@ -66,10 +62,12 @@ def RefPull(image):
 
     def copyMeta(i):
         return i.copyProperties(image, ["CLOUD_COVER", 'SPACECRAFT_ID', 'system:index', 'DATE_ACQUIRED'])
+    
+    ## Remove geometries
+    def removeGeo(i):
+        return i.setGeometry(None)
 
-
-        # Collect median reflectance and occurance values
-    # Make a cloud score, and get the water pixel count
+    # Collect reflectance values, cloud score, and pixel counts
     lsout = pixOut.reduceRegions(lakes, combinedReducer, 30)
 
     out = lsout.map(removeGeo)
@@ -78,11 +76,11 @@ def copyMeta(i):
     return out
 
 
-##Function for limiting the max number of tasks sent to
-# earth engine at one time to avoid time out errors
+## Function for limiting the max number of tasks sent to
+## earth engine at one time to avoid time out errors
 
 def maximum_no_of_tasks(MaxNActive, waitingPeriod):
-    ##maintain a maximum number of active tasks
+    ## maintain a maximum number of active tasks
     time.sleep(10)
     ## initialize submitting jobs
     ts = list(ee.batch.Task.list())
@@ -94,7 +92,7 @@ def maximum_no_of_tasks(MaxNActive, waitingPeriod):
     ## wait if the number of current active tasks reach the maximum number
     ## defined in MaxNActive
     while (NActive >= MaxNActive):
-        time.sleep(waitingPeriod)  # if reach or over maximum no. of active tasks, wait for 2min and check again
+        time.sleep(waitingPeriod) 
         ts = list(ee.batch.Task.list())
         NActive = 0
         for task in ts:
diff --git a/LakeExport.py b/LakeExport.py
index 2bdea4f..b32aeaf 100644
--- a/LakeExport.py
+++ b/LakeExport.py
@@ -3,28 +3,25 @@
 """
 @author: simontopp
 """
-#%%
+
 import time
 import ee
 import os
-#import pandas as pd
-#import feather
 
 ## Initialize Earth Engine
 ee.Initialize()
 
-#Source necessary functions. We do this instead of 'import' because of EE quirk.
+## Source necessary functions. We do this instead of 'import' because of EE quirk.
 exec(open('GEE_pull_functions.py').read())
 
-#water = ee.Image("JRC/GSW1_1/GlobalSurfaceWater").select('occurrence').gt(80)
-
 ## Bring in EE Assets
-# Deepest point for CONUS PGDL Lakes
-# DP Code available at https://zenodo.org/record/4136755#.X5d54pNKgUE
+## Deepest point (Chebyshev center) for CONUS PGDL Lakes
+## DP Code available at https://zenodo.org/record/4136755#.X5d54pNKgUE and
+## https://code.earthengine.google.com/8dac409b220bdfb051bb469bc5b3c708
 dp = (ee.FeatureCollection('users/sntopp/USGS/PGDL_lakes_deepest_point')
   .filterMetadata('distance', "greater_than", 60))
 
-#CONUS Boundary
+## CONUS Boundary
 us = ee.FeatureCollection("USDOS/LSIB_SIMPLE/2017")\
     .filterMetadata('country_na', 'equals', 'United States')
 
@@ -32,16 +29,17 @@
 wrs = ee.FeatureCollection('users/sntopp/wrs2_asc_desc')\
     .filterBounds(us)\
     .filterMetadata('MODE', 'equals', 'D')
-    
+
+## Run everything by path/row to speed up computation in EE.    
 pr = wrs.aggregate_array('PR').getInfo()
 
-#Bring in temp data
+## Bring in temp data
 l8 = ee.ImageCollection("LANDSAT/LC08/C02/T1_L2")
 l7 = ee.ImageCollection("LANDSAT/LE07/C02/T1_L2")
 era5 = ee.ImageCollection("ECMWF/ERA5_LAND/HOURLY")
 
-#Standardize band names between the various collections and aggregate 
-#them into one image collection
+## Standardize band names between the various collections and aggregate 
+## them into one image collection
 bn8 = ['ST_B10', 'ST_QA', 'QA_PIXEL']
 bn57 = ['ST_B6', 'ST_QA', 'QA_PIXEL']
 bns = ['temp','temp_qa','pixel_qa']
@@ -49,7 +47,7 @@
 ls7 = l7.select(bn57, bns)
 ls8 = l8.select(bn8, bns)
 
-##Do coarse cloud filtering
+## Do coarse cloud filtering
 ls = ee.ImageCollection(ls7.merge(ls8))\
     .filter(ee.Filter.lt('CLOUD_COVER', 50))\
     .filterBounds(us)  
@@ -71,12 +69,10 @@
 
 for tiles in pr:
     tile = wrs.filterMetadata('PR', 'equals', tiles)
-    # For some reason we need to cast this to a list and back to a
-    # feature collection
+    
     lakes = dp.filterBounds(tile.geometry())\
         .map(dpBuff)
         
-    #lakes = ee.FeatureCollection(lakes.toList(10000))
     stack = ls.filterBounds(tile.geometry().centroid())
     out = stack.map(RefPull).flatten().filterMetadata('cScore_clouds','less_than',.5)
     dataOut = ee.batch.Export.table.toDrive(collection = out,\
@@ -84,9 +80,10 @@
                                             folder = 'EE_TempPull',\
                                             fileFormat = 'csv',\
                                             selectors = ["system:index", "areakm", "cScore_clouds", "CLOUD_COVER", 'SPACECRAFT_ID', 'DATE_ACQUIRED', "distance", "lake_mix_layer_temperature", "lake_total_layer_temperature", "pCount_water", "site_id", "temp", "temp_qa"])
-    #Check how many existing tasks are running and take a break if it's >15
+    
+    ## Check how many existing tasks are running and take a break if it's >15
     maximum_no_of_tasks(25, 120)
-    #Send next task.
+    ## Send next task.
     dataOut.start()
     counter = counter + 1
     done.append(tiles)

From f382c32c5525d8ea5a731963832c3aed54a10ab9 Mon Sep 17 00:00:00 2001
From: Simon Topp <stopp@usgs.gov>
Date: Thu, 8 Jul 2021 13:55:01 -0400
Subject: [PATCH 06/15] Missed a couple comments in the PR!

---
 LakeExport.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/LakeExport.py b/LakeExport.py
index b32aeaf..2fed694 100644
--- a/LakeExport.py
+++ b/LakeExport.py
@@ -30,7 +30,7 @@
     .filterBounds(us)\
     .filterMetadata('MODE', 'equals', 'D')
 
-## Run everything by path/row to speed up computation in EE.    
+## Run everything by path/row to speed up computation in EE    
 pr = wrs.aggregate_array('PR').getInfo()
 
 ## Bring in temp data
@@ -38,8 +38,7 @@
 l7 = ee.ImageCollection("LANDSAT/LE07/C02/T1_L2")
 era5 = ee.ImageCollection("ECMWF/ERA5_LAND/HOURLY")
 
-## Standardize band names between the various collections and aggregate 
-## them into one image collection
+## Standardize band names between the various collections 
 bn8 = ['ST_B10', 'ST_QA', 'QA_PIXEL']
 bn57 = ['ST_B6', 'ST_QA', 'QA_PIXEL']
 bns = ['temp','temp_qa','pixel_qa']
@@ -47,7 +46,8 @@
 ls7 = l7.select(bn57, bns)
 ls8 = l8.select(bn8, bns)
 
-## Do coarse cloud filtering
+## Do coarse cloud filtering and aggregate 
+## them into one image collection
 ls = ee.ImageCollection(ls7.merge(ls8))\
     .filter(ee.Filter.lt('CLOUD_COVER', 50))\
     .filterBounds(us)  

From e7b6c9ebab55bf26bffdc19b8afc14b1af1ce9e6 Mon Sep 17 00:00:00 2001
From: Simon Topp <33098707+SimonTopp@users.noreply.github.com>
Date: Mon, 9 May 2022 16:05:12 -0400
Subject: [PATCH 07/15] Added python implementation of DP pull

---
 NHD_Deepest_Point_Pull.py | 101 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100755 NHD_Deepest_Point_Pull.py

diff --git a/NHD_Deepest_Point_Pull.py b/NHD_Deepest_Point_Pull.py
new file mode 100755
index 0000000..9083fd2
--- /dev/null
+++ b/NHD_Deepest_Point_Pull.py
@@ -0,0 +1,101 @@
+import ee
+import math
+ee.Initialize()
+
+## Deepest point calculation adapted from Xiao Yang
+### https: // doi.org / 10.5281 / zenodo.4136754
+#
+# table_filt = nhdhr.filter(ee.Filter.gte('AREASQK', .01))
+# print(table_filt.size())
+# print(table_filt)
+#
+# Map.addLayer(nhdhr)
+# // Map.centerObject(fail_point, 12)
+# // print(blah)
+
+
+def get_scale(polygon):
+    radius = polygon.get('areasqkm').getInfo()
+    radius = ee.Number(radius).divide(math.pi).sqrt().multiply(1000)
+    return radius.divide(7)
+
+
+def getUTMProj(lon, lat):
+    # see
+    # https: // apollomapping.com / blog / gtm - finding - a - utm - zone - number - easily and
+    # https: // sis.apache.org / faq.html
+    utmCode = ee.Number(lon).add(180).divide(6).ceil().int()
+    output = ee.Algorithms.If(ee.Number(lat).gte(0),
+                              ee.String('EPSG:326').cat(utmCode.format('%02d')),
+                              ee.String('EPSG:327').cat(utmCode.format('%02d')))
+    return output
+
+def GetLakeCenters(polygon):
+
+    ## Calculate both the deepest point an centroid
+    ## for the inpout polygon ( or multipolygon)
+    ## for each input, export geometries for both centroid and deepest point and their distance to shore.
+    scale = get_scale(polygon)
+    geo = polygon.geometry()
+    ct = geo.centroid(scale)
+    utmCode = getUTMProj(ct.coordinates().getNumber(0), ct.coordinates().getNumber(1))
+
+    polygonImg = ee.Image.constant(1).toByte().paint(ee.FeatureCollection(ee.Feature(geo, None)), 0)
+
+    dist = polygonImg.fastDistanceTransform(2056).updateMask(polygonImg.Not()).sqrt().reproject(utmCode, None, scale).multiply(scale) # convert unit from pixel to meter
+
+    # dist = (polygonImg.fastDistanceTransform(2056).updateMask(polygonImg.not ())
+    # .sqrt().reproject('EPSG:4326', None, scale).multiply(scale)  # convert unit from pixel to meter
+
+    maxDistance = (dist.reduceRegion(
+        reducer=ee.Reducer.max(),
+        geometry=geo,
+        scale=scale,
+        bestEffort=True,
+        tileScale=1
+    ).getNumber('distance'))
+
+    outputDp = (ee.Feature(dist.addBands(ee.Image.pixelLonLat()).updateMask(dist.gte(maxDistance))
+                          .sample(geo, scale).first()))
+
+    dp = ee.Geometry.Point([outputDp.get('longitude'), outputDp.get('latitude')])
+
+    regions = ee.FeatureCollection([ee.Feature(dp, {'type': 'dp'})])
+
+    output = dist.sampleRegions(
+        collection=regions,
+        properties=['type'],
+        scale=scale,
+        tileScale=1,
+        geometries=True)
+
+    return (ee.Feature(output.first()).copyProperties(polygon))
+
+assets_parent = ee.data.listAssets({'parent': 'projects/earthengine-legacy/assets/projects/sat-io/open-datasets/NHD'})['assets']
+
+
+assets_state =  (ee.FeatureCollection(f"{assets_parent[1]['id']}/NHDWaterbody")
+    .filter(ee.Filter.gte('areasqkm',0.0001))
+    .filter(ee.Filter.inList('ftype',[361,436,390])))
+
+dp = GetLakeCenters(ee.Feature(assets_state.first()))
+
+dp_buff = test.map(function(f)
+{
+return (f.buffer(f.getNumber('distance')))})
+
+var
+
+
+colorDp = {color: 'cyan'}
+Map.addLayer(nhdhr.filterBounds(geometry))
+Map.addLayer(dp_buff, colorDp, 'DpBuffer', true, 0.5)
+Map.addLayer(test, colorDp, 'Deepest Point', true)
+Map.centerObject(geometry)
+
+var
+dp_out = nhdhr.filter(ee.Filter.gte('AREASQK', 0.001))
+    .filter(ee.Filter.lt('AREASQK', 0.01)).map(GetLakeCenters)
+Export.table.toAsset(dp_out, 'USGS/NHD_hr_dp_f361_436_390_gt1hectare')
+
+

From f7e9b574873b38ac1696c278b84cb23df57eb8d4 Mon Sep 17 00:00:00 2001
From: Simon Topp <33098707+SimonTopp@users.noreply.github.com>
Date: Mon, 9 May 2022 16:11:35 -0400
Subject: [PATCH 08/15] within scene variance

---
 .gitignore            |   0
 01_TempMunge.Rmd      | 152 +++++++++++++++++++++++++++++++++++++++---
 GEE_pull_functions.py |   0
 LakeExport.py         |   0
 README.md             |   0
 5 files changed, 141 insertions(+), 11 deletions(-)
 mode change 100644 => 100755 .gitignore
 mode change 100644 => 100755 01_TempMunge.Rmd
 mode change 100644 => 100755 GEE_pull_functions.py
 mode change 100644 => 100755 LakeExport.py
 mode change 100644 => 100755 README.md

diff --git a/.gitignore b/.gitignore
old mode 100644
new mode 100755
diff --git a/01_TempMunge.Rmd b/01_TempMunge.Rmd
old mode 100644
new mode 100755
index cef58e9..173012e
--- a/01_TempMunge.Rmd
+++ b/01_TempMunge.Rmd
@@ -35,6 +35,7 @@ munger <- function(file){
     filter(!is.na(temp),
            cScore_clouds == 0,
            pCount_water > 5) %>%
+    mutate(LandsatID = map_chr(`system:index`, ~str_split(.,'_0000')[[1]][1])) %>%
   select(-`system:index`)
   return(df)
 }
@@ -58,7 +59,6 @@ munged <- read_feather('data/out/landsat_temps.feather')
 ## Between Landsat 7 and 8 we have `r nrow(munged)` temperature observations.
 
 ```{r}
-
 obs <- read_csv('data/in/lake_surface_temp_obs.csv') %>%
   rename(date = Date)
 
@@ -68,7 +68,10 @@ day_plus_one <- obs %>% inner_join(munged %>% mutate(date = date + 1)) %>% mutat
 
 day_minus_one <- obs %>% inner_join(munged %>% mutate(date = date -1)) %>% mutate(join = 'day_minus_1')
 
-matchups <- bind_rows(same_day, day_plus_one, day_minus_one)
+matchups <- bind_rows(same_day, day_plus_one, day_minus_one) %>%
+  mutate(resid = wtemp_obs - temp_ls,
+         abs_error = abs(resid),
+         e5_flag = abs(wtemp_obs - temp_e5_mix))
 
 matchups_filt <- matchups %>%
   filter(temp_ls_qa < 3,
@@ -107,11 +110,12 @@ matchups %>% summarise(rmse = rmse(wtemp_obs, temp_ls),
 ## One quick look at Era5, it's not great
 ggplot(matchups, aes(x = wtemp_obs, y = temp_e5_mix)) + geom_hex() +
   scale_fill_viridis_c(trans = 'log10') +
-  geom_smooth(method = 'lm') +
+  geom_smooth(method = 'lm', aes(color = 'Best Fit')) +
+  geom_abline(aes(slope=1, intercept=0,color = '1:1')) +
   stat_poly_eq(formula = y~x, 
                 aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")), 
                 parse = TRUE) +
-  labs(x = 'Observed Temps', y = 'Era5 Mixed Layer Temps', 'Era5 and Observed')
+  labs(x = 'Observed Temps', y = 'Era5 Mixed Layer Temps', 'Era5 and Observed', titel='blue=Best Fit', 'Red=1:1')
 
 
 matchups %>% filter(!is.na(temp_e5_mix)) %>%
@@ -137,6 +141,42 @@ General filters here are:
 -   Scene cloud cover \< 20% - Distance to shore \> 90m
 
 ```{r}
+matchups_filt <- matchups %>%
+  filter(temp_ls_qa < 3,
+         scene_cloud_cover < 10,
+         distance > 200,
+         temp_ls >=0,
+         wtemp_obs >=0,
+         e5_flag < 5,
+         join != 'day_minus_1'
+         )
+
+plotter <- function(var){
+  matchups %>%
+    mutate(bins= cut_number(get(var), 5)) %>%
+    ggplot(aes(x = bins, y = abs_error)) +
+    geom_violin(draw_quantiles = c(.25,.5,.75)) +
+    geom_hline(aes(yintercept = 0), color = 'red')+
+    labs(x = var)
+}
+
+plotter('areakm')
+plotter('scene_cloud_cover')
+plotter('e5_flag')
+plotter('distance')
+plotter('temp_ls_qa')
+
+matchups_filt <- matchups %>%
+  filter(areakm > .2,
+         temp_ls_qa < 3,
+         scene_cloud_cover < 20,
+         distance > 250,
+         temp_ls >= 0,
+         wtemp_obs >=0,
+         e5_flag < 5,
+         join != 'day_minus_1'
+         )
+
 ggplot(matchups_filt, aes(x = wtemp_obs, y = temp_ls)) + geom_hex() +
   scale_fill_viridis_c(trans = 'log10') +
   geom_smooth(method = 'lm') +
@@ -150,14 +190,15 @@ matchups_filt %>% summarise(rmse = rmse(wtemp_obs, temp_ls),
                        smape = smape(wtemp_obs,temp_ls),
                        bias = bias(wtemp_obs, temp_ls)) %>%
   mutate(across(everything(), ~round(.,4))) %>%
-  kableExtra::kable() %>%
-  kableExtra::kable_styling()
+  kableExtra::kable() 
+
+check <- matchups_filt %>% mutate(se = (wtemp_obs - temp_ls)^2)
 ```
 
 ## Finally, just big lakes where we would expect to be most confident (distance to shore \> 1000m)
 
 ```{r}
-ggplot(matchups_filt %>% filter(distance > 1000), aes(x = wtemp_obs, y = temp_ls)) + geom_hex() +
+ggplot(matchups_filt %>% filter(distance > 1000), aes(x = wtemp_obs, y = temp_e5_mix)) + geom_hex() +
   scale_fill_viridis_c(trans = 'log10') +
   geom_smooth(method = 'lm') +
   stat_poly_eq(formula = y~x, 
@@ -165,10 +206,11 @@ ggplot(matchups_filt %>% filter(distance > 1000), aes(x = wtemp_obs, y = temp_ls
                 parse = TRUE) +
   labs(x = 'Observed Temps', y = 'Landsat Temps', title = 'Landsat and Observed')
 
-matchups_filt %>% filter(distance > 1000) %>% summarise(rmse = rmse(wtemp_obs, temp_ls),
-                       mae = mae(wtemp_obs,temp_ls),
-                       smape = smape(wtemp_obs,temp_ls),
-                       bias = bias(wtemp_obs, temp_ls)) %>%
+matchups_filt %>% filter(distance > 1000) %>%
+  summarise(rmse = rmse(wtemp_obs, temp_ls),
+                   mae = mae(wtemp_obs,temp_ls),
+                   smape = smape(wtemp_obs,temp_ls),
+                   bias = bias(wtemp_obs, temp_ls)) %>%
   mutate(across(everything(), ~round(.,4))) %>%
   kableExtra::kable() %>%
   kableExtra::kable_styling()
@@ -227,3 +269,91 @@ ggplot(rmse.sf) + geom_sf(aes(fill = matchup.count)) +
   labs(title = 'Matchup Count')
 
 ```
+
+## Take a look at within scene variance.
+```{r}
+summary_in <- matchups_filt
+
+scene_sd <- summary_in %>% group_by(LandsatID) %>%
+  summarise(count = n(),
+            resid_sd = sd(resid)) %>%
+  filter(count >5)
+
+overall_sd <- summary_in %>% filter(LandsatID %in% scene_sd$LandsatID) %>%
+  summarise(overall = sd(resid)) %>% .[[1]]
+
+ggplot(scene_sd, aes(x = resid_sd)) + geom_histogram() + geom_vline(aes(xintercept = overall_sd, color = 'Overall residual SD'))
+
+scene_counts <- dplyr::count(summary_in, LandsatID) %>% filter(n > 5)
+
+scene_bias <- summary_in %>% 
+  filter(LandsatID %in% scene_counts$LandsatID) %>%
+  group_by(LandsatID) %>%
+  summarise(bias = bias(wtemp_obs, temp_ls))
+
+scene_stats <- summary_in %>% 
+  filter(LandsatID %in% scene_counts$LandsatID) %>%
+  left_join(scene_bias) %>%
+  mutate(temp_no_bias = temp_ls+bias) %>%
+  group_by(LandsatID) %>%
+  summarise(rmse_bias_removed = rmse(wtemp_obs, temp_no_bias),
+            scene_bias = mean(bias))
+
+overall_bias = summary_in %>% filter(LandsatID %in% scene_counts$LandsatID) %>%
+  summarise(bias = bias(wtemp_obs, temp_ls)) %>%
+  .[[1]]
+
+overall_rmse = summary_in %>% filter(LandsatID %in% scene_counts$LandsatID) %>%
+  mutate(temp_ls = temp_ls + overall_bias) %>%
+  summarise(rmse = rmse(wtemp_obs,temp_ls))
+rmses <- tibble(overall_rmse_bias_removed = overall_rmse[[1]], median_scene_rmse_bias_removed = median(scene_stats$rmse_bias_removed)) %>%
+  pivot_longer(everything(), names_to = "RMSE_Summaries")
+
+ggplot(scene_stats, aes(x = rmse_bias_removed)) + geom_histogram() + 
+  geom_vline(data = rmses, aes(xintercept = value, color = RMSE_Summaries)) +
+  labs(x = "Distribution of image specific rmse after removing image bias")
+```
+
+
+```{r}
+mendota <- read_csv('data/in/Mendota_Era5Land_1984_2021.csv') 
+mendotaObs <- obs %>% filter(site_id == "nhdhr_143249470")
+
+mendota <- mendota %>% mutate(lake_mix_layer_temperature = lake_mix_layer_temperature - 273.15,
+                              date = as.Date(date)) %>%
+  inner_join(mendotaObs)
+
+library(plotly)
+
+ggplotly(
+  ggplot(mendota, aes(x=date)) +
+    geom_point(aes(y = lake_mix_layer_temperature, color='Era5')) +
+    geom_point(aes(y=wtemp_obs, color = 'Obs')) +
+    geom_line(aes(y = lake_mix_layer_temperature, color='Era5')) +
+    geom_line(aes(y=wtemp_obs, color = 'Obs')))
+
+ggplot(mendota, aes(x=lake_mix_layer_temperature)) + geom_histogram()
+
+ggplot
+
+
+mendota %>% rename(temp_e5_mix = lake_mix_layer_temperature) %>%
+  summarise(rmse = rmse(wtemp_obs, temp_e5_mix),
+                       mae = mae(wtemp_obs,temp_e5_mix),
+                       smape = smape(wtemp_obs,temp_e5_mix),
+                       bias = bias(wtemp_obs, temp_e5_mix)) %>%
+  mutate(across(everything(), ~round(.,4))) %>%
+  kableExtra::kable() %>%
+  kableExtra::kable_styling()
+
+mendota %>% rename(temp_e5_mix = lake_mix_layer_temperature) %>%
+ggplot(.,aes(x = wtemp_obs, y = temp_e5_mix)) + geom_hex() +
+  scale_fill_viridis_c(trans = 'log10') +
+  geom_smooth(method = 'lm', aes(color = 'Best Fit')) +
+  geom_abline(aes(slope=1, intercept=0,color = '1:1')) +
+  stat_poly_eq(formula = y~x, 
+                aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")), 
+                parse = TRUE) +
+  labs(x = 'Observed Temps', y = 'Era5 Mixed Layer Temps', 'Era5 and Observed', titel='blue=Best Fit', 'Red=1:1')
+```
+
diff --git a/GEE_pull_functions.py b/GEE_pull_functions.py
old mode 100644
new mode 100755
diff --git a/LakeExport.py b/LakeExport.py
old mode 100644
new mode 100755
diff --git a/README.md b/README.md
old mode 100644
new mode 100755

From ffb4558befca15993e553d407248ce067a511a56 Mon Sep 17 00:00:00 2001
From: Simon Topp <33098707+SimonTopp@users.noreply.github.com>
Date: Tue, 10 May 2022 15:35:36 -0400
Subject: [PATCH 09/15] Batched calculations by state and swapped EE community
 NHD dataset

---
 NHD_Deepest_Point_Pull.py | 73 ++++++++++++++++++++++-----------------
 1 file changed, 42 insertions(+), 31 deletions(-)

diff --git a/NHD_Deepest_Point_Pull.py b/NHD_Deepest_Point_Pull.py
index 9083fd2..f7fe587 100755
--- a/NHD_Deepest_Point_Pull.py
+++ b/NHD_Deepest_Point_Pull.py
@@ -1,23 +1,15 @@
 import ee
 import math
+import time
 ee.Initialize()
 
 ## Deepest point calculation adapted from Xiao Yang
 ### https: // doi.org / 10.5281 / zenodo.4136754
-#
-# table_filt = nhdhr.filter(ee.Filter.gte('AREASQK', .01))
-# print(table_filt.size())
-# print(table_filt)
-#
-# Map.addLayer(nhdhr)
-# // Map.centerObject(fail_point, 12)
-# // print(blah)
-
-
+### Functions
 def get_scale(polygon):
-    radius = polygon.get('areasqkm').getInfo()
+    radius = polygon.get('areasqkm')
     radius = ee.Number(radius).divide(math.pi).sqrt().multiply(1000)
-    return radius.divide(7)
+    return radius.divide(20)
 
 
 def getUTMProj(lon, lat):
@@ -69,33 +61,52 @@ def GetLakeCenters(polygon):
         tileScale=1,
         geometries=True)
 
-    return (ee.Feature(output.first()).copyProperties(polygon))
+    return (ee.Feature(output.first()).copyProperties(polygon,['permanent','areasqkm']))
 
-assets_parent = ee.data.listAssets({'parent': 'projects/earthengine-legacy/assets/projects/sat-io/open-datasets/NHD'})['assets']
+def buff_dp(dp):
+    return dp.buffer(dp.getNumber('distance'))
 
 
-assets_state =  (ee.FeatureCollection(f"{assets_parent[1]['id']}/NHDWaterbody")
-    .filter(ee.Filter.gte('areasqkm',0.0001))
-    .filter(ee.Filter.inList('ftype',[361,436,390])))
+def maximum_no_of_tasks(MaxNActive, waitingPeriod):
+    ## maintain a maximum number of active tasks
+    time.sleep(10)
+    ## initialize submitting jobs
+    ts = list(ee.batch.Task.list())
 
-dp = GetLakeCenters(ee.Feature(assets_state.first()))
+    NActive = 0
+    for task in ts:
+        if ('RUNNING' in str(task) or 'READY' in str(task)):
+            NActive += 1
+    ## wait if the number of current active tasks reach the maximum number
+    ## defined in MaxNActive
+    while (NActive >= MaxNActive):
+        time.sleep(waitingPeriod)
+        ts = list(ee.batch.Task.list())
+        NActive = 0
+        for task in ts:
+            if ('RUNNING' in str(task) or 'READY' in str(task)):
+                NActive += 1
+    return ()
 
-dp_buff = test.map(function(f)
-{
-return (f.buffer(f.getNumber('distance')))})
 
-var
+assets_parent = ee.data.listAssets({'parent': 'projects/earthengine-legacy/assets/projects/sat-io/open-datasets/NHD'})['assets']
+assets_parent = assets_parent[0:2]
+for i in range(len(assets_parent)):
+    state_asset = assets_parent[i]['id']
+    assets_state = (ee.FeatureCollection(f"{state_asset}/NHDWaterbody")
+    .filter(ee.Filter.gte('areasqkm',0.0001))
+    .filter(ee.Filter.inList('ftype',[361,436,390])))
+
+    dp = ee.FeatureCollection(assets_state).map(GetLakeCenters)
 
+    dataOut = ee.batch.Export.table.toAsset(collection=dp,description=state_asset.split('/')[-1],assetId=f"projects/earthengine-legacy/assets/users/sntopp/NHD/DeepestPoint/{state_asset.split('/')[-1]}")
 
-colorDp = {color: 'cyan'}
-Map.addLayer(nhdhr.filterBounds(geometry))
-Map.addLayer(dp_buff, colorDp, 'DpBuffer', true, 0.5)
-Map.addLayer(test, colorDp, 'Deepest Point', true)
-Map.centerObject(geometry)
+    ## Check how many existing tasks are running and take a break if it's >15
+    maximum_no_of_tasks(10, 240)
+    ## Send next task.
+    dataOut.start()
+    print(state_asset.split('/')[-1])
 
-var
-dp_out = nhdhr.filter(ee.Filter.gte('AREASQK', 0.001))
-    .filter(ee.Filter.lt('AREASQK', 0.01)).map(GetLakeCenters)
-Export.table.toAsset(dp_out, 'USGS/NHD_hr_dp_f361_436_390_gt1hectare')
+#f"projects/earthengine-legacy/assets/users/sntopp/NHD/{state_asset.split('/')[-1]}/NHDDeepestPoint"
 
 

From 8fa4fbb03fb60bf2b50070a44e79c791d641f5d3 Mon Sep 17 00:00:00 2001
From: Simon Topp <33098707+SimonTopp@users.noreply.github.com>
Date: Mon, 16 May 2022 09:59:10 -0400
Subject: [PATCH 10/15] small fixes to batched DP pull

---
 NHD_Deepest_Point_Pull.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/NHD_Deepest_Point_Pull.py b/NHD_Deepest_Point_Pull.py
index f7fe587..4d817c1 100755
--- a/NHD_Deepest_Point_Pull.py
+++ b/NHD_Deepest_Point_Pull.py
@@ -90,11 +90,11 @@ def maximum_no_of_tasks(MaxNActive, waitingPeriod):
 
 
 assets_parent = ee.data.listAssets({'parent': 'projects/earthengine-legacy/assets/projects/sat-io/open-datasets/NHD'})['assets']
-assets_parent = assets_parent[0:2]
+##assets_parent = assets_parent[5:15]
 for i in range(len(assets_parent)):
     state_asset = assets_parent[i]['id']
     assets_state = (ee.FeatureCollection(f"{state_asset}/NHDWaterbody")
-    .filter(ee.Filter.gte('areasqkm',0.0001))
+    .filter(ee.Filter.gte('areasqkm',0.001))
     .filter(ee.Filter.inList('ftype',[361,436,390])))
 
     dp = ee.FeatureCollection(assets_state).map(GetLakeCenters)
@@ -102,7 +102,7 @@ def maximum_no_of_tasks(MaxNActive, waitingPeriod):
     dataOut = ee.batch.Export.table.toAsset(collection=dp,description=state_asset.split('/')[-1],assetId=f"projects/earthengine-legacy/assets/users/sntopp/NHD/DeepestPoint/{state_asset.split('/')[-1]}")
 
     ## Check how many existing tasks are running and take a break if it's >15
-    maximum_no_of_tasks(10, 240)
+    maximum_no_of_tasks(15, 240)
     ## Send next task.
     dataOut.start()
     print(state_asset.split('/')[-1])

From 508cdc7cf81ec66e3d9dce3d7fa3975612a4563b Mon Sep 17 00:00:00 2001
From: Simon Topp <33098707+SimonTopp@users.noreply.github.com>
Date: Fri, 3 Jun 2022 14:36:12 -0400
Subject: [PATCH 11/15] Added upper and lower bounds to scale, removed great
 lakes

---
 NHD_Deepest_Point_Pull.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/NHD_Deepest_Point_Pull.py b/NHD_Deepest_Point_Pull.py
index 4d817c1..0800a66 100755
--- a/NHD_Deepest_Point_Pull.py
+++ b/NHD_Deepest_Point_Pull.py
@@ -9,7 +9,10 @@
 def get_scale(polygon):
     radius = polygon.get('areasqkm')
     radius = ee.Number(radius).divide(math.pi).sqrt().multiply(1000)
-    return radius.divide(20)
+    scale = radius.divide(20)
+    scale = ee.Algorithms.If(ee.Number(scale).lte(10),10,scale)
+    scale = ee.Algorithms.If(ee.Number(scale).gte(500),500,scale)
+    return ee.Number(scale)
 
 
 def getUTMProj(lon, lat):
@@ -88,13 +91,17 @@ def maximum_no_of_tasks(MaxNActive, waitingPeriod):
                 NActive += 1
     return ()
 
-
+assets_done = ee.data.listAssets({'parent': 'projects/earthengine-legacy/assets/users/sntopp/NHD/DeepestPoint'})['assets']
+ids_done = [i['id'].split('/')[-1] for i in assets_done]
 assets_parent = ee.data.listAssets({'parent': 'projects/earthengine-legacy/assets/projects/sat-io/open-datasets/NHD'})['assets']
-##assets_parent = assets_parent[5:15]
+assets_parent = [i for i in assets_parent if i['id'].split('/')[-1] not in ids_done]
+
+
 for i in range(len(assets_parent)):
     state_asset = assets_parent[i]['id']
     assets_state = (ee.FeatureCollection(f"{state_asset}/NHDWaterbody")
     .filter(ee.Filter.gte('areasqkm',0.001))
+    .filter(ee.Filter.lte('areasqkm',5000))  #Remove Great Lakes
     .filter(ee.Filter.inList('ftype',[361,436,390])))
 
     dp = ee.FeatureCollection(assets_state).map(GetLakeCenters)

From 2479b3e7748d36de8c68f35131289f43cff9ba4e Mon Sep 17 00:00:00 2001
From: Simon Topp <33098707+SimonTopp@users.noreply.github.com>
Date: Wed, 29 Jun 2022 15:44:07 -0400
Subject: [PATCH 12/15] Added EE calculation of area for additional threshold

---
 NHD_Deepest_Point_Pull.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/NHD_Deepest_Point_Pull.py b/NHD_Deepest_Point_Pull.py
index 0800a66..2eb77d0 100755
--- a/NHD_Deepest_Point_Pull.py
+++ b/NHD_Deepest_Point_Pull.py
@@ -10,7 +10,7 @@ def get_scale(polygon):
     radius = polygon.get('areasqkm')
     radius = ee.Number(radius).divide(math.pi).sqrt().multiply(1000)
     scale = radius.divide(20)
-    scale = ee.Algorithms.If(ee.Number(scale).lte(10),10,scale)
+    #scale = ee.Algorithms.If(ee.Number(scale).lte(10),10,scale)
     scale = ee.Algorithms.If(ee.Number(scale).gte(500),500,scale)
     return ee.Number(scale)
 
@@ -91,10 +91,14 @@ def maximum_no_of_tasks(MaxNActive, waitingPeriod):
                 NActive += 1
     return ()
 
+def calc_area(feature):
+    return(feature.set('area_calc',feature.area().divide(1e6)))
+
 assets_done = ee.data.listAssets({'parent': 'projects/earthengine-legacy/assets/users/sntopp/NHD/DeepestPoint'})['assets']
 ids_done = [i['id'].split('/')[-1] for i in assets_done]
 assets_parent = ee.data.listAssets({'parent': 'projects/earthengine-legacy/assets/projects/sat-io/open-datasets/NHD'})['assets']
 assets_parent = [i for i in assets_parent if i['id'].split('/')[-1] not in ids_done]
+#assets_parent = [i for i in assets_parent if i['id'].split('/')[-1] not in ['NHD_MO','NHD_TX','NHD_AK']]
 
 
 for i in range(len(assets_parent)):
@@ -104,7 +108,10 @@ def maximum_no_of_tasks(MaxNActive, waitingPeriod):
     .filter(ee.Filter.lte('areasqkm',5000))  #Remove Great Lakes
     .filter(ee.Filter.inList('ftype',[361,436,390])))
 
-    dp = ee.FeatureCollection(assets_state).map(GetLakeCenters)
+    ## Added to trouble shoot NM where some of the NHD areas are don't match the actual polygon areas
+    assets_state=ee.FeatureCollection(assets_state.map(calc_area)).filter(ee.Filter.gte('area_calc',0.001))
+
+    dp = assets_state.map(GetLakeCenters)
 
     dataOut = ee.batch.Export.table.toAsset(collection=dp,description=state_asset.split('/')[-1],assetId=f"projects/earthengine-legacy/assets/users/sntopp/NHD/DeepestPoint/{state_asset.split('/')[-1]}")
 

From a4ab1d66cb26017271a4846532fe9aafcec2a497 Mon Sep 17 00:00:00 2001
From: Simon Topp <33098707+SimonTopp@users.noreply.github.com>
Date: Wed, 13 Jul 2022 14:55:48 -0600
Subject: [PATCH 13/15] Started to munge national DP data >1 ha.

---
 NHD_Deepest_Point_Pull.py |  10 ++-
 NHD_HR_Lake_Pull.R        | 137 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 146 insertions(+), 1 deletion(-)
 create mode 100755 NHD_HR_Lake_Pull.R

diff --git a/NHD_Deepest_Point_Pull.py b/NHD_Deepest_Point_Pull.py
index 2eb77d0..c8fa1cf 100755
--- a/NHD_Deepest_Point_Pull.py
+++ b/NHD_Deepest_Point_Pull.py
@@ -48,7 +48,7 @@ def GetLakeCenters(polygon):
         scale=scale,
         bestEffort=True,
         tileScale=1
-    ).getNumber('distance'))
+    ).getNumber('distance').int16())
 
     outputDp = (ee.Feature(dist.addBands(ee.Image.pixelLonLat()).updateMask(dist.gte(maxDistance))
                           .sample(geo, scale).first()))
@@ -124,3 +124,11 @@ def calc_area(feature):
 #f"projects/earthengine-legacy/assets/users/sntopp/NHD/{state_asset.split('/')[-1]}/NHDDeepestPoint"
 
 
+state_dps = ee.data.listAssets({'parent': 'projects/earthengine-legacy/assets/users/sntopp/NHD/DeepestPoint'})['assets']
+
+for i in state_dps:
+    dps = ee.FeatureCollection(i['id'])
+    id =  i['id'].split('/')[-1]
+    dpsOut = ee.batch.Export.table.toDrive(dps,id,'EE_DP_Exports',id)
+    maximum_no_of_tasks(15, 60)
+    dpsOut.start()
\ No newline at end of file
diff --git a/NHD_HR_Lake_Pull.R b/NHD_HR_Lake_Pull.R
new file mode 100755
index 0000000..239e090
--- /dev/null
+++ b/NHD_HR_Lake_Pull.R
@@ -0,0 +1,137 @@
+## Mess around with tyler kings stuff
+#resolvable <- vroom::vroom('/Users/stopp/Downloads/NHDWaterbody_resolvable.csv')
+library(tidyverse)
+library(dataRetrieval)
+library(stringr)
+library(sf)
+library(httr)
+library(sbtools)
+sf::sf_use_s2(FALSE)
+library(mapview)
+mapviewOptions(fgb=FALSE)
+#library(smoothr)
+# 
+# ## Check the layers
+# st_layers('/Users/stopp/Downloads/NHD_H_National_GDB/NHD_H_National_GDB.gdb')
+# 
+# ## Pull the first few waterbodies to check the fields
+# check_fields <- st_read('/Users/stopp/Downloads/NHD_H_National_GDB/NHD_H_National_GDB.gdb',
+#                   query = 'SELECT * FROM "NHDWaterbody" WHERE FID < 5', layer = 'NHDWaterbody')
+# 
+# ## Set up a query to just bring in the fields we want for lakes and reservoirs (get rid of swamps, etc)
+# lakes <- st_read('/Users/stopp/Downloads/NHD_H_National_GDB/NHD_H_National_GDB.gdb',
+#                  query = 'SELECT PERMANENT_IDENTIFIER, GNIS_NAME, AREASQKM, ELEVATION, FTYPE, FCODE FROM "NHDWaterbody" WHERE FTYPE = 361 OR FTYPE = 436 OR FTYPE = 390', layer = 'NHDWaterbody') #%>%
+# 
+# #### Simplify everything in chunks 
+# step = 100000
+# for(i in seq(0,nrow(lakes),step)){
+#   
+#   end <- min(i+step, nrow(lakes))
+#   # For re-runs, skip what's been done
+#   files <- list.files('data/out/tmp_waterbody_chunks')
+#   if(T %in% grepl(sprintf('%i_%i.shp',i+1,end),files)) next
+#   
+#   ## Chunk and simplify
+#   lakes_out <- lakes[(i+1):end,] %>%
+#     st_cast(., "MULTIPOLYGON")%>%
+#     # bad Idaho site!
+#     filter(PERMANENT_IDENTIFIER != '{5BEDE13F-C94B-4501-B979-E00C29EA374B}') %>%
+#     sf::st_transform(crs = 4326) %>%
+#     sf::st_zm() %>%
+#     sf::st_buffer(dist = 0) %>%
+#     sf::st_simplify(preserveTopology = TRUE, dTolerance = 0.0001)
+#   
+#   st_write(lakes_out,sprintf('data/out/tmp_waterbody_chunks/%i_%i.shp',i+1,i+step))
+# }
+# 
+# rm(lakes)
+# 
+# chunks <- list.files('data/out/tmp_waterbody_chunks', pattern='*.shp',full.names = T)
+# lakes_simplified <- purrr::map_dfr(chunks, st_read)
+# 
+# object.size(lakes_simplified)
+# 
+# length(unique(lakes_simplified$PERMANE))
+# 
+# mapview(sample_n(lakes_simplified,5000))
+# 
+# st_write(lakes_simplified, 'data/out/NHD_waterbodies/NHD_waterbodies_simplified.shp')
+
+tk_resolvable <- vroom::vroom('/Users/stopp/Downloads/NHDWaterbody_resolvable.csv')
+
+files <- list.files('../../../My Drive/EE_DP_Exports', full.names = T)
+
+options(digits=15)
+dp_national <- read_csv(files[1]) %>%
+  mutate(coords = map(.geo, ~as.numeric(unlist(regmatches(.x,gregexpr("-?[[:digit:]]+\\.*-?[[:digit:]]*",.x))))),
+         long = map_dbl(coords, 1),
+         lat = map_dbl(coords,2),
+         distance = round(distance,3),
+         areasqkm = round(areasqkm, 4)) %>%
+  select(-c(coords, .geo,`system:index`))
+
+for(i in files[2:length(files)]){
+  state <- read_csv(i,show_col_types = F) %>%
+    mutate(coords = map(.geo, ~as.numeric(unlist(regmatches(.x,gregexpr("-?[[:digit:]]+\\.*-?[[:digit:]]*",.x))))),
+           long = map_dbl(coords, 1),
+           lat = map_dbl(coords,2),
+           distance = round(distance,3),
+           areasqkm = round(areasqkm, 4)) %>%
+    select(-c(coords, .geo,`system:index`))
+  
+  dp_national <- dp_national %>%
+    bind_rows(state) %>%
+    distinct(permanent,.keep_all=T)
+}
+
+dp_national <- dp_national %>%
+  select(areasqkm:lat)
+
+library(feather)
+write_feather(dp_national, 'data/out/nhd_hr_1ha_deepest_point_noTX.feather')
+
+
+match_ups <- dp_national %>%
+  inner_join(.,tk_resolvable, by = c('permanent'='permanent_identifier'))
+rm(dp_national, tk_resolvable)
+
+match_ups <- match_ups %>% mutate(resolvable = ifelse(resolvable == 'resolvable', 'Konrad resolvable', 'Konrad not resolvable'))
+
+thresholds <- tibble(distance = c(20,30,40,50,60,70,80,90))
+
+tk_resolvable <- match_ups %>% filter(resolvable == 'Konrad resolvable')
+
+thresholds$counts <- colSums(outer(tk_resolvable$distance, setNames(thresholds$distance, thresholds$distance), "<"))
+thresholds$labels <- paste0(thresholds$distance,' meters (n=',thresholds$counts,')')
+
+tk_resolvable %>%
+  ggplot(.,aes(x=distance)) + stat_bin(aes(y=cumsum(..count..)),geom="step") +
+  geom_vline(data= thresholds, aes(xintercept=distance, color = labels)) +
+  scale_color_viridis_d() +
+  scale_x_log10() +
+  theme_bw() +
+  facet_wrap(~resolvable, scales = 'free') +
+  labs(x = 'Maximum Distance from Shore (Meters)',
+       y= 'Cumulative Lake Count',
+       color='Res. Threshold (n disagreement)')
+
+
+thresholds <- tibble(distance = c(20,30,40,50,60,70,80,90))
+
+tk_non_resolvable <- match_ups %>% filter(resolvable == 'Konrad not resolvable')
+
+thresholds$counts <- colSums(outer(tk_non_resolvable$distance, setNames(thresholds$distance, thresholds$distance), ">"))
+thresholds$labels <- paste0(thresholds$distance,' meters (n=',thresholds$counts,')')
+
+tk_non_resolvable %>%
+  ggplot(.,aes(x=distance)) + stat_bin(aes(y=cumsum(..count..)),geom="step") +
+  geom_vline(data= thresholds, aes(xintercept=distance, color = labels)) +
+  scale_color_viridis_d() +
+  scale_x_log10() +
+  theme_bw() +
+  facet_wrap(~resolvable, scales = 'free') +
+  labs(x = 'Maximum Distance from Shore (Meters)',
+       y= 'Cumulative Lake Count',
+       color='Res. Threshold (n disagreement)')
+
+

From 552501617ee0b503e3a44915edbb4063981a58db Mon Sep 17 00:00:00 2001
From: Simon Topp <33098707+SimonTopp@users.noreply.github.com>
Date: Thu, 9 Feb 2023 14:41:21 -0500
Subject: [PATCH 14/15] Raw trends analysis and legacy TK resolvable push.

---
 02_raw_trends.R    | 125 +++++++++++++++++++++++++++++++++++++++++++++
 NHD_HR_Lake_Pull.R |  80 +++++++++--------------------
 2 files changed, 149 insertions(+), 56 deletions(-)
 create mode 100755 02_raw_trends.R

diff --git a/02_raw_trends.R b/02_raw_trends.R
new file mode 100755
index 0000000..c5c219a
--- /dev/null
+++ b/02_raw_trends.R
@@ -0,0 +1,125 @@
+#### This script is an initial analysis of lake surface temperature trends
+#### using temperatures from the Landsat.  ITS VERY PRELIMINARY and hacked together using
+#### two different EE pulls.
+library(tidyverse)
+library(feather)
+library(lubridate)
+library(trend)
+library(sf)
+library(scales)
+
+l7l8 <- read_feather('data/out/landsat_temps.feather')
+l7l8 <- l7l8 %>%
+  select(areakm, sat, date, distance, site_id, temp_ls, temp_ls_qa) %>%
+  filter(sat=='LANDSAT_7')
+
+ggplot(l7l8, aes(x=temp_ls)) + geom_histogram()
+
+l8l9 <- read_csv('/Users/stopp/Documents/repos/landsat-lake-temp-pull/data/out/L8_L9_cb_temps_munged.csv')
+l8l9 <- l8l9 %>%
+  select(areakm, sat = SPACECRAFT_ID, date, distance, site_id, temp_ls = ls_temp, temp_ls_qa = ls_temp_qa) %>%
+  mutate(sat=as.character(sat))
+
+ggplot(l8l9, aes(x=temp_ls)) + geom_histogram()
+
+ls_temps <- bind_rows(l7l8,l8l9) %>%
+  mutate(site_id = factor(site_id))
+
+rm(l7l8,l8l9)
+
+seasonally_aggregated <- ls_temps %>%
+  mutate(month = month(date),
+         year = year(date),
+         season = case_when((month %in% c(6,7,8)) ~ 'JJA',
+                            (month %in% c(9,10,11)) ~ "SON",
+                            (month %in% c(12,1,2)) ~ 'DJF',
+                            (month %in% c(3,4,5)) ~'MAM'),
+         season=factor(season)) %>%
+  group_by(site_id, year, season, distance, areakm) %>%
+  summarize(temp_ls = mean(temp_ls),
+            count = n())
+
+seasonally_aggregated <- seasonally_aggregated %>% filter(count > 1)
+
+counts <- seasonally_aggregated %>%
+  group_by(site_id, season) %>%
+  mutate(count = n())
+
+seasonally_aggregated <- seasonally_aggregated[counts$count > 20,]
+
+trends <- seasonally_aggregated %>% 
+  group_by(site_id, season) %>%
+  arrange(year) %>%
+  nest() %>%
+  mutate(mk = purrr::map(data, ~sens.slope(.$temp_ls)),
+         sen.slope = purrr::map_dbl(mk, 'estimates'),
+         p.value = purrr::map_dbl(mk, 'p.value')) %>%
+  select(site_id, season, sen.slope, p.value)
+
+ggplot(trends, aes(x=sen.slope, fill = season)) + 
+  geom_density(alpha=.4) +
+  xlim(-.3,.4) +
+  geom_vline(aes(xintercept=0), color = 'red') +
+  scale_fill_viridis_d()
+
+### Look at spatial trends
+lakes_sf <- read_csv('data/in/lake_metadata_20211217.csv') %>%
+  st_as_sf(coords = c('lake_lon_deg','lake_lat_deg'), crs = 4326) %>%
+  select(site_id, elevation_m, area_m2) %>%
+  st_transform(5070)
+
+trends_sf <- lakes_sf %>% inner_join(trends)
+
+usa <- maps::map('usa', plot = F) %>% st_as_sf() %>% st_transform(5070) 
+
+grid <- st_make_grid(usa, cellsize = c(75000,75000), square = F) %>% st_as_sf() %>% mutate(ID = row_number())
+
+grid_means <- grid %>% st_join(trends_sf %>% filter(abs(sen.slope) < .4) %>%st_transform(st_crs(grid)), left = F) %>%
+  st_set_geometry(NULL) %>% group_by(ID, season) %>%
+  summarize(mean_trend = mean(sen.slope))
+
+means_sf <- grid %>% inner_join(grid_means)
+
+ggplot() + 
+  geom_sf(data = usa) +
+  geom_sf(data = means_sf, aes(fill = mean_trend)) +
+  scale_fill_gradient2(low = muted('blue'), high=muted('red'), 'Mean Trend (°C/yr)') +
+  ggthemes::theme_map(base_size = 11) +
+  theme(legend.position = 'bottom') +
+  facet_wrap(~season) +
+  ggtitle("Mean Trend in Lake Temperature Since 1999")
+
+### Binned by Elevation
+trends_sf %>%
+  st_set_geometry(NULL) %>%
+  mutate(elevation_bin = cut_interval(elevation_m, 5)) %>%
+  ggplot(aes(x=elevation_bin,y = sen.slope,fill=elevation_bin)) +
+  coord_cartesian(ylim=c(-.15,.25)) +
+  geom_violin() +
+  geom_boxplot(width=.2) +
+  scale_fill_viridis_d(option='plasma', alpha=.3) +
+  theme(axis.text.x = element_text(angle=45,hjust=1,vjust=1),
+        legend.position = 'none') +
+  labs(x = 'Elevation (m)', y = 'Trend (°C/yr)') 
+
+### Binnned by lake size
+trends_sf %>%
+  st_set_geometry(NULL) %>%
+  mutate(size_bin = cut_number(area_m2/1e6, 10)) %>%
+  ggplot(aes(x=size_bin,y = sen.slope,fill=size_bin)) +
+  coord_cartesian(ylim=c(-.15,.25)) +
+  geom_violin() +
+  geom_boxplot(width=.2) +
+  scale_fill_viridis_d(option='plasma', alpha=.3) +
+  theme(axis.text.x = element_text(angle=45,hjust=1,vjust=1),
+        legend.position = 'none') +
+  labs(x = 'Lake Size Bin (km2)', y = 'Trend (°C/yr)') 
+
+seasonal_summary <- seasonally_aggregated %>%
+  group_by(season, site_id) %>%
+  mutate(temp_scaled = scale(temp_ls)) %>%
+  ungroup() %>%
+  group_by(season, year) %>%
+  summarise(mean_temp= mean(temp_scaled),
+            sd=sd(temp_scaled))
+
diff --git a/NHD_HR_Lake_Pull.R b/NHD_HR_Lake_Pull.R
index 239e090..e364dc7 100755
--- a/NHD_HR_Lake_Pull.R
+++ b/NHD_HR_Lake_Pull.R
@@ -1,64 +1,14 @@
 ## Mess around with tyler kings stuff
-#resolvable <- vroom::vroom('/Users/stopp/Downloads/NHDWaterbody_resolvable.csv')
 library(tidyverse)
-library(dataRetrieval)
-library(stringr)
 library(sf)
-library(httr)
-library(sbtools)
-sf::sf_use_s2(FALSE)
 library(mapview)
+library(feather)
 mapviewOptions(fgb=FALSE)
-#library(smoothr)
-# 
-# ## Check the layers
-# st_layers('/Users/stopp/Downloads/NHD_H_National_GDB/NHD_H_National_GDB.gdb')
-# 
-# ## Pull the first few waterbodies to check the fields
-# check_fields <- st_read('/Users/stopp/Downloads/NHD_H_National_GDB/NHD_H_National_GDB.gdb',
-#                   query = 'SELECT * FROM "NHDWaterbody" WHERE FID < 5', layer = 'NHDWaterbody')
-# 
-# ## Set up a query to just bring in the fields we want for lakes and reservoirs (get rid of swamps, etc)
-# lakes <- st_read('/Users/stopp/Downloads/NHD_H_National_GDB/NHD_H_National_GDB.gdb',
-#                  query = 'SELECT PERMANENT_IDENTIFIER, GNIS_NAME, AREASQKM, ELEVATION, FTYPE, FCODE FROM "NHDWaterbody" WHERE FTYPE = 361 OR FTYPE = 436 OR FTYPE = 390', layer = 'NHDWaterbody') #%>%
-# 
-# #### Simplify everything in chunks 
-# step = 100000
-# for(i in seq(0,nrow(lakes),step)){
-#   
-#   end <- min(i+step, nrow(lakes))
-#   # For re-runs, skip what's been done
-#   files <- list.files('data/out/tmp_waterbody_chunks')
-#   if(T %in% grepl(sprintf('%i_%i.shp',i+1,end),files)) next
-#   
-#   ## Chunk and simplify
-#   lakes_out <- lakes[(i+1):end,] %>%
-#     st_cast(., "MULTIPOLYGON")%>%
-#     # bad Idaho site!
-#     filter(PERMANENT_IDENTIFIER != '{5BEDE13F-C94B-4501-B979-E00C29EA374B}') %>%
-#     sf::st_transform(crs = 4326) %>%
-#     sf::st_zm() %>%
-#     sf::st_buffer(dist = 0) %>%
-#     sf::st_simplify(preserveTopology = TRUE, dTolerance = 0.0001)
-#   
-#   st_write(lakes_out,sprintf('data/out/tmp_waterbody_chunks/%i_%i.shp',i+1,i+step))
-# }
-# 
-# rm(lakes)
-# 
-# chunks <- list.files('data/out/tmp_waterbody_chunks', pattern='*.shp',full.names = T)
-# lakes_simplified <- purrr::map_dfr(chunks, st_read)
-# 
-# object.size(lakes_simplified)
-# 
-# length(unique(lakes_simplified$PERMANE))
-# 
-# mapview(sample_n(lakes_simplified,5000))
-# 
-# st_write(lakes_simplified, 'data/out/NHD_waterbodies/NHD_waterbodies_simplified.shp')
 
 tk_resolvable <- vroom::vroom('/Users/stopp/Downloads/NHDWaterbody_resolvable.csv')
 
+
+### Read in the exports from EE and do a little munging
 files <- list.files('../../../My Drive/EE_DP_Exports', full.names = T)
 
 options(digits=15)
@@ -87,15 +37,17 @@ for(i in files[2:length(files)]){
 dp_national <- dp_national %>%
   select(areasqkm:lat)
 
-library(feather)
-write_feather(dp_national, 'data/out/nhd_hr_1ha_deepest_point_noTX.feather')
+#write_feather(dp_national, 'data/out/nhd_hr_1ha_deepest_point_noTX.feather')
+dp_national <- read_feather('data/out/nhd_hr_1ha_deepest_point_noTX.feather')
+tk_resolvable <- vroom::vroom('/Users/stopp/Downloads/NHDWaterbody_resolvable.csv')
 
 
 match_ups <- dp_national %>%
   inner_join(.,tk_resolvable, by = c('permanent'='permanent_identifier'))
+
 rm(dp_national, tk_resolvable)
 
-match_ups <- match_ups %>% mutate(resolvable = ifelse(resolvable == 'resolvable', 'Konrad resolvable', 'Konrad not resolvable'))
+match_ups <- match_ups %>% mutate(resolvable = ifelse(resolvable == 1, 'Konrad resolvable', 'Konrad not resolvable'))
 
 thresholds <- tibble(distance = c(20,30,40,50,60,70,80,90))
 
@@ -135,3 +87,19 @@ tk_non_resolvable %>%
        color='Res. Threshold (n disagreement)')
 
 
+
+tk30_res <- tk_resolvable %>% filter(distance < 30)
+tk30_not_res <- tk_non_resolvable %>% filter(distance >30)
+
+ggplot(tk30_res, aes(x = areasqkm)) +
+  geom_histogram()
+
+ggplot(tk30_not_res, aes(x= areasqkm)) + 
+  geom_histogram() +
+  scale_x_log10()
+
+
+tk30_res %>% slice_sample(prop=.2) %>% 
+  st_as_sf(., coords=c('long','lat'), crs=4326) %>%
+  mapview(zcol = 'distance')
+  

From 2c497a4ac4446e9fde17be260b484e1893a7a422 Mon Sep 17 00:00:00 2001
From: Simon Topp <33098707+SimonTopp@users.noreply.github.com>
Date: Thu, 9 Feb 2023 15:16:47 -0500
Subject: [PATCH 15/15] add trends

---
 02_raw_trends.R | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/02_raw_trends.R b/02_raw_trends.R
index c5c219a..cea5a1d 100755
--- a/02_raw_trends.R
+++ b/02_raw_trends.R
@@ -115,11 +115,21 @@ trends_sf %>%
         legend.position = 'none') +
   labs(x = 'Lake Size Bin (km2)', y = 'Trend (°C/yr)') 
 
+### Look at actual timeseries to see if they look really goofy
 seasonal_summary <- seasonally_aggregated %>%
-  group_by(season, site_id) %>%
+  ungroup() %>%
+  mutate(area_bin = cut_number(areakm, 10)) %>%
+  group_by(season, site_id, area_bin) %>%
   mutate(temp_scaled = scale(temp_ls)) %>%
   ungroup() %>%
-  group_by(season, year) %>%
+  group_by(season, year, area_bin) %>%
   summarise(mean_temp= mean(temp_scaled),
             sd=sd(temp_scaled))
 
+### 
+ggplot(seasonal_summary, aes(x=year, y = mean_temp, color = area_bin, fill = area_bin)) +
+  geom_ribbon(aes(ymin = mean_temp-sd, ymax=mean_temp+sd), alpha = .3) +
+  geom_line() +
+  facet_wrap(~season)
+
+