Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file modified .gitignore
100644 → 100755
Empty file.
359 changes: 359 additions & 0 deletions 01_TempMunge.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,359 @@
---
title: "01_TempMunge"
author: "Simon Topp"
date: "07/08/2021"
output: html_document
editor_options:
chunk_output_type: console
---

```{r setup, include=FALSE}
library(tidyverse)
library(lubridate)
library(purrr)
library(furrr)
library(data.table)
library(feather)
library(sf)
library(ggpmisc)
library(ggpubr)
library(Hmisc)
library(Metrics)

knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
```

```{r eval = F, include = F}
## Read in whatever path you saved the raw pull data to
files <- list.files('data/EE_TempPull', full.names = T)
files <- files[file.size(files) > 1] ## 6 WRS tile returned no values

## Munge it a little, basically zero clouds/shadow/ice and
## at least 5 pixels
munger <- function(file){
df <- read_csv(file) %>%
filter(!is.na(temp),
cScore_clouds == 0,
pCount_water > 5) %>%
mutate(LandsatID = map_chr(`system:index`, ~str_split(.,'_0000')[[1]][1])) %>%
select(-`system:index`)
return(df)
}

munged <- files %>% map_dfr(munger)

munged <- munged %>% mutate(temp = 0.00341802*temp + 149 - 273.15,
temp_qa = temp_qa*0.01,
lake_mix_layer_temperature = lake_mix_layer_temperature - 273.15) %>%
rename(temp_e5_mix = lake_mix_layer_temperature, temp_e5_full = lake_total_layer_temperature,
temp_ls = temp, temp_ls_qa = temp_qa, date = DATE_ACQUIRED, sat = SPACECRAFT_ID,
scene_cloud_cover = CLOUD_COVER)

write_feather(munged,'data/out/landsat_temps.feather')
```

```{r}
munged <- read_feather('data/out/landsat_temps.feather')
```

## Between Landsat 7 and 8 we have `r nrow(munged)` temperature observations.

```{r}
obs <- read_csv('data/in/lake_surface_temp_obs.csv') %>%
rename(date = Date)

same_day <- obs %>% inner_join(munged) %>% mutate(join = 'same_day')

day_plus_one <- obs %>% inner_join(munged %>% mutate(date = date + 1)) %>% mutate(join = 'day_plus_1')

day_minus_one <- obs %>% inner_join(munged %>% mutate(date = date -1)) %>% mutate(join = 'day_minus_1')

matchups <- bind_rows(same_day, day_plus_one, day_minus_one) %>%
mutate(resid = wtemp_obs - temp_ls,
abs_error = abs(resid),
e5_flag = abs(wtemp_obs - temp_e5_mix))

matchups_filt <- matchups %>%
filter(temp_ls_qa < 3,
scene_cloud_cover < 20,
distance > 90,
temp_ls >=0)

rm(same_day, day_plus_one, day_minus_one)
```

## Of those, we have `r nrow(matchups)` coincident observations of RS and *in situ* temp (matchups).

Let's look at some *inclusive* metrics. General filters here are:

- +/- 1 day matchups

- 0 clouds/ice/cloud shadow over the 'deepest point'

```{r}
ggplot(matchups, aes(x = wtemp_obs, y = temp_ls)) + geom_hex() +
scale_fill_viridis_c(trans = 'log10') +
geom_smooth(method = 'lm') +
stat_poly_eq(formula = y~x,
aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
parse = TRUE) +
labs(x = 'Observed Temps', y = 'Landsat Temps', title = 'Landsat and Observed')

matchups %>% summarise(rmse = rmse(wtemp_obs, temp_ls),
mae = mae(wtemp_obs,temp_ls),
smape = smape(wtemp_obs,temp_ls),
bias = bias(wtemp_obs, temp_ls)) %>%
mutate(across(everything(), ~round(.,4))) %>%
kableExtra::kable() %>%
kableExtra::kable_styling()

## One quick look at Era5, it's not great
ggplot(matchups, aes(x = wtemp_obs, y = temp_e5_mix)) + geom_hex() +
scale_fill_viridis_c(trans = 'log10') +
geom_smooth(method = 'lm', aes(color = 'Best Fit')) +
geom_abline(aes(slope=1, intercept=0,color = '1:1')) +
stat_poly_eq(formula = y~x,
aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
parse = TRUE) +
labs(x = 'Observed Temps', y = 'Era5 Mixed Layer Temps', 'Era5 and Observed', titel='blue=Best Fit', 'Red=1:1')


matchups %>% filter(!is.na(temp_e5_mix)) %>%
summarise(rmse = rmse(wtemp_obs, temp_e5_mix),
mae = mae(wtemp_obs,temp_e5_mix),
smape = smape(wtemp_obs,temp_e5_mix),
bias = bias(wtemp_obs, temp_e5_mix)) %>%
mutate(across(everything(), ~round(.,4))) %>%
kableExtra::kable() %>%
kableExtra::kable_styling()
```

## Now the same but with *strict* filters. After qc filters we have `r nrow(matchups_filt)` matchups.

General filters here are:

- +/- 1 day matchups

- 0 clouds/ice/cloud shadow over the 'deepest point'

- Landsat uncertainty \< 3 degrees

- Scene cloud cover \< 20% - Distance to shore \> 90m

```{r}
matchups_filt <- matchups %>%
filter(temp_ls_qa < 3,
scene_cloud_cover < 10,
distance > 200,
temp_ls >=0,
wtemp_obs >=0,
e5_flag < 5,
join != 'day_minus_1'
)

plotter <- function(var){
matchups %>%
mutate(bins= cut_number(get(var), 5)) %>%
ggplot(aes(x = bins, y = abs_error)) +
geom_violin(draw_quantiles = c(.25,.5,.75)) +
geom_hline(aes(yintercept = 0), color = 'red')+
labs(x = var)
}

plotter('areakm')
plotter('scene_cloud_cover')
plotter('e5_flag')
plotter('distance')
plotter('temp_ls_qa')

matchups_filt <- matchups %>%
filter(areakm > .2,
temp_ls_qa < 3,
scene_cloud_cover < 20,
distance > 250,
temp_ls >= 0,
wtemp_obs >=0,
e5_flag < 5,
join != 'day_minus_1'
)

ggplot(matchups_filt, aes(x = wtemp_obs, y = temp_ls)) + geom_hex() +
scale_fill_viridis_c(trans = 'log10') +
geom_smooth(method = 'lm') +
stat_poly_eq(formula = y~x,
aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
parse = TRUE) +
labs(x = 'Observed Temps', y = 'Landsat Temps', title = 'Landsat and Observed')

matchups_filt %>% summarise(rmse = rmse(wtemp_obs, temp_ls),
mae = mae(wtemp_obs,temp_ls),
smape = smape(wtemp_obs,temp_ls),
bias = bias(wtemp_obs, temp_ls)) %>%
mutate(across(everything(), ~round(.,4))) %>%
kableExtra::kable()

check <- matchups_filt %>% mutate(se = (wtemp_obs - temp_ls)^2)
```

## Finally, just big lakes where we would expect to be most confident (distance to shore \> 1000m)

```{r}
ggplot(matchups_filt %>% filter(distance > 1000), aes(x = wtemp_obs, y = temp_e5_mix)) + geom_hex() +
scale_fill_viridis_c(trans = 'log10') +
geom_smooth(method = 'lm') +
stat_poly_eq(formula = y~x,
aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
parse = TRUE) +
labs(x = 'Observed Temps', y = 'Landsat Temps', title = 'Landsat and Observed')

matchups_filt %>% filter(distance > 1000) %>%
summarise(rmse = rmse(wtemp_obs, temp_ls),
mae = mae(wtemp_obs,temp_ls),
smape = smape(wtemp_obs,temp_ls),
bias = bias(wtemp_obs, temp_ls)) %>%
mutate(across(everything(), ~round(.,4))) %>%
kableExtra::kable() %>%
kableExtra::kable_styling()
```

## Sliced and diced a couple ways

```{r}
lake_data <- read_csv('data/in/lake_metadata.csv')

matchups <- matchups %>% left_join(lake_data) %>%
mutate(residual = wtemp_obs - temp_ls,
abs_error = abs(residual))

ggplot(matchups, aes(x = residual, fill = sat)) +
geom_density(alpha = .4) +
geom_vline(aes(xintercept = 0), color = 'red') +
labs(title = 'Residuals across sensors')

ggplot(matchups, aes(x = residual, fill = join)) +
geom_density(alpha = .4) +
geom_vline(aes(xintercept = 0), color = 'red') +
labs(title = 'Residuals across sensors')

matchups %>%
mutate(dist_group = cut_number(distance, 6)) %>%
ggplot(aes(x = dist_group, y = residual)) +
geom_violin(draw_quantiles = .5) +
labs(title = 'Residuals binned by distance to shore')


## Spatially
matchups.sf <- matchups %>% st_as_sf(coords = c('lake_lon_deg','lake_lat_deg'), crs = 4326) %>% st_transform(5070)

usa <- maps::map('usa', plot = F) %>% st_as_sf() %>% st_transform(5070)

grid <- st_make_grid(usa, cellsize = c(200000,200000), square = F) %>% st_as_sf() %>% mutate(ID = row_number())

rmse.sf <- matchups.sf %>% st_join(grid) %>% st_set_geometry(NULL) %>%
group_by(ID) %>%
summarise(rmse = rmse(wtemp_obs, temp_ls),
matchup.count = n()) %>%
inner_join(grid) %>%
st_as_sf()

ggplot(rmse.sf) + geom_sf(aes(fill = rmse)) +
scale_fill_viridis_c() +
geom_sf(data = usa, fill = 'transparent', color = 'black') +
theme_minimal() +
labs(title = 'Spatial Error Distribution')

ggplot(rmse.sf) + geom_sf(aes(fill = matchup.count)) +
scale_fill_viridis_c(trans='log10') +
geom_sf(data = usa, fill = 'transparent', color = 'black') +
theme_minimal() +
labs(title = 'Matchup Count')

```

## Take a look at within scene variance.
```{r}
summary_in <- matchups_filt

scene_sd <- summary_in %>% group_by(LandsatID) %>%
summarise(count = n(),
resid_sd = sd(resid)) %>%
filter(count >5)

overall_sd <- summary_in %>% filter(LandsatID %in% scene_sd$LandsatID) %>%
summarise(overall = sd(resid)) %>% .[[1]]

ggplot(scene_sd, aes(x = resid_sd)) + geom_histogram() + geom_vline(aes(xintercept = overall_sd, color = 'Overall residual SD'))

scene_counts <- dplyr::count(summary_in, LandsatID) %>% filter(n > 5)

scene_bias <- summary_in %>%
filter(LandsatID %in% scene_counts$LandsatID) %>%
group_by(LandsatID) %>%
summarise(bias = bias(wtemp_obs, temp_ls))

scene_stats <- summary_in %>%
filter(LandsatID %in% scene_counts$LandsatID) %>%
left_join(scene_bias) %>%
mutate(temp_no_bias = temp_ls+bias) %>%
group_by(LandsatID) %>%
summarise(rmse_bias_removed = rmse(wtemp_obs, temp_no_bias),
scene_bias = mean(bias))

overall_bias = summary_in %>% filter(LandsatID %in% scene_counts$LandsatID) %>%
summarise(bias = bias(wtemp_obs, temp_ls)) %>%
.[[1]]

overall_rmse = summary_in %>% filter(LandsatID %in% scene_counts$LandsatID) %>%
mutate(temp_ls = temp_ls + overall_bias) %>%
summarise(rmse = rmse(wtemp_obs,temp_ls))
rmses <- tibble(overall_rmse_bias_removed = overall_rmse[[1]], median_scene_rmse_bias_removed = median(scene_stats$rmse_bias_removed)) %>%
pivot_longer(everything(), names_to = "RMSE_Summaries")

ggplot(scene_stats, aes(x = rmse_bias_removed)) + geom_histogram() +
geom_vline(data = rmses, aes(xintercept = value, color = RMSE_Summaries)) +
labs(x = "Distribution of image specific rmse after removing image bias")
```


```{r}
mendota <- read_csv('data/in/Mendota_Era5Land_1984_2021.csv')
mendotaObs <- obs %>% filter(site_id == "nhdhr_143249470")

mendota <- mendota %>% mutate(lake_mix_layer_temperature = lake_mix_layer_temperature - 273.15,
date = as.Date(date)) %>%
inner_join(mendotaObs)

library(plotly)

ggplotly(
ggplot(mendota, aes(x=date)) +
geom_point(aes(y = lake_mix_layer_temperature, color='Era5')) +
geom_point(aes(y=wtemp_obs, color = 'Obs')) +
geom_line(aes(y = lake_mix_layer_temperature, color='Era5')) +
geom_line(aes(y=wtemp_obs, color = 'Obs')))

ggplot(mendota, aes(x=lake_mix_layer_temperature)) + geom_histogram()

ggplot


mendota %>% rename(temp_e5_mix = lake_mix_layer_temperature) %>%
summarise(rmse = rmse(wtemp_obs, temp_e5_mix),
mae = mae(wtemp_obs,temp_e5_mix),
smape = smape(wtemp_obs,temp_e5_mix),
bias = bias(wtemp_obs, temp_e5_mix)) %>%
mutate(across(everything(), ~round(.,4))) %>%
kableExtra::kable() %>%
kableExtra::kable_styling()

mendota %>% rename(temp_e5_mix = lake_mix_layer_temperature) %>%
ggplot(.,aes(x = wtemp_obs, y = temp_e5_mix)) + geom_hex() +
scale_fill_viridis_c(trans = 'log10') +
geom_smooth(method = 'lm', aes(color = 'Best Fit')) +
geom_abline(aes(slope=1, intercept=0,color = '1:1')) +
stat_poly_eq(formula = y~x,
aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
parse = TRUE) +
labs(x = 'Observed Temps', y = 'Era5 Mixed Layer Temps', 'Era5 and Observed', titel='blue=Best Fit', 'Red=1:1')
```

Loading