CitSci/01_height_munge.Rmd at master · GlobalHydrologyLab/CitSci · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
---
title: "01_height_munge"
output: html_document
editor_options:
  chunk_output_type: console
---

```{r setup, include=FALSE}
library(tidyverse)
library(lubridate)

knitr::opts_chunk$set(echo = TRUE)
```

## First take the raw heights and standardize lakes with multiple gauges based on the GPS readings

```{r cars}

## Pull in the Raw Heights
heights.raw <- read_csv('data/in/LOCSS_raw_heights.csv')

## Standardize lakes with multiple guages based on preliminary GPS data.

#Waccamaw
#DMN2 :  2.18
#HCN2 : GPS 2.18
#WCN2 : GPS 1.77
#ZZN2/NC1009 : GPS 2.24

heights.raw$height[heights.raw$gauge_id == 'DMN2'] <-
  heights.raw$height[heights.raw$gauge_id == 'DMN2'] + (2.24-2.18)

heights.raw$height[heights.raw$gauge_id == 'HCN2'] <-
  heights.raw$height[heights.raw$gauge_id == 'HCN2'] + (2.24-2.18)

heights.raw$height[heights.raw$gauge_id == 'WCN2'] <-
  heights.raw$height[heights.raw$gauge_id == 'WCN2'] + (2.24-1.77)

#phelps
#FDN2 : GPS 2.59
#PHN2 : GPS 11.79

heights.raw$height[heights.raw$gauge_id == 'PHN2'] <-
  heights.raw$height[heights.raw$gauge_id == 'PHN2'] + (2.59-11.79)


#white
#NWN2 : GPS 1.88
#WHN2 : GPS 1.18

heights.raw$height[heights.raw$gauge_id == 'WHN2'] <-
  heights.raw$height[heights.raw$gauge_id == 'WHN2'] + (1.88-1.18)

#lawrence
#LAW2 : GPS 0.4
#XAW2 : GPS 0.38

heights.raw$height[heights.raw$gauge_id == 'XAW2'] <-
  heights.raw$height[heights.raw$gauge_id == 'XAW2'] + (0.4-0.38)

```

## Pull in the data from Wisconsin and standardize its format to match LOCSS data

```{r}
## Pull in the Wisconsin data and reformat it
paths <- list.files('data/in/Wisc_raw_heights', full.names = T)

wisc.heights.raw <- map_dfr(paths, read_csv)

## Two duplicate names between Wisconsin and Washington, make explicit
wisc.heights.raw$Lake_Name[wisc.heights.raw$Lake_Name == 'Deep Lake'] <- 'Deep Lake Wisc.'
wisc.heights.raw$Lake_Name[wisc.heights.raw$Lake_Name == 'Phantom Lake'] <- 'Phantom Lake Wisc.'


wisc.heights.raw <- wisc.heights.raw %>%
  mutate(date = mdy(Date)) %>%
  select(name = Lake_Name, date, height = Level) %>%
  mutate(region = 'WI',
         name.std = name) %>%
  group_by(name.std, name, date, region) %>%
  summarise(height = mean(height)) %>%
  ungroup()
```


## Make sure we have all the lakes/gauges with standardized names, average first by lake then by day to get one height measurement per day per lake

```{r}
## Join with standardized names
lakes <- read_csv('data/in/lake_properties.csv')

## Check for mismatches
check <- heights.raw %>% filter(!gauge_id %in% lakes$gauge_id) %>%
  distinct(name, .keep_all = T)
## There are 14, but I think these are international gauges
check <- wisc.heights.raw %>% left_join(lakes) %>% filter(is.na(nSat))

## Average by lake then date
heights.raw <- heights.raw %>% left_join(
  lakes %>%
    mutate(name.std = name) %>%
    select(name.std, gauge_id, region)) %>%
  filter(height >= 0,
       height <= 3.3) %>%
  group_by(name.std, gauge_id, region, date) %>%
  summarise(height = mean(height)) %>%
  group_by(name.std, date, region) %>%
  summarise(height = mean(height)) %>%
  ungroup() %>%
  filter(!is.na(region))


heights.munged <- heights.raw %>%
  bind_rows(wisc.heights.raw %>% select(-name)) %>%
  mutate(height = height*0.3048)

## Check counts
check <- heights.munged %>% group_by(name.std, region) %>%
  summarise(count = n()) %>%
  arrange(count)

write_csv(heights.munged, 'data/out/heights_munged.csv')
```