MackenzieConnectivity/4_trendAnalysis.Rmd at main · whyana/MackenzieConnectivity · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
---
title: "4_trendAnalysis"
output: html_document
date: "2023-03-13"
editor_options:
  chunk_output_type: console
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

# Libraries
```{r}
library(tidyverse)
library(sf)
library(lubridate)
library(grDevices)
library(mapview)
library(extrafont)
library(ggpubr)
library(ggmap)
library(RgoogleMaps)
library(broom)
library(feather)
library(tidyhydat)
library(sp)
library(data.table)
library(ggalluvial)
library(patchwork)
library(magick)
library(units)
library(Kendall)
library(ggspatial)
library(dtplyr)
#Import libraries for Random Forest
library(caret)
library(e1071)
library(Boruta)
library(tidymodels)
library(skimr)
library(vip)
```

# Import files / set constants
```{r}
# dates for version control
todayDate  = "20230324" # the first data join phase

# intermediate working directory
int.wd="~/WRR Submission 2 Data/Script 4"

#Name of file and folder for lake shapefiles & island polygon shapefiles
lakes.shapeFile = "mackenzieGoodLakes.shp"
setwd(int.wd)
lakes.sf = st_read(lakes.shapeFile)
import.sword = "na_sword_reaches_hb82_v14.shp"

images.wd = "~/images"
```

# Import river centerlines and set the projection for all future plots, import classifications
```{r}
crs.plot = "+proj=tcea +lon_0=-134.3847656 +datum=WGS84 +units=m +no_defs"
setwd(int.wd)

study.area.large=cbind.data.frame(lon=c(-136.80, -136.80, -133.47, -133.47),
                 lat=c(67.25, 69.55, 69.55, 67.46)) %>%
  st_as_sf(coords=c("lon", "lat")) %>% st_set_crs(4326) %>%
  st_bbox() %>% st_as_sfc() %>%
  st_transform(crs = crs.plot)

mack.basin.large = st_read(import.sword) %>%
  st_transform(crs = crs.plot) %>%
  st_intersection(study.area.large) %>% dplyr::filter(width>90)

# import classifications
setwd(int.wd)
all.classified.filter = read_feather(paste0("final.class_", todayDate, ".feather"))
```

# Trend analysis for calibrated reflected
## Calculate peak discharge entering the delta, then filter the connectivity classification data to the month (four weeks) after peak discharge each year.
```{r}
# Get yearly timing of peak flow at Arctic Red River
complete.flows = hy_daily_flows(
  station_number=c("10LC014"),
  start_date = "1973-01-01") %>%
  mutate(doy = yday(Date),
         month = month(Date),
         year = year(Date))

# uses https://agupubs-onlinelibrary-wiley-com.libproxy.lib.unc.edu/doi/full/10.1002/2012WR013198 to define freshet initiation
freshet.initiation = complete.flows %>% arrange(Date) %>% filter(year>=1984) %>%
  mutate(lag.value = lag(Value, n=1),
         diff = Value-lag.value,
         three.pct = Value*0.03,
         thresh.tf = diff>=three.pct) %>%
  filter(thresh.tf==T & doy>31) %>%
  group_by(year) %>%
  mutate(rnum= row_number()) %>% filter(rnum==1) %>% ungroup() %>% select(year, doy) %>%
  rename(freshet.in = doy)

first.peak = complete.flows %>% filter(STATION_NUMBER=="10LC014") %>%
  arrange(Date) %>%
  left_join(freshet.initiation %>% select(year, freshet.in), by="year") %>%
  filter(doy>freshet.in) %>%
  mutate(lag.value = lag(Value, n=1),
         diff = Value-lag.value,
         three.pct = Value*0.03,
         thresh.tf = diff<= (-three.pct)) %>%
  filter(diff<0 & Value>=10000) %>%
  group_by(year) %>%
  mutate(rnum= row_number()) %>% filter(rnum==1) %>% ungroup() %>% select(year, doy, Value) %>%
  rename(first.peak = doy, peak.value=Value)

# filter lake classifications based on the month after the first discharge peak
filt.obs = all.classified.filter %>%
  left_join(first.peak, by= "year") %>% filter(!is.na(first.peak)) %>%
  filter(doy>=first.peak & doy<=first.peak+28) %>%
  select(OBJECTID, .pred_class, date, year, month, doy, first.peak) %>%
   mutate(yeargroup = case_when(
    year>=1984 & year<=2001 ~ "1984-2001",
    year>=2002 & year<=2019 ~ "2002-2019"
  )) %>% filter(!is.na(yeargroup))

# Calculate average annual connectivity the month after discharge within each yeargroup
results.yeargroup = filt.obs %>% group_by(OBJECTID, year, yeargroup)%>%
  summarise(class.mean = mean(as.numeric(as.character(.pred_class)), na.rm=T),
            count=n()) %>% ungroup()
# Calculate average annual connectivity for the entire period
results.all = filt.obs %>%group_by(OBJECTID, year) %>%
  summarise(class.mean = mean(as.numeric(as.character(.pred_class)), na.rm=T),
            count=n()) %>% ungroup() %>% mutate(yeargroup="all")
# Combine the yeargroup dataset and the entire period dataset.
results.summary = rbind.data.frame(results.yeargroup %>% as_tibble(),
                                   results.all %>% as_tibble())

# group by time period, count number of years of data each lake has in each period
good.ids = results.summary %>% group_by(OBJECTID, yeargroup) %>%count() %>% ungroup() %>%
  filter((yeargroup %in% c("1984-2001", "2002-2019") & n>=10) )
# select only lakes that have at least 10 obs in both yeargroup periods
best.ids = good.ids %>% group_by(OBJECTID) %>% count() %>% ungroup() %>% filter(n==2)
best.ids
# Apply the best.ids filter, and group observations by lake, month, and yeargroup
nested.data = results.summary %>%
  left_join(best.ids, by=c("OBJECTID")) %>%
  dplyr::filter(!is.na(n)) %>%
  group_by(OBJECTID, yeargroup) %>% nest() %>% ungroup() %>% as_tibble()

```

## Calculate trend using mann kendall
```{r}
## for each lake, calculate the trend (tau) and pvalue. Note, if there is almost no variability in connectivity, we assume there is no trend.
row.combo=NULL
for (i in 1:nrow(nested.data)){
  print(i)
  dat = nested.data$data[[i]] %>% arrange(year)
  OBJECTID = nested.data$OBJECTID[[i]]
  yeargroup = nested.data$yeargroup[[i]]
  n.obs = nrow(dat)
  obs.count = dat %>% group_by(class.mean) %>% count() %>% ungroup() %>%
    mutate(all.obs = n.obs,
           pct = n/n.obs)
  if(isTRUE(obs.count$pct[obs.count$class.mean<=0.66]>=0.95)){
    class = "always less than 0.66"
    col.combo = cbind.data.frame(OBJECTID, yeargroup,class, pval=NA, S=NA, tau=NA)
    row.combo=rbind.data.frame(row.combo, col.combo)
  } else if(isTRUE(obs.count$pct[obs.count$class.mean>0.66 |obs.count$class.mean<=1.33]>=0.95)){
    class = "always 0.66-1.33"
    col.combo = cbind.data.frame(OBJECTID, yeargroup,class, pval=NA, S=NA, tau=NA)
    row.combo=rbind.data.frame(row.combo, col.combo)
  }else if(isTRUE(obs.count$pct[obs.count$class.mean>1.33]>=0.95)){
    class = "always >1.33"
    col.combo = cbind.data.frame(OBJECTID, yeargroup,class, pval=NA, S=NA, tau=NA)
    row.combo=rbind.data.frame(row.combo, col.combo)
  } else {
    class = "trendtest"
    test.obj=MannKendall(dat$class.mean)
    S=test.obj$S[[1]]
    tau = test.obj$tau
    pval = test.obj$sl
    col.combo = cbind.data.frame(OBJECTID, yeargroup, class, pval, S, tau)
    row.combo=rbind.data.frame(row.combo, col.combo)
  }
}

## Print for the table in the manuscript (table 2)
trend.summary = row.combo %>% as_tibble()%>%
  mutate(trend = case_when(
    tau>0 & pval < 0.05 ~ "increasing sig. connectivity trend",
    tau<0 & pval < 0.05~ "decreasing sig. connectivity trend",
    pval>0.05 ~ "no monotonic trend",
    is.na(tau) & class == "always less than 0.66" ~ "always less than 0.66",
    is.na(tau) & class == "always 0.66-1.33" ~ "always 0.66-1.33",
    is.na(tau) & class == "always >1.33" ~ "always >1.33")) %>%
  group_by(yeargroup, trend) %>% count() %>%
  spread(yeargroup, n) %>% ungroup()

trend.summary %>%
  filter(trend %in%
           c("no monotonic trend", "decreasing sig. connectivity trend", "increasing sig. connectivity trend"))

# Print total lakes in each group
colSums(trend.summary %>% select(`1984-2001`, `2002-2019`, all), na.rm=T)

```