ConservationInternational · jmaccarthy · Feb 28, 2014 · Apr 28, 2014 · Apr 28, 2014 · May 1, 2014
diff --git a/clim_functions b/clim_functions
@@ -0,0 +1,255 @@
+require(stringr)
+require(lubridate)
+require(compiler)
+require(ggplot2)
+
+#Performs range test on relative humidity based on Estevez et al. (2011)####
+#RH acceptable range = 0-100%
+f.rh.r1 <- function(data) {
+  #Vectorize and format the relative humidity data
+  rh.1 <- as.numeric(data$Sensor1_RelativeHumidity)
+  rh.2 <- as.numeric(data$Sensor2_RelativeHumidity)
+  rh.3 <- as.numeric(data$Sensor3_RelativeHumidity)
+  #Index rows that fail the first/only range test for relative humidity
+  rh.1.fail.r1 <- which(rh.1 < 0 | rh.1 > 100)
+  rh.2.fail.r1 <- which(rh.2 < 0 | rh.2 > 100)
+  rh.3.fail.r1 <- which(rh.3 < 0 | rh.3 > 100)
+  #Create a data frame with results of the test
+  final.df <- data.frame(ReferenceID = data$ReferenceID, RecordID = data$RecordID, Observation = data$Observation,
+                         TEAMSiteName = data$TEAMSiteName, SamplingUnitName = data$SamplingUnitName, rh.1 = rh.1,
+                         rh.1.rangeTest = NA, rh.2 = rh.2, rh.2.rangeTest = NA, rh.3 = rh.3, rh.3.rangeTest = NA)
+  #Use the failed test index to store the error code
+  final.df$rh.1.rangeTest[rh.1.fail.r1] <- 'R1'
+  final.df$rh.2.rangeTest[rh.2.fail.r1] <- 'R1'
+  final.df$rh.3.rangeTest[rh.3.fail.r1] <- 'R1'
+  #Replace NAs with a blank
+  final.df$rh.1.rangeTest[is.na(final.df$rh.1.rangeTest)] <- ' '
+  final.df$rh.2.rangeTest[is.na(final.df$rh.2.rangeTest)] <- ' '
+  final.df$rh.3.rangeTest[is.na(final.df$rh.3.rangeTest)] <- ' '
+  return(final.df)
+}
+
+#Performs first range test on temperature based on Estevez et al. (2011)####
+#Acceptable values fall between -30 and 50; revisit these numbers (Africa max high is 55--WMO)
+f.temp.r1 <- function(data) {
+  #Vectorize and format the temperature data
+  temp.1 <- as.numeric(data$Sensor1_AvgTemp)
+  temp.2 <- as.numeric(data$Sensor2_AvgTemp)
+  temp.3 <- as.numeric(data$Sensor3_AvgTemp)
+  #Index rows that have failed the range test for temperature
+  temp.1.fail.r1 <- which(temp.1 < -30 | temp.1 > 50)
+  temp.2.fail.r1 <- which(temp.2 < -30 | temp.2 > 50)
+  temp.3.fail.r1 <- which(temp.3 < -30 | temp.3 > 50)
+  #Create a data frame with results of the test
+  final.df <- data.frame(ReferenceID = data$ReferenceID, RecordID = data$RecordID, Observation = data$Observation,
+                         TEAMSiteName = data$TEAMSiteName, SamplingUnitName = data$SamplingUnitName, temp.1 = temp.1,
+                         temp.1.rangeTest = NA, temp.2 = temp.2, temp.2.rangeTest = NA, temp.3 = temp.3,
+                         temp.3.rangeTest = NA)
+  #Use the failed test index to store the error code
+  final.df$temp.1.rangeTest[temp.1.fail.r1] <- 'R1'
+  final.df$temp.2.rangeTest[temp.2.fail.r1] <- 'R1'
+  final.df$temp.3.rangeTest[temp.3.fail.r1] <- 'R1'
+  #Replace NAs with a blank
+  final.df$temp.1.rangeTest[is.na(final.df$temp.1.rangeTest)] <- ' '
+  final.df$temp.2.rangeTest[is.na(final.df$temp.2.rangeTest)] <- ' '
+  final.df$temp.3.rangeTest[is.na(final.df$temp.3.rangeTest)] <- ' '
+  return(final.df)
+}
+
+#Performs second range test on temperature based on Estevez et al. (2011)--UNDER CONSTRUCTION####
+#Need to find an accurate list of country max/min temperatures; ideally long-term stations close to TEAM station
+
+#Performs first range test on rainfall based on Estevez et al. (2011)####
+#Acceptable values fall between 0 and 250 mm over a half hour period
+#NEED TO PASTE TIME AND DATE (JUST DATE NOW) PRIOR TO JUNE 2010; FOR VB AT LEAST
+f.rain.r1 <- function(data) {
+  #Vectorize and format the rain data
+  rain <- as.numeric(data$Rainfall)
+  #Format observation date as POSIX
+  data$Observation <- ymd_hms(data$Observation)
+  #Store the first day of the data (where to start the interval)
+  start.date <- min(data$Observation) #ymd('2010-07-01')
+  #Create a vector to store the index info for failed tests
+  rain.fail.r1 <- vector()
+  #Create a half hour interval window to select data during that period
+  int <- new_interval(start = start.date, end = start.date + minutes(30))
+  #For each record use the interval to index data and then sum rainfall measurements
+  for(i in 1:10000) {
+    #If data is missing for a particular interval, shift the interval by five minutes and start again
+    #at next record
+    if(length(which(data$Observation %within% int)) == 0) {
+      int <- int_shift(int = int, by = minutes(5))
+    } else {
+      #If the interval exists in the data, sum the data over the interval
+      half.hr.sum <- sum(rain[which(data$Observation %within% int)])
+      #If the sum of the interval is greater than 250 or less than 0, store the row number in the index
+      if(half.hr.sum > 250 | half.hr.sum < 0) {
+        rain.fail.r1[i] <- which(data$Observation %within% int)[1]
+      }
+      #After storing the row number, shift the interval by five minutes and start again at next record
+      int <- int_shift(int = int, by = minutes(5))
+    }
+  }
+  #Create a data frame with results of the test
+  final.df <- data.frame(ReferenceID = data$ReferenceID, RecordID = data$RecordID, Observation = data$Observation,
+                         TEAMSiteName = data$TEAMSiteName, SamplingUnitName = data$SamplingUnitName, rain = rain,
+                         rain.rangeTest = NA)
+  #Remove NAs that were introduced to the failed test index
+  rain.fail.r1 <- rain.fail.r1[!is.na(rain.fail.r1)]
+  #Use the failed test index to store the error code
+  final.df$rain.rangeTest[rain.fail.r1] <- 'R1'
+  #Replace NAs with a blank
+  final.df$rain.rangeTest[is.na(final.df$rain.rangeTest)] <- ' '
+  return(final.df)
+}
+
+#Performs second range test on rainfall based on Estevez et al. (2011)####
+#Acceptable values fall between 0 and 500 mm
+f.rain.r2 <- function(data) {
+  data$Observation <- ymd_hms(data$Observation)
+  #Vectorize and format rain data
+  rain <- as.numeric(data$Rainfall)
+  #Grab just the date of each record
+  TempDay <- floor_date(data$Observation, 'day')
+  TempDay <- ymd(TempDay)
+  #Generate daily summaries
+  Daily.Climate.Data <- data.frame(Rain = tapply(rain, TempDay, sum))
+  #Index the rows in which a day had more than 500 mm or less than 0 mm of rain
+  rain.fail.r2 <- as.vector(which(Daily.Climate.Data$Rain > 500 | Daily.Climate.Data$Rain < 0))
+  #Get the dates from the index number to flag the results
+  if(length(rain.fail.r2) != 0) {
+    dates.fail <- ymd(row.names(Daily.Climate.Data)[rain.fail.r2])
+    #Creates an index again to store the dates in the data that match the days that
+    #failed in the previous index
+    rain.fail.r2 <- which(TempDay %in% dates.fail)
+  }
+  #Create a data frame with results of the test
+  final.df <- data.frame(ReferenceID = data$ReferenceID, RecordID = data$RecordID, Observation = data$Observation,
+                         TEAMSiteName = data$TEAMSiteName, SamplingUnitName = data$SamplingUnitName, rain = rain,
+                         rain.rangeTest = NA)
+  #Use the failed test index to store the error code
+  final.df$rain.rangeTest[rain.fail.r2] <- 'R2'
+  #Replace NAs with a blank
+  final.df$rain.rangeTest[is.na(final.df$rain.rangeTest)] <- ' '
+  return(final.df)
+}
+
+#Performs first persistence test for temperature####
+f.temp.rh.p1 <- function(data) {
+  data$Observation <- ymd_hms(data$Observation)
+  #Vectorize and format the temperature data
+  temp.1 <- as.numeric(data$Sensor1_AvgTemp)
+  temp.2 <- as.numeric(data$Sensor2_AvgTemp)
+  temp.3 <- as.numeric(data$Sensor3_AvgTemp)
+  rh.1 <- as.numeric(data$Sensor1_RelativeHumidity)
+  rh.2 <- as.numeric(data$Sensor2_RelativeHumidity)
+  rh.3 <- as.numeric(data$Sensor3_RelativeHumidity)
+  #Create a list of variables
+  var.list <- list(temp.3, rh.1, rh.2, rh.3)
+  #Run the persistence function to generate index of failed records for:
+  #first record (with data), and 1 and 2 days prior to current record
+  #See f.persistence function for the real magic; setup with data set
+  #and variable of interest
+  #Takes roughly 8 hours per sensor - should be able to speed this up
+  temp.1.fail.p1 <- f.persistence(data, temp.1)
+  #Start @ 11:49 AM
+  temp.2.fail.p1 <- f.persistence(data, temp.2)
+  temp.3.fail.p1 <- f.persistence(data, temp.3)
+  #Create a data frame with results of the test
+  final.df <- data.frame(ReferenceID = data$ReferenceID, RecordID = data$RecordID, Observation = data$Observation,
+                         TEAMSiteName = data$TEAMSiteName, SamplingUnitName = data$SamplingUnitName, temp.1 = temp.1,
+                         temp.1.perTest = NA, temp.2 = temp.2, temp.2.perTest = NA, temp.3 = temp.3,
+                         temp.3.perTest = NA)
+  #Use the failed test index to store the error code
+  final.df$temp.1.perTest[temp.1.fail.p1] <- 'P1'
+  final.df$temp.2.perTest[temp.2.fail.p1] <- 'P1'
+  final.df$temp.3.perTest[temp.3.fail.p1] <- 'P1'
+  #Replace NAs with a blank
+  final.df$temp.1.perTest[is.na(final.df$temp.1.perTest)] <- ' '
+  final.df$temp.2.perTest[is.na(final.df$temp.2.perTest)] <- ' '
+  final.df$temp.3.perTest[is.na(final.df$temp.3.perTest)] <- ' '
+  return(final.df)
+}
+
+#This function creates an index to compare records to exactly 1 and 2 days prior to the
+#current record and then returns an index of records that fail the test
+#All three records will be flagged if they are the same value
+#Must use a valid data set and variable
+f.persistence <- function(data, var) {
+  #Find the date (and time) of the first record in the data set
+  this.date <- which(data$Observation == min(data$Observation))
+  #Get the index for a 2 day delayed start to make the for loop work properly
+  delayed.start <- which(data$Observation == data$Observation[this.date] + days(2))
+  #If there is no data for the first record, index the first record that has data
+  delayed.start <- ifelse(test=delayed.start < which(!is.na(var))[1],
+                          yes=which(!is.na(var))[1],
+                          no=delayed.start)
+  #Get the index for a 2 day delayed start from the first record that has data
+  #to make the for loop work properly (155-163 CAN PROBABLY BE CLEANED UP)
+  delayed.start <- which(data$Observation == data$Observation[delayed.start] + days(2))
+  #Create an empty list to store the index values for records that fail the test
+  fail.list <- list()
+  #Starts at the earliest record with data and fills list with records that fail test
+  for(i in delayed.start:nrow(data)) {
+    #Index the date and time of the current record
+    this.date <- which(data$Observation == data$Observation[i])
+    #index the date and time of the record exactly 1 and 2 days prior to current record 
+    day.before <- which(data$Observation == data$Observation[i] - days(1))
+    days2.before <- which(data$Observation == data$Observation[i] - days(2))
+    #If the variable for the current record is equal to the value exactly 1 and 2 days
+    #before the current record, index all three rows
+    if(length(var[i] == var[day.before] | var[i] == days2.before ) == 0) {
+      this.date <- this.date
+    } else if(is.na(var[i]) | is.na(var[day.before]) | is.na(var[days2.before])) {
+      this.date <- this.date
+    } else if(var[i] == var[day.before] & var[i] == var[days2.before])  {
+      fail.list[[i]] <- c(this.date, day.before, days2.before)
+    }
+  }
+  #Convert the list index for failed records into a vector index and return
+  fail.index <- unlist(fail.list)
+  return(fail.index)
+}
+
+#Takes list of variables and runs persistence test on all
+f.persistence.list <- function(data, var.list) {
+  #Find the date (and time) of the first record in the data set
+  this.date <- which(data$Observation == min(data$Observation))
+  #Get the index for a 2 day delayed start to make the for loop work properly
+  for (j in 1:length(var.list)) {
+    delayed.start <- vector()
+    delayed.start[j] <- which(data$Observation == data$Observation[this.date] + days(2))
+    #If there is no data for the first record, index the first record that has data
+    delayed.start[j] <- ifelse(test=delayed.start < which(!is.na(var.list[[j]]))[1],
+                            yes=which(!is.na(var.list[[j]]))[1],
+                            no=delayed.start)
+    #Get the index for a 2 day delayed start from the first record that has data
+    #to make the for loop work properly (155-163 CAN PROBABLY BE CLEANED UP)
+    delayed.start[j] <- which(data$Observation == data$Observation[delayed.start] + days(2))
+  }
+  #Create an empty list to store the index values for records that fail the test
+  fail.list <- list()
+  #Starts at the earliest record with data and fills list with records that fail test
+  for(i in delayed.start:nrow(data)) {
+    #Index the date and time of the current record
+    this.date <- which(data$Observation == data$Observation[i])
+    #index the date and time of the record exactly 1 and 2 days prior to current record 
+    day.before <- which(data$Observation == data$Observation[i] - days(1))
+    days2.before <- which(data$Observation == data$Observation[i] - days(2))
+    #If the variable for the current record is equal to the value exactly 1 and 2 days
+    #before the current record, index all three rows
+    if(is.na(var[i]) | is.na(var[day.before]) | is.na(var[days2.before])) {
+      this.date <- this.date
+    }
+    if(length(var[i] == var[day.before] & var[i] == days2.before ) == 0) {
+      this.date <- this.date
+    } else {
+      if(var[i] == var[day.before] & var[i] == var[days2.before])  {
+        fail.list[[i]] <- c(this.date, day.before, days2.before)
+      }
+    }
+  }
+  #Convert the list index for failed records into a vector index and return
+  fail.index <- unlist(fail.list)
+  return(fail.index)
+}