From 36addd8bd9d4a3b9668a72714f7e08ad160be871 Mon Sep 17 00:00:00 2001 From: jmaccarthy Date: Fri, 28 Feb 2014 17:09:59 -0500 Subject: [PATCH 01/10] testing forked commits --- team_db_query.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/team_db_query.R b/team_db_query.R index 89c0589..f9a7bbc 100644 --- a/team_db_query.R +++ b/team_db_query.R @@ -1,6 +1,6 @@ # team_db_query.r # -#Hello! +#Hello! This is different from the master version. f.teamdb.query <- function(dataset) { # Database query function of TEAM production database. Returns a R Object for # each dataset and saves to the workspace. The function contains views or pre- From 1194443814e76ce0c1aaaa8243f8e7b0a7e58740 Mon Sep 17 00:00:00 2001 From: jmaccarthy Date: Mon, 28 Apr 2014 15:07:04 -0400 Subject: [PATCH 02/10] this file contains range test functions --- clim_functions | 125 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 clim_functions diff --git a/clim_functions b/clim_functions new file mode 100644 index 0000000..98dcd42 --- /dev/null +++ b/clim_functions @@ -0,0 +1,125 @@ +require(stringr) +require(lubridate) +require(compiler) +require(ggplot2) + +#Performs range test on relative humidity based on Estevez et al. (2011)#### +#RH acceptable range = 0-100% +f.rh.r1 <- function(data) { + #Vectorize and format the relative humidity data + rh.1 <- as.numeric(data$Sensor1_RelativeHumidity) + rh.2 <- as.numeric(data$Sensor2_RelativeHumidity) + rh.3 <- as.numeric(data$Sensor3_RelativeHumidity) + #Index rows that fail the first/only range test for relative humidity + rh.1.fail.r1 <- which(rh.1 < 0 | rh.1 > 100) + rh.2.fail.r1 <- which(rh.2 < 0 | rh.2 > 100) + rh.3.fail.r1 <- which(rh.3 < 0 | rh.3 > 100) + #Create a dataframe with results of the test + final.df <- data.frame(ReferenceID = data$ReferenceID, RecordID = data$RecordID, Observation = data$Observation, + TEAMSiteName = data$TEAMSiteName, SamplingUnitName = data$SamplingUnitName, rh.1 = rh.1, + rh.1.rangeTest = NA, rh.2 = rh.2, rh.2.rangeTest = NA, rh.3 = rh.3, rh.3.rangeTest = NA) + #Use the failed test index to store the error code + final.df$rh.1.rangeTest[rh.1.fail.r1] <- 'R1' + final.df$rh.2.rangeTest[rh.2.fail.r1] <- 'R1' + final.df$rh.3.rangeTest[rh.3.fail.r1] <- 'R1' + #Replace NAs with a blank + final.df$rh.1.rangeTest[is.na(final.df$rh.1.rangeTest)] <- ' ' + final.df$rh.2.rangeTest[is.na(final.df$rh.2.rangeTest)] <- ' ' + final.df$rh.3.rangeTest[is.na(final.df$rh.3.rangeTest)] <- ' ' + return(final.df) +} + +#Performs first range test on temperature based on Estevez et al. (2011)#### +#Acceptable values fall between -30 and 50; revisit these numbers (Africa max high is 55--WMO) +f.temp.r1 <- function(data) { + #Vectorize and format the temperature data + temp.1 <- as.numeric(data$Sensor1_AvgTemp) + temp.2 <- as.numeric(data$Sensor2_AvgTemp) + temp.3 <- as.numeric(data$Sensor3_AvgTemp) + #Index rows that have failed the range test for temperature + temp.1.fail.r1 <- which(temp.1 < -30 | temp.1 > 50) + temp.2.fail.r1 <- which(temp.2 < -30 | temp.2 > 50) + temp.3.fail.r1 <- which(temp.3 < -30 | temp.3 > 50) + #Create a dataframe with results of the test + final.df <- data.frame(ReferenceID = data$ReferenceID, RecordID = data$RecordID, Observation = data$Observation, + TEAMSiteName = data$TEAMSiteName, SamplingUnitName = data$SamplingUnitName, temp.1 = temp.1, + temp.1.rangeTest = NA, temp.2 = temp.2, temp.2.rangeTest = NA, temp.3 = temp.3, + temp.3.rangeTest = NA) + #Use the failed test index to store the error code + final.df$temp.1.rangeTest[temp.1.fail.r1] <- 'R1' + final.df$temp.2.rangeTest[temp.2.fail.r1] <- 'R1' + final.df$temp.3.rangeTest[temp.3.fail.r1] <- 'R1' + #Replace NAs with a blank + final.df$temp.1.rangeTest[is.na(final.df$temp.1.rangeTest)] <- ' ' + final.df$temp.2.rangeTest[is.na(final.df$temp.2.rangeTest)] <- ' ' + final.df$temp.3.rangeTest[is.na(final.df$temp.3.rangeTest)] <- ' ' + return(final.df) +} + +#Performs second range test on temperature based on Estevez et al. (2011)--UNDER CONSTRUCTION#### +#Need to find an accurate list of country max/min temperatures; ideally long-term stations close to TEAM station + +#Performs first range test on rainfall based on Estevez et al. (2011)#### +#NEED TO PASTE TIME AND DATE (JUST DATE NOW) PRIOR TO JUNE 2010; FOR VB AT LEAST +f.rain.r1 <- function(data) { + #Vectorize and format the rain data + rain <- as.numeric(data$Rainfall) + #Format observation date as POSIX + data$Observation <- ymd_hms(data$Observation) + #Store the first day of the data (where to start the interval) + start.date <- min(data$Observation) #ymd('2010-07-01') + #Create a vector to store the index info for failed tests + rain.fail.r1 <- vector() + #Create a half hour interval window to select data during that period + int <- new_interval(start = start.date, end = start.date + minutes(30)) + #For each record use the interval to index data and then sum rainfall measurements + for(i in 1:10000) { + #If data is missing for a particular interval, shift the interval by five minutes and start again + #at next record + if(length(which(data$Observation %within% int)) == 0) { + int <- int_shift(int = int, by = minutes(5)) + } else { + #If the interval exists in the data, sum the data over the interval + half.hr.sum <- sum(rain[which(data$Observation %within% int)]) + #If the sum of the interval is greater than 250 or less than 0, store the row number in the index + if(half.hr.sum > 250 | half.hr.sum < 0) { + rain.fail.r1[i] <- which(data$Observation %within% int)[1] + } + #After storing the row number, shift the interval by five minutes and start again at next record + int <- int_shift(int = int, by = minutes(5)) + } + } + #Create a dataframe with results of the test + final.df <- data.frame(ReferenceID = data$ReferenceID, RecordID = data$RecordID, Observation = data$Observation, + TEAMSiteName = data$TEAMSiteName, SamplingUnitName = data$SamplingUnitName, rain = rain, + rain.rangeTest = NA) + #Remove NAs that were introduced to the failed test index + rain.fail.r1 <- rain.fail.r1[!is.na(rain.fail.r1)] + #Use the failed test index to store the error code + final.df$rain.rangeTest[rain.fail.r1] <- 'R1' + #Replace NAs with a blank + final.df$rain.rangeTest[is.na(final.df$rain.rangeTest)] <- ' ' + return(final.df) +} + +#Performs second range test on rainfall based on Estevez et al. (2011)#### +f.rain.r2 <- function(data) { + data$Observation <- ymd_hms(data$Observation) + rain <- as.numeric(data$Rainfall) + TempDay <- floor_date(data$Observation, 'day') + Daily.Climate.Data <- data.frame(Rain = tapply(rain, TempDay, sum)) + rain.fail.r2 <- as.vector(which(Daily.Climate.Data$Rain > 500 | Daily.Climate.Data$Rain < 0)) + dates.fail <- ymd(row.names(Daily.Climate.Data)[rain.fail.r2]) + rain.fail.r2 <- which(TempDay %in% dates.fail) + #Create a dataframe with results of the test + final.df <- data.frame(ReferenceID = data$ReferenceID, RecordID = data$RecordID, Observation = data$Observation, + TEAMSiteName = data$TEAMSiteName, SamplingUnitName = data$SamplingUnitName, rain = rain, + rain.rangeTest = NA) + #Use the failed test index to store the error code + final.df$rain.rangeTest[rain.fail.r2] <- 'R2' + #Replace NAs with a blank + final.df$rain.rangeTest[is.na(final.df$rain.rangeTest)] <- ' ' + return(final.df) +} +#head(str_split_fixed(data$Observation, pattern = ' ', 2)) +#strsplit(data$Observation, ' ') From ca08c2e6c0a7b52764f20c4ddcc29b4355436ee9 Mon Sep 17 00:00:00 2001 From: jmaccarthy Date: Mon, 28 Apr 2014 15:08:06 -0400 Subject: [PATCH 03/10] range test program --- rangeTest | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 rangeTest diff --git a/rangeTest b/rangeTest new file mode 100644 index 0000000..fd5cc99 --- /dev/null +++ b/rangeTest @@ -0,0 +1,27 @@ +#source('~/Jimmy_Code/teamcode/tools/team_db_query.R') + +#Load in the TEAM climate data +#cldata <- f.teamdb.query('climate') +#clmeta <- cldata$cl_temp_maxmin +#cldata <- cldata$cl_data + +load('cl_data_new2014-04-24.gzip') +vb.data.3.0 <- cl_data_all[which(cl_data_all$TEAMSiteName == unique(cl_data_all$TEAMSiteName)[3] & + cl_data_all$ProtocolVersion == 'Climate 3'), ] + +#Relative humidity range test 1 +rh.r1.test <- f.rh.r1(vb.data) + +#Temperature range test 1 +temp.r1.test <- f.temp.r1(vb.data) + +#NEED TO ADD TEMPERATURE RANGE TEST 2 + +#Rainfall range test 1 +system.time(rain.r1.test <- f.rain.r1(vb.data.3.0)) +#g<-cmpfun(f.rain.r1) +#compare <- microbenchmark(f.rain.r1(vb.data.3.0), g(vb.data.3.0), times = 1000) +#autoplot(compare) + +#Rainfall range test 2 +rain.r2.test From 9c9bc5a244d01c33dab627202d94991e9629f565 Mon Sep 17 00:00:00 2001 From: jmaccarthy Date: Wed, 30 Apr 2014 23:22:46 -0400 Subject: [PATCH 04/10] Update clim_functions adds persistence tests --- clim_functions | 85 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 79 insertions(+), 6 deletions(-) diff --git a/clim_functions b/clim_functions index 98dcd42..412af14 100644 --- a/clim_functions +++ b/clim_functions @@ -14,7 +14,7 @@ f.rh.r1 <- function(data) { rh.1.fail.r1 <- which(rh.1 < 0 | rh.1 > 100) rh.2.fail.r1 <- which(rh.2 < 0 | rh.2 > 100) rh.3.fail.r1 <- which(rh.3 < 0 | rh.3 > 100) - #Create a dataframe with results of the test + #Create a data frame with results of the test final.df <- data.frame(ReferenceID = data$ReferenceID, RecordID = data$RecordID, Observation = data$Observation, TEAMSiteName = data$TEAMSiteName, SamplingUnitName = data$SamplingUnitName, rh.1 = rh.1, rh.1.rangeTest = NA, rh.2 = rh.2, rh.2.rangeTest = NA, rh.3 = rh.3, rh.3.rangeTest = NA) @@ -40,7 +40,7 @@ f.temp.r1 <- function(data) { temp.1.fail.r1 <- which(temp.1 < -30 | temp.1 > 50) temp.2.fail.r1 <- which(temp.2 < -30 | temp.2 > 50) temp.3.fail.r1 <- which(temp.3 < -30 | temp.3 > 50) - #Create a dataframe with results of the test + #Create a data frame with results of the test final.df <- data.frame(ReferenceID = data$ReferenceID, RecordID = data$RecordID, Observation = data$Observation, TEAMSiteName = data$TEAMSiteName, SamplingUnitName = data$SamplingUnitName, temp.1 = temp.1, temp.1.rangeTest = NA, temp.2 = temp.2, temp.2.rangeTest = NA, temp.3 = temp.3, @@ -60,6 +60,7 @@ f.temp.r1 <- function(data) { #Need to find an accurate list of country max/min temperatures; ideally long-term stations close to TEAM station #Performs first range test on rainfall based on Estevez et al. (2011)#### +#Acceptable values fall between 0 and 250 mm over a half hour period #NEED TO PASTE TIME AND DATE (JUST DATE NOW) PRIOR TO JUNE 2010; FOR VB AT LEAST f.rain.r1 <- function(data) { #Vectorize and format the rain data @@ -89,7 +90,7 @@ f.rain.r1 <- function(data) { int <- int_shift(int = int, by = minutes(5)) } } - #Create a dataframe with results of the test + #Create a data frame with results of the test final.df <- data.frame(ReferenceID = data$ReferenceID, RecordID = data$RecordID, Observation = data$Observation, TEAMSiteName = data$TEAMSiteName, SamplingUnitName = data$SamplingUnitName, rain = rain, rain.rangeTest = NA) @@ -103,15 +104,23 @@ f.rain.r1 <- function(data) { } #Performs second range test on rainfall based on Estevez et al. (2011)#### +#Acceptable values fall between 0 and 500 mm f.rain.r2 <- function(data) { data$Observation <- ymd_hms(data$Observation) + #Vectorize and format rain data rain <- as.numeric(data$Rainfall) + #Grab just the date of each record TempDay <- floor_date(data$Observation, 'day') + #Generate daily summaries Daily.Climate.Data <- data.frame(Rain = tapply(rain, TempDay, sum)) + #Index the rows in which a day had more than 500 mm or less than 0 mm of rain rain.fail.r2 <- as.vector(which(Daily.Climate.Data$Rain > 500 | Daily.Climate.Data$Rain < 0)) + #Get the dates from the index number to flag the results dates.fail <- ymd(row.names(Daily.Climate.Data)[rain.fail.r2]) + #Creates an index again to store the dates in the data that match the days that + #failed in the previous index (CAN 116-121 BE CLEANED UP AT ALL?) rain.fail.r2 <- which(TempDay %in% dates.fail) - #Create a dataframe with results of the test + #Create a data frame with results of the test final.df <- data.frame(ReferenceID = data$ReferenceID, RecordID = data$RecordID, Observation = data$Observation, TEAMSiteName = data$TEAMSiteName, SamplingUnitName = data$SamplingUnitName, rain = rain, rain.rangeTest = NA) @@ -121,5 +130,69 @@ f.rain.r2 <- function(data) { final.df$rain.rangeTest[is.na(final.df$rain.rangeTest)] <- ' ' return(final.df) } -#head(str_split_fixed(data$Observation, pattern = ' ', 2)) -#strsplit(data$Observation, ' ') + +#Performs first persistence test for temperature#### +f.temp.p1 <- function(data) { + data$Observation <- ymd_hms(data$Observation) + #Vectorize and format the temperature data + temp.1 <- as.numeric(data$Sensor1_AvgTemp) + temp.2 <- as.numeric(data$Sensor2_AvgTemp) + temp.3 <- as.numeric(data$Sensor3_AvgTemp) + #Run the persistence function to generate index of failed records for: + #first record (with data), and 1 and 2 days prior to current record + #See f.persistence function for the real magic; setup with data set + #and variable of interest + temp.1.fail.p1 <- f.persistence(data, temp.1) + temp.2.fail.p1 <- f.persistence(data, temp.2) + temp.3.fail.p1 <- f.persistence(data, temp.3) + #Create a data frame with results of the test + final.df <- data.frame(ReferenceID = data$ReferenceID, RecordID = data$RecordID, Observation = data$Observation, + TEAMSiteName = data$TEAMSiteName, SamplingUnitName = data$SamplingUnitName, temp.1 = temp.1, + temp.1.perTest = NA, temp.2 = temp.2, temp.2.perTest = NA, temp.3 = temp.3, + temp.3.perTest = NA) + #Use the failed test index to store the error code + final.df$temp.1.perTest[temp.1.fail.p1] <- 'P1' + final.df$temp.2.perTest[temp.2.fail.p1] <- 'P1' + final.df$temp.3.perTest[temp.3.fail.p1] <- 'P1' + #Replace NAs with a blank + final.df$temp.1.perTest[is.na(final.df$temp.1.perTest)] <- ' ' + final.df$temp.2.perTest[is.na(final.df$temp.2.perTest)] <- ' ' + final.df$temp.3.perTest[is.na(final.df$temp.3.perTest)] <- ' ' + return(final.df) +} + +#This function creates an index to compare records to exactly 1 and 2 days prior to the +#current record and then returns an index of records that fail the test +#All three records will be flagged if they are the same value +#Must use a valid data set and variable +f.persistence <- function(data, var) { + #Find the date (and time) of the first record in the data set + this.date <- which(data$Observation == min(data$Observation)) + #Get the index for a 2 day delayed start to make the for loop work properly + delayed.start <- which(data$Observation == data$Observation[this.date] + days(2)) + #If there is no data for the first record, index the first record that has data + delayed.start <- ifelse(test=delayed.start < which(!is.na(var))[1], + yes=which(!is.na(var))[1], + no=delayed.start) + #Get the index for a 2 day delayed start from the first record that has data + #to make the for loop work properly (155-163 CAN PROBABLY BE CLEANED UP) + delayed.start <- which(data$Observation == data$Observation[delayed.start] + days(2)) + #Create an empty list to store the index values for records that fail the test + fail.list <- list() + #Starts at the earliest record with data and fills list with records that fail test + for(i in delayed.start:379732) { + #Index the date and time of the current record + this.date <- which(data$Observation == data$Observation[i]) + #index the date and time of the record exactly 1 and 2 days prior to current record + day.before <- which(data$Observation == data$Observation[i] - days(1)) + days2.before <- which(data$Observation == data$Observation[i] - days(2)) + #If the variable for the current record is equal to the value exactly 1 and 2 days + #before the current record, index all three rows + if(var[i] == var[day.before] & var[i] == var[days2.before]) { + fail.list[[i]] <- c(this.date, day.before, days2.before) + } + } + #Convert the list index for failed records into a vector index and return + fail.index <- unlist(fail.list) + return(fail.index) +} From 5a7615a301c2bfa25eb28403d10bf16ff3ca94d0 Mon Sep 17 00:00:00 2001 From: jmaccarthy Date: Wed, 30 Apr 2014 23:24:02 -0400 Subject: [PATCH 05/10] Create clim_run this program runs the supplementary clim_functions file --- clim_run | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 clim_run diff --git a/clim_run b/clim_run new file mode 100644 index 0000000..bfb63fe --- /dev/null +++ b/clim_run @@ -0,0 +1,31 @@ +#source('~/Jimmy_Code/teamcode/tools/team_db_query.R') + +#Load in the TEAM climate data +#cldata <- f.teamdb.query('climate') +#clmeta <- cldata$cl_temp_maxmin +#cldata <- cldata$cl_data + +load('cl_data_new2014-04-24.gzip') +vb.data.3.0 <- cl_data_all[which(cl_data_all$TEAMSiteName == unique(cl_data_all$TEAMSiteName)[3] & + cl_data_all$ProtocolVersion == 'Climate 3'), ] +vb.data.3.0.st2 <- vb.data.3.0[which(vb.data.3.0$SamplingUnitName == 'CL-VB-2'), ] + +#Relative humidity range test 1 +rh.r1.test <- f.rh.r1(vb.data.3.0.st2) + +#Temperature range test 1 +temp.r1.test <- f.temp.r1(vb.data.3.0.st2) + +#NEED TO ADD TEMPERATURE RANGE TEST 2 + +#Rainfall range test 1 +rain.r1.test <- f.rain.r1(vb.data.3.0.st2) +#g<-cmpfun(f.rain.r1) +#compare <- microbenchmark(f.rain.r1(vb.data.3.0), g(vb.data.3.0), times = 1000) +#autoplot(compare) + +#Rainfall range test 2 +rain.r2.test <- f.rain.r2(vb.data.3.0) + +#Temperature persistence test 1 +temp.p1.test <- f.temp.p1(vb.data.3.0) From f6fe51b704b42db50e2109c311cc5eac850ddf11 Mon Sep 17 00:00:00 2001 From: jmaccarthy Date: Thu, 1 May 2014 12:47:37 -0400 Subject: [PATCH 06/10] Update clim_functions --- clim_functions | 71 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 64 insertions(+), 7 deletions(-) diff --git a/clim_functions b/clim_functions index 412af14..0688e06 100644 --- a/clim_functions +++ b/clim_functions @@ -111,15 +111,18 @@ f.rain.r2 <- function(data) { rain <- as.numeric(data$Rainfall) #Grab just the date of each record TempDay <- floor_date(data$Observation, 'day') + TempDay <- ymd(TempDay) #Generate daily summaries Daily.Climate.Data <- data.frame(Rain = tapply(rain, TempDay, sum)) #Index the rows in which a day had more than 500 mm or less than 0 mm of rain rain.fail.r2 <- as.vector(which(Daily.Climate.Data$Rain > 500 | Daily.Climate.Data$Rain < 0)) #Get the dates from the index number to flag the results - dates.fail <- ymd(row.names(Daily.Climate.Data)[rain.fail.r2]) - #Creates an index again to store the dates in the data that match the days that - #failed in the previous index (CAN 116-121 BE CLEANED UP AT ALL?) - rain.fail.r2 <- which(TempDay %in% dates.fail) + if(length(rain.fail.r2) != 0) { + dates.fail <- ymd(row.names(Daily.Climate.Data)[rain.fail.r2]) + #Creates an index again to store the dates in the data that match the days that + #failed in the previous index + rain.fail.r2 <- which(TempDay %in% dates.fail) + } #Create a data frame with results of the test final.df <- data.frame(ReferenceID = data$ReferenceID, RecordID = data$RecordID, Observation = data$Observation, TEAMSiteName = data$TEAMSiteName, SamplingUnitName = data$SamplingUnitName, rain = rain, @@ -132,17 +135,24 @@ f.rain.r2 <- function(data) { } #Performs first persistence test for temperature#### -f.temp.p1 <- function(data) { +f.temp.rh.p1 <- function(data) { data$Observation <- ymd_hms(data$Observation) #Vectorize and format the temperature data temp.1 <- as.numeric(data$Sensor1_AvgTemp) temp.2 <- as.numeric(data$Sensor2_AvgTemp) temp.3 <- as.numeric(data$Sensor3_AvgTemp) + rh.1 <- as.numeric(data$Sensor1_RelativeHumidity) + rh.2 <- as.numeric(data$Sensor2_RelativeHumidity) + rh.3 <- as.numeric(data$Sensor3_RelativeHumidity) + #Create a list of variables + var.list <- list(temp.3, rh.1, rh.2, rh.3) #Run the persistence function to generate index of failed records for: #first record (with data), and 1 and 2 days prior to current record #See f.persistence function for the real magic; setup with data set #and variable of interest + #Takes roughly 8 hours per sensor - should be able to speed this up temp.1.fail.p1 <- f.persistence(data, temp.1) + #Start @ 11:49 AM temp.2.fail.p1 <- f.persistence(data, temp.2) temp.3.fail.p1 <- f.persistence(data, temp.3) #Create a data frame with results of the test @@ -180,7 +190,7 @@ f.persistence <- function(data, var) { #Create an empty list to store the index values for records that fail the test fail.list <- list() #Starts at the earliest record with data and fills list with records that fail test - for(i in delayed.start:379732) { + for(i in delayed.start:nrow(data)) { #Index the date and time of the current record this.date <- which(data$Observation == data$Observation[i]) #index the date and time of the record exactly 1 and 2 days prior to current record @@ -188,7 +198,11 @@ f.persistence <- function(data, var) { days2.before <- which(data$Observation == data$Observation[i] - days(2)) #If the variable for the current record is equal to the value exactly 1 and 2 days #before the current record, index all three rows - if(var[i] == var[day.before] & var[i] == var[days2.before]) { + if(length(var[i] == var[day.before] | var[i] == days2.before ) == 0) { + this.date <- this.date + } else if(is.na(var[i]) | is.na(var[day.before]) | is.na(var[days2.before])) { + this.date <- this.date + } else if(var[i] == var[day.before] & var[i] == var[days2.before]) { fail.list[[i]] <- c(this.date, day.before, days2.before) } } @@ -196,3 +210,46 @@ f.persistence <- function(data, var) { fail.index <- unlist(fail.list) return(fail.index) } + +#Takes list of variables and runs persistence test on all +f.persistence.list <- function(data, var.list) { + #Find the date (and time) of the first record in the data set + this.date <- which(data$Observation == min(data$Observation)) + #Get the index for a 2 day delayed start to make the for loop work properly + for (j in 1:length(var.list)) { + delayed.start <- vector() + delayed.start[j] <- which(data$Observation == data$Observation[this.date] + days(2)) + #If there is no data for the first record, index the first record that has data + delayed.start[j] <- ifelse(test=delayed.start < which(!is.na(var.list[[j]]))[1], + yes=which(!is.na(var.list[[j]]))[1], + no=delayed.start) + #Get the index for a 2 day delayed start from the first record that has data + #to make the for loop work properly (155-163 CAN PROBABLY BE CLEANED UP) + delayed.start[j] <- which(data$Observation == data$Observation[delayed.start] + days(2)) + } + #Create an empty list to store the index values for records that fail the test + fail.list <- list() + #Starts at the earliest record with data and fills list with records that fail test + for(i in delayed.start:nrow(data)) { + #Index the date and time of the current record + this.date <- which(data$Observation == data$Observation[i]) + #index the date and time of the record exactly 1 and 2 days prior to current record + day.before <- which(data$Observation == data$Observation[i] - days(1)) + days2.before <- which(data$Observation == data$Observation[i] - days(2)) + #If the variable for the current record is equal to the value exactly 1 and 2 days + #before the current record, index all three rows + if(is.na(var[i]) | is.na(var[day.before]) | is.na(var[days2.before])) { + this.date <- this.date + } + if(length(var[i] == var[day.before] & var[i] == days2.before ) == 0) { + this.date <- this.date + } else { + if(var[i] == var[day.before] & var[i] == var[days2.before]) { + fail.list[[i]] <- c(this.date, day.before, days2.before) + } + } + } + #Convert the list index for failed records into a vector index and return + fail.index <- unlist(fail.list) + return(fail.index) +} From e42f0568db2cc0157be6f0bde14039ff30b41f6f Mon Sep 17 00:00:00 2001 From: jmaccarthy Date: Thu, 1 May 2014 12:48:07 -0400 Subject: [PATCH 07/10] Update clim_run --- clim_run | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/clim_run b/clim_run index bfb63fe..dd2308c 100644 --- a/clim_run +++ b/clim_run @@ -11,21 +11,25 @@ vb.data.3.0 <- cl_data_all[which(cl_data_all$TEAMSiteName == unique(cl_data_all$ vb.data.3.0.st2 <- vb.data.3.0[which(vb.data.3.0$SamplingUnitName == 'CL-VB-2'), ] #Relative humidity range test 1 +#Less than a minute for 342637 records rh.r1.test <- f.rh.r1(vb.data.3.0.st2) #Temperature range test 1 +#Less than a minute for 342637 records temp.r1.test <- f.temp.r1(vb.data.3.0.st2) #NEED TO ADD TEMPERATURE RANGE TEST 2 #Rainfall range test 1 +#Runs at 5 minutes for 342637 records rain.r1.test <- f.rain.r1(vb.data.3.0.st2) #g<-cmpfun(f.rain.r1) #compare <- microbenchmark(f.rain.r1(vb.data.3.0), g(vb.data.3.0), times = 1000) #autoplot(compare) #Rainfall range test 2 -rain.r2.test <- f.rain.r2(vb.data.3.0) +#Runs at less than a minute for 342637 records +rain.r2.test <- f.rain.r2(vb.data.3.0.st2) #Temperature persistence test 1 temp.p1.test <- f.temp.p1(vb.data.3.0) From 7ab8888468e76384ba268b6f7f00c448a230e3b7 Mon Sep 17 00:00:00 2001 From: jmaccarthy Date: Fri, 13 Jun 2014 16:46:25 -0400 Subject: [PATCH 08/10] Create climate_tests_v2 Range and persistence tests for TEAM climate data --- climate_tests_v2 | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 climate_tests_v2 diff --git a/climate_tests_v2 b/climate_tests_v2 new file mode 100644 index 0000000..dddcd7e --- /dev/null +++ b/climate_tests_v2 @@ -0,0 +1,37 @@ +#source('~/Jimmy_Code/teamcode/tools/team_db_query.R') + +#Load in the TEAM climate data +#cldata <- f.teamdb.query('climate') +#clmeta <- cldata$cl_temp_maxmin +#cldata <- cldata$cl_data + +load('cl_data_new2014-04-24.gzip') +vb.data.3.0 <- cl_data_all[which(cl_data_all$TEAMSiteName == unique(cl_data_all$TEAMSiteName)[3] & + cl_data_all$ProtocolVersion == 'Climate 3'), ] +nak.data.3.0 <- cl_data_all[which(cl_data_all$TEAMSiteName == unique(cl_data_all$TEAMSiteName)[1] & + cl_data_all$ProtocolVersion == 'Climate 3'), ] +vb.data.3.0.st2 <- vb.data.3.0[which(vb.data.3.0$SamplingUnitName == 'CL-VB-2'), ] +vb.use.data <- vb.data.3.0.st2[1:30000, ] +data<-vb.use.data[1:5000, ] + +#Range tests for relative humidity, temperature, and rainfall +#System Time: 54.2 seconds for 5,000 records +#System Time: 23396.4 seconds (6.5 hours) for 384,907 +system.time(rangeTest.Output <- f.rangeTest(nak.data.3.0)) + +#NEED TO ADD TEMPERATURE RANGE TEST 2 + +#Persistence tests for solar radiation, relative humidity, and temperature +#System Time: 75.1 seconds for 5,000 records +#System Time: 20883.3 seconds (5.8 hours) for 384,907 +system.time(persistenceTest.Output <- f.persistenceTest(nak.data.3.0)) + +#Calculate the error rate +#Returns a list of the error rate tables: range, step, consistency, and persistence +#System Time: 4.6 seconds for 2 tests +system.time(errorRate <- f.errorRate(rangeTest.Output, persistenceTest.Output)) + + +#g<-cmpfun(f.rain.r1) +#compare <- microbenchmark(f.rain.r1(vb.data.3.0), g(vb.data.3.0), times = 1000) +#autoplot(compare) From e56ec7990e590eb5dd10a3db6d513ed3dedf7eb1 Mon Sep 17 00:00:00 2001 From: jmaccarthy Date: Fri, 13 Jun 2014 16:47:17 -0400 Subject: [PATCH 09/10] Rename climate_tests_v2 to clim_tests_v2 --- climate_tests_v2 => clim_tests_v2 | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename climate_tests_v2 => clim_tests_v2 (100%) diff --git a/climate_tests_v2 b/clim_tests_v2 similarity index 100% rename from climate_tests_v2 rename to clim_tests_v2 From 7e646f1880812a504fc2ad6c8485b78d06275cb2 Mon Sep 17 00:00:00 2001 From: jmaccarthy Date: Fri, 13 Jun 2014 16:48:21 -0400 Subject: [PATCH 10/10] Create clim_funks_v2 functions for performing range and persistence tests on TEAM climate data...also includes error rate helper function --- clim_funks_v2 | 226 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 226 insertions(+) create mode 100644 clim_funks_v2 diff --git a/clim_funks_v2 b/clim_funks_v2 new file mode 100644 index 0000000..251f15c --- /dev/null +++ b/clim_funks_v2 @@ -0,0 +1,226 @@ +require(stringr) +require(lubridate) +require(compiler) +require(ggplot2) +require(zoo) + +#Performs range test for relative humidity, and temperature based on Estevez et al. (2011)#### +#RH acceptable range = 0-100% +#Temp acceptable rante = -30 and 50; revisit these numbers (Africa max high is 55--WMO) +f.rangeTest <- function(data) { + #Vectorize and format the relative humidity, temperature, and rain data#### + #Relative Humidity + rh.1 <- as.numeric(data$Sensor1_RelativeHumidity) + rh.2 <- as.numeric(data$Sensor2_RelativeHumidity) + rh.3 <- as.numeric(data$Sensor3_RelativeHumidity) + #Temperature + temp.1 <- as.numeric(data$Sensor1_AvgTemp) + temp.2 <- as.numeric(data$Sensor2_AvgTemp) + temp.3 <- as.numeric(data$Sensor3_AvgTemp) + #Rain + rain <- as.numeric(data$Rainfall) + data$Observation <- ymd_hms(data$Observation) + #Create indices for rows that fail the range test#### + #Relative Humidity + rh.1.fail.r1 <- which(rh.1 < 0 | rh.1 > 100) + rh.2.fail.r1 <- which(rh.2 < 0 | rh.2 > 100) + rh.3.fail.r1 <- which(rh.3 < 0 | rh.3 > 100) + #Temperature - Test 1 + temp.1.fail.r1 <- which(temp.1 < -30 | temp.1 > 50) + temp.2.fail.r1 <- which(temp.2 < -30 | temp.2 > 50) + temp.3.fail.r1 <- which(temp.3 < -30 | temp.3 > 50) + #Temperature - Test 2 + #Performs second range test on temperature based on Estevez et al. (2011)--UNDER CONSTRUCTION#### + #Need to find an accurate list of country max/min temperatures; ideally long-term stations close to TEAM station + #Rain - Test 1 + #Store the first day of the data (where to start the interval) + start.date <- min(data$Observation) #ymd('2010-07-01') + #Create a vector to store the index info for failed tests + rain.fail.r1 <- vector() + #Create a half hour interval window to select data during that period + int <- new_interval(start = start.date, end = start.date + minutes(30)) + #For each record use the interval to index data and then sum rainfall measurements + for(i in 1:nrow(data)) { + #If there is no data for a particular interval, shift the interval by five minutes and start again + #at next record + if(length(which(data$Observation %within% int)) == 0) { + int <- int_shift(int = int, by = minutes(5)) + } else { + #If the any data exists in the interval, sum the data over the half-hour interval + half.hr.sum <- sum(rain[which(data$Observation %within% int)]) + #If the sum of the interval is greater than 120 or less than 0, store the row number in the index + if(half.hr.sum > 120 | half.hr.sum < 0) { + rain.fail.r1[i] <- which(data$Observation %within% int)[1] + } + #After storing the row number in the index, shift the interval by five minutes and repeat + int <- int_shift(int = int, by = minutes(5)) + } + } + #Rain - Test 2 + #Grab just the date of each record + TempDay <- floor_date(data$Observation, 'day') + TempDay <- ymd(TempDay) + #Generate daily summaries + Daily.Climate.Data <- data.frame(Rain = tapply(rain, TempDay, sum)) + #Index the rows in which a day had more than 500 mm or less than 0 mm of rain + rain.fail.r2 <- as.vector(which(Daily.Climate.Data$Rain > 500 | Daily.Climate.Data$Rain < 0)) + #Get the dates from the index number to flag the results + if(length(rain.fail.r2) != 0) { + dates.fail <- ymd(row.names(Daily.Climate.Data)[rain.fail.r2]) + #Creates an index again to store the dates in the data that match the days that + #failed in the previous index + rain.fail.r2 <- which(TempDay %in% dates.fail) + } + #Create a data frame with results of the test#### + final.df <- data.frame(ReferenceID = data$ReferenceID, RecordID = data$RecordID, Observation = data$Observation, + TEAMSiteName = data$TEAMSiteName, SamplingUnitName = data$SamplingUnitName, rh.1 = rh.1, + rh.1.rangeTest = NA, rh.2 = rh.2, rh.2.rangeTest = NA, rh.3 = rh.3, rh.3.rangeTest = NA, + temp.1 = temp.1, temp.1.rangeTest = NA, temp.2 = temp.2, temp.2.rangeTest = NA, + temp.3 = temp.3, temp.3.rangeTest = NA, rain = rain, rain.rangeTest1 = NA, + rain.rangeTest2 = NA) + #Use the failed test index to store the error code#### + #Relative Humidity + final.df$rh.1.rangeTest[rh.1.fail.r1] <- 'R1' + final.df$rh.2.rangeTest[rh.2.fail.r1] <- 'R1' + final.df$rh.3.rangeTest[rh.3.fail.r1] <- 'R1' + #Temperature + final.df$temp.1.rangeTest[temp.1.fail.r1] <- 'R1' + final.df$temp.2.rangeTest[temp.2.fail.r1] <- 'R1' + final.df$temp.3.rangeTest[temp.3.fail.r1] <- 'R1' + #Rain + #Remove NAs that were introduced to the failed test index + rain.fail.r1 <- rain.fail.r1[!is.na(rain.fail.r1)] + final.df$rain.rangeTest1[rain.fail.r1] <- 'R1' + final.df$rain.rangeTest2[rain.fail.r2] <- 'R2' + #Replace NAs with a blank#### + #Relative Humidity + final.df$rh.1.rangeTest[is.na(final.df$rh.1.rangeTest)] <- ' ' + final.df$rh.2.rangeTest[is.na(final.df$rh.2.rangeTest)] <- ' ' + final.df$rh.3.rangeTest[is.na(final.df$rh.3.rangeTest)] <- ' ' + #Temperature + final.df$temp.1.rangeTest[is.na(final.df$temp.1.rangeTest)] <- ' ' + final.df$temp.2.rangeTest[is.na(final.df$temp.2.rangeTest)] <- ' ' + final.df$temp.3.rangeTest[is.na(final.df$temp.3.rangeTest)] <- ' ' + #Rain + final.df$rain.rangeTest1[is.na(final.df$rain.rangeTest1)] <- ' ' + final.df$rain.rangeTest2[is.na(final.df$rain.rangeTest2)] <- ' ' + return(final.df) +} + +#Performs persistence test for solar radiation, relative humidity, and temperatur based on Estevez et al. (2011)e#### +#This function creates an index to compare records to exactly 1 and 2 days prior to the +#current record and then returns an index of records that fail the test +#All three records will be flagged if they are the same value +f.persistenceTest <- function(data) { + data$Observation <- ymd_hms(data$Observation) + #Vectorize and format the data#### + #Solar Radiation + sol.1 <- as.numeric(data$Sensor1_AvgSolarRadiation) + sol.2 <- as.numeric(data$Sensor2_AvgSolarRadiation) + #Relative Humidity + rh.1 <- as.numeric(data$Sensor1_RelativeHumidity) + rh.2 <- as.numeric(data$Sensor2_RelativeHumidity) + rh.3 <- as.numeric(data$Sensor3_RelativeHumidity) + #Temperature + temp.1 <- as.numeric(data$Sensor1_AvgTemp) + temp.2 <- as.numeric(data$Sensor2_AvgTemp) + temp.3 <- as.numeric(data$Sensor3_AvgTemp) + + #Find the date (and time) of the first record in the data set + start.date <- which(data$Observation == min(data$Observation)) + #Get the index for a 2 day delayed start to make the for loop work properly + delayed.start <- which(data$Observation == data$Observation[start.date] + days(2)) + index.list <- list() + for(i in delayed.start:nrow(data)) { + index.list[[i]] <- c(which(data$Observation == data$Observation[i]), which(data$Observation == data$Observation[i] - days(1)), + which(data$Observation == data$Observation[i] - days(2))) + } + #Create an empty list to store the index values for records that fail the test + sol.1.index.vector <- vector() + sol.2.index.vector <- vector() + rh.1.index.vector <- vector() + rh.2.index.vector <- vector() + rh.3.index.vector <- vector() + temp.1.index.vector <- vector() + temp.2.index.vector <- vector() + temp.3.index.vector <- vector() + for(i in 1:length(index.list)) { + sol.1.index.vector[i] <- ifelse(sol.1[index.list[[i]][1]] == sol.1[index.list[[i]][2]] & + sol.1[index.list[[i]][1]] == sol.1[index.list[[i]][3]], 'P1', ' ') + sol.2.index.vector[i] <- ifelse(sol.2[index.list[[i]][1]] == sol.2[index.list[[i]][2]] & + sol.2[index.list[[i]][1]] == sol.2[index.list[[i]][3]], 'P1', ' ') + rh.1.index.vector[i] <- ifelse(temp.1[index.list[[i]][1]] == temp.1[index.list[[i]][2]] & + temp.1[index.list[[i]][1]] == temp.1[index.list[[i]][3]], 'P1', ' ') + rh.2.index.vector[i] <- ifelse(temp.1[index.list[[i]][1]] == temp.1[index.list[[i]][2]] & + temp.1[index.list[[i]][1]] == temp.1[index.list[[i]][3]], 'P1', ' ') + rh.3.index.vector[i] <- ifelse(temp.1[index.list[[i]][1]] == temp.1[index.list[[i]][2]] & + temp.1[index.list[[i]][1]] == temp.1[index.list[[i]][3]], 'P1', ' ') + temp.1.index.vector[i] <- ifelse(temp.1[index.list[[i]][1]] == temp.1[index.list[[i]][2]] & + temp.1[index.list[[i]][1]] == temp.1[index.list[[i]][3]], 'P1', ' ') + temp.2.index.vector[i] <- ifelse(temp.2[index.list[[i]][1]] == temp.2[index.list[[i]][2]] & + temp.2[index.list[[i]][1]] == temp.2[index.list[[i]][3]], 'P1', ' ') + temp.3.index.vector[i] <- ifelse(temp.3[index.list[[i]][1]] == temp.3[index.list[[i]][2]] & + temp.3[index.list[[i]][1]] == temp.3[index.list[[i]][3]], 'P1', ' ') + } + #Convert the list index for failed records into a vector index and return + sol.1.fail.list <- unlist(index.list[which(sol.1.index.vector == 'P1')]) + sol.2.fail.list <- unlist(index.list[which(sol.2.index.vector == 'P1')]) + rh.1.fail.list <- unlist(index.list[which(rh.1.index.vector == 'P1')]) + rh.2.fail.list <- unlist(index.list[which(rh.2.index.vector == 'P1')]) + rh.3.fail.list <- unlist(index.list[which(rh.3.index.vector == 'P1')]) + temp.1.fail.list <- unlist(index.list[which(temp.1.index.vector == 'P1')]) + temp.2.fail.list <- unlist(index.list[which(temp.2.index.vector == 'P1')]) + temp.3.fail.list <- unlist(index.list[which(temp.3.index.vector == 'P1')]) + #Create a data frame with results of the test + final.df <- data.frame(ReferenceID = data$ReferenceID, RecordID = data$RecordID, Observation = data$Observation, + TEAMSiteName = data$TEAMSiteName, SamplingUnitName = data$SamplingUnitName, sol.1 = sol.1, + sol.1.perTest = NA, sol.2 = sol.2, sol.2.perTest = NA, rh.1 = rh.1, rh.1.perTest = NA, + rh.2 = rh.2, rh.2.perTest = NA, rh.3 = rh.3, rh.3.perTest = NA, temp.1 = temp.1, + temp.1.perTest = NA, temp.2 = temp.2, temp.2.perTest = NA, temp.3 = temp.3, + temp.3.perTest = NA) + #Use the failed test index to store the error code + final.df$sol.1.perTest[sol.1.fail.list] <- 'P1' + final.df$sol.2.perTest[sol.2.fail.list] <- 'P1' + final.df$rh.1.perTest[rh.1.fail.list] <- 'P1' + final.df$rh.2.perTest[rh.2.fail.list] <- 'P1' + final.df$rh.3.perTest[rh.3.fail.list] <- 'P1' + final.df$temp.1.perTest[temp.1.fail.list] <- 'P1' + final.df$temp.2.perTest[temp.2.fail.list] <- 'P1' + final.df$temp.3.perTest[temp.3.fail.list] <- 'P1' + #Replace NAs with a blank + final.df$sol.1.perTest[is.na(final.df$sol.1.perTest)] <- ' ' + final.df$sol.2.perTest[is.na(final.df$sol.2.perTest)] <- ' ' + final.df$rh.1.perTest[is.na(final.df$rh.1.perTest)] <- ' ' + final.df$rh.2.perTest[is.na(final.df$rh.2.perTest)] <- ' ' + final.df$rh.3.perTest[is.na(final.df$rh.3.perTest)] <- ' ' + final.df$temp.1.perTest[is.na(final.df$temp.1.perTest)] <- ' ' + final.df$temp.2.perTest[is.na(final.df$temp.2.perTest)] <- ' ' + final.df$temp.3.perTest[is.na(final.df$temp.3.perTest)] <- ' ' + return(final.df) +} + +#Helper function to tally error rate +f.errorRate <- function(range, persistence) { + #Range error rate + noRows.range <- data.frame(noRows = c(nrow(range[!is.na(range$rh.1), ]), nrow(range[!is.na(range$rh.2), ]), nrow(range[!is.na(range$rh.3), ]), + nrow(range[!is.na(range$temp.1), ]), nrow(range[!is.na(range$temp.2), ]), nrow(range[!is.na(range$temp.3), ]), + nrow(range[!is.na(range$rain), ]), nrow(range[!is.na(range$rain), ]))) + row.names(noRows.range) <- c('rh.1', 'rh.2', 'rh.3', 'temp.1', 'temp.2', 'temp.3', 'rain1', 'rain2') + noRows.range$failed <- c(length(which(range$rh.1.rangeTest == 'R1')), length(which(range$rh.2.rangeTest == 'R1')), + length(which(range$rh.3.rangeTest == 'R1')), length(which(range$temp.1.rangeTest == 'R1')), + length(which(range$temp.2.rangeTest == 'R1')), length(which(range$temp.3.rangeTest == 'R1')), + length(which(range$rain.rangeTest1 == 'R1')), length(which(range$rain.rangeTest2 == 'R2'))) + noRows.range$errorRate <- noRows.range$failed / noRows.range$noRows + #Persistence error rate + noRows.per <- data.frame(noRows = c(nrow(persistence[!is.na(persistence$sol.1), ]), nrow(persistence[!is.na(persistence$sol.2), ]), nrow(persistence[!is.na(persistence$rh.1), ]), + nrow(persistence[!is.na(persistence$rh.2), ]), nrow(persistence[!is.na(persistence$rh.3), ]), nrow(persistence[!is.na(persistence$temp.1), ]), + nrow(persistence[!is.na(persistence$temp.2), ]), nrow(persistence[!is.na(persistence$temp.3), ]))) + row.names(noRows.per) <- c('sol.1', 'sol.2', 'rh.1', 'rh.2', 'rh.3', 'temp.1', 'temp.2', 'temp.3') + noRows.per$failed <- c(length(which(persistence$sol.1.perTest == 'P1')), length(which(persistence$sol.2.perTest == 'P1')), + length(which(persistence$rh.1.perTest == 'P1')), length(which(persistence$rh.2.perTest == 'P1')), + length(which(persistence$rh.3.perTest == 'P1')), length(which(persistence$temp.1.perTest == 'P1')), + length(which(persistence$temp.2.perTest == 'P1')), length(which(persistence$temp.3.perTest == 'P1'))) + noRows.per$errorRate <- noRows.per$failed / noRows.per$noRows + final.list <- list(noRows.range, noRows.per) + return(final.list) +}