diff --git a/clim_functions b/clim_functions new file mode 100644 index 0000000..0688e06 --- /dev/null +++ b/clim_functions @@ -0,0 +1,255 @@ +require(stringr) +require(lubridate) +require(compiler) +require(ggplot2) + +#Performs range test on relative humidity based on Estevez et al. (2011)#### +#RH acceptable range = 0-100% +f.rh.r1 <- function(data) { + #Vectorize and format the relative humidity data + rh.1 <- as.numeric(data$Sensor1_RelativeHumidity) + rh.2 <- as.numeric(data$Sensor2_RelativeHumidity) + rh.3 <- as.numeric(data$Sensor3_RelativeHumidity) + #Index rows that fail the first/only range test for relative humidity + rh.1.fail.r1 <- which(rh.1 < 0 | rh.1 > 100) + rh.2.fail.r1 <- which(rh.2 < 0 | rh.2 > 100) + rh.3.fail.r1 <- which(rh.3 < 0 | rh.3 > 100) + #Create a data frame with results of the test + final.df <- data.frame(ReferenceID = data$ReferenceID, RecordID = data$RecordID, Observation = data$Observation, + TEAMSiteName = data$TEAMSiteName, SamplingUnitName = data$SamplingUnitName, rh.1 = rh.1, + rh.1.rangeTest = NA, rh.2 = rh.2, rh.2.rangeTest = NA, rh.3 = rh.3, rh.3.rangeTest = NA) + #Use the failed test index to store the error code + final.df$rh.1.rangeTest[rh.1.fail.r1] <- 'R1' + final.df$rh.2.rangeTest[rh.2.fail.r1] <- 'R1' + final.df$rh.3.rangeTest[rh.3.fail.r1] <- 'R1' + #Replace NAs with a blank + final.df$rh.1.rangeTest[is.na(final.df$rh.1.rangeTest)] <- ' ' + final.df$rh.2.rangeTest[is.na(final.df$rh.2.rangeTest)] <- ' ' + final.df$rh.3.rangeTest[is.na(final.df$rh.3.rangeTest)] <- ' ' + return(final.df) +} + +#Performs first range test on temperature based on Estevez et al. (2011)#### +#Acceptable values fall between -30 and 50; revisit these numbers (Africa max high is 55--WMO) +f.temp.r1 <- function(data) { + #Vectorize and format the temperature data + temp.1 <- as.numeric(data$Sensor1_AvgTemp) + temp.2 <- as.numeric(data$Sensor2_AvgTemp) + temp.3 <- as.numeric(data$Sensor3_AvgTemp) + #Index rows that have failed the range test for temperature + temp.1.fail.r1 <- which(temp.1 < -30 | temp.1 > 50) + temp.2.fail.r1 <- which(temp.2 < -30 | temp.2 > 50) + temp.3.fail.r1 <- which(temp.3 < -30 | temp.3 > 50) + #Create a data frame with results of the test + final.df <- data.frame(ReferenceID = data$ReferenceID, RecordID = data$RecordID, Observation = data$Observation, + TEAMSiteName = data$TEAMSiteName, SamplingUnitName = data$SamplingUnitName, temp.1 = temp.1, + temp.1.rangeTest = NA, temp.2 = temp.2, temp.2.rangeTest = NA, temp.3 = temp.3, + temp.3.rangeTest = NA) + #Use the failed test index to store the error code + final.df$temp.1.rangeTest[temp.1.fail.r1] <- 'R1' + final.df$temp.2.rangeTest[temp.2.fail.r1] <- 'R1' + final.df$temp.3.rangeTest[temp.3.fail.r1] <- 'R1' + #Replace NAs with a blank + final.df$temp.1.rangeTest[is.na(final.df$temp.1.rangeTest)] <- ' ' + final.df$temp.2.rangeTest[is.na(final.df$temp.2.rangeTest)] <- ' ' + final.df$temp.3.rangeTest[is.na(final.df$temp.3.rangeTest)] <- ' ' + return(final.df) +} + +#Performs second range test on temperature based on Estevez et al. (2011)--UNDER CONSTRUCTION#### +#Need to find an accurate list of country max/min temperatures; ideally long-term stations close to TEAM station + +#Performs first range test on rainfall based on Estevez et al. (2011)#### +#Acceptable values fall between 0 and 250 mm over a half hour period +#NEED TO PASTE TIME AND DATE (JUST DATE NOW) PRIOR TO JUNE 2010; FOR VB AT LEAST +f.rain.r1 <- function(data) { + #Vectorize and format the rain data + rain <- as.numeric(data$Rainfall) + #Format observation date as POSIX + data$Observation <- ymd_hms(data$Observation) + #Store the first day of the data (where to start the interval) + start.date <- min(data$Observation) #ymd('2010-07-01') + #Create a vector to store the index info for failed tests + rain.fail.r1 <- vector() + #Create a half hour interval window to select data during that period + int <- new_interval(start = start.date, end = start.date + minutes(30)) + #For each record use the interval to index data and then sum rainfall measurements + for(i in 1:10000) { + #If data is missing for a particular interval, shift the interval by five minutes and start again + #at next record + if(length(which(data$Observation %within% int)) == 0) { + int <- int_shift(int = int, by = minutes(5)) + } else { + #If the interval exists in the data, sum the data over the interval + half.hr.sum <- sum(rain[which(data$Observation %within% int)]) + #If the sum of the interval is greater than 250 or less than 0, store the row number in the index + if(half.hr.sum > 250 | half.hr.sum < 0) { + rain.fail.r1[i] <- which(data$Observation %within% int)[1] + } + #After storing the row number, shift the interval by five minutes and start again at next record + int <- int_shift(int = int, by = minutes(5)) + } + } + #Create a data frame with results of the test + final.df <- data.frame(ReferenceID = data$ReferenceID, RecordID = data$RecordID, Observation = data$Observation, + TEAMSiteName = data$TEAMSiteName, SamplingUnitName = data$SamplingUnitName, rain = rain, + rain.rangeTest = NA) + #Remove NAs that were introduced to the failed test index + rain.fail.r1 <- rain.fail.r1[!is.na(rain.fail.r1)] + #Use the failed test index to store the error code + final.df$rain.rangeTest[rain.fail.r1] <- 'R1' + #Replace NAs with a blank + final.df$rain.rangeTest[is.na(final.df$rain.rangeTest)] <- ' ' + return(final.df) +} + +#Performs second range test on rainfall based on Estevez et al. (2011)#### +#Acceptable values fall between 0 and 500 mm +f.rain.r2 <- function(data) { + data$Observation <- ymd_hms(data$Observation) + #Vectorize and format rain data + rain <- as.numeric(data$Rainfall) + #Grab just the date of each record + TempDay <- floor_date(data$Observation, 'day') + TempDay <- ymd(TempDay) + #Generate daily summaries + Daily.Climate.Data <- data.frame(Rain = tapply(rain, TempDay, sum)) + #Index the rows in which a day had more than 500 mm or less than 0 mm of rain + rain.fail.r2 <- as.vector(which(Daily.Climate.Data$Rain > 500 | Daily.Climate.Data$Rain < 0)) + #Get the dates from the index number to flag the results + if(length(rain.fail.r2) != 0) { + dates.fail <- ymd(row.names(Daily.Climate.Data)[rain.fail.r2]) + #Creates an index again to store the dates in the data that match the days that + #failed in the previous index + rain.fail.r2 <- which(TempDay %in% dates.fail) + } + #Create a data frame with results of the test + final.df <- data.frame(ReferenceID = data$ReferenceID, RecordID = data$RecordID, Observation = data$Observation, + TEAMSiteName = data$TEAMSiteName, SamplingUnitName = data$SamplingUnitName, rain = rain, + rain.rangeTest = NA) + #Use the failed test index to store the error code + final.df$rain.rangeTest[rain.fail.r2] <- 'R2' + #Replace NAs with a blank + final.df$rain.rangeTest[is.na(final.df$rain.rangeTest)] <- ' ' + return(final.df) +} + +#Performs first persistence test for temperature#### +f.temp.rh.p1 <- function(data) { + data$Observation <- ymd_hms(data$Observation) + #Vectorize and format the temperature data + temp.1 <- as.numeric(data$Sensor1_AvgTemp) + temp.2 <- as.numeric(data$Sensor2_AvgTemp) + temp.3 <- as.numeric(data$Sensor3_AvgTemp) + rh.1 <- as.numeric(data$Sensor1_RelativeHumidity) + rh.2 <- as.numeric(data$Sensor2_RelativeHumidity) + rh.3 <- as.numeric(data$Sensor3_RelativeHumidity) + #Create a list of variables + var.list <- list(temp.3, rh.1, rh.2, rh.3) + #Run the persistence function to generate index of failed records for: + #first record (with data), and 1 and 2 days prior to current record + #See f.persistence function for the real magic; setup with data set + #and variable of interest + #Takes roughly 8 hours per sensor - should be able to speed this up + temp.1.fail.p1 <- f.persistence(data, temp.1) + #Start @ 11:49 AM + temp.2.fail.p1 <- f.persistence(data, temp.2) + temp.3.fail.p1 <- f.persistence(data, temp.3) + #Create a data frame with results of the test + final.df <- data.frame(ReferenceID = data$ReferenceID, RecordID = data$RecordID, Observation = data$Observation, + TEAMSiteName = data$TEAMSiteName, SamplingUnitName = data$SamplingUnitName, temp.1 = temp.1, + temp.1.perTest = NA, temp.2 = temp.2, temp.2.perTest = NA, temp.3 = temp.3, + temp.3.perTest = NA) + #Use the failed test index to store the error code + final.df$temp.1.perTest[temp.1.fail.p1] <- 'P1' + final.df$temp.2.perTest[temp.2.fail.p1] <- 'P1' + final.df$temp.3.perTest[temp.3.fail.p1] <- 'P1' + #Replace NAs with a blank + final.df$temp.1.perTest[is.na(final.df$temp.1.perTest)] <- ' ' + final.df$temp.2.perTest[is.na(final.df$temp.2.perTest)] <- ' ' + final.df$temp.3.perTest[is.na(final.df$temp.3.perTest)] <- ' ' + return(final.df) +} + +#This function creates an index to compare records to exactly 1 and 2 days prior to the +#current record and then returns an index of records that fail the test +#All three records will be flagged if they are the same value +#Must use a valid data set and variable +f.persistence <- function(data, var) { + #Find the date (and time) of the first record in the data set + this.date <- which(data$Observation == min(data$Observation)) + #Get the index for a 2 day delayed start to make the for loop work properly + delayed.start <- which(data$Observation == data$Observation[this.date] + days(2)) + #If there is no data for the first record, index the first record that has data + delayed.start <- ifelse(test=delayed.start < which(!is.na(var))[1], + yes=which(!is.na(var))[1], + no=delayed.start) + #Get the index for a 2 day delayed start from the first record that has data + #to make the for loop work properly (155-163 CAN PROBABLY BE CLEANED UP) + delayed.start <- which(data$Observation == data$Observation[delayed.start] + days(2)) + #Create an empty list to store the index values for records that fail the test + fail.list <- list() + #Starts at the earliest record with data and fills list with records that fail test + for(i in delayed.start:nrow(data)) { + #Index the date and time of the current record + this.date <- which(data$Observation == data$Observation[i]) + #index the date and time of the record exactly 1 and 2 days prior to current record + day.before <- which(data$Observation == data$Observation[i] - days(1)) + days2.before <- which(data$Observation == data$Observation[i] - days(2)) + #If the variable for the current record is equal to the value exactly 1 and 2 days + #before the current record, index all three rows + if(length(var[i] == var[day.before] | var[i] == days2.before ) == 0) { + this.date <- this.date + } else if(is.na(var[i]) | is.na(var[day.before]) | is.na(var[days2.before])) { + this.date <- this.date + } else if(var[i] == var[day.before] & var[i] == var[days2.before]) { + fail.list[[i]] <- c(this.date, day.before, days2.before) + } + } + #Convert the list index for failed records into a vector index and return + fail.index <- unlist(fail.list) + return(fail.index) +} + +#Takes list of variables and runs persistence test on all +f.persistence.list <- function(data, var.list) { + #Find the date (and time) of the first record in the data set + this.date <- which(data$Observation == min(data$Observation)) + #Get the index for a 2 day delayed start to make the for loop work properly + for (j in 1:length(var.list)) { + delayed.start <- vector() + delayed.start[j] <- which(data$Observation == data$Observation[this.date] + days(2)) + #If there is no data for the first record, index the first record that has data + delayed.start[j] <- ifelse(test=delayed.start < which(!is.na(var.list[[j]]))[1], + yes=which(!is.na(var.list[[j]]))[1], + no=delayed.start) + #Get the index for a 2 day delayed start from the first record that has data + #to make the for loop work properly (155-163 CAN PROBABLY BE CLEANED UP) + delayed.start[j] <- which(data$Observation == data$Observation[delayed.start] + days(2)) + } + #Create an empty list to store the index values for records that fail the test + fail.list <- list() + #Starts at the earliest record with data and fills list with records that fail test + for(i in delayed.start:nrow(data)) { + #Index the date and time of the current record + this.date <- which(data$Observation == data$Observation[i]) + #index the date and time of the record exactly 1 and 2 days prior to current record + day.before <- which(data$Observation == data$Observation[i] - days(1)) + days2.before <- which(data$Observation == data$Observation[i] - days(2)) + #If the variable for the current record is equal to the value exactly 1 and 2 days + #before the current record, index all three rows + if(is.na(var[i]) | is.na(var[day.before]) | is.na(var[days2.before])) { + this.date <- this.date + } + if(length(var[i] == var[day.before] & var[i] == days2.before ) == 0) { + this.date <- this.date + } else { + if(var[i] == var[day.before] & var[i] == var[days2.before]) { + fail.list[[i]] <- c(this.date, day.before, days2.before) + } + } + } + #Convert the list index for failed records into a vector index and return + fail.index <- unlist(fail.list) + return(fail.index) +} diff --git a/clim_funks_v2 b/clim_funks_v2 new file mode 100644 index 0000000..251f15c --- /dev/null +++ b/clim_funks_v2 @@ -0,0 +1,226 @@ +require(stringr) +require(lubridate) +require(compiler) +require(ggplot2) +require(zoo) + +#Performs range test for relative humidity, and temperature based on Estevez et al. (2011)#### +#RH acceptable range = 0-100% +#Temp acceptable rante = -30 and 50; revisit these numbers (Africa max high is 55--WMO) +f.rangeTest <- function(data) { + #Vectorize and format the relative humidity, temperature, and rain data#### + #Relative Humidity + rh.1 <- as.numeric(data$Sensor1_RelativeHumidity) + rh.2 <- as.numeric(data$Sensor2_RelativeHumidity) + rh.3 <- as.numeric(data$Sensor3_RelativeHumidity) + #Temperature + temp.1 <- as.numeric(data$Sensor1_AvgTemp) + temp.2 <- as.numeric(data$Sensor2_AvgTemp) + temp.3 <- as.numeric(data$Sensor3_AvgTemp) + #Rain + rain <- as.numeric(data$Rainfall) + data$Observation <- ymd_hms(data$Observation) + #Create indices for rows that fail the range test#### + #Relative Humidity + rh.1.fail.r1 <- which(rh.1 < 0 | rh.1 > 100) + rh.2.fail.r1 <- which(rh.2 < 0 | rh.2 > 100) + rh.3.fail.r1 <- which(rh.3 < 0 | rh.3 > 100) + #Temperature - Test 1 + temp.1.fail.r1 <- which(temp.1 < -30 | temp.1 > 50) + temp.2.fail.r1 <- which(temp.2 < -30 | temp.2 > 50) + temp.3.fail.r1 <- which(temp.3 < -30 | temp.3 > 50) + #Temperature - Test 2 + #Performs second range test on temperature based on Estevez et al. (2011)--UNDER CONSTRUCTION#### + #Need to find an accurate list of country max/min temperatures; ideally long-term stations close to TEAM station + #Rain - Test 1 + #Store the first day of the data (where to start the interval) + start.date <- min(data$Observation) #ymd('2010-07-01') + #Create a vector to store the index info for failed tests + rain.fail.r1 <- vector() + #Create a half hour interval window to select data during that period + int <- new_interval(start = start.date, end = start.date + minutes(30)) + #For each record use the interval to index data and then sum rainfall measurements + for(i in 1:nrow(data)) { + #If there is no data for a particular interval, shift the interval by five minutes and start again + #at next record + if(length(which(data$Observation %within% int)) == 0) { + int <- int_shift(int = int, by = minutes(5)) + } else { + #If the any data exists in the interval, sum the data over the half-hour interval + half.hr.sum <- sum(rain[which(data$Observation %within% int)]) + #If the sum of the interval is greater than 120 or less than 0, store the row number in the index + if(half.hr.sum > 120 | half.hr.sum < 0) { + rain.fail.r1[i] <- which(data$Observation %within% int)[1] + } + #After storing the row number in the index, shift the interval by five minutes and repeat + int <- int_shift(int = int, by = minutes(5)) + } + } + #Rain - Test 2 + #Grab just the date of each record + TempDay <- floor_date(data$Observation, 'day') + TempDay <- ymd(TempDay) + #Generate daily summaries + Daily.Climate.Data <- data.frame(Rain = tapply(rain, TempDay, sum)) + #Index the rows in which a day had more than 500 mm or less than 0 mm of rain + rain.fail.r2 <- as.vector(which(Daily.Climate.Data$Rain > 500 | Daily.Climate.Data$Rain < 0)) + #Get the dates from the index number to flag the results + if(length(rain.fail.r2) != 0) { + dates.fail <- ymd(row.names(Daily.Climate.Data)[rain.fail.r2]) + #Creates an index again to store the dates in the data that match the days that + #failed in the previous index + rain.fail.r2 <- which(TempDay %in% dates.fail) + } + #Create a data frame with results of the test#### + final.df <- data.frame(ReferenceID = data$ReferenceID, RecordID = data$RecordID, Observation = data$Observation, + TEAMSiteName = data$TEAMSiteName, SamplingUnitName = data$SamplingUnitName, rh.1 = rh.1, + rh.1.rangeTest = NA, rh.2 = rh.2, rh.2.rangeTest = NA, rh.3 = rh.3, rh.3.rangeTest = NA, + temp.1 = temp.1, temp.1.rangeTest = NA, temp.2 = temp.2, temp.2.rangeTest = NA, + temp.3 = temp.3, temp.3.rangeTest = NA, rain = rain, rain.rangeTest1 = NA, + rain.rangeTest2 = NA) + #Use the failed test index to store the error code#### + #Relative Humidity + final.df$rh.1.rangeTest[rh.1.fail.r1] <- 'R1' + final.df$rh.2.rangeTest[rh.2.fail.r1] <- 'R1' + final.df$rh.3.rangeTest[rh.3.fail.r1] <- 'R1' + #Temperature + final.df$temp.1.rangeTest[temp.1.fail.r1] <- 'R1' + final.df$temp.2.rangeTest[temp.2.fail.r1] <- 'R1' + final.df$temp.3.rangeTest[temp.3.fail.r1] <- 'R1' + #Rain + #Remove NAs that were introduced to the failed test index + rain.fail.r1 <- rain.fail.r1[!is.na(rain.fail.r1)] + final.df$rain.rangeTest1[rain.fail.r1] <- 'R1' + final.df$rain.rangeTest2[rain.fail.r2] <- 'R2' + #Replace NAs with a blank#### + #Relative Humidity + final.df$rh.1.rangeTest[is.na(final.df$rh.1.rangeTest)] <- ' ' + final.df$rh.2.rangeTest[is.na(final.df$rh.2.rangeTest)] <- ' ' + final.df$rh.3.rangeTest[is.na(final.df$rh.3.rangeTest)] <- ' ' + #Temperature + final.df$temp.1.rangeTest[is.na(final.df$temp.1.rangeTest)] <- ' ' + final.df$temp.2.rangeTest[is.na(final.df$temp.2.rangeTest)] <- ' ' + final.df$temp.3.rangeTest[is.na(final.df$temp.3.rangeTest)] <- ' ' + #Rain + final.df$rain.rangeTest1[is.na(final.df$rain.rangeTest1)] <- ' ' + final.df$rain.rangeTest2[is.na(final.df$rain.rangeTest2)] <- ' ' + return(final.df) +} + +#Performs persistence test for solar radiation, relative humidity, and temperatur based on Estevez et al. (2011)e#### +#This function creates an index to compare records to exactly 1 and 2 days prior to the +#current record and then returns an index of records that fail the test +#All three records will be flagged if they are the same value +f.persistenceTest <- function(data) { + data$Observation <- ymd_hms(data$Observation) + #Vectorize and format the data#### + #Solar Radiation + sol.1 <- as.numeric(data$Sensor1_AvgSolarRadiation) + sol.2 <- as.numeric(data$Sensor2_AvgSolarRadiation) + #Relative Humidity + rh.1 <- as.numeric(data$Sensor1_RelativeHumidity) + rh.2 <- as.numeric(data$Sensor2_RelativeHumidity) + rh.3 <- as.numeric(data$Sensor3_RelativeHumidity) + #Temperature + temp.1 <- as.numeric(data$Sensor1_AvgTemp) + temp.2 <- as.numeric(data$Sensor2_AvgTemp) + temp.3 <- as.numeric(data$Sensor3_AvgTemp) + + #Find the date (and time) of the first record in the data set + start.date <- which(data$Observation == min(data$Observation)) + #Get the index for a 2 day delayed start to make the for loop work properly + delayed.start <- which(data$Observation == data$Observation[start.date] + days(2)) + index.list <- list() + for(i in delayed.start:nrow(data)) { + index.list[[i]] <- c(which(data$Observation == data$Observation[i]), which(data$Observation == data$Observation[i] - days(1)), + which(data$Observation == data$Observation[i] - days(2))) + } + #Create an empty list to store the index values for records that fail the test + sol.1.index.vector <- vector() + sol.2.index.vector <- vector() + rh.1.index.vector <- vector() + rh.2.index.vector <- vector() + rh.3.index.vector <- vector() + temp.1.index.vector <- vector() + temp.2.index.vector <- vector() + temp.3.index.vector <- vector() + for(i in 1:length(index.list)) { + sol.1.index.vector[i] <- ifelse(sol.1[index.list[[i]][1]] == sol.1[index.list[[i]][2]] & + sol.1[index.list[[i]][1]] == sol.1[index.list[[i]][3]], 'P1', ' ') + sol.2.index.vector[i] <- ifelse(sol.2[index.list[[i]][1]] == sol.2[index.list[[i]][2]] & + sol.2[index.list[[i]][1]] == sol.2[index.list[[i]][3]], 'P1', ' ') + rh.1.index.vector[i] <- ifelse(temp.1[index.list[[i]][1]] == temp.1[index.list[[i]][2]] & + temp.1[index.list[[i]][1]] == temp.1[index.list[[i]][3]], 'P1', ' ') + rh.2.index.vector[i] <- ifelse(temp.1[index.list[[i]][1]] == temp.1[index.list[[i]][2]] & + temp.1[index.list[[i]][1]] == temp.1[index.list[[i]][3]], 'P1', ' ') + rh.3.index.vector[i] <- ifelse(temp.1[index.list[[i]][1]] == temp.1[index.list[[i]][2]] & + temp.1[index.list[[i]][1]] == temp.1[index.list[[i]][3]], 'P1', ' ') + temp.1.index.vector[i] <- ifelse(temp.1[index.list[[i]][1]] == temp.1[index.list[[i]][2]] & + temp.1[index.list[[i]][1]] == temp.1[index.list[[i]][3]], 'P1', ' ') + temp.2.index.vector[i] <- ifelse(temp.2[index.list[[i]][1]] == temp.2[index.list[[i]][2]] & + temp.2[index.list[[i]][1]] == temp.2[index.list[[i]][3]], 'P1', ' ') + temp.3.index.vector[i] <- ifelse(temp.3[index.list[[i]][1]] == temp.3[index.list[[i]][2]] & + temp.3[index.list[[i]][1]] == temp.3[index.list[[i]][3]], 'P1', ' ') + } + #Convert the list index for failed records into a vector index and return + sol.1.fail.list <- unlist(index.list[which(sol.1.index.vector == 'P1')]) + sol.2.fail.list <- unlist(index.list[which(sol.2.index.vector == 'P1')]) + rh.1.fail.list <- unlist(index.list[which(rh.1.index.vector == 'P1')]) + rh.2.fail.list <- unlist(index.list[which(rh.2.index.vector == 'P1')]) + rh.3.fail.list <- unlist(index.list[which(rh.3.index.vector == 'P1')]) + temp.1.fail.list <- unlist(index.list[which(temp.1.index.vector == 'P1')]) + temp.2.fail.list <- unlist(index.list[which(temp.2.index.vector == 'P1')]) + temp.3.fail.list <- unlist(index.list[which(temp.3.index.vector == 'P1')]) + #Create a data frame with results of the test + final.df <- data.frame(ReferenceID = data$ReferenceID, RecordID = data$RecordID, Observation = data$Observation, + TEAMSiteName = data$TEAMSiteName, SamplingUnitName = data$SamplingUnitName, sol.1 = sol.1, + sol.1.perTest = NA, sol.2 = sol.2, sol.2.perTest = NA, rh.1 = rh.1, rh.1.perTest = NA, + rh.2 = rh.2, rh.2.perTest = NA, rh.3 = rh.3, rh.3.perTest = NA, temp.1 = temp.1, + temp.1.perTest = NA, temp.2 = temp.2, temp.2.perTest = NA, temp.3 = temp.3, + temp.3.perTest = NA) + #Use the failed test index to store the error code + final.df$sol.1.perTest[sol.1.fail.list] <- 'P1' + final.df$sol.2.perTest[sol.2.fail.list] <- 'P1' + final.df$rh.1.perTest[rh.1.fail.list] <- 'P1' + final.df$rh.2.perTest[rh.2.fail.list] <- 'P1' + final.df$rh.3.perTest[rh.3.fail.list] <- 'P1' + final.df$temp.1.perTest[temp.1.fail.list] <- 'P1' + final.df$temp.2.perTest[temp.2.fail.list] <- 'P1' + final.df$temp.3.perTest[temp.3.fail.list] <- 'P1' + #Replace NAs with a blank + final.df$sol.1.perTest[is.na(final.df$sol.1.perTest)] <- ' ' + final.df$sol.2.perTest[is.na(final.df$sol.2.perTest)] <- ' ' + final.df$rh.1.perTest[is.na(final.df$rh.1.perTest)] <- ' ' + final.df$rh.2.perTest[is.na(final.df$rh.2.perTest)] <- ' ' + final.df$rh.3.perTest[is.na(final.df$rh.3.perTest)] <- ' ' + final.df$temp.1.perTest[is.na(final.df$temp.1.perTest)] <- ' ' + final.df$temp.2.perTest[is.na(final.df$temp.2.perTest)] <- ' ' + final.df$temp.3.perTest[is.na(final.df$temp.3.perTest)] <- ' ' + return(final.df) +} + +#Helper function to tally error rate +f.errorRate <- function(range, persistence) { + #Range error rate + noRows.range <- data.frame(noRows = c(nrow(range[!is.na(range$rh.1), ]), nrow(range[!is.na(range$rh.2), ]), nrow(range[!is.na(range$rh.3), ]), + nrow(range[!is.na(range$temp.1), ]), nrow(range[!is.na(range$temp.2), ]), nrow(range[!is.na(range$temp.3), ]), + nrow(range[!is.na(range$rain), ]), nrow(range[!is.na(range$rain), ]))) + row.names(noRows.range) <- c('rh.1', 'rh.2', 'rh.3', 'temp.1', 'temp.2', 'temp.3', 'rain1', 'rain2') + noRows.range$failed <- c(length(which(range$rh.1.rangeTest == 'R1')), length(which(range$rh.2.rangeTest == 'R1')), + length(which(range$rh.3.rangeTest == 'R1')), length(which(range$temp.1.rangeTest == 'R1')), + length(which(range$temp.2.rangeTest == 'R1')), length(which(range$temp.3.rangeTest == 'R1')), + length(which(range$rain.rangeTest1 == 'R1')), length(which(range$rain.rangeTest2 == 'R2'))) + noRows.range$errorRate <- noRows.range$failed / noRows.range$noRows + #Persistence error rate + noRows.per <- data.frame(noRows = c(nrow(persistence[!is.na(persistence$sol.1), ]), nrow(persistence[!is.na(persistence$sol.2), ]), nrow(persistence[!is.na(persistence$rh.1), ]), + nrow(persistence[!is.na(persistence$rh.2), ]), nrow(persistence[!is.na(persistence$rh.3), ]), nrow(persistence[!is.na(persistence$temp.1), ]), + nrow(persistence[!is.na(persistence$temp.2), ]), nrow(persistence[!is.na(persistence$temp.3), ]))) + row.names(noRows.per) <- c('sol.1', 'sol.2', 'rh.1', 'rh.2', 'rh.3', 'temp.1', 'temp.2', 'temp.3') + noRows.per$failed <- c(length(which(persistence$sol.1.perTest == 'P1')), length(which(persistence$sol.2.perTest == 'P1')), + length(which(persistence$rh.1.perTest == 'P1')), length(which(persistence$rh.2.perTest == 'P1')), + length(which(persistence$rh.3.perTest == 'P1')), length(which(persistence$temp.1.perTest == 'P1')), + length(which(persistence$temp.2.perTest == 'P1')), length(which(persistence$temp.3.perTest == 'P1'))) + noRows.per$errorRate <- noRows.per$failed / noRows.per$noRows + final.list <- list(noRows.range, noRows.per) + return(final.list) +} diff --git a/clim_run b/clim_run new file mode 100644 index 0000000..dd2308c --- /dev/null +++ b/clim_run @@ -0,0 +1,35 @@ +#source('~/Jimmy_Code/teamcode/tools/team_db_query.R') + +#Load in the TEAM climate data +#cldata <- f.teamdb.query('climate') +#clmeta <- cldata$cl_temp_maxmin +#cldata <- cldata$cl_data + +load('cl_data_new2014-04-24.gzip') +vb.data.3.0 <- cl_data_all[which(cl_data_all$TEAMSiteName == unique(cl_data_all$TEAMSiteName)[3] & + cl_data_all$ProtocolVersion == 'Climate 3'), ] +vb.data.3.0.st2 <- vb.data.3.0[which(vb.data.3.0$SamplingUnitName == 'CL-VB-2'), ] + +#Relative humidity range test 1 +#Less than a minute for 342637 records +rh.r1.test <- f.rh.r1(vb.data.3.0.st2) + +#Temperature range test 1 +#Less than a minute for 342637 records +temp.r1.test <- f.temp.r1(vb.data.3.0.st2) + +#NEED TO ADD TEMPERATURE RANGE TEST 2 + +#Rainfall range test 1 +#Runs at 5 minutes for 342637 records +rain.r1.test <- f.rain.r1(vb.data.3.0.st2) +#g<-cmpfun(f.rain.r1) +#compare <- microbenchmark(f.rain.r1(vb.data.3.0), g(vb.data.3.0), times = 1000) +#autoplot(compare) + +#Rainfall range test 2 +#Runs at less than a minute for 342637 records +rain.r2.test <- f.rain.r2(vb.data.3.0.st2) + +#Temperature persistence test 1 +temp.p1.test <- f.temp.p1(vb.data.3.0) diff --git a/clim_tests_v2 b/clim_tests_v2 new file mode 100644 index 0000000..dddcd7e --- /dev/null +++ b/clim_tests_v2 @@ -0,0 +1,37 @@ +#source('~/Jimmy_Code/teamcode/tools/team_db_query.R') + +#Load in the TEAM climate data +#cldata <- f.teamdb.query('climate') +#clmeta <- cldata$cl_temp_maxmin +#cldata <- cldata$cl_data + +load('cl_data_new2014-04-24.gzip') +vb.data.3.0 <- cl_data_all[which(cl_data_all$TEAMSiteName == unique(cl_data_all$TEAMSiteName)[3] & + cl_data_all$ProtocolVersion == 'Climate 3'), ] +nak.data.3.0 <- cl_data_all[which(cl_data_all$TEAMSiteName == unique(cl_data_all$TEAMSiteName)[1] & + cl_data_all$ProtocolVersion == 'Climate 3'), ] +vb.data.3.0.st2 <- vb.data.3.0[which(vb.data.3.0$SamplingUnitName == 'CL-VB-2'), ] +vb.use.data <- vb.data.3.0.st2[1:30000, ] +data<-vb.use.data[1:5000, ] + +#Range tests for relative humidity, temperature, and rainfall +#System Time: 54.2 seconds for 5,000 records +#System Time: 23396.4 seconds (6.5 hours) for 384,907 +system.time(rangeTest.Output <- f.rangeTest(nak.data.3.0)) + +#NEED TO ADD TEMPERATURE RANGE TEST 2 + +#Persistence tests for solar radiation, relative humidity, and temperature +#System Time: 75.1 seconds for 5,000 records +#System Time: 20883.3 seconds (5.8 hours) for 384,907 +system.time(persistenceTest.Output <- f.persistenceTest(nak.data.3.0)) + +#Calculate the error rate +#Returns a list of the error rate tables: range, step, consistency, and persistence +#System Time: 4.6 seconds for 2 tests +system.time(errorRate <- f.errorRate(rangeTest.Output, persistenceTest.Output)) + + +#g<-cmpfun(f.rain.r1) +#compare <- microbenchmark(f.rain.r1(vb.data.3.0), g(vb.data.3.0), times = 1000) +#autoplot(compare) diff --git a/rangeTest b/rangeTest new file mode 100644 index 0000000..fd5cc99 --- /dev/null +++ b/rangeTest @@ -0,0 +1,27 @@ +#source('~/Jimmy_Code/teamcode/tools/team_db_query.R') + +#Load in the TEAM climate data +#cldata <- f.teamdb.query('climate') +#clmeta <- cldata$cl_temp_maxmin +#cldata <- cldata$cl_data + +load('cl_data_new2014-04-24.gzip') +vb.data.3.0 <- cl_data_all[which(cl_data_all$TEAMSiteName == unique(cl_data_all$TEAMSiteName)[3] & + cl_data_all$ProtocolVersion == 'Climate 3'), ] + +#Relative humidity range test 1 +rh.r1.test <- f.rh.r1(vb.data) + +#Temperature range test 1 +temp.r1.test <- f.temp.r1(vb.data) + +#NEED TO ADD TEMPERATURE RANGE TEST 2 + +#Rainfall range test 1 +system.time(rain.r1.test <- f.rain.r1(vb.data.3.0)) +#g<-cmpfun(f.rain.r1) +#compare <- microbenchmark(f.rain.r1(vb.data.3.0), g(vb.data.3.0), times = 1000) +#autoplot(compare) + +#Rainfall range test 2 +rain.r2.test diff --git a/team_db_query.R b/team_db_query.R index 89c0589..f9a7bbc 100644 --- a/team_db_query.R +++ b/team_db_query.R @@ -1,6 +1,6 @@ # team_db_query.r # -#Hello! +#Hello! This is different from the master version. f.teamdb.query <- function(dataset) { # Database query function of TEAM production database. Returns a R Object for # each dataset and saves to the workspace. The function contains views or pre-