From 43b69fa7ed0b90dda7ccb644f22c74d7e0cbbb3f Mon Sep 17 00:00:00 2001 From: afilazzola Date: Mon, 15 Sep 2025 14:52:53 -0400 Subject: [PATCH 1/6] Revise how cell IDs are extracted to be memory optimized and reduce crashing --- src/4-background-data-generation.R | 298 ++++++++++++++++++----------- 1 file changed, 189 insertions(+), 109 deletions(-) diff --git a/src/4-background-data-generation.R b/src/4-background-data-generation.R index 95ade79..c219b39 100644 --- a/src/4-background-data-generation.R +++ b/src/4-background-data-generation.R @@ -9,7 +9,7 @@ # source dependencies ---------------------------------------------------------- -library(rsyncrosim) # install.packages("C:/GitHub/rsyncrosim", type="source", repos=NULL) +library(rsyncrosim) # install.packages("C:/GitHub/rsyncrosim", type="source", repos=NULL) library(terra) library(tidyr) library(dplyr) @@ -27,14 +27,28 @@ myProject <- rsyncrosim::project() myScenario <- scenario() # temp directory -ssimTempDir <- ssimEnvironment()$TransferDirectory +ssimTempDir <- ssimEnvironment()$TransferDirectory # Read in datasheets templateSheet <- datasheet(myScenario, "wisdm_TemplateRaster") -covariateDataSheet <- datasheet(myScenario, "wisdm_CovariateData", optional = T, lookupsAsFactors = F) +covariateDataSheet <- datasheet( + myScenario, + "wisdm_CovariateData", + optional = T, + lookupsAsFactors = F +) fieldDataSheet <- datasheet(myScenario, "wisdm_FieldData", optional = T) -backgroundDataOptionsSheet <- datasheet(myScenario, "wisdm_BackgroundDataOptions", optional = T) -siteDataSheet <- datasheet(myScenario, "wisdm_SiteData", optional = T, lookupsAsFactors = F) +backgroundDataOptionsSheet <- datasheet( + myScenario, + "wisdm_BackgroundDataOptions", + optional = T +) +siteDataSheet <- datasheet( + myScenario, + "wisdm_SiteData", + optional = T, + lookupsAsFactors = F +) # Set progress bar ------------------------------------------------------------- @@ -43,180 +57,246 @@ progressBar(type = "begin", totalSteps = steps) # Prep inputs ------------------------------------------------------------------ -# drop no data (-9999) sites that resulted from spatial aggregation -fieldDataSheet <- fieldDataSheet[!fieldDataSheet$Response == -9999,] +# drop no data (-9999) sites that resulted from spatial aggregation +fieldDataSheet <- fieldDataSheet[!fieldDataSheet$Response == -9999, ] -# Set defaults ---------------------------------------------------------------- +# Set defaults ---------------------------------------------------------------- ## Background data options sheet -if(nrow(backgroundDataOptionsSheet)<1){ - backgroundDataOptionsSheet <- addRow(backgroundDataOptionsSheet, list(GenerateBackgroundSites = FALSE)) +if (nrow(backgroundDataOptionsSheet) < 1) { + backgroundDataOptionsSheet <- addRow( + backgroundDataOptionsSheet, + list(GenerateBackgroundSites = FALSE) + ) } -if(is.na(backgroundDataOptionsSheet$GenerateBackgroundSites)){ backgroundDataOptionsSheet$GenerateBackgroundSites <- FALSE } -if(backgroundDataOptionsSheet$GenerateBackgroundSites){ - if(is.na(backgroundDataOptionsSheet$BackgroundSiteCount)){backgroundDataOptionsSheet$BackgroundSiteCount <- sum(fieldDataSheet$Response) } - if(is.na(backgroundDataOptionsSheet$BackgroundGenerationMethod)){backgroundDataOptionsSheet$BackgroundGenerationMethod <- "Kernel Density Estimate (KDE)"} - if(is.na(backgroundDataOptionsSheet$KDESurface)){ - if(backgroundDataOptionsSheet$BackgroundGenerationMethod == "Kernel Density Estimate (KDE)"){ +if (is.na(backgroundDataOptionsSheet$GenerateBackgroundSites)) { + backgroundDataOptionsSheet$GenerateBackgroundSites <- FALSE +} +if (backgroundDataOptionsSheet$GenerateBackgroundSites) { + if (is.na(backgroundDataOptionsSheet$BackgroundSiteCount)) { + backgroundDataOptionsSheet$BackgroundSiteCount <- sum( + fieldDataSheet$Response + ) + } + if (is.na(backgroundDataOptionsSheet$BackgroundGenerationMethod)) { + backgroundDataOptionsSheet$BackgroundGenerationMethod <- "Kernel Density Estimate (KDE)" + } + if (is.na(backgroundDataOptionsSheet$KDESurface)) { + if ( + backgroundDataOptionsSheet$BackgroundGenerationMethod == + "Kernel Density Estimate (KDE)" + ) { backgroundDataOptionsSheet$KDESurface <- "Continuous" - }} - if(is.na(backgroundDataOptionsSheet$Isopleth)){ - if(backgroundDataOptionsSheet$KDESurface == "Binary" | backgroundDataOptionsSheet$BackgroundGenerationMethod == "Minimum Convex Polygon (MCP)"){ + } + } + if (is.na(backgroundDataOptionsSheet$Isopleth)) { + if ( + backgroundDataOptionsSheet$KDESurface == "Binary" | + backgroundDataOptionsSheet$BackgroundGenerationMethod == + "Minimum Convex Polygon (MCP)" + ) { backgroundDataOptionsSheet$Isopleth <- 95 } } } -saveDatasheet(myScenario, backgroundDataOptionsSheet, "wisdm_BackgroundDataOptions") +saveDatasheet( + myScenario, + backgroundDataOptionsSheet, + "wisdm_BackgroundDataOptions" +) # Generate pseudo-absences (if applicable) ------------------------------------- -if(backgroundDataOptionsSheet$GenerateBackgroundSites){ - - if(backgroundDataOptionsSheet$BackgroundGenerationMethod == "Kernel Density Estimate (KDE)"){ - methodInputs <- list("method"="kde", - "surface"=backgroundDataOptionsSheet$KDESurface, - "isopleth"=backgroundDataOptionsSheet$Isopleth) - } - if(backgroundDataOptionsSheet$BackgroundGenerationMethod == "Minimum Convex Polygon (MCP)"){ - methodInputs <- list("method"="mcp", - "surface" = NA, - "isopleth"=backgroundDataOptionsSheet$Isopleth) - } +if (backgroundDataOptionsSheet$GenerateBackgroundSites) { + if ( + backgroundDataOptionsSheet$BackgroundGenerationMethod == + "Kernel Density Estimate (KDE)" + ) { + methodInputs <- list( + "method" = "kde", + "surface" = backgroundDataOptionsSheet$KDESurface, + "isopleth" = backgroundDataOptionsSheet$Isopleth + ) + } + if ( + backgroundDataOptionsSheet$BackgroundGenerationMethod == + "Minimum Convex Polygon (MCP)" + ) { + methodInputs <- list( + "method" = "mcp", + "surface" = NA, + "isopleth" = backgroundDataOptionsSheet$Isopleth + ) + } templateRaster <- rast(templateSheet$RasterFilePath) - - # create mask polygon from extent of valid data in template + + # create mask polygon from extent of valid data in template templateVector <- as.polygons(templateRaster) - + # generate background surface - backgroundSurfaceGeneration(sp = "species", - template = templateRaster, - method = methodInputs, - mask = templateVector, - outputDir = ssimTempDir, - dat = fieldDataSheet) - + backgroundSurfaceGeneration( + sp = "species", + template = templateRaster, + method = methodInputs, + mask = templateVector, + outputDir = ssimTempDir, + dat = fieldDataSheet + ) + progressBar() - rm(templateVector); gc() - + rm(templateVector) + gc() + # generate background (psuedo-absence) points - backgroundPointGeneration(sp = "species", - outputDir = ssimTempDir, - n = backgroundDataOptionsSheet$BackgroundSiteCount+100, - method = methodInputs, - # target_file = backgroundDataOptionsSheet, # this is an external input... - overwrite = T) - + backgroundPointGeneration( + sp = "species", + outputDir = ssimTempDir, + n = backgroundDataOptionsSheet$BackgroundSiteCount + 100, + method = methodInputs, + # target_file = backgroundDataOptionsSheet, # this is an external input... + overwrite = T + ) + progressBar() # add background point to field data - bgData <- read.csv(file.path(ssimTempDir, paste0("species_", methodInputs$method, "_bg_pts.csv"))) - - startId <- max(fieldDataSheet$SiteID, siteDataSheet$SiteID)+1 - bgData$SiteID <- startId:(startId+nrow(bgData)-1) + bgData <- read.csv(file.path( + ssimTempDir, + paste0("species_", methodInputs$method, "_bg_pts.csv") + )) + + startId <- max(fieldDataSheet$SiteID, siteDataSheet$SiteID) + 1 + bgData$SiteID <- startId:(startId + nrow(bgData) - 1) bgData$UseInModelEvaluation <- NA bgData$ModelSelectionSplit <- NA bgData$Weight <- NA - + fieldData <- rbind(fieldDataSheet, bgData) - + ## Remove background points that occur in pixels with presence points ----- - + # rasterize points data - r <- rast(ext(templateRaster), resolution = res(templateRaster), crs = crs(templateRaster)) + r <- rast( + ext(templateRaster), + resolution = res(templateRaster), + crs = crs(templateRaster) + ) pts <- vect(fieldData, geom = c("X", "Y"), crs = crs(templateRaster)) - rm(templateRaster, fieldData); gc() - - rIDs <- rast(r, vals = 1:(dim(r)[1]*dim(r)[2])) + rIDs <- init(r, "cell") + + # Extract cell IDs for each point cellPerPt <- terra::extract(rIDs, pts) cellPerPt$SiteID <- pts$SiteID names(cellPerPt)[2] <- "PixelID" - pts <- merge(pts, cellPerPt[,c("PixelID", "SiteID")]) - rm(rIDs, cellPerPt); gc() - + pts <- merge(pts, cellPerPt[, c("PixelID", "SiteID")]) + gc() + # check for duplicate pixel ids - if(any(duplicated(pts$PixelID))){ - dups <- pts[which(duplicated(pts$PixelID)),] + if (any(duplicated(pts$PixelID))) { + dups <- pts[which(duplicated(pts$PixelID)), ] dropSites <- dups$SiteID[which(dups$Response == -9998)] - pts <- pts[!pts$SiteID %in% dropSites,] + pts <- pts[!pts$SiteID %in% dropSites, ] } - + bgPts <- pts[pts$Response == -9998] bgPts$PixelID <- NULL - rm(pts, dups, dropSites); gc() - + rm(pts, dups, dropSites) + gc() + # remove extra bg sites - if(nrow(bgPts)>backgroundDataOptionsSheet$BackgroundSiteCount){ - bgPts <- sample(x = bgPts, size = backgroundDataOptionsSheet$BackgroundSiteCount, replace = F) + if (nrow(bgPts) > backgroundDataOptionsSheet$BackgroundSiteCount) { + bgPts <- sample( + x = bgPts, + size = backgroundDataOptionsSheet$BackgroundSiteCount, + replace = F + ) } - if(nrow(bgPts)% distinct(SiteID, CovariatesID, .keep_all = T) - rm(bgSiteData); gc() - + siteDataSheet <- siteDataSheet %>% + distinct(SiteID, CovariatesID, .keep_all = T) + rm(bgSiteData) + gc() + # save site data to scenario saveDatasheet(myScenario, siteDataSheet, "wisdm_SiteData") - rm(siteDataSheet); gc() - + rm(siteDataSheet) + gc() + # update field data - bgData <- bgData[which(bgData$SiteID %in% bgPts$SiteID),] - if(any(!is.na(fieldDataSheet$Weight))){ bgData$Weight <- 1 } + bgData <- bgData[which(bgData$SiteID %in% bgPts$SiteID), ] + if (any(!is.na(fieldDataSheet$Weight))) { + bgData$Weight <- 1 + } fieldDataSheet <- rbind(fieldDataSheet, bgData) fieldDataSheet$SiteID <- format(fieldDataSheet$SiteID, scientific = F) - rm(bgData, bgPts); gc() - + rm(bgData, bgPts) + gc() + # save updated field data to scenario saveDatasheet(myScenario, fieldDataSheet, "wisdm_FieldData", append = F) } progressBar(type = "end") - From 4d8e55fcf528d9431daa672037a76fe4c4116a56 Mon Sep 17 00:00:00 2001 From: afilazzola Date: Mon, 15 Sep 2025 14:55:22 -0400 Subject: [PATCH 2/6] added same optimization to the background point extraction --- src/4-background-data-generation.R | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/src/4-background-data-generation.R b/src/4-background-data-generation.R index c219b39..1d483b5 100644 --- a/src/4-background-data-generation.R +++ b/src/4-background-data-generation.R @@ -178,19 +178,22 @@ if (backgroundDataOptionsSheet$GenerateBackgroundSites) { ## Remove background points that occur in pixels with presence points ----- # rasterize points data - r <- rast( - ext(templateRaster), - resolution = res(templateRaster), - crs = crs(templateRaster) - ) - pts <- vect(fieldData, geom = c("X", "Y"), crs = crs(templateRaster)) - rIDs <- init(r, "cell") + rastPts <- rasterize(bgPts, r) + # Replace matrix conversion with direct value extraction + vals <- values(rastPts) + keep <- which(!is.na(vals)) - # Extract cell IDs for each point - cellPerPt <- terra::extract(rIDs, pts) - cellPerPt$SiteID <- pts$SiteID + rIDs <- init(r, "cell") + cellPerPt <- terra::extract(rIDs, bgPts) + cellPerPt$SiteID <- bgPts$SiteID names(cellPerPt)[2] <- "PixelID" - pts <- merge(pts, cellPerPt[, c("PixelID", "SiteID")]) + bgPts <- merge(bgPts, cellPerPt[, c("PixelID", "SiteID")]) + + rPixels <- rasterize(bgPts, r, field = "PixelID") + pixelVals <- values(rPixels) + PixelIDs <- pixelVals[keep] + + PixelData <- data.frame(PixelID = PixelIDs) gc() # check for duplicate pixel ids From bbfd5e0afa5fa937bdabdf59456600698ef7e08b Mon Sep 17 00:00:00 2001 From: afilazzola Date: Mon, 15 Sep 2025 15:03:03 -0400 Subject: [PATCH 3/6] Removed rasterization that was crashing with high res and high background occs. Now pulls cell ids directly --- src/4-background-data-generation.R | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/4-background-data-generation.R b/src/4-background-data-generation.R index 1d483b5..5969eb3 100644 --- a/src/4-background-data-generation.R +++ b/src/4-background-data-generation.R @@ -231,10 +231,8 @@ if (backgroundDataOptionsSheet$GenerateBackgroundSites) { ## Extract covariate data for background sites ----- # rasterize bg data - rastPts <- rasterize(bgPts, r) - matPts <- as.matrix(rastPts, wide = T) - keep <- which(!is.na(matPts)) - rm(rastPts) + cellIds <- cellFromXY(r, geom(bgPts)[, c("x", "y")]) + keep <- unique(cellIds) gc() rIDs <- rast(r, vals = 1:(dim(r)[1] * dim(r)[2])) From 8cea14e02d5f6e97062382de6c8d6d0fe251ad64 Mon Sep 17 00:00:00 2001 From: afilazzola Date: Mon, 15 Sep 2025 15:08:25 -0400 Subject: [PATCH 4/6] removed all matrix and used only the raster Id indexing for highest performance --- src/4-background-data-generation.R | 37 +++++++++++++++--------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/src/4-background-data-generation.R b/src/4-background-data-generation.R index 5969eb3..4451e97 100644 --- a/src/4-background-data-generation.R +++ b/src/4-background-data-generation.R @@ -230,34 +230,33 @@ if (backgroundDataOptionsSheet$GenerateBackgroundSites) { ## Extract covariate data for background sites ----- - # rasterize bg data + # extract bg points data cellIds <- cellFromXY(r, geom(bgPts)[, c("x", "y")]) keep <- unique(cellIds) - gc() - rIDs <- rast(r, vals = 1:(dim(r)[1] * dim(r)[2])) - cellPerPt <- terra::extract(rIDs, bgPts) - cellPerPt$SiteID <- bgPts$SiteID - names(cellPerPt)[2] <- "PixelID" - bgPts <- merge(bgPts, cellPerPt[, c("PixelID", "SiteID")]) - rm(rIDs) - gc() + # Get cell IDs for each point (simpler approach) + cellPerPt <- data.frame( + ID = 1:length(bgPts), # temporary ID for merging + PixelID = cellIds, + SiteID = bgPts$SiteID + ) + + bgPts$ID <- 1:length(bgPts) + bgPts <- merge(bgPts, cellPerPt[, c("ID", "PixelID", "SiteID")], by = "ID") + bgPts$ID <- NULL # Remove temporary ID + + PixelIDs <- keep + + cellToPixel <- data.frame(cell = cellIds, PixelID = cellIds) + cellToPixel <- cellToPixel[!duplicated(cellToPixel$cell), ] + PixelIDs <- cellToPixel$PixelID[match(keep, cellToPixel$cell)] - rPixels <- rasterize(bgPts, r, field = "PixelID") - matPixs <- as.matrix(rPixels, wide = T) - PixelIDs <- matPixs[keep] PixelData <- data.frame(PixelID = PixelIDs) - rm(rPixels, matPixs, PixelIDs) - gc() for (i in 1:nrow(covariateDataSheet)) { ri <- rast(covariateDataSheet$RasterFilePath[i]) - mi <- as.matrix(ri, wide = TRUE) - outMat <- mi * matPts - vals <- outMat[keep] - rm(ri, mi, outMat) - gc() + vals <- ri[keep] PixelData[covariateDataSheet$CovariatesID[i]] <- vals progressBar() From ffe838af72edfeb11a8363310620f60b25300195 Mon Sep 17 00:00:00 2001 From: afilazzola Date: Mon, 15 Sep 2025 15:28:41 -0400 Subject: [PATCH 5/6] Pretty sure the logic here should be fail any(fieldDataSheet$Response == -9998) --- src/8-fit-maxent.R | 430 ++++++++++++++++++++++++++++++++------------- 1 file changed, 308 insertions(+), 122 deletions(-) diff --git a/src/8-fit-maxent.R b/src/8-fit-maxent.R index 1d41ecd..b95b846 100644 --- a/src/8-fit-maxent.R +++ b/src/8-fit-maxent.R @@ -4,20 +4,20 @@ ## -------------------- # built under R version 4.1.3, SyncroSim 3.1.10 & rsyncrosim 2.1.3 -# script pulls in pre-processed field, site and covariate data; -# fits maxent model; builds model diagnostic and validation plots +# script pulls in pre-processed field, site and covariate data; +# fits maxent model; builds model diagnostic and validation plots # source dependencies ---------------------------------------------------------- -library(rsyncrosim) +library(rsyncrosim) library(tidyverse) -library(zip) +library(zip) packageDir <- Sys.getenv("ssim_package_directory") source(file.path(packageDir, "00-helper-functions.R")) source(file.path(packageDir, "08-fit-model-functions.R")) -# disable scientific notation +# disable scientific notation options(scipen = 999) # Set progress bar ------------------------------------------------------------- @@ -39,100 +39,174 @@ covariatesSheet <- datasheet(myScenario, "wisdm_Covariates", optional = T) modelsSheet <- datasheet(myScenario, "wisdm_Models") fieldDataSheet <- datasheet(myScenario, "wisdm_FieldData", optional = T) validationDataSheet <- datasheet(myScenario, "wisdm_ValidationOptions") -retainedCovariatesSheet <- datasheet(myScenario, "wisdm_RetainedCovariates", lookupsAsFactors = F) +retainedCovariatesSheet <- datasheet( + myScenario, + "wisdm_RetainedCovariates", + lookupsAsFactors = F +) siteDataSheet <- datasheet(myScenario, "wisdm_SiteData", lookupsAsFactors = F) maxentSheet <- datasheet(myScenario, "wisdm_Maxent") mulitprocessingSheet <- datasheet(myScenario, "core_Multiprocessing") -modelOutputsSheet <- datasheet(myScenario, "wisdm_OutputModel", optional = T, returnInvisible = T, empty = T, lookupsAsFactors = F) %>% drop_na() +modelOutputsSheet <- datasheet( + myScenario, + "wisdm_OutputModel", + optional = T, + returnInvisible = T, + empty = T, + lookupsAsFactors = F +) %>% + drop_na() progressBar() # Error handling --------------------------------------------------------------- # check for for background data -if(!any(fieldDataSheet$Response == -9998)){ - if(any(siteDataSheet$Response == 0)){ - stop("Maxent is a presence-background method; please select a method suitable for presence-absence data before continuing.") +if (any(fieldDataSheet$Response == -9998)) { + if (any(siteDataSheet$Response == 0)) { + stop( + "Maxent is a presence-background method; please select a method suitable for presence-absence data before continuing." + ) } - stop("Maxent is a presence-background method; please generate pseudo-absences before continuing. Note: Pseuso-absences can be generated in the '4 - Background Data Generation' Stage of the Pipeline. See 'Background Data Options' for more details.") + stop( + "Maxent is a presence-background method; please generate pseudo-absences before continuing. Note: Pseuso-absences can be generated in the '4 - Background Data Generation' Stage of the Pipeline. See 'Background Data Options' for more details." + ) } -# Set defaults ---------------------------------------------------------------- +# Set defaults ---------------------------------------------------------------- ## Validation Sheet -if(nrow(validationDataSheet)<1){ - validationDataSheet <- addRow(validationDataSheet, list(SplitData = FALSE, - CrossValidate = FALSE)) +if (nrow(validationDataSheet) < 1) { + validationDataSheet <- addRow( + validationDataSheet, + list(SplitData = FALSE, CrossValidate = FALSE) + ) +} +if (is.na(validationDataSheet$CrossValidate)) { + validationDataSheet$CrossValidate <- FALSE +} +if (is.na(validationDataSheet$SplitData)) { + validationDataSheet$SplitData <- FALSE } -if(is.na(validationDataSheet$CrossValidate)){validationDataSheet$CrossValidate <- FALSE} -if(is.na(validationDataSheet$SplitData)){validationDataSheet$SplitData <- FALSE} -## Maxent Sheet -if(nrow(maxentSheet)<1 | all(is.na(maxentSheet))){ +## Maxent Sheet +if (nrow(maxentSheet) < 1 | all(is.na(maxentSheet))) { maxentSheet <- maxentSheet %>% drop_na() - maxentSheet <- addRow(maxentSheet, list(MemoryLimit = 512, - VisibleInterface = FALSE, - MaximumBackgroundPoints = 10000, - SaveMaxentFiles = FALSE)) + maxentSheet <- addRow( + maxentSheet, + list( + MemoryLimit = 512, + VisibleInterface = FALSE, + MaximumBackgroundPoints = 10000, + SaveMaxentFiles = FALSE + ) + ) } -if(is.na(maxentSheet$MemoryLimit)){maxentSheet$MemoryLimit <- 512} -if(is.na(maxentSheet$MaximumBackgroundPoints)){maxentSheet$MaximumBackgroundPoints <- 10000} -if(is.na(maxentSheet$VisibleInterface)){maxentSheet$VisibleInterface <- FALSE} -if(is.na(maxentSheet$SaveMaxentFiles)){maxentSheet$SaveMaxentFiles <- FALSE} +if (is.na(maxentSheet$MemoryLimit)) { + maxentSheet$MemoryLimit <- 512 +} +if (is.na(maxentSheet$MaximumBackgroundPoints)) { + maxentSheet$MaximumBackgroundPoints <- 10000 +} +if (is.na(maxentSheet$VisibleInterface)) { + maxentSheet$VisibleInterface <- FALSE +} +if (is.na(maxentSheet$SaveMaxentFiles)) { + maxentSheet$SaveMaxentFiles <- FALSE +} -if(is.na(mulitprocessingSheet$EnableMultiprocessing) | mulitprocessingSheet$EnableMultiprocessing == FALSE){ +if ( + is.na(mulitprocessingSheet$EnableMultiprocessing) | + mulitprocessingSheet$EnableMultiprocessing == FALSE +) { maxentSheet$MultiprocessingThreads <- 1 - } else { - if(is.na(maxentSheet$MultiprocessingThreads)){ - maxentSheet$MultiprocessingThreads <- mulitprocessingSheet$MaximumJobs - } +} else { + if (is.na(maxentSheet$MultiprocessingThreads)) { + maxentSheet$MultiprocessingThreads <- mulitprocessingSheet$MaximumJobs } - -saveDatasheet(myScenario, maxentSheet, "wisdm_Maxent") +} + +saveDatasheet(myScenario, maxentSheet, "wisdm_Maxent") progressBar() # Prep data for model fitting -------------------------------------------------- siteDataWide <- spread(siteDataSheet, key = CovariatesID, value = "Value") -rm(siteDataSheet); gc() +rm(siteDataSheet) +gc() # remove variables dropped due to correlation -siteDataWide <- siteDataWide[,c('SiteID', retainedCovariatesSheet$CovariatesID)] +siteDataWide <- siteDataWide[, c( + 'SiteID', + retainedCovariatesSheet$CovariatesID +)] # merge field and site data siteDataWide <- merge(fieldDataSheet, siteDataWide, by = "SiteID") -rm(fieldDataSheet); gc() +rm(fieldDataSheet) +gc() -# remove sites with incomplete data +# remove sites with incomplete data allCases <- nrow(siteDataWide) -siteDataWide <- siteDataWide[complete.cases(subset(siteDataWide, select = c(-UseInModelEvaluation, -ModelSelectionSplit, -Weight))),] +siteDataWide <- siteDataWide[ + complete.cases(subset( + siteDataWide, + select = c(-UseInModelEvaluation, -ModelSelectionSplit, -Weight) + )), +] compCases <- nrow(siteDataWide) -if(compCases/allCases < 0.9){updateRunLog(paste("\nWarning: ", round((1-compCases/allCases)*100,digits=2),"% of cases were removed because of missing values.\n",sep=""))} +if (compCases / allCases < 0.9) { + updateRunLog(paste( + "\nWarning: ", + round((1 - compCases / allCases) * 100, digits = 2), + "% of cases were removed because of missing values.\n", + sep = "" + )) +} # set site weights to default of 1 if not already supplied -if(all(is.na(siteDataWide$Weight))){siteDataWide$Weight <- 1} +if (all(is.na(siteDataWide$Weight))) { + siteDataWide$Weight <- 1 +} -# set pseudo absences to zero -if(any(siteDataWide$Response == -9998)){pseudoAbs <- TRUE} else {pseudoAbs <- FALSE} +# set pseudo absences to zero +if (any(siteDataWide$Response == -9998)) { + pseudoAbs <- TRUE +} else { + pseudoAbs <- FALSE +} siteDataWide$Response[siteDataWide$Response == -9998] <- 0 # set categorical variable to factor -factorInputVars <- covariatesSheet$CovariateName[which(covariatesSheet$IsCategorical == T & covariatesSheet$CovariateName %in% names(siteDataWide))] -if(length(factorInputVars)>0){ - for (i in factorInputVars){ - siteDataWide[,i] <- factor(siteDataWide[,i]) +factorInputVars <- covariatesSheet$CovariateName[which( + covariatesSheet$IsCategorical == T & + covariatesSheet$CovariateName %in% names(siteDataWide) +)] +if (length(factorInputVars) > 0) { + for (i in factorInputVars) { + siteDataWide[, i] <- factor(siteDataWide[, i]) } } -# identify training and testing sites -trainTestDatasets <- split(siteDataWide, f = siteDataWide[,"UseInModelEvaluation"], drop = T) -rm(siteDataWide); gc() +# identify training and testing sites +trainTestDatasets <- split( + siteDataWide, + f = siteDataWide[, "UseInModelEvaluation"], + drop = T +) +rm(siteDataWide) +gc() trainingData <- trainTestDatasets$`FALSE` -if(!validationDataSheet$CrossValidate){trainingData$ModelSelectionSplit <- FALSE} +if (!validationDataSheet$CrossValidate) { + trainingData$ModelSelectionSplit <- FALSE +} testingData <- trainTestDatasets$`TRUE` -rm(trainTestDatasets); gc() -if(!is.null(testingData)){testingData$ModelSelectionSplit <- FALSE} +rm(trainTestDatasets) +gc() +if (!is.null(testingData)) { + testingData$ModelSelectionSplit <- FALSE +} progressBar() # Model definitions ------------------------------------------------------------ @@ -145,12 +219,12 @@ out$modType <- modType <- "maxent" ## Model options out$modOptions <- maxentSheet -out$modOptions$thresholdOptimization <- "Sens=Spec" # To Do: link to defined Threshold Optimization Method in UI - currently set to default: sensitivity=specificity +out$modOptions$thresholdOptimization <- "Sens=Spec" # To Do: link to defined Threshold Optimization Method in UI - currently set to default: sensitivity=specificity -## Model family +## Model family out$modelFamily <- "binomial" -## Candidate variables +## Candidate variables out$inputVars <- retainedCovariatesSheet$CovariatesID out$factorInputVars <- factorInputVars @@ -158,9 +232,9 @@ out$factorInputVars <- factorInputVars out$pseudoAbs <- pseudoAbs ## Validation options -out$validationOptions <- validationDataSheet +out$validationOptions <- validationDataSheet -## path to temp ssim storage +## path to temp ssim storage out$tempDir <- ssimTempDir # Format and save data for use in Maxent ------------------------------------------- @@ -171,72 +245,118 @@ dir.create(file.path(ssimTempDir, "Outputs")) ## training data -out$swdPath <- swdPath <- file.path(ssimTempDir, "Inputs", "training-swd.csv", fsep = "\\") +out$swdPath <- swdPath <- file.path( + ssimTempDir, + "Inputs", + "training-swd.csv", + fsep = "\\" +) trainingData %>% mutate(Species = case_when(Response == 1 ~ "species")) %>% drop_na(Species) %>% - select(-SiteID, -Response, -UseInModelEvaluation, -ModelSelectionSplit, -Weight) %>% + select( + -SiteID, + -Response, + -UseInModelEvaluation, + -ModelSelectionSplit, + -Weight + ) %>% relocate(Species, .before = X) %>% write.csv(swdPath, row.names = F) out$data$train <- trainingData -if(pseudoAbs){ - - out$backgroundPath <- backgroundPath <- file.path(ssimTempDir, "Inputs", "background-swd.csv", fsep = "\\") - +if (pseudoAbs) { + out$backgroundPath <- backgroundPath <- file.path( + ssimTempDir, + "Inputs", + "background-swd.csv", + fsep = "\\" + ) + trainingData %>% mutate(Species = case_when(Response != 1 ~ "background")) %>% drop_na(Species) %>% - select(-SiteID, -Response, -UseInModelEvaluation, -ModelSelectionSplit, -Weight) %>% + select( + -SiteID, + -Response, + -UseInModelEvaluation, + -ModelSelectionSplit, + -Weight + ) %>% relocate(Species, .before = X) %>% write.csv(backgroundPath, row.names = F) - } +} ## testing data -if(!is.null(testingData)){ - out$testDataPath <- testDataPath <- file.path(ssimTempDir, "Inputs", "testing-swd.csv", fsep = "\\") +if (!is.null(testingData)) { + out$testDataPath <- testDataPath <- file.path( + ssimTempDir, + "Inputs", + "testing-swd.csv", + fsep = "\\" + ) testingData %>% mutate(Species = case_when(Response == 1 ~ "species")) %>% drop_na(Species) %>% - select(-SiteID, -Response, -UseInModelEvaluation, -ModelSelectionSplit, -Weight) %>% + select( + -SiteID, + -Response, + -UseInModelEvaluation, + -ModelSelectionSplit, + -Weight + ) %>% relocate(Species, .before = X) %>% write.csv(testDataPath, row.names = F) - + out$data$test <- testingData - rm(testingData); gc() - } + rm(testingData) + gc() +} out$batchPath <- file.path(ssimTempDir, "Inputs", "runMaxent.bat", fsep = "\\") # out$maxJobs <- mulitprocessingSheet$MaximumJobs # Create output text file ------------------------------------------------------ -capture.output(cat("Maxent Results"), file = file.path(ssimTempDir, paste0(modType, "_output.txt"))) -on.exit(capture.output(cat("Model Failed\n\n"),file = file.path(ssimTempDir, paste0(modType, "_output.txt")),append=TRUE)) +capture.output( + cat("Maxent Results"), + file = file.path(ssimTempDir, paste0(modType, "_output.txt")) +) +on.exit(capture.output( + cat("Model Failed\n\n"), + file = file.path(ssimTempDir, paste0(modType, "_output.txt")), + append = TRUE +)) # Review model data ------------------------------------------------------------ -if(nrow(out$data$train)/(length(out$inputVars)-1)<10){ - updateRunLog(paste("\nYou have approximately ", round(nrow(out$data$train)/(ncol(out$data$train)-1),digits=1), - " observations for every predictor\n consider reducing the number of predictors before continuing\n",sep="")) +if (nrow(out$data$train) / (length(out$inputVars) - 1) < 10) { + updateRunLog(paste( + "\nYou have approximately ", + round(nrow(out$data$train) / (ncol(out$data$train) - 1), digits = 1), + " observations for every predictor\n consider reducing the number of predictors before continuing\n", + sep = "" + )) } progressBar() # Fit model -------------------------------------------------------------------- -finalMod <- fitModel(dat = NULL, # maxent code pulls in data from csv files built/saved above - out = out) +finalMod <- fitModel( + dat = NULL, # maxent code pulls in data from csv files built/saved above + out = out +) # finalMod$trainingData <- trainingData # save model to temp storage # saveRDS(finalMod, file = paste0(ssimTempDir,"\\Data\\", modType, "_model.rds")) # finalMod$trainingData <- NULL -# add relevant model details to out +# add relevant model details to out out$finalMod <- finalMod out$finalVars <- out$inputVars # maxent doesn't drop variables out$nVarsFinal <- length(out$finalVars) @@ -244,52 +364,84 @@ out$nVarsFinal <- length(out$finalVars) # load maxent run results runReults <- read.csv(file.path(ssimTempDir, "Outputs", "maxentResults.csv")) -modSummary <- data.frame("Variabels" = gsub(".contribution", "", names(runReults)[grep("contribution",names(runReults))])) -modSummary$Contribution <- t(runReults[grep("contribution",names(runReults))]) +modSummary <- data.frame( + "Variabels" = gsub( + ".contribution", + "", + names(runReults)[grep("contribution", names(runReults))] + ) +) +modSummary$Contribution <- t(runReults[grep("contribution", names(runReults))]) updateRunLog("\nSummary of Model:\n") coeftbl <- modSummary rownames(coeftbl) <- NULL -updateRunLog(pander::pandoc.table.return(coeftbl, style = "simple", split.tables = 100)) -capture.output(cat("\n\n"), modSummary, file = file.path(ssimTempDir, paste0(modType, "_output.txt")), append=TRUE) +updateRunLog(pander::pandoc.table.return( + coeftbl, + style = "simple", + split.tables = 100 +)) +capture.output( + cat("\n\n"), + modSummary, + file = file.path(ssimTempDir, paste0(modType, "_output.txt")), + append = TRUE +) progressBar() # Test model predictions ------------------------------------------------------- -out$data$train$predicted <- pred.fct(x=out$data$train, mod=finalMod, modType=modType) # alternatively should these be pulled from the samplePredictions.csv output by maxent?? - -if(validationDataSheet$SplitData){ - out$data$test$predicted <- pred.fct(x=out$data$test, mod=finalMod, modType=modType) +out$data$train$predicted <- pred.fct( + x = out$data$train, + mod = finalMod, + modType = modType +) # alternatively should these be pulled from the samplePredictions.csv output by maxent?? + +if (validationDataSheet$SplitData) { + out$data$test$predicted <- pred.fct( + x = out$data$test, + mod = finalMod, + modType = modType + ) } progressBar() # Evaluate thresholds (for use with binary output) ---------------------------- -predOcc <- out$data$train[out$data$train$Response >= 1 , "predicted"] -predAbs <- out$data$train[out$data$train$Response == 0 , "predicted"] +predOcc <- out$data$train[out$data$train$Response >= 1, "predicted"] +predAbs <- out$data$train[out$data$train$Response == 0, "predicted"] evalOut <- modelEvaluation(predOcc = predOcc, predAbs = predAbs) finalMod$binThresholds <- thresholds <- evalOut@thresholds -rm(predOcc, predAbs, evalOut); gc() +rm(predOcc, predAbs, evalOut) +gc() -names(thresholds) <- c("Max kappa", "Max sensitivity and specificity", "No omission", - "Prevalence", "Sensitivity equals specificity") +names(thresholds) <- c( + "Max kappa", + "Max sensitivity and specificity", + "No omission", + "Prevalence", + "Sensitivity equals specificity" +) updateRunLog("\nThresholds:\n") -tbl <- round(thresholds, 6) -updateRunLog(pander::pandoc.table.return(tbl, style = "simple", split.tables = 100)) +tbl <- round(thresholds, 6) +updateRunLog(pander::pandoc.table.return( + tbl, + style = "simple", + split.tables = 100 +)) # save model info to temp storage finalMod$trainingData <- trainingData saveRDS(finalMod, file = file.path(ssimTempDir, paste0(modType, "_model.rds"))) -rm(finalMod); gc() +rm(finalMod) +gc() # Run Cross Validation (if specified) ------------------------------------------ -if(validationDataSheet$CrossValidate){ - - out <- cv.fct(out = out, - nfolds = validationDataSheet$NumberOfFolds) +if (validationDataSheet$CrossValidate) { + out <- cv.fct(out = out, nfolds = validationDataSheet$NumberOfFolds) } progressBar() @@ -297,7 +449,7 @@ progressBar() ## AUC/ROC - Residual Plots - Variable Importance - Calibration - Confusion Matrix ## -out <- suppressWarnings(makeModelEvalPlots(out=out)) +out <- suppressWarnings(makeModelEvalPlots(out = out)) progressBar() ## Response Curves ## @@ -310,31 +462,66 @@ progressBar() tempFiles <- list.files(ssimTempDir) # add model Outputs to datasheet -modelOutputsSheet <- addRow(modelOutputsSheet, - list(ModelsID = modelsSheet$ModelName[modelsSheet$ModelType == modType], - ModelRDS = file.path(ssimTempDir, paste0(modType, "_model.rds")), - ResponseCurves = file.path(ssimTempDir, paste0(modType, "_ResponseCurves.png")), - TextOutput = file.path(ssimTempDir, paste0(modType, "_output.txt")), - ResidualSmoothPlot = file.path(ssimTempDir, paste0(modType, "_ResidualSmoothPlot.png")), - ResidualSmoothRDS = file.path(ssimTempDir, paste0(modType, "_ResidualSmoothFunction.rds")), - ConfusionMatrix = file.path(ssimTempDir, paste0(modType, "_ConfusionMatrix.png")), - VariableImportancePlot = file.path(ssimTempDir, paste0(modType, "_VariableImportance.png")), - VariableImportanceData = file.path(ssimTempDir, paste0(modType, "_VariableImportance.csv")), - ROCAUCPlot = file.path(ssimTempDir, paste0(modType, "_ROCAUCPlot.png")), - CalibrationPlot = file.path(ssimTempDir, paste0(modType, "_CalibrationPlot.png")))) - - -if("maxent_StandardResidualPlots.png" %in% tempFiles){ modelOutputsSheet$ResidualsPlot <- file.path(ssimTempDir, paste0(modType, "_StandardResidualPlots.png")) } -if("maxent_AUCPRPlot.png" %in% tempFiles){ modelOutputsSheet$AUCPRPlot <- file.path(ssimTempDir, paste0(modType, "_AUCPRPlot.png")) } +modelOutputsSheet <- addRow( + modelOutputsSheet, + list( + ModelsID = modelsSheet$ModelName[modelsSheet$ModelType == modType], + ModelRDS = file.path(ssimTempDir, paste0(modType, "_model.rds")), + ResponseCurves = file.path( + ssimTempDir, + paste0(modType, "_ResponseCurves.png") + ), + TextOutput = file.path(ssimTempDir, paste0(modType, "_output.txt")), + ResidualSmoothPlot = file.path( + ssimTempDir, + paste0(modType, "_ResidualSmoothPlot.png") + ), + ResidualSmoothRDS = file.path( + ssimTempDir, + paste0(modType, "_ResidualSmoothFunction.rds") + ), + ConfusionMatrix = file.path( + ssimTempDir, + paste0(modType, "_ConfusionMatrix.png") + ), + VariableImportancePlot = file.path( + ssimTempDir, + paste0(modType, "_VariableImportance.png") + ), + VariableImportanceData = file.path( + ssimTempDir, + paste0(modType, "_VariableImportance.csv") + ), + ROCAUCPlot = file.path(ssimTempDir, paste0(modType, "_ROCAUCPlot.png")), + CalibrationPlot = file.path( + ssimTempDir, + paste0(modType, "_CalibrationPlot.png") + ) + ) +) + + +if ("maxent_StandardResidualPlots.png" %in% tempFiles) { + modelOutputsSheet$ResidualsPlot <- file.path( + ssimTempDir, + paste0(modType, "_StandardResidualPlots.png") + ) +} +if ("maxent_AUCPRPlot.png" %in% tempFiles) { + modelOutputsSheet$AUCPRPlot <- file.path( + ssimTempDir, + paste0(modType, "_AUCPRPlot.png") + ) +} # save maxent files to compressed zip -if(maxentSheet$SaveMaxentFiles){ +if (maxentSheet$SaveMaxentFiles) { setwd(out$tempDir) - + # create zip file of all maxent specific inputs/outputs - zipName <- paste0("Maxent-",ssimEnvironment()$ScenarioId, ".zip") + zipName <- paste0("Maxent-", ssimEnvironment()$ScenarioId, ".zip") zip(zipfile = zipName, files = c("Inputs", "Outputs")) - + # save zip to model outputs modelOutputsSheet$MaxentFiles <- file.path(ssimTempDir, zipName) } @@ -342,4 +529,3 @@ if(maxentSheet$SaveMaxentFiles){ # save outputs datasheet saveDatasheet(myScenario, modelOutputsSheet, "wisdm_OutputModel", append = T) progressBar(type = "end") - From 319c167053623e49316b6f1c04c9f5ab56cdbb70 Mon Sep 17 00:00:00 2001 From: afilazzola Date: Mon, 15 Sep 2025 15:33:38 -0400 Subject: [PATCH 6/6] Fixed typos in variable reduction run log prints --- src/6-variable-reduction.R | 333 ++++++++++++++++++++++++------------- 1 file changed, 218 insertions(+), 115 deletions(-) diff --git a/src/6-variable-reduction.R b/src/6-variable-reduction.R index 87c67e4..9e0d662 100644 --- a/src/6-variable-reduction.R +++ b/src/6-variable-reduction.R @@ -10,7 +10,7 @@ # source dependencies ---------------------------------------------------------- -library(rsyncrosim) # install.packages("C:/GitHub/rsyncrosim", type="source", repos=NULL) +library(rsyncrosim) # install.packages("C:/GitHub/rsyncrosim", type="source", repos=NULL) library(tidyr) library(dplyr) library(shiny) @@ -31,30 +31,49 @@ myProject <- rsyncrosim::project() myScenario <- scenario() # path to ssim directories -ssimTempDir <- ssimEnvironment()$TransferDirectory +ssimTempDir <- ssimEnvironment()$TransferDirectory # Read in datasheets -covariatesSheet <- datasheet(myProject, "wisdm_Covariates", optional = T, includeKey = T) +covariatesSheet <- datasheet( + myProject, + "wisdm_Covariates", + optional = T, + includeKey = T +) fieldDataSheet <- datasheet(myScenario, "wisdm_FieldData", optional = T) -siteDataSheet <- datasheet(myScenario, "wisdm_SiteData", lookupsAsFactors = F) -covariateSelectionSheet <- datasheet(myScenario, "wisdm_CovariateSelectionOptions", optional = T) +siteDataSheet <- datasheet(myScenario, "wisdm_SiteData", lookupsAsFactors = F) +covariateSelectionSheet <- datasheet( + myScenario, + "wisdm_CovariateSelectionOptions", + optional = T +) retainedCovariatesSheet <- datasheet(myScenario, "wisdm_RetainedCovariates") -covariateCorrelationSheet <- datasheet(myScenario, "wisdm_OutputCovariateCorrelationMatrix", optional = T) %>% drop_na() +covariateCorrelationSheet <- datasheet( + myScenario, + "wisdm_OutputCovariateCorrelationMatrix", + optional = T +) %>% + drop_na() progressBar() # Set defaults ----------------------------------------------------------------- -## Covariate selection -if(nrow(covariateSelectionSheet)<1){ - covariateSelectionSheet <- addRow(covariateSelectionSheet, list(SelectionMethod = "Interactive (Correlation Viewer)")) +## Covariate selection +if (nrow(covariateSelectionSheet) < 1) { + covariateSelectionSheet <- addRow( + covariateSelectionSheet, + list(SelectionMethod = "Interactive (Correlation Viewer)") + ) } -if(is.na(covariateSelectionSheet$SelectionMethod)){covariateSelectionSheet$SelectionMethod <- "Interactive (Correlation Viewer)"} +if (is.na(covariateSelectionSheet$SelectionMethod)) { + covariateSelectionSheet$SelectionMethod <- "Interactive (Correlation Viewer)" +} ## Retained covariates -if(any(is.na(retainedCovariatesSheet$CovariatesID))){ +if (any(is.na(retainedCovariatesSheet$CovariatesID))) { retainedCovariatesSheet <- na.omit(retainedCovariatesSheet) } @@ -63,37 +82,52 @@ if(any(is.na(retainedCovariatesSheet$CovariatesID))){ # merge field and site data siteDataWide <- spread(siteDataSheet, key = CovariatesID, value = "Value") siteDataWide <- merge(fieldDataSheet, siteDataWide, by = "SiteID") -siteData <- select(siteDataWide, -c(SiteID, X, Y, UseInModelEvaluation, ModelSelectionSplit, Weight)) # -rm(siteDataSheet, fieldDataSheet, siteDataWide); gc() +siteData <- select( + siteDataWide, + -c(SiteID, X, Y, UseInModelEvaluation, ModelSelectionSplit, Weight) +) # +rm(siteDataSheet, fieldDataSheet, siteDataWide) +gc() # identify categorical covariates and drop any with a single level -if(sum(covariatesSheet$IsCategorical, na.rm = T)>0){ - factorVars <- covariatesSheet$CovariateName[which(covariatesSheet$IsCategorical == T & covariatesSheet$CovariateName %in% names(siteData))] - if(length(factorVars)>0){ +if (sum(covariatesSheet$IsCategorical, na.rm = T) > 0) { + factorVars <- covariatesSheet$CovariateName[which( + covariatesSheet$IsCategorical == T & + covariatesSheet$CovariateName %in% names(siteData) + )] + if (length(factorVars) > 0) { badFactors <- NULL - for (i in 1:length(factorVars)){ - factor.table <- table(siteData[,factorVars[i]]) - if(length(factor.table)<2){ badFactors <- c(badFactors, factorVars[i]) } + for (i in 1:length(factorVars)) { + factor.table <- table(siteData[, factorVars[i]]) + if (length(factor.table) < 2) { + badFactors <- c(badFactors, factorVars[i]) + } } - if(length(badFactors) > 0){ + if (length(badFactors) > 0) { factorVars <- factorVars[-which(factorVars %in% badFactors)] - if(length(factorVars) == 0){ factorVars <- NULL } - updateRunLog(paste0("\nThe following categorical response variables were removed from consideration\n", - "because they had only one level: ",paste(badFactors, collapse=","),"\n")) + if (length(factorVars) == 0) { + factorVars <- NULL + } + updateRunLog(paste0( + "\nThe following categorical response variables were removed from consideration\n", + "because they had only one level: ", + paste(badFactors, collapse = ","), + "\n" + )) } - } else { + } else { badFactors <- NULL - } -} else { + } +} else { factorVars <- NULL badFactors <- NULL - } +} -# model family +# model family modelFamily <- "binomial" # Ignore background data if present -siteData <- siteData[!siteData$Response == -9999,] +siteData <- siteData[!siteData$Response == -9999, ] # update response for pseudo-absence sites siteData$Response[siteData$Response == -9998] <- 0 @@ -105,137 +139,206 @@ progressBar() # Automatic selection (using VIF) ---------------------------------------------- -if(covariateSelectionSheet$SelectionMethod == "Automatic (Variance Inflation Factor)"){ - - ## Covariate selection - if(is.na(covariateSelectionSheet$CorrelationThreshold)){covariateSelectionSheet$CorrelationThreshold <- 0.7} - if(is.na(covariateSelectionSheet$VIFThreshold)){covariateSelectionSheet$VIFThreshold <- 10} - - saveDatasheet(myScenario, covariateSelectionSheet, "wisdm_CovariateSelectionOptions") - +if ( + covariateSelectionSheet$SelectionMethod == + "Automatic (Variance Inflation Factor)" +) { + ## Covariate selection + if (is.na(covariateSelectionSheet$CorrelationThreshold)) { + covariateSelectionSheet$CorrelationThreshold <- 0.7 + } + if (is.na(covariateSelectionSheet$VIFThreshold)) { + covariateSelectionSheet$VIFThreshold <- 10 + } + + saveDatasheet( + myScenario, + covariateSelectionSheet, + "wisdm_CovariateSelectionOptions" + ) + source(file.path(packageDir, "usdm-vif.R")) - - if(length(retainedCovariatesSheet$CovariatesID)>0){ - colin <- vifstep(x = covData, th = covariateSelectionSheet$VIFThreshold, keep = as.character(retainedCovariatesSheet$CovariatesID)) - } else{ + + if (length(retainedCovariatesSheet$CovariatesID) > 0) { + colin <- vifstep( + x = covData, + th = covariateSelectionSheet$VIFThreshold, + keep = as.character(retainedCovariatesSheet$CovariatesID) + ) + } else { colin <- vifstep(x = covData, th = covariateSelectionSheet$VIFThreshold) } - - selectedCovariates <- colin@results$Variables - + + selectedCovariates <- colin@results$Variables + dropCovs <- colin@excluded allCovs <- colin@variables - - updateRunLog("\n",length(dropCovs), " of the ", length(allCovs), " input variables were removed from the covartite list due to collinarity:\n\n", paste(dropCovs, collapse = "\n")) - updateRunLog("VIFs of remaining varaibles") - updateRunLog(pander::pandoc.table.return(colin@results, style = "simple", split.tables = 100)) - + + updateRunLog( + "\n", + length(dropCovs), + " of the ", + length(allCovs), + " input variables were removed from the covariate list due to collinearity:\n\n", + paste(dropCovs, collapse = "\n") + ) + updateRunLog("VIFs of remaining variables") + updateRunLog(pander::pandoc.table.return( + colin@results, + style = "simple", + split.tables = 100 + )) + corMat <- colin@corMatrix corMat[lower.tri(corMat, diag = TRUE)] <- NA - corVals <- unique(corMat[corMat > covariateSelectionSheet$CorrelationThreshold]) + corVals <- unique(corMat[ + corMat > covariateSelectionSheet$CorrelationThreshold + ]) corVals <- corVals[!is.na(corVals)] - - if(length(corVals)>0){ - + + if (length(corVals) > 0) { updateRunLog("\nThe following correlated covariates were retained:\n") - - for(i in corVals){ - row <- row.names(corMat)[as.vector(which(corMat==i,arr.ind=TRUE))[1]] - col <- colnames(corMat)[as.vector(which(corMat==i,arr.ind=TRUE))[2]] - updateRunLog(paste0(row, " ~ ", col, ": ", round(i,4))) + + for (i in corVals) { + row <- row.names(corMat)[as.vector(which(corMat == i, arr.ind = TRUE))[1]] + col <- colnames(corMat)[as.vector(which(corMat == i, arr.ind = TRUE))[2]] + updateRunLog(paste0(row, " ~ ", col, ": ", round(i, 4))) } } -progressBar() + progressBar() } # end if "Automatic" # Interactive Selection (using viewer) ------------------------------------------ -if(covariateSelectionSheet$SelectionMethod == "Interactive (Correlation Viewer)"){ - - ## Covariate selection - if(is.na(covariateSelectionSheet$DisplayHighestCorrelations)){covariateSelectionSheet$DisplayHighestCorrelations <- TRUE} - if(is.na(covariateSelectionSheet$CorrelationThreshold)){covariateSelectionSheet$CorrelationThreshold <- 0.7} - if(is.na(covariateSelectionSheet$NumberOfPlots)){covariateSelectionSheet$NumberOfPlots <- 5} - - saveDatasheet(myScenario, covariateSelectionSheet, "wisdm_CovariateSelectionOptions") - +if ( + covariateSelectionSheet$SelectionMethod == "Interactive (Correlation Viewer)" +) { + ## Covariate selection + if (is.na(covariateSelectionSheet$DisplayHighestCorrelations)) { + covariateSelectionSheet$DisplayHighestCorrelations <- TRUE + } + if (is.na(covariateSelectionSheet$CorrelationThreshold)) { + covariateSelectionSheet$CorrelationThreshold <- 0.7 + } + if (is.na(covariateSelectionSheet$NumberOfPlots)) { + covariateSelectionSheet$NumberOfPlots <- 5 + } + + saveDatasheet( + myScenario, + covariateSelectionSheet, + "wisdm_CovariateSelectionOptions" + ) + # prep deviance explained data devExp <- vector() - for(i in (1:ncol(covData))){ - devExp[i] <- try(my.panel.smooth(x = covData[,i], - y = siteData$Response, - plot.it=FALSE, - family=modelFamily),silent=TRUE) + for (i in (1:ncol(covData))) { + devExp[i] <- try( + my.panel.smooth( + x = covData[, i], + y = siteData$Response, + plot.it = FALSE, + family = modelFamily + ), + silent = TRUE + ) } - devExp <- round(devExp,2) + devExp <- round(devExp, 2) devInfo <- as.data.frame(devExp) devInfo$covs <- names(covData) devInfo$covDE <- paste0(devInfo$covs, " (", devInfo$devExp, ")") covsDE <- devInfo$covs names(covsDE) <- devInfo$covDE - + # run pairs explore with all variables ----------------------------------------- - + options <- covariateSelectionSheet - options$NumberOfPlots <- ncol(select(siteData, -Response, -all_of(badFactors))) - - pairsExplore(inputData = siteData, - options = options, - selectedCovs = names(select(siteData, -Response, -all_of(badFactors))), - factorVars = factorVars, - family = modelFamily, - outputFile = file.path(ssimTempDir, "InitialCovariateCorrelationMatrix.png")) - + options$NumberOfPlots <- ncol(select( + siteData, + -Response, + -all_of(badFactors) + )) + + pairsExplore( + inputData = siteData, + options = options, + selectedCovs = names(select(siteData, -Response, -all_of(badFactors))), + factorVars = factorVars, + family = modelFamily, + outputFile = file.path(ssimTempDir, "InitialCovariateCorrelationMatrix.png") + ) + # run shiny app ---------------------------------------------------------------- - + # inputs covData <- select(siteData, -Response, -all_of(badFactors)) selectedCovariates <- names(covData) # covsDE options <- covariateSelectionSheet - - # TO DO: find better way to access default web app + + # TO DO: find better way to access default web app browser.path <- NULL - if(file.exists("C:/Program Files/Google/Chrome/Application/chrome.exe")){ - browser.path <- "C:/Program Files/Google/Chrome/Application/chrome.exe" - } else if(file.exists("C:/Program Files(x86)/Google/Chrome/Application/chrome.exe")){ + if (file.exists("C:/Program Files/Google/Chrome/Application/chrome.exe")) { + browser.path <- "C:/Program Files/Google/Chrome/Application/chrome.exe" + } else if ( + file.exists("C:/Program Files(x86)/Google/Chrome/Application/chrome.exe") + ) { browser.path <- "C:/Program Files(x86)/Google/Chrome/Application/chrome.exe" - } else if(file.exists("C:/Program Files/Mozilla Firefox/firefox.exe")){ + } else if (file.exists("C:/Program Files/Mozilla Firefox/firefox.exe")) { browser.path <- "C:/Program Files/Mozilla Firefox/firefox.exe" - # } else if(file.exists("C:/Program Files/Internet Explorer/iexplore.exe")){ - # browser.path <- "C:/Program Files/Internet Explorer/iexplore.exe" + # } else if(file.exists("C:/Program Files/Internet Explorer/iexplore.exe")){ + # browser.path <- "C:/Program Files/Internet Explorer/iexplore.exe" } - - # portable chrome - to large to store on git + + # portable chrome - to large to store on git # browser.path = file.path(packageDir,"Apps/chrome/chrome.exe") - - if(is.null(browser.path)){ - runApp(appDir = file.path(packageDir, "06-covariate-correlation-app.R"), - launch.browser = TRUE) + + if (is.null(browser.path)) { + runApp( + appDir = file.path(packageDir, "06-covariate-correlation-app.R"), + launch.browser = TRUE + ) } else { - runApp(appDir = file.path(packageDir, "06-covariate-correlation-app.R"), - launch.browser = function(shinyurl) { - system(paste0("\"", browser.path, "\" --app=", shinyurl, " -incognito"), wait = F) - }) + runApp( + appDir = file.path(packageDir, "06-covariate-correlation-app.R"), + launch.browser = function(shinyurl) { + system( + paste0("\"", browser.path, "\" --app=", shinyurl, " -incognito"), + wait = F + ) + } + ) } - + # save image files covariateCorrelationSheet <- addRow( - covariateCorrelationSheet, - data.frame(InitialMatrix = file.path(ssimTempDir, - "InitialCovariateCorrelationMatrix.png"), - SelectedMatrix = file.path(ssimTempDir, - "SelectedCovariateCorrelationMatrix.png"))) - saveDatasheet(myScenario, covariateCorrelationSheet, - "wisdm_OutputCovariateCorrelationMatrix") + covariateCorrelationSheet, + data.frame( + InitialMatrix = file.path( + ssimTempDir, + "InitialCovariateCorrelationMatrix.png" + ), + SelectedMatrix = file.path( + ssimTempDir, + "SelectedCovariateCorrelationMatrix.png" + ) + ) + ) + saveDatasheet( + myScenario, + covariateCorrelationSheet, + "wisdm_OutputCovariateCorrelationMatrix" + ) progressBar() -} # end if "Interactive" +} # end if "Interactive" # save reduced covariate list -------------------------------------------------- -selectedCovariates <- unique(c(as.character(retainedCovariatesSheet$CovariatesID), selectedCovariates)) +selectedCovariates <- unique(c( + as.character(retainedCovariatesSheet$CovariatesID), + selectedCovariates +)) retainedCovariatesSheet <- data.frame(CovariatesID = selectedCovariates) saveDatasheet(myScenario, retainedCovariatesSheet, "wisdm_RetainedCovariates") progressBar(type = "end") -