From f0235189e6d61432470b63e3625fea8388db2be7 Mon Sep 17 00:00:00 2001 From: matthewsrogan <87720818+matthewsrogan@users.noreply.github.com> Date: Wed, 7 Sep 2022 13:28:21 -0400 Subject: [PATCH 1/3] additional functionality identify records outside a region of interest (mol_roi.R), identify records flagged according to a quality control attribute provided by the source (mol_flagged_by_source.R), and identify spatiotemporal duplicates (mol_spatiotemporal_duplicate.R). --- R/mol_flagged_by_source.R | 64 ++++++++ R/mol_roi.R | 253 +++++++++++++++++++++++++++++++ R/mol_spatiotemporal_duplicate.R | 31 ++++ 3 files changed, 348 insertions(+) create mode 100644 R/mol_flagged_by_source.R create mode 100644 R/mol_roi.R create mode 100644 R/mol_spatiotemporal_duplicate.R diff --git a/R/mol_flagged_by_source.R b/R/mol_flagged_by_source.R new file mode 100644 index 0000000..157943c --- /dev/null +++ b/R/mol_flagged_by_source.R @@ -0,0 +1,64 @@ +#' Flag records that have prexisting quality control attributes +#' +#' This function flags records with quality control attributes provided by the +#' source (eg GBIF 'issues'). The flag is applied my matching a vector of +#' user-specified strings to a given column. +#' +#' @family prefilter +#' @param data data.frame. Containing quality control attributes. +#' @param flagStrings character. A vector of terms denoting sub-quality observations. +#' @param flagCols character string. One or more column names containing quality +#' control attributes (e.g. 'issue' for GBIF data). +#' +#' @details The function uses regex matching of multiple terms to identify +#' observations tagged with one or more of the quality control attributes. If +#' length of flagStrings is greater than one, terms are collapsed using the "|" +#' operator. +#' +#' @return A data.frame containing the column ".flagged_by_source" +#' .Compliant (TRUE) for observations that match one or more flagStrings; +#' otherwise "FALSE". +#' +#' @importFrom dplyr if_any +#' @importFrom stringr str_c str_detect +#' +#' @export +#' +#' @examples +#' x <- data.frame( +#' issue = c("", "", "", "TAXON_MATCH_FUZZY"), +#' occurrenceStatus = c("", "ABSENT", "", "") +#' ) +#' +#' mol_flagged_by_source( +#' data = x, +#' flagStrings = c("ABSENT", "TAXON_MATCH_FUZZY"), +#' flagCols = c("issue", "occurrenceStatus") +#' ) +mol_flagged_by_source( + data, + flagStrings, + flagCols +){ + + check_col(data, flagCols) + + terms <- str_c(flagStrings, collapse = "|") + + data <- data %>% + mutate(.flagged_by_source = if_any(dplyr::all_of(flagCols), + ~str_detect(.x, terms))) + + message( + paste( + "\mol_flagged_by_source:\nFlagged", + sum(data$.flagged_by_source == FALSE), + "observations with flagged attributes.", + "\nOne column was added to the database.\n" + ) + ) + + return(data) +} + + diff --git a/R/mol_roi.R b/R/mol_roi.R new file mode 100644 index 0000000..5f8546b --- /dev/null +++ b/R/mol_roi.R @@ -0,0 +1,253 @@ +#' Identify records located outside a given region of interest. +#' +#' This function flags records that are outside a particular spatial region of +#' interest (roi). ROIs can be specified in a variety of raster and vector +#' formats. +#' +#' @family space +#' @param data data.frame. Containing geographical coordinates. +#' @param roi spatial region of interest. Can be provided as an sf, sfc, +#' SpatialPolygons, SpatialPolygonsDataFrame, rasterLayer, spatRaster, +#' or as a file path with extension #' ".shp", ".gpkg", or ".tif". +#' @param lat character string. The column name with latitude in decimal degrees +#' and WGS84. Default = "decimalLatitude". +#' @param lon character string. The column with longitude in decimal degrees and +#' WGS84. Default = "decimalLongitude". +#' @param byExtOnly logical. If true, occurrences are only flagged if located +#' outside the roi extent. If false, occurrences are checked for whether they are +#' located within the roi polygons or raster mask. +#' @param maskValue numeric or character. Raster value corresponding to "outside" +#' the region of interest. Ignored if roi is provided as a vector layer. +#' +#' @details This test identifies records outside a particular region of interest +#' defined by a raster or vector layer +#' +#' @return A data.frame containing the column ".outside_roi". Compliant +#' (TRUE) if coordinate is outside user defined roi; otherwise "FALSE". +#' +#' @importFrom sf st_bbox st_as_sf st_as_sfc st_within +#' @importFrom stringr str_detect +#' @importFrom terra crs rast geom vect +#' +#' @export mol_roi +#' +#' @examples +#' x <- data.frame( +#' decimalLatitude = c(1, 2, 3, 5), +#' decimalLongitude = c(1, 2, 3, 4) +#' ) +#' +#' roi <- sf::st_as_sfc(sf::st_bbox(c(xmin = 0, xmax = 4, ymax = 4, ymin = 0), crs = 4326)) +#' +#' mol_roi( +#' data = x, +#' roi = roi, +#' lat = "decimalLatitude", +#' lon = "decimalLongitude" +#' ) +#' + +mol_roi <- + function(data, + roi, + lat = "decimalLatitude", + lon = "decimalLongitude", + byExtOnly = FALSE, + maskValue = NA){ + + ### Run checks + if (!is.data.frame(data)) { + stop(deparse(substitute(data)), " is not a data.frame", call. = FALSE) + } + + check_col(data, c(lat, lon)) + + if(!any(c("character", + "sf", + "sfc", + "SpatialPolygons", + "RasterLayer", + "SpatRaster")) %in% class(roi)){ + stop("The region of interest is not in a valid format.") + } + + ### converts coordinates columns to numeric + data <- + data %>% + dplyr::rowid_to_column("id_temp") %>% + dplyr::mutate(decimalLatitude = as.numeric(.data[[lat]]), + decimalLongitude = as.numeric(.data[[lon]]), + .outside_roi = TRUE) + + ### Screen NAs + dataCoords %>% + dplyr::select(dplyr::all_of(c("id_temp", lon, lat))) %>% + dplyr::filter(!is.na(.data[[lat]]), + !is.na(.data[[lon]])) + + if(nrow(dataCoords) == 0) stop("No valid coordinates.") + + ### Read file path + if(is.character(roi)){ + + # check file exists + if(!file.exists(roi)) stop("The ROI filepath is invalid.") + + # check proper format + if(!any(str_detect(roi, + c("\\.shp$", "\\.gpkg$", "\\.tif$")))){ + stop("ROI input files must have '.shp', '.gpkg' or '.tif' file extensions.") + } + + # load roi + if(str_detect(roi, "\\.tif")){ + roi <- terra::rast(roi) + } else{ + roi <- st_read(roi) + } + + } + + ### Convert to sf/terra + if("SpatialPolygons" %in% .class2(roi)) roi <- st_as_sfc(roi) + if(class(roi) == "RasterLayer") roi <- rast(roi) + + ### Run appropriate check + if(any(c("sf", "sfc") %in% class(roi))){ + + if(!any(is.na(maskValue), is.null(maskValue))){ + warning("maskValue is ignored when ROI is provided as a vector layer.") + } + + unflagged <- roi_sf(dataCoords, + roi, + lat, + lon) + } else( + unflagged <- roi_rast(dataCoords, + roi, + lat, + lon) + ) + + ### Update data frame + data$.outside_roi[data$id_temp %in% unflagged] <- FALSE + out <- data %>% + select(-id_temp) + + if(sum(out$.outside_roi) == 0){ + message("No coordinates were located outside the region of interest.") + } else( + message(paste(sum(out$.outside_roi), + "occurrences were flagged as outside the region of interest.", + "One column was added to the database.")) + ) + + return(out) + + } + +roi_sf <- + function(dataCoords, + roi, + lat, + lon, + byExtentOnly){ + + ### Check CRS + if(st_crs(roi) != st_crs(4326)){ + message("Reprojecting ROI to WGS84.") + roi <- roi %>% st_transform(4326) + } + + ### Filter by extent + ext <- st_bbox(roi) + + crpd <- dataCoords %>% + dplyr::filter(dplyr::between(.data[[lat]], ext["ymin"], ext["ymax"]), + dplyr::between(.data[[lon]], ext["xmin"], ext["xmax"])) + + + if(any(byExtOnly, nrow(crpd) == 0)){ + unflgd <- crpd$id_temp + + } else{ + + ### Consolidate roi + suppressWarnings({ + s2_status <- sf_use_s2() + sf_use_s2(FALSE) + + roi <- roi %>% + sf::st_union() %>% + sf::st_combine() + + if(s2_status) sf_use_s2(TRUE) + }) + + + + unflgd <- crpd %>% + dplyr::mutate(within = st_within(., + roi, + sparse = F)[,1]) %>% + dplyr::filter(within) %>% + dplyr::pull(id_temp) + } + + return(unflgd) + } + +roi_rast <- + function(dataCoords, + roi, + lat, + lon, + byExtentOnly, + maskValue){ + + ### Reproject to raster CRS + dataCoords[, c("tempX", "tempY")] <- geom(project(vect(dataCoords, + geom = c(lon, lat), + crs = "EPSG:4326"), + crs(roi)), + wkt = FALSE, + hex = FALSE, + df = TRUE)[,c("x", "y")] + + ### Filter by extent + Ext <- ext(roi) + + crpd <- dataCoords %>% + dplyr::filter(dplyr::between(tempX, Ext[1], Ext[2]), + dplyr::between(tempY, Ext[3], Ext[4])) + + + if(any(byExtOnly, nrow(crpd) == 0)){ + unflgd <- crpd$id_temp + + } else{ + if(is.null(maskValue)){ + warning("No maskValue specied. Assuming maskValue is NA") + maskValue <- NA + } + smpld <- crpd %>% + dplyr::mutate(value = terra::extract(roi, + crpd[,c("tempX", "tempY")], + method = "simple")[,2]) %>% + filter(!is.na(value)) + + if(!is.na(maskValue)){ + smpld <- smpld %>% + filter(value != maskValue) + } + + unflgd <- smpld$id_temp + } + + return(unflgd) + } + + + + diff --git a/R/mol_spatiotemporal_duplicate.R b/R/mol_spatiotemporal_duplicate.R new file mode 100644 index 0000000..33436ab --- /dev/null +++ b/R/mol_spatiotemporal_duplicate.R @@ -0,0 +1,31 @@ +#' Identify spatiotemporally duplicated records +#' +#' This function flags records that are spatialtemporal duplicates +#' +#' @param data data.frame. Containing information about the location, taxonomy, +#' date, and source of the observation. +#' @param lat character string. The column name with latitude in decimal degree +#' and in WGS84. Default = "decimalLatitude". +#' @param lon character string. The column with longitude in decimal degree and +#' in WGS84. Default = "decimalLongitude". +#' @param eventDate Numeric or date. The column with event date information. +#' @param src character string. The column name denoting the source of each +#' observation. +#' @param srcPriority character. Order in which to prioritize retaining +#' data from each source. +#' @param recordIDcols character. Columns with recordIDs distinguishing +#' samples collected at the same time and place. +#' @param cols_to_remove character. Which columns should be removed prior to +#' checking for duplicates? Default = "all", which means that all columns +#' containing the results of data quality tests are removed. +#' +#' @details +#' +#' @return A data.frame containing the column ".spatiotemporal_duplicate" +#' .Compliant (TRUE) if if observation is a spatiotemporal duplicate; otherwise "FALSE". +#' +#' @importFrom +#' +#' @export +#' +#' @examples From 3d299962c26d7cd2951e1c1c5645c9a3183ea74b Mon Sep 17 00:00:00 2001 From: matthewsrogan <87720818+matthewsrogan@users.noreply.github.com> Date: Fri, 30 Sep 2022 17:24:48 -0400 Subject: [PATCH 2/3] ongoing development --- R/mol_flagged_by_source.R | 16 ++-- R/mol_roi.R | 43 +++++++---- R/mol_spatiotemporal_duplicate.R | 121 +++++++++++++++++++++++++++---- 3 files changed, 143 insertions(+), 37 deletions(-) diff --git a/R/mol_flagged_by_source.R b/R/mol_flagged_by_source.R index 157943c..42ac8e0 100644 --- a/R/mol_flagged_by_source.R +++ b/R/mol_flagged_by_source.R @@ -1,8 +1,6 @@ #' Flag records that have prexisting quality control attributes #' -#' This function flags records with quality control attributes provided by the -#' source (eg GBIF 'issues'). The flag is applied my matching a vector of -#' user-specified strings to a given column. +#' This function flags records with quality control attributes provided by the source (eg GBIF 'issues'). The flag is applied my matching a vector of user-specified strings to one or more columns. #' #' @family prefilter #' @param data data.frame. Containing quality control attributes. @@ -19,8 +17,10 @@ #' .Compliant (TRUE) for observations that match one or more flagStrings; #' otherwise "FALSE". #' -#' @importFrom dplyr if_any +#' @importFrom dplyr if_any all_of #' @importFrom stringr str_c str_detect +#' +#' @author Matthew S. Rogan #' #' @export #' @@ -35,7 +35,7 @@ #' flagStrings = c("ABSENT", "TAXON_MATCH_FUZZY"), #' flagCols = c("issue", "occurrenceStatus") #' ) -mol_flagged_by_source( +bdc_flagged_by_source <- function( data, flagStrings, flagCols @@ -43,11 +43,11 @@ mol_flagged_by_source( check_col(data, flagCols) - terms <- str_c(flagStrings, collapse = "|") + terms <- stringr::str_c(flagStrings, collapse = "|") data <- data %>% - mutate(.flagged_by_source = if_any(dplyr::all_of(flagCols), - ~str_detect(.x, terms))) + dplyr::mutate(.flagged_by_source = dplyr::if_any(tidyselect::all_of(flagCols), + ~stringr::str_detect(.x, terms))) message( paste( diff --git a/R/mol_roi.R b/R/mol_roi.R index 5f8546b..789ac79 100644 --- a/R/mol_roi.R +++ b/R/mol_roi.R @@ -8,7 +8,7 @@ #' @param data data.frame. Containing geographical coordinates. #' @param roi spatial region of interest. Can be provided as an sf, sfc, #' SpatialPolygons, SpatialPolygonsDataFrame, rasterLayer, spatRaster, -#' or as a file path with extension #' ".shp", ".gpkg", or ".tif". +#' or as a file path with extension ".shp", ".gpkg", or ".tif". #' @param lat character string. The column name with latitude in decimal degrees #' and WGS84. Default = "decimalLatitude". #' @param lon character string. The column with longitude in decimal degrees and @@ -24,6 +24,8 @@ #' #' @return A data.frame containing the column ".outside_roi". Compliant #' (TRUE) if coordinate is outside user defined roi; otherwise "FALSE". +#' +#' @author Matthew S. Rogan #' #' @importFrom sf st_bbox st_as_sf st_as_sfc st_within #' @importFrom stringr str_detect @@ -47,7 +49,7 @@ #' ) #' -mol_roi <- +bdc_roi <- function(data, roi, lat = "decimalLatitude", @@ -94,23 +96,31 @@ mol_roi <- if(!file.exists(roi)) stop("The ROI filepath is invalid.") # check proper format - if(!any(str_detect(roi, - c("\\.shp$", "\\.gpkg$", "\\.tif$")))){ + if(!any(stringr::str_detect(roi, + c("\\.shp$", "\\.gpkg$", "\\.tif$")))){ stop("ROI input files must have '.shp', '.gpkg' or '.tif' file extensions.") } # load roi - if(str_detect(roi, "\\.tif")){ + if(stringr::str_detect(roi, "\\.tif")){ + suppressWarnings({ + check_require_cran("terra") + }) roi <- terra::rast(roi) } else{ - roi <- st_read(roi) + roi <- sf::st_read(roi) } } ### Convert to sf/terra - if("SpatialPolygons" %in% .class2(roi)) roi <- st_as_sfc(roi) - if(class(roi) == "RasterLayer") roi <- rast(roi) + if("SpatialPolygons" %in% .class2(roi)) roi <- sf::st_as_sfc(roi) + if(class(roi) == "RasterLayer"){ + suppressWarnings({ + check_require_cran("terra") + }) + roi <- terra::rast(roi) + } ### Run appropriate check if(any(c("sf", "sfc") %in% class(roi))){ @@ -133,7 +143,7 @@ mol_roi <- ### Update data frame data$.outside_roi[data$id_temp %in% unflagged] <- FALSE out <- data %>% - select(-id_temp) + dplyr::select(-id_temp) if(sum(out$.outside_roi) == 0){ message("No coordinates were located outside the region of interest.") @@ -155,13 +165,13 @@ roi_sf <- byExtentOnly){ ### Check CRS - if(st_crs(roi) != st_crs(4326)){ + if(sf::st_crs(roi) != sf::st_crs(4326)){ message("Reprojecting ROI to WGS84.") - roi <- roi %>% st_transform(4326) + roi <- roi %>% sf::st_transform(4326) } ### Filter by extent - ext <- st_bbox(roi) + ext <- sf::st_bbox(roi) crpd <- dataCoords %>% dplyr::filter(dplyr::between(.data[[lat]], ext["ymin"], ext["ymax"]), @@ -188,9 +198,10 @@ roi_sf <- unflgd <- crpd %>% - dplyr::mutate(within = st_within(., - roi, - sparse = F)[,1]) %>% + dplyr::mutate(within = sf::st_within(., + roi, + sparse = F)[,1]) %>% + sf::st_drop_geometry() %>% dplyr::filter(within) %>% dplyr::pull(id_temp) } @@ -207,7 +218,7 @@ roi_rast <- maskValue){ ### Reproject to raster CRS - dataCoords[, c("tempX", "tempY")] <- geom(project(vect(dataCoords, + dataCoords[, c("tempX", "tempY")] <- terra::geom(project(vect(dataCoords, geom = c(lon, lat), crs = "EPSG:4326"), crs(roi)), diff --git a/R/mol_spatiotemporal_duplicate.R b/R/mol_spatiotemporal_duplicate.R index 33436ab..329103a 100644 --- a/R/mol_spatiotemporal_duplicate.R +++ b/R/mol_spatiotemporal_duplicate.R @@ -8,24 +8,119 @@ #' and in WGS84. Default = "decimalLatitude". #' @param lon character string. The column with longitude in decimal degree and #' in WGS84. Default = "decimalLongitude". -#' @param eventDate Numeric or date. The column with event date information. -#' @param src character string. The column name denoting the source of each -#' observation. -#' @param srcPriority character. Order in which to prioritize retaining -#' data from each source. -#' @param recordIDcols character. Columns with recordIDs distinguishing -#' samples collected at the same time and place. -#' @param cols_to_remove character. Which columns should be removed prior to -#' checking for duplicates? Default = "all", which means that all columns -#' containing the results of data quality tests are removed. +#' @param date character string. The column name with temporal information of the occurrence record. +#' @param recordCols character string. Names of columns that distinguish records collected simultaneously (e.g., specimenID). +#' @param priorityCol character. Name of column that indicates how to prioritize which duplicate record is flagged. +#' @param priorityOrder a vector of values in the priority column in descending order of priority. +#' @param ndec integer. Number of decimal places in the decimalDegree coordinates at which to determine spatial duplicates. #' -#' @details +#' @details Records with NA in the lat, lon, or date columns are excluded. If 'date' argument is NULL, function flags spatial duplicates and the flag column name is changed accordingly. +#' +#' priorityCol is used to arrange the data prior to identifying duplicates. The vector in priorityOrder is used to define levels in a factor prior to ordering. If priorityOrder is NULL and priorityCol isn't, priorityCol is simply sorted in ascending order. Values in the priority column that are not specified in the priorityOrder will be arranged as lowest priority, with a warning. #' -#' @return A data.frame containing the column ".spatiotemporal_duplicate" +#' @return A data.frame containing the column ".spatiotemporal_duplicate", or ".spatial_duplicate" when date = NULL. #' .Compliant (TRUE) if if observation is a spatiotemporal duplicate; otherwise "FALSE". #' -#' @importFrom +#' @importFrom lubridate as_date #' #' @export #' #' @examples + +bdc_spatiotemporal_duplicate <- function( + data, + lat = "decimalLatitude", + lon = "decimalLongitude", + date = "eventDate", + recordCols = NULL, + priorityCol = NULL, + priorityOrder = NULL, + digits = 3){ + + ### Run checks + if (!is.data.frame(data)) { + stop(deparse(substitute(data)), " is not a data.frame", call. = FALSE) + } + + dup_cols <- c(lon, lat, date, recordCols) + + check_col(data, c(dup_cols, priorityCols)) + + if(!is.null(priorityOrder) & is.null(priorityCols)){ + stop("No column specified for sorting by priority.\nEither specify a valid column name as priorityCol or set priorityOrder to NULL.") + } + + # check mismatches between priorityCol and priorityOrder + if(!is.null(priorityCol){ + if(is.null(priorityOrder)){ + warning("The order of priority for values in the priority column has not been specified.\nValues will be prioritized in ascending order.") + } + + if(!is.null(priorityOrder)){ + if(!any(priorityOrder %in% unique(data[[priorityCol]]))){ + stop("The values specified as the priority order do not appear in the priority column. Recheck arguments.") + } + + if(!all(priorityOrder %in% unique(data[[priorityCol]]))){ + warning("Not all values in the priority order occur in the priority column.\nThese values will be ignored.") + } + + if(!all(unique(data[[priorityCol]]) %in% priorityOrder)){ + warning("Not all values in the priority column are included in the priority order.\nThese values will be treated as lowest priority.") + } + } + } + + if(!is.numeric(ndec)) stop("the number of decimals (ndec) argument must be provided as an integer.") + + if(as.integer(ndec) != ndec) stop("the number of decimals ('ndec') argument must be provided as an integer.") + + ### update data + flagCol <- dplyr::if_else(is.null(date), + ".spatial_duplicate", + ".spatiotemporal_duplicate") + data <- data %>% + dplyr::mutate(decimalLatitude = as.numeric(.data[[lat]]), + decimalLongitude = as.numeric(.data[[lon]]), + !! flagCol := FALSE) %>% + tibble::rowid_to_column("id_temp") + + + ### subset and rearrange data + data_temp <- data %>% + dplyr::select(dplyr::all_of(c(lat, lon, date, recordCols, priorityCol))) %>% + dplyr::filter(!dplyr::if_any(c(lon, lat, date), ~is.na(.))) %>% + dplyr::mutate(dplyr::across(dplyr::all_of(c(lon, lat)), ~round(., digits = ndec))) + + if(!is.null(priorityCol)){ + if(!is.null(priorityOrder)){ + data_temp[[priorityCol]] <- factor(data_temp[[priorityCol]], + levels = priorityOrder) + } + + data_temp <- data_temp %>% + dplyr::arrange(across(all_of(priorityCol))) + } + + ### get flagged ids + flagged <- data_temp %>% + filter(duplicated(dplyr::select(., all_of(dup_cols)))) %>% + pull(id_temp) + + ### update flag col + data[[flagCol]][data$id_temp %in% flagged] <- TRUE + + message( + paste( + "\nbdc_spatiotemporal_duplicate:\nFlagged", + sum(data[[flagCol]]), + "observations that were", + paste0(str_remove(flagCol, "^\\."), "s."), + "\nOne column was added to the database.\n" + ) + ) + + return(data) + +} + From 3aa22593a3e3cf0965e193584ea2697da6ddf861 Mon Sep 17 00:00:00 2001 From: matthewsrogan <87720818+matthewsrogan@users.noreply.github.com> Date: Fri, 21 Oct 2022 17:48:10 -0400 Subject: [PATCH 3/3] Updated mol_roi.R debugged raster and vector helper functions. --- R/mol_roi.R | 68 +++++++++++++++++++++++++++++------------------------ 1 file changed, 37 insertions(+), 31 deletions(-) diff --git a/R/mol_roi.R b/R/mol_roi.R index 789ac79..e207a06 100644 --- a/R/mol_roi.R +++ b/R/mol_roi.R @@ -22,14 +22,13 @@ #' @details This test identifies records outside a particular region of interest #' defined by a raster or vector layer #' -#' @return A data.frame containing the column ".outside_roi". Compliant +#' @return A tibble containing the column ".outside_roi". Compliant #' (TRUE) if coordinate is outside user defined roi; otherwise "FALSE". #' #' @author Matthew S. Rogan #' #' @importFrom sf st_bbox st_as_sf st_as_sfc st_within #' @importFrom stringr str_detect -#' @importFrom terra crs rast geom vect #' #' @export mol_roi #' @@ -41,7 +40,7 @@ #' #' roi <- sf::st_as_sfc(sf::st_bbox(c(xmin = 0, xmax = 4, ymax = 4, ymin = 0), crs = 4326)) #' -#' mol_roi( +#' bdc_roi( #' data = x, #' roi = roi, #' lat = "decimalLatitude", @@ -55,7 +54,7 @@ bdc_roi <- lat = "decimalLatitude", lon = "decimalLongitude", byExtOnly = FALSE, - maskValue = NA){ + maskValue = NULL){ ### Run checks if (!is.data.frame(data)) { @@ -69,20 +68,21 @@ bdc_roi <- "sfc", "SpatialPolygons", "RasterLayer", - "SpatRaster")) %in% class(roi)){ + "SpatRaster") %in% class(roi))){ stop("The region of interest is not in a valid format.") } ### converts coordinates columns to numeric data <- data %>% - dplyr::rowid_to_column("id_temp") %>% + tibble::as_tibble() %>% + tibble::rowid_to_column("id_temp") %>% dplyr::mutate(decimalLatitude = as.numeric(.data[[lat]]), decimalLongitude = as.numeric(.data[[lon]]), .outside_roi = TRUE) ### Screen NAs - dataCoords %>% + dataCoords <- data %>% dplyr::select(dplyr::all_of(c("id_temp", lon, lat))) %>% dplyr::filter(!is.na(.data[[lat]]), !is.na(.data[[lon]])) @@ -93,11 +93,14 @@ bdc_roi <- if(is.character(roi)){ # check file exists - if(!file.exists(roi)) stop("The ROI filepath is invalid.") + if(!file.exists(roi)) stop("The ROI filepath does not exist.") # check proper format if(!any(stringr::str_detect(roi, - c("\\.shp$", "\\.gpkg$", "\\.tif$")))){ + c("\\.shp$", + "\\.gpkg$", + "\\.tif$", + "\\.tiff$")))){ stop("ROI input files must have '.shp', '.gpkg' or '.tif' file extensions.") } @@ -108,14 +111,14 @@ bdc_roi <- }) roi <- terra::rast(roi) } else{ - roi <- sf::st_read(roi) + roi <- sf::read_sf(roi) } } ### Convert to sf/terra if("SpatialPolygons" %in% .class2(roi)) roi <- sf::st_as_sfc(roi) - if(class(roi) == "RasterLayer"){ + if("RasterLayer" %in% class(roi)){ suppressWarnings({ check_require_cran("terra") }) @@ -132,13 +135,16 @@ bdc_roi <- unflagged <- roi_sf(dataCoords, roi, lat, - lon) - } else( + lon, + byExtOnly = byExtOnly) + } else{ unflagged <- roi_rast(dataCoords, roi, lat, - lon) - ) + lon, + byExtOnly = byExtOnly, + maskValue = maskValue) + } ### Update data frame data$.outside_roi[data$id_temp %in% unflagged] <- FALSE @@ -147,11 +153,11 @@ bdc_roi <- if(sum(out$.outside_roi) == 0){ message("No coordinates were located outside the region of interest.") - } else( + } else{ message(paste(sum(out$.outside_roi), "occurrences were flagged as outside the region of interest.", "One column was added to the database.")) - ) + } return(out) @@ -162,7 +168,7 @@ roi_sf <- roi, lat, lon, - byExtentOnly){ + byExtOnly = FALSE){ ### Check CRS if(sf::st_crs(roi) != sf::st_crs(4326)){ @@ -184,26 +190,26 @@ roi_sf <- } else{ ### Consolidate roi - suppressWarnings({ - s2_status <- sf_use_s2() - sf_use_s2(FALSE) + suppressMessages({ + s2_status <- sf::sf_use_s2() + sf::sf_use_s2(FALSE) roi <- roi %>% sf::st_union() %>% sf::st_combine() - if(s2_status) sf_use_s2(TRUE) - }) + if(s2_status) sf::sf_use_s2(TRUE) - - unflgd <- crpd %>% + sf::st_as_sf(coords = c(lon, lat), + crs = 4326) %>% dplyr::mutate(within = sf::st_within(., roi, sparse = F)[,1]) %>% sf::st_drop_geometry() %>% dplyr::filter(within) %>% dplyr::pull(id_temp) + }) } return(unflgd) @@ -214,8 +220,8 @@ roi_rast <- roi, lat, lon, - byExtentOnly, - maskValue){ + byExtOnly = FALSE, + maskValue = NA){ ### Reproject to raster CRS dataCoords[, c("tempX", "tempY")] <- terra::geom(project(vect(dataCoords, @@ -244,13 +250,13 @@ roi_rast <- } smpld <- crpd %>% dplyr::mutate(value = terra::extract(roi, - crpd[,c("tempX", "tempY")], - method = "simple")[,2]) %>% - filter(!is.na(value)) + crpd[,c("tempX", "tempY")], + method = "simple")[,2]) %>% + dplyr::filter(!is.na(value)) if(!is.na(maskValue)){ smpld <- smpld %>% - filter(value != maskValue) + dplyr::filter(value != maskValue) } unflgd <- smpld$id_temp