diff --git a/.gitignore b/.gitignore index 097b3de..7bb888c 100644 --- a/.gitignore +++ b/.gitignore @@ -49,3 +49,4 @@ ignore/ doc Meta *cache +src/Mapper.dll diff --git a/R/CoverRef.R b/R/CoverRef.R index 9e62ec7..416543f 100644 --- a/R/CoverRef.R +++ b/R/CoverRef.R @@ -1,17 +1,17 @@ #' Cover abstract class #' @aliases cover -#' @description Reference Class (R6) implementation of a Cover. This class is meant to act as an abstract class to derive other +#' @description Reference Class (R6) implementation of a Cover. This class is meant to act as an abstract class to derive other #' types of covering generators with. Minimally, a derived covering class must implement the -#' 'construct_cover' method to populate the 'level_sets' list with point indices, and any parameters -#' that the derived class requires. -#' -#' Additional methods may also be added to improve the efficiency of the cover. -#' See the \href{https://peekxc.github.io/Mapper/articles/UsingCustomCover.html}{vignette} on creating a custom -#' cover for details. -#' +#' 'construct_cover' method to populate the 'level_sets' list with point indices, and any parameters +#' that the derived class requires. +#' +#' Additional methods may also be added to improve the efficiency of the cover. +#' See the \href{https://peekxc.github.io/Mapper/articles/UsingCustomCover.html}{vignette} on creating a custom +#' cover for details. +#' #' @section Fields: -#' The following is a list of the fields available for derived classes. Each may be accessed -#' by the \code{self} environment. See \code{?R6} for more details. +#' The following is a list of the fields available for derived classes. Each may be accessed +#' by the \code{self} environment. See \code{?R6} for more details. #' \itemize{ #' \item{\emph{level_sets}:}{ named list, indexed by \code{index_set}, whose values represent indices in the original data set to cluster over.} #' \item{\emph{index_set}:}{ character vector of keys that uniquely index the open sets of the cover.} @@ -20,14 +20,14 @@ #' @format An \code{\link{R6Class}} generator object #' @author Matt Piekenbrock #' @export CoverRef -CoverRef <- R6::R6Class("CoverRef", +CoverRef <- R6::R6Class("CoverRef", private = list( .level_sets = NULL, - .index_set = NULL, + .index_set = NULL, .typename = character(0) - ), + ), lock_class = FALSE, ## Feel free to add your own members - lock_objects = FALSE ## Or change existing ones + lock_objects = FALSE ## Or change existing ones ) ## Cover initialization @@ -43,7 +43,7 @@ CoverRef$set("public", "format", function(...){ ## Typename field ## typename ---- -CoverRef$set("active", "typename", +CoverRef$set("active", "typename", function(value){ if (missing(value)){ private$.typename } else { stop("Cover 'typename' member is read-only.") @@ -53,7 +53,7 @@ CoverRef$set("active", "typename", ## The index set may be composed of any data type, but the collection of indices must uniquely ## index the level sets list via the `[[` operator. ## index_set ---- -CoverRef$set("active", "index_set", +CoverRef$set("active", "index_set", function(value){ if (missing(value)){ private$.index_set @@ -65,10 +65,10 @@ CoverRef$set("active", "index_set", } }) -## The level sets must be a list indexed by the index set. If the list is named, a check is performed to make sure the -## names match the values of the index set, and in the proper order. Otherwise, the order is assumed to be correct. +## The level sets must be a list indexed by the index set. If the list is named, a check is performed to make sure the +## names match the values of the index set, and in the proper order. Otherwise, the order is assumed to be correct. ## level_sets ---- -CoverRef$set("active", "level_sets", +CoverRef$set("active", "level_sets", function(value){ if (missing(value)){ private$.level_sets @@ -80,23 +80,23 @@ CoverRef$set("active", "level_sets", } ) -## Default cover +## Default cover ## construct_cover ---- CoverRef$set("public", "construct_cover", function(index=NULL){ stop("Base class cover construction called. This method must be overridden to be used.") }) -## Given an index in the index set, returns the indices of point intersect the image -## of f in the cover. The default method relies on the construct_cover method. +## Given an index in the index set, returns the indices of point intersect the image +## of f in the cover. The default method relies on the construct_cover method. # CoverRef$set("public", "construct_pullback", function(index){ -# +# # }) -## Which indices of the index set should be compared in constructing the k-simplices? -## This can be customized based on the cover to (dramatically) reduce -## the number of intersection checks needed to generate the k-skeletons, where k >= 1. -## Defaults to every pairwise combination of level sets. +## Which indices of the index set should be compared in constructing the k-simplices? +## This can be customized based on the cover to (dramatically) reduce +## the number of intersection checks needed to generate the k-skeletons, where k >= 1. +## Defaults to every pairwise combination of level sets. ## neighborhood ---- CoverRef$set("public", "neighborhood", function(filter, k=1){ if (length(private$.index_set) <= 1L){ return(NULL) } @@ -104,7 +104,7 @@ CoverRef$set("public", "neighborhood", function(filter, k=1){ relist(private$.index_set[k_combs], k_combs) }) -## Validates that the constructed cover is indeed a valid cover. +## Validates that the constructed cover is indeed a valid cover. ## validate ---- CoverRef$set("public", "validate", function(){ if ( length(private$.index_set) != length(private$.level_sets) ){ @@ -129,13 +129,14 @@ covers_available <- function(){ line_format <- " %-28s %-34s %-15s" writeLines(c( sprintf("Typename:%-20sGenerator:%-25sParameters:%-26s", "", "", ""), - sprintf(line_format, "fixed interval", "FixedIntervalCover", paste0(c("number_intervals", "percent_overlap"), collapse = ", ")), + sprintf(line_format, "fixed interval", "FixedIntervalCover", paste0(c("number_intervals", "percent_overlap"), collapse = ", ")), sprintf(line_format, "restrained interval", "RestrainedIntervalCover", paste0(c("number_intervals", "percent_overlap"), collapse = ", ")), # sprintf(line_format, "adaptive", "AdaptiveCover", paste0(c("number_intervals", "percent_overlap", "quantile_method"), collapse = ", ")), - sprintf(line_format, "ball", "BallCover", paste0("epsilon", collapse = ", ")) + sprintf(line_format, "ball", "BallCover", paste0("epsilon", collapse = ", ")), + sprintf(line_format, "landmark_ball", "LandmarkBallCover", paste0(c("epsilon", "num_sets", "seed_index", "seed_method"), collapse = ", ")), + sprintf(line_format, "neighborhood", "NeighborhoodCover", paste0(c("k", "seed_index", "seed_method"), collapse = ", ")) )) } -# TODO: add ability to convert cover to new type to preserve copying the filter points multiple times, or -# move the filter points to the MapperRef object - +# TODO: add ability to convert cover to new type to preserve copying the filter points multiple times, or +# move the filter points to the MapperRef object diff --git a/R/LandmarkBallCover.R b/R/LandmarkBallCover.R new file mode 100644 index 0000000..c93ba5e --- /dev/null +++ b/R/LandmarkBallCover.R @@ -0,0 +1,116 @@ +#' Landmark Ball Cover +#' +#' @docType class +#' @description This class provides a cover whose open sets are formed by balls centered about each +#' point in a landmark set. Given a radius \deqn{\epsilon}, choose a set of landmark points via the +#' algorithm presented in Dłotko to produce a cover by balls of radius \deqn{\epsilon}. Alternatively, +#' given a number of cover sets \code{n}, choose \code{n} landmarks via maxmin algorithm. If no seed +#' or seed_method is specified, default behavior uses the first data point as the seed. +#' +#' This differs from BallCover.R in that it does NOT union intersecting cover sets. +#' +#' Using this class requires the \code{RANN} package to be installed, and thus explicitly assumes +#' the filter space endowed with the euclidean metric. +#' +#' @field epsilon := radius of the ball to form around each landmark point +#' @field num_sets := desired number of balls/cover sets +#' @field seed_index := index of data point to use as the seed for landmark set calculation +#' @field seed_method := method to select a seed ("SPEC" : user specified index | "RAND" : random index +#' | "ECC" : point with highest eccentricity in the filter space) +#' @author Yara Skaf, Cory Brunsion +#' @family cover +#' @references Dłotko, Paweł. "Ball Mapper: A Shape Summary for Topological Data Analysis." (2019). Web. + +library(proxy) + +#' @export +LandmarkBallCover <- R6::R6Class( + classname = "LandmarkBallCover", + inherit = CoverRef, + public = list(epsilon=NULL, num_sets=NULL, seed_index=1, seed_method="SPEC") +) + +## initialize ------ +#' @export +LandmarkBallCover$set("public", "initialize", function(...){ + super$initialize(typename="landmark_ball") + params <- c(...) + if ("epsilon" %in% names(params)){ self$epsilon <- params[["epsilon"]] } + if ("num_sets" %in% names(params)){ self$num_sets <- params[["num_sets"]] } + if ("seed_index" %in% names(params)){ self$seed_index <- params[["seed_index"]] } + if ("seed_method" %in% names(params)){ self$seed_method <- params[["seed_method"]] } +}) + +## validate ------ +LandmarkBallCover$set("public", "validate", function(filter){ + ## Get filter values + fv <- filter() + f_size <- nrow(fv) + + ## validate parameters + stopifnot(!is.null(self$epsilon) || !is.null(self$num_sets)) # require either radius or # balls + stopifnot(is.null(self$num_sets) || (self$num_sets <= f_size && self$num_sets > 0)) # cannot have more cover sets than data points + stopifnot(is.null(self$epsilon) || self$epsilon >= 0) # radius must be positive + stopifnot(self$seed_index <= f_size && self$seed_index > 0) # seed index must be within the range of the data indices + stopifnot(all(self$seed_method == "RAND") || all(self$seed_method == "SPEC") || all(self$seed_method == "ECC")) # must use one of available seed methods +}) + +## format ---- +LandmarkBallCover$set("public", "format", function(...){ + titlecase <- function(x){ + s <- strsplit(x, " ")[[1]] + paste(toupper(substring(s, 1, 1)), substring(s, 2), sep = "", collapse = " ") + } + if (!is.null(self$num_sets)) { + sprintf("%s Cover: (number of sets = %s, seed index = %s)", titlecase(private$.typename), self$num_sets, self$seed_index) + }else if (!is.null(self$epsilon)){ + sprintf("%s Cover: (epsilon = %s, seed index = %s)", titlecase(private$.typename), self$epsilon, self$seed_index) + } +}) + +## construct_cover ------ +LandmarkBallCover$set("public", "construct_cover", function(filter, index=NULL){ + if (!requireNamespace("RANN", quietly = TRUE)){ + stop("Package \"RANN\" is needed for to use this cover.", call. = FALSE) + } + self$validate(filter) + + ## Get filter values + fv <- filter() + f_size <- nrow(fv) + + ## Construct the balls + if(is.null(index)){ + ## Set the seed index if necessary + if(all(self$seed_method == "RAND")) { self$seed_index = sample(1:f_size, 1) } + if(all(self$seed_method == "ECC")) { self$seed_index = which.max(eccentricity(from=fv, x=fv)) } + + ## Compute the landmark set + if (!is.null(self$num_sets)) { eps_lm <- unique(landmarks(x=fv, n=self$num_sets, seed_index=self$seed_index)) + } else if (!is.null(self$epsilon)) { eps_lm <- landmarks(x=fv, eps=self$epsilon, seed_index=self$seed_index) } + + ## Construct the index set + self$index_set <- as.character(eps_lm) + + ## Get distance from each point to landmarks + dist_to_lm <- proxy::dist(fv, fv[eps_lm,,drop=FALSE]) + pts_within_eps <- function(lm_dist){ which(lm_dist <= self$epsilon) } + + ## Calculate an epsilon if one was not given + if (!is.null(self$num_sets)) { + sortedDists = matrix(apply(dist_to_lm,1,sort),nrow=f_size,byrow=TRUE) + max = which.max(sortedDists[,1]) + self$epsilon = sortedDists[max,1] # radius should be distance of the farthest pt from the landmark set so that all pts are in at least one ball + } + + x = apply(dist_to_lm, 2, pts_within_eps) + + # if all level sets contain the same number of points, apply returns a matrix -> need to split columns into list elements + if(is.matrix(x)){ self$level_sets <- structure(split(x, rep(1:ncol(x), each = nrow(x))), names=self$index_set) + } else { self$level_sets <- structure(as.list(x), names=self$index_set) } + } + if (!missing(index)){ return(self$level_sets[[index]]) } + + ## Always return self + invisible(self) +}) diff --git a/R/NeighborhoodCover.R b/R/NeighborhoodCover.R new file mode 100644 index 0000000..133cb89 --- /dev/null +++ b/R/NeighborhoodCover.R @@ -0,0 +1,114 @@ +#' Neighborhood Cover +#' +#' @docType class +#' @description This class provides a cover whose open sets are formed by \code{k}-neighborhoods about a landmark +#' set. If no seed or seed_method is specified, default behavior uses the first data point as the seed. Cover sets +#' may contain more than \code{k} points if there are more than \code{k} points equidistant from the central point. +#' Using this class requires the \code{RANN} package to be installed, and thus explicitly assumes the filter space +#' endowed with the euclidean metric. +#' +#' @field k := desired number of neighbord to include in a cover set +#' @field seed_index := index of data point to use as the seed for landmark set calculation +#' @field seed_method := method to select a seed ("SPEC" : user specified index | "RAND" : random index +#' | "ECC" : point with highest eccentricity in the filter space) +#' @author Yara Skaf, Cory Brunsion +#' @family cover +#' @export + +library(proxy) + +#' @export +NeighborhoodCover <- R6::R6Class( + classname = "NeighborhoodCover", + inherit = CoverRef, + public = list(k=NULL, seed_index=1, seed_method="SPEC") +) + +## initialize ------ +#' @export +NeighborhoodCover$set("public", "initialize", function(...){ + super$initialize(typename="neighborhood") + params <- c(...) + if ("k" %in% names(params)){ self$k <- params[["k"]] } + if ("seed_index" %in% names(params)){ self$seed_index <- params[["seed_index"]] } + if ("seed_method" %in% names(params)){ self$seed_method <- params[["seed_method"]] } +}) + +## validate ------ +NeighborhoodCover$set("public", "validate", function(filter){ + ## Get filter values + fv <- filter() + f_size <- nrow(fv) + + ## validate parameters + stopifnot(!is.null(self$k)) # require nieghborhood size + stopifnot(self$k >= 2 && self$k <= f_size) # size must be at least 2 points + stopifnot(self$seed_index <= f_size && self$seed_index > 0) # seed index must be within the range of the data indices + stopifnot(all(self$seed_method == "RAND") || all(self$seed_method == "SPEC") || all(self$seed_method == "ECC")) # must use one of available seed methods +}) + +## format ---- +NeighborhoodCover$set("public", "format", function(...){ + titlecase <- function(x){ + s <- strsplit(x, " ")[[1]] + paste(toupper(substring(s, 1, 1)), substring(s, 2), sep = "", collapse = " ") + } + sprintf("%s Cover: (k = %s, seed index = %s)", titlecase(private$.typename), self$k, self$seed_index) +}) + +## construct_cover ------ +NeighborhoodCover$set("public", "construct_cover", function(filter, index=NULL, dist_method = "euclidean"){ + if (!requireNamespace("RANN", quietly = TRUE)){ + stop("Package \"RANN\" is needed for to use this cover.", call. = FALSE) + } + self$validate(filter) + stopifnot(toupper(dist_method) %in% toupper(proxy::pr_DB$get_entry_names())) + + ## Get filter values + fv <- filter() + f_dim <- ncol(fv) + f_size <- nrow(fv) + + if(is.null(index)){ + ## Set the seed index if necessary + if(all(self$seed_method == "RAND")) { self$seed_index = sample(1:f_size, 1) } + if(all(self$seed_method == "ECC")) { self$seed_index = which.max(eccentricity(from=fv, x=fv)) } + + ## Compute the set of k-nhds + C = c() # create an empty list to store indices of centers + k_nhds = list() # create empty list of lists to store points in each neighborhood + ptsLeft = c(1:f_size) # keep track of indices that are still available to be chosen as a center + + nextC = self$seed_index # use the seed as the first center + while(TRUE){ + # add the new center to the list and compute its k-neighborhood + C = append(C, nextC) + neighbors = proxy::dist(matrix(fv[nextC,], ncol=f_dim), fv, method = dist_method) + sortedNeighbors = sort(neighbors) + + # include all points that are equidistant from the center, even if those points create a nhd size > k + i = 1 + while( ((self$k+i) <= f_size) && (sortedNeighbors[self$k] == sortedNeighbors[self$k + i]) ){i = i + 1} + nhd = order(neighbors)[1:(self$k + i - 1)] + k_nhds = append(k_nhds, list(nhd)) + + # points that are included in a nhd are no longer eligible to be chosen as centers + ptsLeft = setdiff(ptsLeft, nhd) + if(length(ptsLeft) <= 0){break} + + # select the next center + dists = proxy::dist(matrix(fv[C,], ncol=f_dim), matrix(fv[ptsLeft,], ncol=f_dim), method = dist_method) + sortedDists = matrix(apply(dists,2,sort),ncol=length(ptsLeft)) + max = which.max(sortedDists[1,]) + nextC = ptsLeft[max] + } + + ## Construct the index set and level sets + self$index_set = as.character(C) + self$level_sets = structure(k_nhds, names=self$index_set) + } + if (!missing(index)){ return(self$level_sets[[index]]) } + + ## Always return self + invisible(self) +}) diff --git a/R/landmarks.R b/R/landmarks.R index 4791643..14045cc 100644 --- a/R/landmarks.R +++ b/R/landmarks.R @@ -1,39 +1,69 @@ #' landmarks #' @description Computes a set of \code{n} landmarks of a point cloud. -#' @details This function uses the greedy maxmin procedure to produce a set of \code{n} evenly spaced landmark points in the data set -#' \code{x}. Maxmin is a simple greedy algorithm that is relatively efficient, but it has a tendency to pick out extremal points. -#' If the distance metric is euclidean, an efficient Rcpp implementation is used. If another metric is requested, -#' the algorithm is performed in R. -#' @param x a data matrix. -#' @param n the number of landmarks requested. +#' @details This function uses the greedy maxmin procedure to produce a set of \code{n} evenly spaced landmark points in the data set +#' \code{x}. Maxmin is a simple greedy algorithm that is relatively efficient, but it has a tendency to pick out extremal points. +#' If the distance metric is euclidean, an efficient Rcpp implementation is used. If another metric is requested, +#' the algorithm is performed in R. +#' Alternatively, a radius \deqn{\epsilon} can be specified so that the landmark set is computed to cover the space with +#' \deqn{\epsilon}-balls centered at the landmarks. +#' @param x a data matrix. +#' @param n the number of landmarks requested. +#' @param eps the desired radius of balls in the cover set. #' @param dist_method the distance metric to use. Any distance measure in the \code{proxy} package is supported. -#' @param seed_index the first landmark to seed the algorithm. +#' @param seed_index the first landmark to seed the algorithm. #' @param shuffle_data whether to first randomly shuffle the data. #' @references De Silva, Vin, and Gunnar E. Carlsson. "Topological estimation using witness complexes." SPBG 4 (2004): 157-166. +#' @references Dłotko, Paweł. "Ball Mapper: A Shape Summary for Topological Data Analysis." (2019). Web. #' @export -landmarks <- function(x, n, dist_method = "euclidean", seed_index = 1, shuffle_data=FALSE){ +landmarks <- function(x, n=NULL, eps=NULL, dist_method = "euclidean", seed_index = 1, shuffle_data=FALSE){ stopifnot(is.matrix(x)) - stopifnot(seed_index >= 1L && seed_index < nrow(x)) + stopifnot(seed_index >= 1L && seed_index <= nrow(x)) + stopifnot(!is.null(n) || !is.null(eps)) # must specify a number of balls or a radius + shuffle_idx <- NA - if (shuffle_data){ + if (shuffle_data){ shuffle_idx <- sample(seq(nrow(x))) - x <- x[,,drop=FALSE] + x <- x[,,drop=FALSE] } - if (missing(dist_method) || toupper(dist_method) == "EUCLIDEAN"){ - landmark_idx <- landmark_maxmin(x, n, seed_index-1L) - } else if (requireNamespace("proxy", quietly = TRUE)){ + + if(!is.null(n)){ + if (missing(dist_method) || toupper(dist_method) == "EUCLIDEAN"){ + landmark_idx <- landmark_maxmin(x, n, seed_index-1L) + } else if (requireNamespace("proxy", quietly = TRUE)){ + stopifnot(toupper(dist_method) %in% toupper(proxy::pr_DB$get_entry_names())) + landmark_idx <- vector(mode="integer", n) + landmark_idx[1L] <- seed_index + landmark_min_dist <- rep(Inf, nrow(x)) + for (i in 2L:n){ + landmark_dist <- proxy::dist(x, x[landmark_idx[i-1L],,drop=FALSE], method = dist_method) + landmark_min_dist <- pmin(landmark_dist, landmark_min_dist) + potential_idx <- setdiff(seq(nrow(x)), landmark_idx[c(1:i)]) + landmark_idx[i] <- potential_idx[which.max(landmark_min_dist[potential_idx])] + } + } else { + stop(sprintf("Unsupported distance method passed: %s\n", dist_method)) + } + } else if(!is.null(eps)){ stopifnot(toupper(dist_method) %in% toupper(proxy::pr_DB$get_entry_names())) - landmark_idx <- vector(mode="integer", n) - landmark_idx[1L] <- seed_index - landmark_min_dist <- rep(Inf, nrow(x)) - for (i in 2L:n){ - landmark_dist <- proxy::dist(x, x[landmark_idx[i-1L],,drop=FALSE], method = dist_method) - landmark_min_dist <- pmin(landmark_dist, landmark_min_dist) - potential_idx <- setdiff(seq(nrow(x)), landmark_idx[c(1:i)]) - landmark_idx[i] <- potential_idx[which.max(landmark_min_dist[potential_idx])] + f_dim <- ncol(x) + f_size <- nrow(x) + + # algorithm and variable names as specified in Dlotko paper + C = c() # create a list to store indices of centers/landmarks + next_pt = seed_index # first landmark should be the seed point + while(TRUE){ + C = append(C, next_pt) # add new point to list of landmarks + + # compute distance between landmark set and each point in the space + dists = proxy::dist(matrix(x[C,], ncol=f_dim), x, method = dist_method) + sortedDists = matrix(apply(dists,2,sort),ncol=f_size) + + # the next landmark is the point with greatest distance from the current landmark set + next_pt = which.max(sortedDists[1,]) + d = sortedDists[1,next_pt] # done when this max distance is < eps, i.e. when all pts are contained in an eps-ball + if(d < eps){break} } - } else { - stop(sprintf("Unsupported distance method passed: %s\n", dist_method)) + landmark_idx = C } if (is.na(shuffle_idx)){ landmark_idx } else { shuffle_idx[landmark_idx] } -} \ No newline at end of file +} diff --git a/R/mapperRef.R b/R/mapperRef.R index 93c3d5a..765ed4e 100644 --- a/R/mapperRef.R +++ b/R/mapperRef.R @@ -2,14 +2,14 @@ #' @name MapperRef #' @aliases Mapper #' @description `MapperRef` is an \link{R6} class built for parameterizing and computing mappers efficiently. -#' @section Details: To create a new \code{MapperRef} instance object, instantiate the class with the \code{\link[R6:R6Class]{R6 new}} operator. +#' @section Details: To create a new \code{MapperRef} instance object, instantiate the class with the \code{\link[R6:R6Class]{R6 new}} operator. #' Instantiation of a \code{MapperRef} objects requires a data matrix, or a function returning one. \cr -#' The primary output of the Mapper method is a simplicial complex. -#' \cr -#' The underlying complex does not need to be modified by the user, i.e. is completely maintained by \code{\link{MapperRef}} methods +#' The primary output of the Mapper method is a simplicial complex. +#' \cr +#' The underlying complex does not need to be modified by the user, i.e. is completely maintained by \code{\link{MapperRef}} methods #' (e.g. \code{\link{construct_nerve}}, \code{\link{construct_k_skeleton}}, etc.). #' @section Usage: -#' \preformatted{m = MapperRef$new(X)} +#' \preformatted{m = MapperRef$new(X)} #' @section Arguments: #' \itemize{ #' \item{\strong{X}}: The data, either as a matrix, or a function that returns a matrix. @@ -17,15 +17,15 @@ #' @section Fields: #' \itemize{ #' \item{\strong{X}}: The data, as a function. Evaluation returns the data as a matrix. -#' \item{\strong{filter}}: The filter, as a function. Evaluation returns the filtered data as a matrix. -#' \item{\strong{cover}}: The cover (a \code{\link{CoverRef}} derived object). +#' \item{\strong{filter}}: The filter, as a function. Evaluation returns the filtered data as a matrix. +#' \item{\strong{cover}}: The cover (a \code{\link{CoverRef}} derived object). #' \item{\strong{clustering_algorithm}}: The clustering algorithm to use in the pullback. -#' \item{\strong{measure}}: Distance measure to use to compute distances in ambient space. See \code{use_distance_measure} for more details. -#' \item{\strong{pullback}}: Mapping between the sets in the cover (by index) and the vertices (by id). -#' \item{\strong{vertices}}: The mapper vertices. +#' \item{\strong{measure}}: Distance measure to use to compute distances in ambient space. See \code{use_distance_measure} for more details. +#' \item{\strong{pullback}}: Mapping between the sets in the cover (by index) and the vertices (by id). +#' \item{\strong{vertices}}: The mapper vertices. #' \item{\strong{simplicial_complex}}: A \code{\link[simplextree:simplextree]{simplex tree}} object. #' } -#' +#' #' @section Methods: #' \itemize{ #' \item{\code{\link{use_filter}}: Specifies the filter.} @@ -41,9 +41,9 @@ #' } #' @section More information: #' Full documentation available \href{https://peekxc.github.io/Mapper}{online}. -#' +#' #' @return \code{\link{MapperRef}} instance equipped with methods for building the mapper. -#' +#' #' @import methods #' @importFrom Rcpp sourceCpp #' @author Matt Piekenbrock, \email{matt.piekenbrock@@gmail.com} @@ -53,23 +53,23 @@ NULL #' @export -MapperRef <- R6::R6Class("MapperRef", +MapperRef <- R6::R6Class("MapperRef", private = list( - .X = NULL, + .X = NULL, .filter = NULL, - .cover = NULL, - .clustering_algorithm = NULL, - .measure = "euclidean", - .measure_opt = NULL, + .cover = NULL, + .clustering_algorithm = NULL, + .measure = "euclidean", + .measure_opt = NULL, .pullback=list(), .vertices = list(), .simplicial_complex = NULL ), lock_class = FALSE, ## Feel free to add your own members - lock_objects = FALSE ## Or change existing ones + lock_objects = FALSE ## Or change existing ones ) -## To add a public member function +## To add a public member function ## add function ---- MapperRef$set("public", "add_function", function(name, FUN) { self[[name]] <- FUN @@ -78,9 +78,9 @@ MapperRef$set("public", "add_function", function(name, FUN) { ## The set index -> (vertex) decomposition mapping. ## pullback ---- -MapperRef$set("active", "pullback", - function(value){ - if (missing(value)){ private$.pullback } +MapperRef$set("active", "pullback", + function(value){ + if (missing(value)){ private$.pullback } else { private$.pullback <- value } } ) @@ -88,16 +88,16 @@ MapperRef$set("active", "pullback", ## cover ---- ## @title Mapper cover ## @name cover -## @description Every \code{\link{MapperRef}} object requires a \code{\link{CoverRef}} object as -## is \code{cover} member field. In the context of Mapper, a cover is used to discretize the filter -## space into a partition, which is then used via a \emph{pullback} operation to construct the vertices. \cr -## \cr -## The \code{\link{MapperRef}} class makes no restrictions on the cover that is used; only that it fulfills the +## @description Every \code{\link{MapperRef}} object requires a \code{\link{CoverRef}} object as +## is \code{cover} member field. In the context of Mapper, a cover is used to discretize the filter +## space into a partition, which is then used via a \emph{pullback} operation to construct the vertices. \cr +## \cr +## The \code{\link{MapperRef}} class makes no restrictions on the cover that is used; only that it fulfills the ## requirements of being a valid \code{\link{CoverRef}} instance. -## @seealso \code{\link{CoverRef}} -MapperRef$set("active", "cover", +## @seealso \code{\link{CoverRef}} +MapperRef$set("active", "cover", function(value){ #function(fv, type = c("restrained rectangular"), ...) - if (missing(value)){ private$.cover } + if (missing(value)){ private$.cover } else { stopifnot(inherits(value, "CoverRef")) private$.cover <- value @@ -108,7 +108,7 @@ MapperRef$set("active", "cover", ## Mapper stores the vertices as a list ## vertices ---- -MapperRef$set("active", "vertices", +MapperRef$set("active", "vertices", function(value){ if(missing(value)){ private$.vertices @@ -116,32 +116,32 @@ MapperRef$set("active", "vertices", private$.vertices <- value # stop("`$vertices` is read-only. To update the vertex membership, use the 'compute_vertices' function.", call. = FALSE) } - } + } ) ## simplicial_complex ---- -## @title Simplicial Complex -## @name simplicial_complex -## @description The relational information of the Mapper construction. -## @details The primary output of the Mapper method is a simplicial complex. With \code{\link{MapperRef}} objects, -## the simplicial complex is stored as a \code{\link[simplextree]{simplextree}}. -## \cr -## The underlying complex does not need to be modified by the user, i.e. is completely maintained by \code{\link{MapperRef}} methods +## @title Simplicial Complex +## @name simplicial_complex +## @description The relational information of the Mapper construction. +## @details The primary output of the Mapper method is a simplicial complex. With \code{\link{MapperRef}} objects, +## the simplicial complex is stored as a \code{\link[simplextree]{simplextree}}. +## \cr +## The underlying complex does not need to be modified by the user, i.e. is completely maintained by \code{\link{MapperRef}} methods ## (e.g. \code{\link{construct_nerve}}, \code{\link{construct_k_skeleton}}, etc.). -MapperRef$set("active", "simplicial_complex", +MapperRef$set("active", "simplicial_complex", function(value){ if (missing(value)){ private$.simplicial_complex } else { stopifnot(is(value, "Rcpp_SimplexTree")) - private$.simplicial_complex <- value + private$.simplicial_complex <- value # stop("`$simplicial_complex` is read-only. To change the complex, use the objects (simplex tree) methods directly.", call. = FALSE) } } ) ## clustering_algorithm ---- -## Clustering algorithm must be a function that takes as arguments 'pid' and 'idx' -MapperRef$set("active", "clustering_algorithm", +## Clustering algorithm must be a function that takes as arguments 'pid' and 'idx' +MapperRef$set("active", "clustering_algorithm", function(value){ if (missing(value)){ private$.clustering_algorithm } else { @@ -157,9 +157,9 @@ MapperRef$set("active", "data", function(value){ else { if (is.matrix(value)){ X_acc <- function(data_matrix){ - function(idx=NULL){ + function(idx=NULL){ if (missing(idx)){ return(data_matrix) } - return(data_matrix[idx,,drop=FALSE]) + return(data_matrix[idx,,drop=FALSE]) } } private$.X <- X_acc(value) @@ -170,19 +170,19 @@ MapperRef$set("active", "data", function(value){ } } }) - + ## X ---- ## The data should be held fixed -MapperRef$set("active", "X", +MapperRef$set("active", "X", function(value){ if (missing(value)){ return(private$.X) } else { if (is.matrix(value)){ X_acc <- function(data_matrix){ - function(idx=NULL){ + function(idx=NULL){ if (missing(idx)){ return(data_matrix) } - return(data_matrix[idx,,drop=FALSE]) + return(data_matrix[idx,,drop=FALSE]) } } private$.X <- X_acc(value) @@ -198,16 +198,16 @@ MapperRef$set("active", "X", ## filter ---- ## The filter function associated with the Mapper instance -MapperRef$set("active", "filter", +MapperRef$set("active", "filter", function(value){ if (missing(value)){ return(private$.filter) } else { # browser() if (is.matrix(value)){ filter_acc <- function(filter_matrix){ - function(idx=NULL){ + function(idx=NULL){ if (missing(idx) || is.null(idx)){ return(filter_matrix) } - return(filter_matrix[idx,,drop=FALSE]) + return(filter_matrix[idx,,drop=FALSE]) } } private$.filter <- filter_acc(value) @@ -236,7 +236,7 @@ MapperRef$set("active", "measure", ) ## initialize ---- -## Initialization method relies on active binding +## Initialization method relies on active binding MapperRef$set("public", "initialize", function(X = NULL){ private$.simplicial_complex <- simplextree::simplex_tree() self$use_distance_measure(measure = "euclidean") @@ -249,19 +249,19 @@ MapperRef$set("public", "initialize", function(X = NULL){ #' @name use_data #' @title Sets the data #' @description Sets the data matrix to associate with the mapper. -#' @param data either a matrix, a function, or a data set name. See details. +#' @param data either a matrix, a function, or a data set name. See details. #' @param ... additional parameters to pass to the filter function. -#' @details This function sets the data for the Mapper to work on. If \code{data} is a string, -#' it must be one of the (illustrative) data sets listed in \code{data(package="Mapper")}. -#' Otherwise, \code{data} must be either a matrix of coordinate values, a function that returns a matrix of +#' @details This function sets the data for the Mapper to work on. If \code{data} is a string, +#' it must be one of the (illustrative) data sets listed in \code{data(package="Mapper")}. +#' Otherwise, \code{data} must be either a matrix of coordinate values, a function that returns a matrix of #' coordinate values. MapperRef$set("public", "use_data", function(X){ if (is.character(X)){ stopifnot(X %in% c("noisy_circle", "wvs_us_wave6")) - self$X <- local({ + self$X <- local({ load(system.file(sprintf("data/%s.Rdata", X), package="Mapper")) X <- eval(parse(text=X)) - return(scale(as.matrix(X))) + return(scale(as.matrix(X))) }) } else { self$X <- X } return(invisible(self)) @@ -271,10 +271,10 @@ MapperRef$set("public", "use_data", function(X){ #' @name use_filter #' @title Sets the filter #' @description Sets the map, or \emph{filter}, to associate with the instance -#' @param filter either the filter name to use, a matrix, or a function. See details. +#' @param filter either the filter name to use, a matrix, or a function. See details. #' @param ... additional parameters to pass to the filter function. -#' @details \code{filter} must be either a matrix of coordinate values, a function that returns a matrix of -#' coordinate values, or one of the following predefined filters: +#' @details \code{filter} must be either a matrix of coordinate values, a function that returns a matrix of +#' coordinate values, or one of the following predefined filters: #' \itemize{ #' \item{\strong{PC}}{ Principle components (with \code{\link[stats:prcomp]{prcomp}})} #' \item{\strong{IC}}{ Independent components (with \code{\link[fastICA:fastICA]{fastICA}})} @@ -287,15 +287,15 @@ MapperRef$set("public", "use_data", function(X){ #' \item{\strong{UMAP}}{ Uniform Manifold Approximation and Projection (with \code{\link[umap:umap]{umap}})} #' } #' Nearly all the pre-configured filters essentially call functions in other packages with -#' somewhat reasonable default parameters to perform the mapping. Any parameters supplied to \code{...} -#' are passed to the corresponding package function linked above, which override any default parameters. -#' If the package needed to compute the filter is not installed, a prompt is given asking the user +#' somewhat reasonable default parameters to perform the mapping. Any parameters supplied to \code{...} +#' are passed to the corresponding package function linked above, which override any default parameters. +#' If the package needed to compute the filter is not installed, a prompt is given asking the user #' whether they would like to install it. \cr #' \cr -#' \strong{NOTE:} The predefined filters are meant to be used for exploratory or illustrative purposes only---this function +#' \strong{NOTE:} The predefined filters are meant to be used for exploratory or illustrative purposes only---this function #' is \emph{not} meant to act as a comprensive interface to the functions each filter corresponds too. MapperRef$set("public", "use_filter", function(filter=c("PC", "IC", "ECC", "KDE", "DTM", "MDS", "ISOMAP", "LE", "UMAP"), ...){ - if (is.function(filter)){ private$.filter <- filter } + if (is.function(filter)){ private$.filter <- filter } else if (is.matrix(filter)){ self$filter <- filter } else if (is.numeric(filter) && is.vector(filter)){ @@ -323,13 +323,13 @@ MapperRef$set("public", "use_filter", function(filter=c("PC", "IC", "ECC", "KDE" } return(params) } - + ## Basic filters if (filter == "PC"){ default_params <- list(x = self$X(), scale. = TRUE, center = TRUE, rank. = 2L) params <- modifyList(default_params, given_params) res <- do.call(stats::prcomp, params) - self$filter <- matrix(res$x, ncol = params[["rank."]]) + self$filter <- matrix(res$x, ncol = params[["rank."]]) # c("mapper_filter", "function") } else if (filter == "IC"){ @@ -338,15 +338,15 @@ MapperRef$set("public", "use_filter", function(filter=c("PC", "IC", "ECC", "KDE" params <- modifyList(default_params, given_params) res <- do.call(fastICA::fastICA, params) self$filter <- matrix(res$S, ncol = params[["n.comp"]]) ## S stores independent components - } + } else if (filter == "ECC"){ ## eccentricity has_pd <- requireNamespace("parallelDist", quietly = TRUE) dist_f <- ifelse(has_pd, parallelDist::parallelDist, stats::dist) params <- modifyList(list(p=1), given_params) p_str <- c("manhattan", "euclidean", "maximum")[match(params$p, list(1, 2, Inf))] - if (params$p != Inf) { + if (params$p != Inf) { self$filter <- matrix(colMeans(as.matrix(dist_f(self$X())^(1/params$p))), ncol = 1) - } else { self$filter <- matrix(apply(as.matrix(dist_f(self$X())), 2, max), ncol = 1) } + } else { self$filter <- matrix(apply(as.matrix(dist_f(self$X())), 2, max), ncol = 1) } } else if (filter == "KDE"){ require_but_ask("ks") @@ -403,39 +403,39 @@ MapperRef$set("public", "use_filter", function(filter=c("PC", "IC", "ECC", "KDE" ## use_distance_measure ---- #' @name use_distance_measure #' @title Assign a distance measure -#' @description Assigns a distance measure to the \code{\link{MapperRef}} instance to use in the clustering algorithm. +#' @description Assigns a distance measure to the \code{\link{MapperRef}} instance to use in the clustering algorithm. #' @section Usage: #' \preformatted{ $use_distance_measure(measure, ...) } -#' @section Arguments: +#' @section Arguments: #' \describe{ #' \item{\code{measure}}{The distance measure to use (string).} #' \item{\code{...}}{Extra parameters passed to the distance function. See details. } #' } -#' @section Details: Unless the \code{\link[Mapper:use_clustering_algorithm]{clustering_algorithm}} has been replaced by the user, -#' by default, Mapper requires a notion of distance between objects to be defined in creating the vertices of the construction. -#' +#' @section Details: Unless the \code{\link[Mapper:use_clustering_algorithm]{clustering_algorithm}} has been replaced by the user, +#' by default, Mapper requires a notion of distance between objects to be defined in creating the vertices of the construction. +#' #' The distance function is determined based on the supplied \code{measure}. \code{measure} must be one of: #' \preformatted{["euclidean", "maximum", "manhattan", "canberra", "binary", "minkowski"]} #' or, if the \pkg{proxy} and \code{\link[parallelDist]{parallelDist}} packages are installed, any name in \code{proxy::pr_DB$get_entry_names()}.\cr #' \cr -#' Additional parameters passed via \code{...} are passed to either \code{\link[stats]{dist}} (or -#' \code{\link[parallelDist]{parallelDist}} if installed). +#' Additional parameters passed via \code{...} are passed to either \code{\link[stats]{dist}} (or +#' \code{\link[parallelDist]{parallelDist}} if installed). #' @section Value: -#' The mapper instance, with the measure field assigned. -#' @examples +#' The mapper instance, with the measure field assigned. +#' @examples #' data(noisy_circle) #' m <- MapperRef$new(noisy_circle) #' m$use_filter(noisy_circle[,1]) #' m$use_cover("fixed interval", number_intervals = 5, percent_overlap = 25) -#' +#' #' ## Constructs clusters with euclidean metric (default) #' m$use_distance_measure("euclidean") -#' m$construct_pullback() -#' +#' m$construct_pullback() +#' #' ## Constructs clusters with p-norm (p = 1) #' m$use_distance_measure("Minkowski", p = 1L) -#' m$construct_pullback() -#' +#' m$construct_pullback() +#' #' \dontrun{ #' ## To see list of available measures, use: #' proxy::pr_DB$get_entry_names() @@ -448,26 +448,26 @@ MapperRef$set("public", "use_distance_measure", function(measure, ...){ }) ## use_cover ---- -#' @name use_cover -#' @title Selects one of the available covering methods to use. -#' @description -#' Convenience method to select, parameterize, and construct a cover and associate it with the calling objects \code{cover} field. -#' @param cover Either a pre-configured cover, or name of the cover to use. -#' @param ... Additional parameter values to pass to the covering generators initialize method. -#' @details Every \code{\link{MapperRef}} object requires a \code{\link{CoverRef}} object as -#' is \code{cover} member field. In the context of Mapper, a cover is used to discretize the filter -#' space into a partition, which is then used via a \emph{pullback} operation to construct the vertices. \cr -#' \cr -#' The \code{\link{MapperRef}} class makes no restrictions on the cover that is used; only that it fulfills the -#' requirements of being a valid \code{\link{CoverRef}} instance (e.g. a \code{\link{FixedIntervalCover}}), -#' or one of the cover typenames listed in the \code{\link{covers_available}()}. -#' If a typename is given, the cover is automatically constructed before being assigned to the \code{$cover} field. -#' @examples +#' @name use_cover +#' @title Selects one of the available covering methods to use. +#' @description +#' Convenience method to select, parameterize, and construct a cover and associate it with the calling objects \code{cover} field. +#' @param cover Either a pre-configured cover, or name of the cover to use. +#' @param ... Additional parameter values to pass to the covering generators initialize method. +#' @details Every \code{\link{MapperRef}} object requires a \code{\link{CoverRef}} object as +#' is \code{cover} member field. In the context of Mapper, a cover is used to discretize the filter +#' space into a partition, which is then used via a \emph{pullback} operation to construct the vertices. \cr +#' \cr +#' The \code{\link{MapperRef}} class makes no restrictions on the cover that is used; only that it fulfills the +#' requirements of being a valid \code{\link{CoverRef}} instance (e.g. a \code{\link{FixedIntervalCover}}), +#' or one of the cover typenames listed in the \code{\link{covers_available}()}. +#' If a typename is given, the cover is automatically constructed before being assigned to the \code{$cover} field. +#' @examples #' data(noisy_circle) #' m <- MapperRef$new(noisy_circle) #' m$use_filter(noisy_circle[,1]) #' m$use_cover("fixed interval", number_intervals = 5, percent_overlap = 25) -#' +#' #' ## Alternative way to specify (and construct) the cover #' cover <- FixedIntervalCover$new(number_intervals = 5, percent_overlap = 25) #' cover$construct_cover(filter = m$filter) @@ -475,73 +475,75 @@ MapperRef$set("public", "use_distance_measure", function(measure, ...){ MapperRef$set("public", "use_cover", function(cover="fixed interval", ...){ stopifnot(!is.null(self$filter)) if (missing(cover)){ cover <- "fixed interval"} - self$cover <- switch(cover, - "fixed interval"=FixedIntervalCover$new(...)$construct_cover(self$filter), + self$cover <- switch(cover, + "fixed interval"=FixedIntervalCover$new(...)$construct_cover(self$filter), "restrained interval"=RestrainedIntervalCover$new(...)$construct_cover(self$filter), # "adaptive"=AdaptiveCover$new(...)$construct_cover(self$filter), "ball"=BallCover$new(...)$construct_cover(self$filter), + "landmark_ball"=LandmarkBallCover$new(...)$construct_cover(self$filter), + "neighborhood"=NeighborhoodCover$new(...)$construct_cover(self$filter), stop(sprintf("Unknown cover type: %s, please specify a cover typename listed in `covers_available()`", cover)) ) invisible(self) }) -#@param filter_values (n x d) numeric matrix of values giving the results of the map. +#@param filter_values (n x d) numeric matrix of values giving the results of the map. ## use_clustering_algorithm ---- -#' @name use_clustering_algorithm -#' @title Sets the clustering algorithm +#' @name use_clustering_algorithm +#' @title Sets the clustering algorithm #' @description Sets the clustering algorithm used to construct the connected components in the pullback cover. #' @param cl Either one of the link criteria used in the \code{\link[stats]{hclust}} function (string) or a function. See details. #' @param cutoff_method Type of heuristic to determine cut value. See details. Ignored is \code{cl} is a function. -#' @param ... Additional parameters passed as defaults to the cutoff method. See details. -#' @details If \code{cl} is a linkage criterion, a standard hierarchical clustering algorithm is used with suitable default parameters. -#' If it is a function, it must have a signature compatible with \code{function(pid, idx, self, ...)} where -#' \code{pid} is the index of the (pullback) set to cluster (see \code{CoverRef}), +#' @param ... Additional parameters passed as defaults to the cutoff method. See details. +#' @details If \code{cl} is a linkage criterion, a standard hierarchical clustering algorithm is used with suitable default parameters. +#' If it is a function, it must have a signature compatible with \code{function(pid, idx, self, ...)} where +#' \code{pid} is the index of the (pullback) set to cluster (see \code{CoverRef}), #' \code{idx} are the indices of the points in \code{X} to cluster on, and -#' \code{self} is the \code{MapperRef} instance environment. +#' \code{self} is the \code{MapperRef} instance environment. #' \cr #' The cutoff method may be one of either c("continuous", "histogram"). See \code{\link{cutoff_first_bin}} and \code{\link{cutoff_first_threshold}} -#' for what they each correspond to, respectively. Additional named parameters passed via \code{...} act as defaults to the cutting method chosen. -#' If not chosen, reasonable defaults are used. \cr +#' for what they each correspond to, respectively. Additional named parameters passed via \code{...} act as defaults to the cutting method chosen. +#' If not chosen, reasonable defaults are used. \cr #' \cr -#' Additional named parameters passed via the dots in \code{$clustering_algorithm} -#' will taken precedence and override all previously set default settings. -MapperRef$set("public", "use_clustering_algorithm", - function(cl = c("single", "ward.D", "ward.D2", "complete", "average", "mcquitty", "median", "centroid"), +#' Additional named parameters passed via the dots in \code{$clustering_algorithm} +#' will taken precedence and override all previously set default settings. +MapperRef$set("public", "use_clustering_algorithm", + function(cl = c("single", "ward.D", "ward.D2", "complete", "average", "mcquitty", "median", "centroid"), cutoff_method = c("continuous", "histogram"), ...){ ## Use a linkage criterion + cutting rule default_cutting_params <- list(...) if (missing(cl)) { cl <- "single" } - if (missing(cutoff_method)){ cutoff_method <- "continuous" } + if (missing(cutoff_method)){ cutoff_method <- "continuous" } if (class(cl) == "character"){ hclust_opts <- c("single", "ward.D", "ward.D2", "complete", "average", "mcquitty", "median", "centroid") if (!cl %in% hclust_opts){ stop(sprint("Unknown linkage method passed. Please use one of (%s). See ?hclust for details.", paste0(hclust_opts, collapse = ", "))) } stopifnot(is.character(self$measure)) - ## Closured representation to substitute default parameters. + ## Closured representation to substitute default parameters. create_cl <- function(cl, cutoff_method, cut_defaults){ cutoff_f <- switch(cutoff_method, "histogram"=cutoff_first_bin, "continuous"=cutoff_first_threshold) function(pid, idx, self, ...){ if (is.null(idx) || length(idx) == 0){ return(integer(0L)) } if (length(idx) <= 2L){ return(rep(1L, length(idx))); } override_params <- list(...) - + ## Use parallelDist package if available has_pd <- requireNamespace("parallelDist", quietly = TRUE) dist_f <- ifelse(has_pd, parallelDist::parallelDist, stats::dist) dist_params <- list(x=self$X(idx), method=tolower(self$measure)) if (!is.null(private$.measure_opt)){ dist_params <- append(dist_params, private$.measure_opt) } dist_x <- do.call(dist_f, dist_params) - - ## Use fastcluster if available + + ## Use fastcluster if available has_fc <- requireNamespace("fastcluster", quietly = TRUE) cl_f <- ifelse(has_fc, fastcluster::hclust, stats::hclust) hcl <- do.call(cl_f, list(d=dist_x, method=cl)) - - cutoff_params <- if (cutoff_method == "histogram"){ - list(hcl = hcl, num_bins = 10L) + + cutoff_params <- if (cutoff_method == "histogram"){ + list(hcl = hcl, num_bins = 10L) } else { list(hcl = hcl, threshold = 0.0) } cutoff_params <- modifyList(modifyList(cutoff_params, cut_defaults), override_params) - + ## Use heuristic cutting value to produce the partitioning eps <- do.call(cutoff_f, cutoff_params) return(cutree(hcl, h = eps)) @@ -556,18 +558,18 @@ MapperRef$set("public", "use_clustering_algorithm", ) ## construct_pullback ---- -#' @name construct_pullback -#' @title Constructs the (decomposed) pullback cover. -#' @description Executes the clustering algorithm on subsets of \code{X}, +#' @name construct_pullback +#' @title Constructs the (decomposed) pullback cover. +#' @description Executes the clustering algorithm on subsets of \code{X}, #' @param pullback_ids indices of the \code{\link[Mapper:CoverRef]{covers}} \code{index_set}, or \code{NULL}. #' @param ... additional parameters to pass to the \code{clustering_algorithm}. -#' @details This methods uses the function given by \code{clustering_algorithm} field to -#' decompose the preimages returned by the \code{cover} member into connected components, which are -#' stored as \code{vertices}. Indices may be passed to limit which sets are decomposed, otherwise +#' @details This methods uses the function given by \code{clustering_algorithm} field to +#' decompose the preimages returned by the \code{cover} member into connected components, which are +#' stored as \code{vertices}. Indices may be passed to limit which sets are decomposed, otherwise #' the sets in the cover all considered. #' \cr -#' Note that this method removes the \code{vertices} associated with \code{pullback_ids}, but does not -#' modify the simplicial complex. +#' Note that this method removes the \code{vertices} associated with \code{pullback_ids}, but does not +#' modify the simplicial complex. #' @seealso \code{\link{construct_k_skeleton}} \code{\link{construct_nerve}} \code{\link{use_clustering_algorithm}} MapperRef$set("public", "construct_pullback", function(pullback_ids=NULL, ...){ stopifnot(!is.null(self$cover$level_sets)) @@ -575,7 +577,6 @@ MapperRef$set("public", "construct_pullback", function(pullback_ids=NULL, ...){ pids_supplied <- (!missing(pullback_ids) && !is.null(pullback_ids)) if (pids_supplied){ stopifnot(all(pullback_ids %in% self$cover$index_set)) } pullback_ids <- if (pids_supplied){ pullback_ids } else { self$cover$index_set } - ## If specific pullback ids not given, then resist the pullback mapping if (!pids_supplied || length(private$.pullback) == 0){ n_sets <- length(self$cover$index_set) @@ -583,10 +584,9 @@ MapperRef$set("public", "construct_pullback", function(pullback_ids=NULL, ...){ self$vertices <- list() private$.pullback <- structure(replicate(n_sets, integer(0)), names = self$cover$index_set) } - - ## Update the vertices for the given level sets. This requires updating both the simplex tree's - ## internal representation as well as the outward-facing vertex list. The new vertices are returned - ## to replace the current list. The pullback map is also updated. + ## Update the vertices for the given level sets. This requires updating both the simplex tree's + ## internal representation as well as the outward-facing vertex list. The new vertices are returned + ## to replace the current list. The pullback map is also updated. calc_preimage <- function(){ function(index){ self$cover$construct_cover(self$filter, index) } } @@ -603,29 +603,27 @@ MapperRef$set("public", "construct_pullback", function(pullback_ids=NULL, ...){ # private$.simplicial_complex$insert(as.list(ids)) # return(ids) # } - ## Do the decomposition private$.vertices <- Mapper:::decompose_preimages( pullback_ids = as.character(pullback_ids), - cluster_f = partial_cluster(...), - level_set_f = calc_preimage(), + cluster_f = partial_cluster(...), + level_set_f = calc_preimage(), vertices = private$.vertices, # id_generator = id_gen, pullback = private$.pullback ) - ## Return self invisible(self) }) ## construct_nerve ---- -#' @name construct_nerve -#' @title Compute the nerve of the cover. -#' @description Computes (or updates) the k-simplices composing the Mapper, where k >= 0. -#' @param k The order of the simplices to construct. See details. +#' @name construct_nerve +#' @title Compute the nerve of the cover. +#' @description Computes (or updates) the k-simplices composing the Mapper, where k >= 0. +#' @param k The order of the simplices to construct. See details. #' @param indices (n x k) matrix of indices of the covers index set to update. #' @param min_weight minimum intersection size to consider as a simplex. Defaults to 1. -#' @details Compared to \code{construct_k_skeleton}, this method \emph{only} intersections +#' @details Compared to \code{construct_k_skeleton}, this method \emph{only} intersections #' between (k-1) simplices in the complex. MapperRef$set("public", "construct_nerve", function(k, indices = NULL, min_weight=1L, modify=TRUE){ stopifnot(length(private$.vertices) > 0) @@ -639,49 +637,49 @@ MapperRef$set("public", "construct_nerve", function(k, indices = NULL, min_weigh ## If k==0 is specified, just build the vertices stree_ptr <- private$.simplicial_complex$as_XPtr() - if (k == 0){ - if (!modify){ return(as.integer(names(self$vertices))) } + if (k == 0){ + if (!modify){ return(as.integer(names(self$vertices))) } build_0_skeleton(as.integer(names(self$vertices)), stree_ptr) return(invisible(self)) } - - ## Retrieve the valid level set index pairs to compare. In the worst case, with no cover-specific optimization, + + ## Retrieve the valid level set index pairs to compare. In the worst case, with no cover-specific optimization, ## this may just be all pairwise combinations of LSFI's for the full simplicial complex. indices <- if (idx_specified){ indices } else { self$cover$neighborhood(self$filter, k) } - + ## If no indices to compute, nothing to do for k > 1. return self invisibly. if (is.null(indices)){ return(invisible(self)) } - + ## Build the parameter list params <- list(pullback_ids = indices, vertices = self$vertices, pullback = private$.pullback, stree = stree_ptr, modify = modify) - if (k == 1){ - params <- modifyList(params, list(min_sz = min_weight)) - if (!params$modify){ return(do.call(rbind, do.call(build_1_skeleton, params)) ) } + if (k == 1){ + params <- modifyList(params, list(min_sz = min_weight)) + if (!params$modify){ return(do.call(rbind, do.call(build_1_skeleton, params)) ) } do.call(build_1_skeleton, params) return(invisible(self)) - } else { - params <- modifyList(params, list(k = k)) - if (!params$modify){ return(do.call(rbind, do.call(build_k_skeleton, params)) ) } + } else { + params <- modifyList(params, list(k = k)) + if (!params$modify){ return(do.call(rbind, do.call(build_k_skeleton, params)) ) } do.call(build_k_skeleton, params) return(invisible(self)) - } + } }) ## construct_k_skeleton ---- -#' @name construct_k_skeleton +#' @name construct_k_skeleton #' @title Constructs the k-skeleton #' @param k the maximal dimension to consider. #' @description Computes the k-skeleton of the mapper by computing the nerve of the pullback of \code{cover} member. -#' @details -#' The primary output of the Mapper method is a simplicial complex. With \code{\link{MapperRef}} objects, -#' the simplicial complex is stored as a \code{\link[simplextree]{simplextree}}. The underlying complex does not need to be modified by the user, i.e. is completely maintained by \code{\link{MapperRef}} methods -#' (e.g. this method, \code{\link{construct_nerve}}, etc.). \cr -#' \cr -#' This function computes the k-skeleton inductively, e.g. by first computing the vertices, -#' then the edges, etc. up to the dimension specified. A check is performed to ensure the pullback has been decomposed, and +#' @details +#' The primary output of the Mapper method is a simplicial complex. With \code{\link{MapperRef}} objects, +#' the simplicial complex is stored as a \code{\link[simplextree]{simplextree}}. The underlying complex does not need to be modified by the user, i.e. is completely maintained by \code{\link{MapperRef}} methods +#' (e.g. this method, \code{\link{construct_nerve}}, etc.). \cr +#' \cr +#' This function computes the k-skeleton inductively, e.g. by first computing the vertices, +#' then the edges, etc. up to the dimension specified. A check is performed to ensure the pullback has been decomposed, and #' if not, then \code{\link{construct_pullback}} is called. \cr #' \cr -#' For an algorithmic description of this process, see Singh et. al, section 3.2. +#' For an algorithmic description of this process, see Singh et. al, section 3.2. #' @references Gurjeet Singh, Facundo Mémoli, and Gunnar Carlsson. "Topological methods for the analysis of high dimensional data sets and 3d object recognition." SPBG. 2007. MapperRef$set("public", "construct_k_skeleton", function(k=1L){ stopifnot(k >= 0) @@ -705,19 +703,19 @@ MapperRef$set("public", "format", function(...){ }) ## as_igraph ---- -#' @name as_igraph +#' @name as_igraph #' @title Exports Mapper as an igraph object. #' @description Exports the 1-skeleton to a graph using the igraph library. -#' @param vertex_scale scaling function for the vertex sizes. -#' @param vertex_min minimum vertex size. -#' @param vertex_min maximum vertex size. -#' @param col_pal color palette to color the vertices by. -#' @details This method converts the 1-skeleton of the Mapper to an igraph object, and assigns some -#' default visual properties. Namely, the vertex attributes "color", "size", and "label" and the -#' edge attribute "color" are assigned. -#' The vertex colors are colored according on the given color palette (default is rainbow) according -#' to their mean filter value (see \code{\link{bin_color}}). The vertex sizes are scaled according -#' to the number of points they contain, scaled by \code{vertex_scale}, and bounded between +#' @param vertex_scale scaling function for the vertex sizes. +#' @param vertex_min minimum vertex size. +#' @param vertex_min maximum vertex size. +#' @param col_pal color palette to color the vertices by. +#' @details This method converts the 1-skeleton of the Mapper to an igraph object, and assigns some +#' default visual properties. Namely, the vertex attributes "color", "size", and "label" and the +#' edge attribute "color" are assigned. +#' The vertex colors are colored according on the given color palette (default is rainbow) according +#' to their mean filter value (see \code{\link{bin_color}}). The vertex sizes are scaled according +#' to the number of points they contain, scaled by \code{vertex_scale}, and bounded between #' (\code{vertex_min}, \code{vertex_max}). The vertex labels are in the format ":".\cr #' \cr #' The edges are colored similarly by the average filter value of the points intersecting @@ -728,15 +726,15 @@ MapperRef$set("public", "as_igraph", function(vertex_scale=c("linear", "log"), v am <- private$.simplicial_complex$as_adjacency_matrix() colnames(am) <- as.character(private$.simplicial_complex$vertices) G <- igraph::graph_from_adjacency_matrix(am, mode = "undirected", add.colnames = NULL) ## NULL makes named vertices - + ## Coloring + aggregation functions - agg_val <- function(lst) { sapply(sapply(lst, function(idx){ rowMeans(self$filter(idx)) }), mean) } - + agg_val <- function(lst) { sapply(sapply(lst, function(idx){ rowMeans(self$filter(idx)) }), mean) } + ## Aggregate node filter values v_idx <- match(private$.simplicial_complex$vertices, as.integer(names(self$vertices))) agg_node_val <- agg_val(private$.vertices) igraph::vertex_attr(G, name = "color") <- bin_color(agg_node_val[v_idx], col_pal = col_pal) - + ## Extract indices in the edges edges <- igraph::as_edgelist(G) edge_idx <- lapply(seq(nrow(edges)), function(i){ @@ -745,9 +743,9 @@ MapperRef$set("public", "as_igraph", function(vertex_scale=c("linear", "log"), v }) agg_edge_val <- agg_val(edge_idx) igraph::edge_attr(G, name = "color") <- bin_color(agg_edge_val, col_pal = col_pal) - + ## Normalize between 0-1, unless all the same - normalize <- function(x) { + normalize <- function(x) { if (all(x == x[1])){ return(rep(1, length(x))) } else { (x - min(x))/(max(x) - min(x)) } } @@ -784,75 +782,75 @@ MapperRef$set("public", "as_igraph", function(vertex_scale=c("linear", "log"), v # top_vids <- names(m$vertices)[order(v_len, decreasing = TRUE)][1:v_idx] # avg_v_f <- sapply(vids, function(vid) { mean(rowMeans(m$filter(m$vertices[[vid]]))) }) # row_col <- bin_color(avg_v_f)[match(top_vids, vids)] -# # list("matrix_rows, colors = c(Boston = "green", NYC = "navy", LA = "purple"), +# # list("matrix_rows, colors = c(Boston = "green", NYC = "navy", LA = "purple"), # # alpha = 0.5))) # vids_int <- as.integer(top_vids) -# -# +# +# # ## Color intersections by f # res_f <- m$simplicial_complex$ltraverse(empty_face, function(simplex){ # if (all(simplex %in% vids_int)){ -# list(simplex = simplex, f_val = mean(avg_v_f[match(simplex, vids)])) +# list(simplex = simplex, f_val = mean(avg_v_f[match(simplex, vids)])) # } -# }, type = "dfs") +# }, type = "dfs") # res_f <- Filter(function(x) { !is.null(x) }, res_f) -# params <- lapply(res_f, function(el){ -# list(query = intersects, +# params <- lapply(res_f, function(el){ +# list(query = intersects, # params = as.list(paste0("v", as.character(el$simplex))), -# color = bin_color(avg_v_f)[match(el$simplex, vids)], +# color = bin_color(avg_v_f)[match(el$simplex, vids)], # active = FALSE # ) # }) # sapply(params, function(p) unlist(p[[2]])) -# +# # # queries = list(list(query = intersects, params = list("Drama"), color = "red", active = F) # us_df <- as.data.frame(x_groups) # meta_data <- data.frame(sets=as.factor(paste0("v", top_vids))) # meta_data$cat_id <- as.character(paste0("v", top_vids)) # id_color_map <- structure(row_col, names=as.character(meta_data$cat_id)) -# UpSetR::upset(data = us_df, nsets = v_idx, -# sets.x.label = "Node size", -# scale.intersections = "identity", +# UpSetR::upset(data = us_df, nsets = v_idx, +# sets.x.label = "Node size", +# scale.intersections = "identity", # set.metadata = list( # data=meta_data, # plots=list(list(type="matrix_rows", column="cat_id", colors=id_color_map, alpha=0.5)) # ) # ) -# -# # queries = wut, +# +# # queries = wut, # # queries = list(list(query = intersects, params = list("v43"), color = "red", active = F)), -# #queries = list(list(query = intersects, params = list("v43", "v37"), color = "red", active = F)), +# #queries = list(list(query = intersects, params = list("v43", "v37"), color = "red", active = F)), # mb.ratio = c(0.35, 0.65)) # sets.bar.color = row_col -# -# -# -# +# +# +# +# # }) # MapperRef$set("public", "as_grapher", function(construct_widget=TRUE, ...){ # requireNamespace("grapher", quietly = TRUE) -# -# ## Make the igraph +# +# ## Make the igraph # am <- private$.simplicial_complex$as_adjacency_matrix() -# G <- igraph::graph_from_adjacency_matrix(am, mode = "undirected", add.colnames = NA) +# G <- igraph::graph_from_adjacency_matrix(am, mode = "undirected", add.colnames = NA) # json_config <- grapher::getDefaultJsonConfig(network=G) -# +# # ## Color nodes and edges by a default rainbow palette # rbw_pal <- rev(rainbow(100, start = 0, end = 4/6)) -# agg_node_val <- sapply(sapply(private$.vertices, function(v_idx){ +# agg_node_val <- sapply(sapply(private$.vertices, function(v_idx){ # apply(as.matrix(self$cover$filter_values[v_idx,]), 1, mean) # }), mean) # binned_idx <- cut(agg_node_val, breaks = 100, labels = F) # vertex_colors <- rbw_pal[binned_idx] -# vertex_sizes <- sapply(private$.vertices, length) -# +# vertex_sizes <- sapply(private$.vertices, length) +# # ## Normalize between 0-1, unless all the same -# normalize <- function(x) { +# normalize <- function(x) { # if (all(x == x[1])){ return(rep(1, length(x))) } # else { (x - min(x))/(max(x) - min(x)) } # } -# +# # ## Create the vertices. By default, color them on a rainbow palette according to their mean filter value. # if (length(igraph::V(G)) > 0){ # vertex_radii <- (7L - 2L)*normalize(vertex_sizes) + 2L @@ -865,7 +863,7 @@ MapperRef$set("public", "as_igraph", function(vertex_scale=c("linear", "log"), v # } else { # json_config$nodes <- integer(length = 0L) # } -# +# # ## Create the edges w/ a similar coloring scheme. # if (length(igraph::E(G)) > 0){ # el <- igraph::as_edgelist(G, names = FALSE) @@ -873,14 +871,14 @@ MapperRef$set("public", "as_igraph", function(vertex_scale=c("linear", "log"), v # edge_links <- matrix(apply(el, 2, as.integer) - 1L, ncol = 2) # json_config$links$from <- edge_links[,1] # json_config$links$to <- edge_links[,2] -# json_config$links$color <- substr(rbw_pal[edge_binned_idx], start = 0, stop = 7) +# json_config$links$color <- substr(rbw_pal[edge_binned_idx], start = 0, stop = 7) # } else { # json_config$links <- integer(length = 0L) # } -# -# ## Return the grapher instance or just the json config if requested +# +# ## Return the grapher instance or just the json config if requested # if (construct_widget){ -# grapher::grapher(json_config) +# grapher::grapher(json_config) # } else { # return(json_config) # } @@ -889,22 +887,22 @@ MapperRef$set("public", "as_igraph", function(vertex_scale=c("linear", "log"), v ## as_pixiplex ---- #' @name as_pixiplex #' @title Exports the complex as a pixiplex object. -#' @description Uses the pixiplex library. +#' @description Uses the pixiplex library. MapperRef$set("public", "as_pixiplex", function(...){ requireNamespace("pixiplex", quietly = TRUE) pp <- pixiplex::pixiplex(private$.simplicial_complex) - + ## Default styling rbw_pal <- rev(rainbow(100, start = 0, end = 4/6)) - agg_node_val <- sapply(sapply(private$.vertices, function(v_idx){ + agg_node_val <- sapply(sapply(private$.vertices, function(v_idx){ rowMeans(self$filter(v_idx)) }), mean) binned_idx <- cut(agg_node_val, breaks = 100, labels = F) vertex_colors <- rbw_pal[binned_idx] - vertex_sizes <- sapply(private$.vertices, length) - + vertex_sizes <- sapply(private$.vertices, length) + ## Normalize between 0-1, unless all the same - normalize <- function(x) { + normalize <- function(x) { if (all(x == x[1])){ return(rep(1, length(x))) } else { (x - min(x))/(max(x) - min(x)) } } @@ -919,20 +917,20 @@ MapperRef$set("public", "as_pixiplex", function(...){ #' @title Exports minimal information about the Mapper #' @description This function exports a few core characteristics about the mapper complex as a list. #' @param graph_type export preference on the structure the graph output. -#' @return List with the following members: +#' @return List with the following members: #' \itemize{ #' \item \emph{vertices} list of the indices of the original data the current vertex intersects with. #' \item \emph{graph} some adjacency representation of the mapper graph. #' \item \emph{pullback} map connecting the index set of the cover and the vertices. #' } -#' @details \code{graph_type} must be one of 'adjacency_matrix', 'adjacency_list', or 'edgelist'. +#' @details \code{graph_type} must be one of 'adjacency_matrix', 'adjacency_list', or 'edgelist'. MapperRef$set("public", "exportMapper", function(graph_type=c("adjacency_matrix", "adjacency_list", "edgelist")){ result <- list(pullback = private$.pullback, vertices = private$.vertices) if (missing(graph_type)){ graph_type <- "adjacency_matrix" } - result$graph <- switch(graph_type, + result$graph <- switch(graph_type, "adjacency_matrix"=self$simplicial_complex$as_adjacency_matrix(), - "adjacency_list"=self$simplicial_complex$as_adjacency_list(), - "edgelist"=self$simplicial_complex$as_edge_list(), + "adjacency_list"=self$simplicial_complex$as_adjacency_list(), + "edgelist"=self$simplicial_complex$as_edge_list(), stop("'graph_type' must be one of: 'adjacency_matrix', 'adjacency_list', 'edgelist'")) n_simplices <- self$simplicial_complex$n_simplices x_dim <- ncol(self$X()) @@ -944,4 +942,3 @@ MapperRef$set("public", "exportMapper", function(graph_type=c("adjacency_matrix" class(result) <- "Mapper" return(result) }) - diff --git a/src/landmarks.cpp b/src/landmarks.cpp index 19e1588..b414b17 100644 --- a/src/landmarks.cpp +++ b/src/landmarks.cpp @@ -12,7 +12,7 @@ inline double sq_dist(const NumericVector& x, const NumericVector& y){ } // Eccentricity -// Computes a distance measure from the barycenter of the data, or from a given center, if provided. +// Computes a distance measure from the barycenter of the data, or from a given center, if provided. // [[Rcpp::export]] NumericVector eccentricity(const NumericMatrix& from, const NumericMatrix& x, const int type = 1) { if (from.ncol() != x.ncol()){ stop("Matrices must have identical number of columns."); } @@ -29,25 +29,26 @@ NumericVector eccentricity(const NumericMatrix& from, const NumericMatrix& x, co return(wrap(pt_ecc)); } -// Uses the euclidean maxmin procedure to choose n landmarks. +// Uses the euclidean maxmin procedure to choose n landmarks. // [[Rcpp::export]] IntegerVector landmark_maxmin(const NumericMatrix& x, const int n, const int seed = 0) { const size_t n_pts = x.nrow(); std::vector< double > lm_dist(n_pts, std::numeric_limits::infinity()); IntegerVector landmark_idx = no_init_vector(n); - landmark_idx[seed] = 0; + //landmark_idx[seed] = 0; + landmark_idx[0] = seed; IntegerVector::iterator c_landmark = landmark_idx.begin(); double new_min_dist; std::generate(landmark_idx.begin()+1, landmark_idx.end(), [&](){ size_t i = 0; new_min_dist = std::numeric_limits::infinity(); - + // Replace nearest landmark distances std::replace_if(lm_dist.begin(), lm_dist.end(), [&i, &x, &c_landmark, &new_min_dist](const double c_dist){ new_min_dist = sq_dist(x.row(*c_landmark), x.row(i++)); return(new_min_dist < c_dist); }, new_min_dist); - + // Find the point that maximizes said distances, move to next landmark c_landmark++; auto max_landmark = std::max_element(lm_dist.begin(), lm_dist.end());