diff --git a/DESCRIPTION b/DESCRIPTION index e428f59..f9c9654 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -22,7 +22,8 @@ Imports: purrr, Matrix, broom, - methods + methods, + tibble Suggests: ggraph, igraph, diff --git a/NAMESPACE b/NAMESPACE index e41bb8c..8e93aec 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +export(multi_scale) export(pairwise_cor) export(pairwise_cor_) export(pairwise_count) diff --git a/R/multi_scale.R b/R/multi_scale.R new file mode 100644 index 0000000..88d0199 --- /dev/null +++ b/R/multi_scale.R @@ -0,0 +1,60 @@ +#' Multidimensional Scaling of documents separated by a distance measure. +#' +#' Given a tbl or data frame of pairwise distances between documents, scale each document +#' to a *k* dimensional space that ensures the distance between all documents is maintained. +#' **NOTE:** Doesnt work when the pairwise distance tbl is formed using \code{upper = FALSE}. +#' +#' @param tbl Table obtained by running a pairwise distance method \code{pairwise_delta} or \code{pairwise_dist} +#' @param item1 first item +#' @param item2 second item +#' @param value Value +#' @param k number of dimensions, defaults to 2 +#' +#' @return Returns a function that takes at least four arguments: +#' \item{item}{Column to store documents or items separated by various distances as used prior to calling \code{multi_scale()}} +#' \item{V1}{First Dimension} +#' \item{V2}{Second Dimension} +#' \item{...}{Other Dimensions as specified by k's value} +#' +#' @examples +#' +#' library(janeaustenr) +#' library(dplyr) +#' library(tidyr) +#' library(tidytext) +#' +#' # closest documents in terms of 1000 most frequent words +#' austen_delta <- austen_books() %>% +#' unnest_tokens(word, text) %>% +#' count(book, word) %>% +#' pairwise_delta(book, word, n) +#' +#' austen_delta +#' +#' austen_delta %>% +#' multi_scale(item1, item2, delta) +#' +#' @export + +multi_scale <- function(tbl, item1, item2, value, k = 2) { + multi_scale_(tbl, + col_name(substitute(item1)), + col_name(substitute(item2)), + col_name(substitute(value)), + k = 2) +} + + +multi_scale_ <- function(tbl, item1, item2, value, k = 2) { + tbl_matrix <- tbl %>% + tidyr::spread(item2, col_name(value), fill = 0) %>% + as.data.frame() %>% + tibble::remove_rownames() %>% + tibble::column_to_rownames("item1") %>% + as.matrix() + + stats::cmdscale(tbl_matrix, k = k) %>% + as.data.frame() %>% + tibble::rownames_to_column("item") %>% + tibble::as_tibble() +} diff --git a/R/pairwise_cor.R b/R/pairwise_cor.R index 8b238cc..5a3e8a1 100644 --- a/R/pairwise_cor.R +++ b/R/pairwise_cor.R @@ -37,8 +37,8 @@ #' #' @export pairwise_cor <- function(tbl, item, feature, value, - method = c("pearson", "kendall", "spearman"), - use = "everything", ...) { + method = c("pearson", "kendall", "spearman"), + use = "everything", ...) { if (missing(value)) { tbl$..value <- 1 val <- "..value" @@ -47,19 +47,19 @@ pairwise_cor <- function(tbl, item, feature, value, } pairwise_cor_(tbl, - col_name(substitute(item)), - col_name(substitute(feature)), - val, - method = method, use = use, ...) + col_name(substitute(item)), + col_name(substitute(feature)), + val, + method = method, use = use, ...) } #' @rdname pairwise_cor #' @export pairwise_cor_ <- function(tbl, item, feature, value, - method = c("pearson", "kendall", "spearman"), - use = "everything", - ...) { + method = c("pearson", "kendall", "spearman"), + use = "everything", + ...) { method <- match.arg(method) sparse <- (method == "pearson" & use == "everything") @@ -68,7 +68,7 @@ pairwise_cor_ <- function(tbl, item, feature, value, } else { function(x) stats::cor(t(x), method = method, use = use) } - cor_func <- squarely_(f, sparse = sparse, ...) + cor_func <- squarely_(f, sparse = sparse, fill_value = NA, ...) tbl %>% ungroup() %>% diff --git a/R/widely.R b/R/widely.R index 01c4497..4fbb37f 100644 --- a/R/widely.R +++ b/R/widely.R @@ -11,6 +11,7 @@ #' non-sparse matrix to be created. Set to NULL to allow any size #' matrix. #' @param sparse Whether to cast to a sparse matrix +#' @param fill_value value to be used to replace NAs when converted from long to wide format. #' #' @return Returns a function that takes at least four arguments: #' \item{tbl}{A table} @@ -50,12 +51,16 @@ widely <- function(.f, sort = FALSE, sparse = FALSE, - maximum_size = 1e7) { + maximum_size = 1e7, + fill_value = 0) { function(tbl, row, column, value, ...) { + + inner_func <- widely_(.f, sort = sort, sparse = sparse, - maximum_size = maximum_size) + maximum_size = maximum_size, + fill_value = fill_value) inner_func(tbl, col_name(substitute(row)), @@ -71,7 +76,8 @@ widely <- function(.f, widely_ <- function(.f, sort = FALSE, sparse = FALSE, - maximum_size = 1e7) { + maximum_size = 1e7, + fill_value = 0) { f <- function(tbl, row, column, value, ...) { if (inherits(tbl, "grouped_df")) { # perform within each group, then restore groups @@ -87,7 +93,7 @@ widely_ <- function(.f, if (!sparse) { if (!is.null(maximum_size)) { matrix_size <- (length(unique(tbl[[row]])) * - length(unique(tbl[[column]]))) + length(unique(tbl[[column]]))) if (matrix_size > maximum_size) { stop("Size of acast matrix, ", matrix_size, " will be too large. Set maximum_size = NULL to avoid ", @@ -98,7 +104,8 @@ widely_ <- function(.f, form <- stats::as.formula(paste(row, column, sep = " ~ ")) - input <- reshape2::acast(tbl, form, value.var = value, fill = 0) + input <- reshape2::acast(tbl, form, value.var = value, fill = fill_value) + } else { input <- tidytext::cast_sparse_(tbl, row, column, value) } diff --git a/man/multi_scale.Rd b/man/multi_scale.Rd new file mode 100644 index 0000000..e3f1785 --- /dev/null +++ b/man/multi_scale.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/multi_scale.R +\name{multi_scale} +\alias{multi_scale} +\title{Multidimensional Scaling of documents separated by a distance measure.} +\usage{ +multi_scale(tbl, item1, item2, value, k = 2) +} +\arguments{ +\item{tbl}{Table obtained by running a pairwise distance method \code{pairwise_delta} or \code{pairwise_dist}} + +\item{item1}{first item} + +\item{item2}{second item} + +\item{value}{Value} + +\item{k}{number of dimensions, defaults to 2} +} +\value{ +Returns a function that takes at least four arguments: + \item{item}{Column to store documents or items separated by various distances as used prior to calling \code{multi_scale()}} + \item{V1}{First Dimension} + \item{V2}{Second Dimension} + \item{...}{Other Dimensions as specified by k's value} +} +\description{ +Given a tbl or data frame of pairwise distances between documents, scale each document +to a *k* dimensional space that ensures the distance between all documents is maintained. +**NOTE:** Doesnt work when the pairwise distance tbl is formed using \code{upper = FALSE}. +} +\examples{ + +library(janeaustenr) +library(dplyr) +library(tidyr) +library(tidytext) + +# closest documents in terms of 1000 most frequent words +austen_delta <- austen_books() \%>\% + unnest_tokens(word, text) \%>\% + count(book, word) \%>\% + pairwise_delta(book, word, n) + +austen_delta + +austen_delta \%>\% + multi_scale(item1, item2, delta) + +} diff --git a/man/widely.Rd b/man/widely.Rd index 573cf7b..dc8d25f 100644 --- a/man/widely.Rd +++ b/man/widely.Rd @@ -6,9 +6,11 @@ \title{Adverb for functions that operate on matrices in "wide" format} \usage{ -widely(.f, sort = FALSE, sparse = FALSE, maximum_size = 1e+07) +widely(.f, sort = FALSE, sparse = FALSE, maximum_size = 1e+07, + fill_value = 0) -widely_(.f, sort = FALSE, sparse = FALSE, maximum_size = 1e+07) +widely_(.f, sort = FALSE, sparse = FALSE, maximum_size = 1e+07, + fill_value = 0) } \arguments{ \item{.f}{Function being wrapped} @@ -20,6 +22,8 @@ widely_(.f, sort = FALSE, sparse = FALSE, maximum_size = 1e+07) \item{maximum_size}{To prevent crashing, a maximum size of a non-sparse matrix to be created. Set to NULL to allow any size matrix.} + +\item{fill_value}{value to be used to replace NAs when converted from long to wide format.} } \value{ Returns a function that takes at least four arguments: